From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/net/6lowpan/Makefile | 2 +- kernel/net/6lowpan/core.c | 45 + kernel/net/6lowpan/iphc.c | 707 ++++-- kernel/net/6lowpan/nhc.c | 16 +- kernel/net/6lowpan/nhc.h | 14 +- kernel/net/6lowpan/nhc_udp.c | 35 +- kernel/net/8021q/vlan.c | 96 + kernel/net/8021q/vlan_core.c | 14 +- kernel/net/8021q/vlan_dev.c | 3 +- kernel/net/9p/client.c | 8 + kernel/net/9p/trans_rdma.c | 34 +- kernel/net/9p/trans_virtio.c | 1 + kernel/net/Kconfig | 11 + kernel/net/Makefile | 3 + kernel/net/appletalk/ddp.c | 2 +- kernel/net/atm/br2684.c | 9 +- kernel/net/atm/clip.c | 3 + kernel/net/atm/common.c | 4 +- kernel/net/atm/common.h | 2 +- kernel/net/atm/pvc.c | 2 +- kernel/net/atm/svc.c | 2 +- kernel/net/ax25/af_ax25.c | 38 +- kernel/net/ax25/ax25_in.c | 3 +- kernel/net/ax25/ax25_ip.c | 1 - kernel/net/ax25/ax25_out.c | 1 - kernel/net/ax25/ax25_subr.c | 1 + kernel/net/ax25/ax25_uid.c | 1 - kernel/net/batman-adv/Makefile | 6 +- kernel/net/batman-adv/bat_algo.h | 2 +- kernel/net/batman-adv/bat_iv_ogm.c | 336 +-- kernel/net/batman-adv/bitarray.c | 12 +- kernel/net/batman-adv/bitarray.h | 18 +- kernel/net/batman-adv/bridge_loop_avoidance.c | 144 +- kernel/net/batman-adv/bridge_loop_avoidance.h | 14 +- kernel/net/batman-adv/debugfs.c | 47 +- kernel/net/batman-adv/debugfs.h | 42 +- kernel/net/batman-adv/distributed-arp-table.c | 137 +- kernel/net/batman-adv/distributed-arp-table.h | 23 +- kernel/net/batman-adv/fragmentation.c | 57 +- kernel/net/batman-adv/fragmentation.h | 11 +- kernel/net/batman-adv/gateway_client.c | 124 +- kernel/net/batman-adv/gateway_client.h | 14 +- kernel/net/batman-adv/gateway_common.c | 80 +- kernel/net/batman-adv/gateway_common.h | 8 +- kernel/net/batman-adv/hard-interface.c | 84 +- kernel/net/batman-adv/hard-interface.h | 25 +- kernel/net/batman-adv/hash.c | 14 +- kernel/net/batman-adv/hash.h | 45 +- kernel/net/batman-adv/icmp_socket.c | 41 +- kernel/net/batman-adv/icmp_socket.h | 8 +- kernel/net/batman-adv/main.c | 187 +- kernel/net/batman-adv/main.h | 86 +- kernel/net/batman-adv/multicast.c | 145 +- kernel/net/batman-adv/multicast.h | 6 +- kernel/net/batman-adv/network-coding.c | 134 +- kernel/net/batman-adv/network-coding.h | 11 +- kernel/net/batman-adv/originator.c | 298 +-- kernel/net/batman-adv/originator.h | 45 +- kernel/net/batman-adv/packet.h | 209 +- kernel/net/batman-adv/routing.c | 81 +- kernel/net/batman-adv/routing.h | 10 +- kernel/net/batman-adv/send.c | 51 +- kernel/net/batman-adv/send.h | 20 +- kernel/net/batman-adv/soft-interface.c | 112 +- kernel/net/batman-adv/soft-interface.h | 9 +- kernel/net/batman-adv/sysfs.c | 66 +- kernel/net/batman-adv/sysfs.h | 10 +- kernel/net/batman-adv/translation-table.c | 482 ++-- kernel/net/batman-adv/translation-table.h | 38 +- kernel/net/batman-adv/types.h | 161 +- kernel/net/bluetooth/6lowpan.c | 214 +- kernel/net/bluetooth/Kconfig | 5 + kernel/net/bluetooth/Makefile | 6 +- kernel/net/bluetooth/a2mp.c | 17 +- kernel/net/bluetooth/a2mp.h | 19 + kernel/net/bluetooth/af_bluetooth.c | 10 +- kernel/net/bluetooth/amp.c | 134 +- kernel/net/bluetooth/amp.h | 14 + kernel/net/bluetooth/bnep/sock.c | 2 +- kernel/net/bluetooth/cmtp/capi.c | 8 +- kernel/net/bluetooth/cmtp/sock.c | 2 +- kernel/net/bluetooth/hci_conn.c | 427 +++- kernel/net/bluetooth/hci_core.c | 401 ++- kernel/net/bluetooth/hci_event.c | 372 +-- kernel/net/bluetooth/hci_request.c | 137 +- kernel/net/bluetooth/hci_request.h | 4 + kernel/net/bluetooth/hci_sock.c | 113 +- kernel/net/bluetooth/hidp/core.c | 15 + kernel/net/bluetooth/hidp/sock.c | 2 +- kernel/net/bluetooth/l2cap_core.c | 43 +- kernel/net/bluetooth/l2cap_sock.c | 104 +- kernel/net/bluetooth/lib.c | 32 + kernel/net/bluetooth/mgmt.c | 814 +++--- kernel/net/bluetooth/rfcomm/core.c | 2 +- kernel/net/bluetooth/rfcomm/sock.c | 28 +- kernel/net/bluetooth/sco.c | 63 +- kernel/net/bluetooth/smp.c | 253 +- kernel/net/bluetooth/smp.h | 1 + kernel/net/bridge/Makefile | 2 + kernel/net/bridge/br.c | 25 +- kernel/net/bridge/br_device.c | 16 +- kernel/net/bridge/br_fdb.c | 242 +- kernel/net/bridge/br_forward.c | 62 +- kernel/net/bridge/br_if.c | 8 +- kernel/net/bridge/br_input.c | 35 +- kernel/net/bridge/br_ioctl.c | 3 +- kernel/net/bridge/br_mdb.c | 159 +- kernel/net/bridge/br_multicast.c | 399 +-- kernel/net/bridge/br_netfilter.c | 1140 --------- kernel/net/bridge/br_netfilter_hooks.c | 1039 ++++++++ kernel/net/bridge/br_netfilter_ipv6.c | 244 ++ kernel/net/bridge/br_netlink.c | 597 ++++- kernel/net/bridge/br_private.h | 240 +- kernel/net/bridge/br_stp.c | 42 +- kernel/net/bridge/br_stp_bpdu.c | 12 +- kernel/net/bridge/br_stp_if.c | 32 +- kernel/net/bridge/br_stp_timer.c | 4 +- kernel/net/bridge/br_sysfs_br.c | 11 +- kernel/net/bridge/br_sysfs_if.c | 2 +- kernel/net/bridge/br_vlan.c | 800 +++--- kernel/net/bridge/netfilter/ebt_log.c | 2 +- kernel/net/bridge/netfilter/ebt_nflog.c | 2 +- kernel/net/bridge/netfilter/ebt_stp.c | 6 +- kernel/net/bridge/netfilter/ebtable_broute.c | 8 +- kernel/net/bridge/netfilter/ebtable_filter.c | 13 +- kernel/net/bridge/netfilter/ebtable_nat.c | 13 +- kernel/net/bridge/netfilter/ebtables.c | 20 +- kernel/net/bridge/netfilter/nf_tables_bridge.c | 20 +- kernel/net/bridge/netfilter/nft_reject_bridge.c | 19 +- kernel/net/caif/caif_dev.c | 2 +- kernel/net/caif/caif_socket.c | 25 +- kernel/net/can/af_can.c | 2 +- kernel/net/can/bcm.c | 15 +- kernel/net/can/gw.c | 68 +- kernel/net/ceph/auth_x.c | 36 +- kernel/net/ceph/ceph_common.c | 87 +- kernel/net/ceph/crush/crush.c | 13 +- kernel/net/ceph/crush/crush_ln_table.h | 32 +- kernel/net/ceph/crush/hash.c | 8 +- kernel/net/ceph/crush/mapper.c | 148 +- kernel/net/ceph/crypto.c | 10 +- kernel/net/ceph/crypto.h | 4 +- kernel/net/ceph/messenger.c | 291 ++- kernel/net/ceph/mon_client.c | 50 +- kernel/net/ceph/osd_client.c | 140 +- kernel/net/ceph/osdmap.c | 2 +- kernel/net/ceph/pagevec.c | 5 +- kernel/net/core/Makefile | 1 + kernel/net/core/datagram.c | 2 +- kernel/net/core/dev.c | 491 +++- kernel/net/core/dst.c | 127 +- kernel/net/core/ethtool.c | 15 +- kernel/net/core/fib_rules.c | 34 +- kernel/net/core/filter.c | 543 +++- kernel/net/core/flow_dissector.c | 820 +++++-- kernel/net/core/gen_estimator.c | 13 +- kernel/net/core/lwtunnel.c | 249 ++ kernel/net/core/neighbour.c | 68 +- kernel/net/core/net-sysfs.c | 62 +- kernel/net/core/net-traces.c | 1 + kernel/net/core/net_namespace.c | 133 +- kernel/net/core/netclassid_cgroup.c | 31 +- kernel/net/core/netevent.c | 5 +- kernel/net/core/netpoll.c | 31 +- kernel/net/core/netprio_cgroup.c | 9 +- kernel/net/core/pktgen.c | 133 +- kernel/net/core/ptp_classifier.c | 16 +- kernel/net/core/request_sock.c | 88 +- kernel/net/core/rtnetlink.c | 406 ++- kernel/net/core/scm.c | 9 + kernel/net/core/secure_seq.c | 2 +- kernel/net/core/skbuff.c | 421 ++-- kernel/net/core/sock.c | 176 +- kernel/net/core/sock_diag.c | 97 +- kernel/net/core/stream.c | 12 +- kernel/net/core/sysctl_net_core.c | 10 + kernel/net/core/timestamping.c | 6 +- kernel/net/core/tso.c | 18 +- kernel/net/core/utils.c | 70 +- kernel/net/dcb/dcbnl.c | 30 +- kernel/net/dccp/ackvec.c | 12 +- kernel/net/dccp/ccid.c | 3 +- kernel/net/dccp/dccp.h | 16 +- kernel/net/dccp/diag.c | 1 + kernel/net/dccp/ipv4.c | 94 +- kernel/net/dccp/ipv6.c | 175 +- kernel/net/dccp/minisocks.c | 22 +- kernel/net/dccp/output.c | 17 +- kernel/net/dccp/probe.c | 11 +- kernel/net/dccp/proto.c | 5 +- kernel/net/decnet/af_decnet.c | 19 +- kernel/net/decnet/dn_neigh.c | 23 +- kernel/net/decnet/dn_nsp_in.c | 7 +- kernel/net/decnet/dn_nsp_out.c | 4 +- kernel/net/decnet/dn_route.c | 38 +- kernel/net/decnet/dn_rules.c | 1 - kernel/net/decnet/netfilter/dn_rtmsg.c | 2 +- kernel/net/dns_resolver/dns_key.c | 20 +- kernel/net/dns_resolver/dns_query.c | 9 +- kernel/net/dns_resolver/internal.h | 8 + kernel/net/dsa/dsa.c | 216 +- kernel/net/dsa/dsa_priv.h | 8 +- kernel/net/dsa/slave.c | 479 +++- kernel/net/dsa/tag_brcm.c | 15 +- kernel/net/dsa/tag_dsa.c | 12 +- kernel/net/dsa/tag_edsa.c | 12 +- kernel/net/dsa/tag_trailer.c | 14 +- kernel/net/ethernet/eth.c | 19 +- kernel/net/hsr/hsr_device.c | 4 +- kernel/net/ieee802154/6lowpan/6lowpan_i.h | 25 +- kernel/net/ieee802154/6lowpan/core.c | 199 +- kernel/net/ieee802154/6lowpan/reassembly.c | 174 +- kernel/net/ieee802154/6lowpan/rx.c | 378 ++- kernel/net/ieee802154/6lowpan/tx.c | 100 +- kernel/net/ieee802154/Kconfig | 5 + kernel/net/ieee802154/core.c | 14 +- kernel/net/ieee802154/core.h | 1 + kernel/net/ieee802154/header_ops.c | 20 +- kernel/net/ieee802154/nl-mac.c | 39 +- kernel/net/ieee802154/nl-phy.c | 10 +- kernel/net/ieee802154/nl802154.c | 1620 +++++++++++- kernel/net/ieee802154/rdev-ops.h | 165 ++ kernel/net/ieee802154/socket.c | 30 +- kernel/net/ieee802154/sysfs.c | 38 + kernel/net/ieee802154/trace.h | 79 +- kernel/net/ipv4/Kconfig | 34 +- kernel/net/ipv4/Makefile | 3 +- kernel/net/ipv4/af_inet.c | 89 +- kernel/net/ipv4/ah4.c | 4 +- kernel/net/ipv4/arp.c | 147 +- kernel/net/ipv4/datagram.c | 2 +- kernel/net/ipv4/devinet.c | 25 +- kernel/net/ipv4/esp4.c | 201 +- kernel/net/ipv4/fib_frontend.c | 130 +- kernel/net/ipv4/fib_lookup.h | 1 + kernel/net/ipv4/fib_rules.c | 6 +- kernel/net/ipv4/fib_semantics.c | 520 +++- kernel/net/ipv4/fib_trie.c | 90 +- kernel/net/ipv4/fou.c | 35 +- kernel/net/ipv4/geneve.c | 453 ---- kernel/net/ipv4/gre_demux.c | 235 +- kernel/net/ipv4/gre_offload.c | 3 +- kernel/net/ipv4/icmp.c | 32 +- kernel/net/ipv4/igmp.c | 214 +- kernel/net/ipv4/inet_connection_sock.c | 297 ++- kernel/net/ipv4/inet_diag.c | 152 +- kernel/net/ipv4/inet_fragment.c | 46 +- kernel/net/ipv4/inet_hashtables.c | 137 +- kernel/net/ipv4/inet_timewait_sock.c | 73 +- kernel/net/ipv4/inetpeer.c | 20 +- kernel/net/ipv4/ip_forward.c | 37 +- kernel/net/ipv4/ip_fragment.c | 120 +- kernel/net/ipv4/ip_gre.c | 464 +++- kernel/net/ipv4/ip_input.c | 50 +- kernel/net/ipv4/ip_output.c | 204 +- kernel/net/ipv4/ip_sockglue.c | 54 +- kernel/net/ipv4/ip_tunnel.c | 37 +- kernel/net/ipv4/ip_tunnel_core.c | 258 +- kernel/net/ipv4/ip_vti.c | 2 +- kernel/net/ipv4/ipconfig.c | 34 +- kernel/net/ipv4/ipip.c | 8 +- kernel/net/ipv4/ipmr.c | 39 +- kernel/net/ipv4/netfilter.c | 16 +- kernel/net/ipv4/netfilter/Kconfig | 17 +- kernel/net/ipv4/netfilter/Makefile | 3 + kernel/net/ipv4/netfilter/arp_tables.c | 131 +- kernel/net/ipv4/netfilter/arptable_filter.c | 7 +- kernel/net/ipv4/netfilter/ip_tables.c | 170 +- kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c | 17 +- kernel/net/ipv4/netfilter/ipt_ECN.c | 2 +- kernel/net/ipv4/netfilter/ipt_REJECT.c | 2 +- kernel/net/ipv4/netfilter/ipt_SYNPROXY.c | 35 +- kernel/net/ipv4/netfilter/ipt_ah.c | 2 +- kernel/net/ipv4/netfilter/ipt_rpfilter.c | 11 +- kernel/net/ipv4/netfilter/iptable_filter.c | 9 +- kernel/net/ipv4/netfilter/iptable_mangle.c | 19 +- kernel/net/ipv4/netfilter/iptable_nat.c | 26 +- kernel/net/ipv4/netfilter/iptable_raw.c | 9 +- kernel/net/ipv4/netfilter/iptable_security.c | 12 +- .../net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 20 +- .../net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 8 +- kernel/net/ipv4/netfilter/nf_defrag_ipv4.c | 42 +- kernel/net/ipv4/netfilter/nf_dup_ipv4.c | 106 + kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 48 +- kernel/net/ipv4/netfilter/nf_nat_pptp.c | 2 +- kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c | 2 +- kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c | 2 +- kernel/net/ipv4/netfilter/nf_reject_ipv4.c | 6 +- kernel/net/ipv4/netfilter/nf_tables_arp.c | 6 +- kernel/net/ipv4/netfilter/nf_tables_ipv4.c | 10 +- kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c | 22 +- kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c | 8 +- kernel/net/ipv4/netfilter/nft_dup_ipv4.c | 110 + kernel/net/ipv4/netfilter/nft_masq_ipv4.c | 2 +- kernel/net/ipv4/netfilter/nft_redir_ipv4.c | 2 +- kernel/net/ipv4/netfilter/nft_reject_ipv4.c | 5 +- kernel/net/ipv4/ping.c | 7 +- kernel/net/ipv4/proc.c | 4 + kernel/net/ipv4/raw.c | 28 +- kernel/net/ipv4/route.c | 354 ++- kernel/net/ipv4/syncookies.c | 33 +- kernel/net/ipv4/sysctl_net_ipv4.c | 59 +- kernel/net/ipv4/tcp.c | 189 +- kernel/net/ipv4/tcp_bic.c | 2 +- kernel/net/ipv4/tcp_cdg.c | 433 ++++ kernel/net/ipv4/tcp_cong.c | 27 +- kernel/net/ipv4/tcp_cubic.c | 26 +- kernel/net/ipv4/tcp_dctcp.c | 26 +- kernel/net/ipv4/tcp_diag.c | 8 +- kernel/net/ipv4/tcp_fastopen.c | 75 +- kernel/net/ipv4/tcp_highspeed.c | 2 +- kernel/net/ipv4/tcp_htcp.c | 2 +- kernel/net/ipv4/tcp_hybla.c | 2 +- kernel/net/ipv4/tcp_illinois.c | 2 +- kernel/net/ipv4/tcp_input.c | 532 ++-- kernel/net/ipv4/tcp_ipv4.c | 273 +- kernel/net/ipv4/tcp_metrics.c | 83 +- kernel/net/ipv4/tcp_minisocks.c | 88 +- kernel/net/ipv4/tcp_offload.c | 4 +- kernel/net/ipv4/tcp_output.c | 278 ++- kernel/net/ipv4/tcp_recovery.c | 109 + kernel/net/ipv4/tcp_scalable.c | 2 +- kernel/net/ipv4/tcp_timer.c | 25 +- kernel/net/ipv4/tcp_vegas.c | 6 +- kernel/net/ipv4/tcp_veno.c | 2 +- kernel/net/ipv4/tcp_yeah.c | 2 +- kernel/net/ipv4/udp.c | 21 +- kernel/net/ipv4/udp_diag.c | 2 + kernel/net/ipv4/udp_tunnel.c | 33 +- kernel/net/ipv4/xfrm4_input.c | 7 +- kernel/net/ipv4/xfrm4_output.c | 13 +- kernel/net/ipv4/xfrm4_policy.c | 113 +- kernel/net/ipv6/Kconfig | 30 +- kernel/net/ipv6/Makefile | 2 + kernel/net/ipv6/addrconf.c | 512 ++-- kernel/net/ipv6/addrconf_core.c | 11 +- kernel/net/ipv6/addrlabel.c | 2 +- kernel/net/ipv6/af_inet6.c | 34 +- kernel/net/ipv6/ah6.c | 4 +- kernel/net/ipv6/datagram.c | 19 +- kernel/net/ipv6/esp6.c | 201 +- kernel/net/ipv6/exthdrs.c | 5 +- kernel/net/ipv6/fib6_rules.c | 25 +- kernel/net/ipv6/icmp.c | 44 +- kernel/net/ipv6/ila.c | 229 ++ kernel/net/ipv6/inet6_connection_sock.c | 98 +- kernel/net/ipv6/inet6_hashtables.c | 19 +- kernel/net/ipv6/ip6_fib.c | 65 +- kernel/net/ipv6/ip6_flowlabel.c | 9 +- kernel/net/ipv6/ip6_gre.c | 106 +- kernel/net/ipv6/ip6_input.c | 20 +- kernel/net/ipv6/ip6_offload.c | 12 + kernel/net/ipv6/ip6_output.c | 268 +- kernel/net/ipv6/ip6_tunnel.c | 149 +- kernel/net/ipv6/ip6_udp_tunnel.c | 13 +- kernel/net/ipv6/ip6_vti.c | 2 +- kernel/net/ipv6/ip6mr.c | 32 +- kernel/net/ipv6/ipv6_sockglue.c | 33 +- kernel/net/ipv6/mcast.c | 11 +- kernel/net/ipv6/mcast_snoop.c | 216 ++ kernel/net/ipv6/mip6.c | 16 +- kernel/net/ipv6/ndisc.c | 94 +- kernel/net/ipv6/netfilter.c | 9 +- kernel/net/ipv6/netfilter/Kconfig | 17 +- kernel/net/ipv6/netfilter/Makefile | 3 + kernel/net/ipv6/netfilter/ip6_tables.c | 162 +- kernel/net/ipv6/netfilter/ip6t_REJECT.c | 11 +- kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c | 31 +- kernel/net/ipv6/netfilter/ip6t_rpfilter.c | 6 +- kernel/net/ipv6/netfilter/ip6table_filter.c | 6 +- kernel/net/ipv6/netfilter/ip6table_mangle.c | 18 +- kernel/net/ipv6/netfilter/ip6table_nat.c | 26 +- kernel/net/ipv6/netfilter/ip6table_raw.c | 6 +- kernel/net/ipv6/netfilter/ip6table_security.c | 7 +- .../net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 20 +- .../net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 12 +- kernel/net/ipv6/netfilter/nf_conntrack_reasm.c | 33 +- kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 32 +- kernel/net/ipv6/netfilter/nf_dup_ipv6.c | 82 + kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 48 +- kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 2 +- kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c | 2 +- kernel/net/ipv6/netfilter/nf_reject_ipv6.c | 6 +- kernel/net/ipv6/netfilter/nf_tables_ipv6.c | 10 +- kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c | 22 +- kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c | 14 +- kernel/net/ipv6/netfilter/nft_dup_ipv6.c | 108 + kernel/net/ipv6/netfilter/nft_redir_ipv6.c | 3 +- kernel/net/ipv6/netfilter/nft_reject_ipv6.c | 7 +- kernel/net/ipv6/output_core.c | 38 +- kernel/net/ipv6/raw.c | 31 +- kernel/net/ipv6/reassembly.c | 30 +- kernel/net/ipv6/route.c | 932 ++++--- kernel/net/ipv6/sit.c | 28 +- kernel/net/ipv6/syncookies.c | 34 +- kernel/net/ipv6/sysctl_net_ipv6.c | 23 +- kernel/net/ipv6/tcp_ipv6.c | 291 ++- kernel/net/ipv6/tunnel6.c | 12 +- kernel/net/ipv6/udp.c | 22 +- kernel/net/ipv6/xfrm6_input.c | 4 +- kernel/net/ipv6/xfrm6_mode_tunnel.c | 5 +- kernel/net/ipv6/xfrm6_output.c | 40 +- kernel/net/ipv6/xfrm6_policy.c | 92 +- kernel/net/ipx/af_ipx.c | 2 +- kernel/net/irda/af_irda.c | 8 +- kernel/net/irda/ircomm/ircomm_tty.c | 31 +- kernel/net/irda/irlmp.c | 2 +- kernel/net/irda/timer.c | 4 +- kernel/net/iucv/af_iucv.c | 24 +- kernel/net/iucv/iucv.c | 12 +- kernel/net/key/af_key.c | 51 +- kernel/net/l2tp/l2tp_core.c | 26 +- kernel/net/l2tp/l2tp_core.h | 3 + kernel/net/l2tp/l2tp_eth.c | 1 + kernel/net/l2tp/l2tp_ip.c | 1 + kernel/net/l2tp/l2tp_ip6.c | 9 +- kernel/net/l2tp/l2tp_netlink.c | 25 +- kernel/net/l2tp/l2tp_ppp.c | 5 +- kernel/net/l3mdev/Kconfig | 10 + kernel/net/l3mdev/Makefile | 5 + kernel/net/l3mdev/l3mdev.c | 92 + kernel/net/llc/af_llc.c | 6 +- kernel/net/llc/llc_conn.c | 6 +- kernel/net/mac80211/Kconfig | 17 +- kernel/net/mac80211/Makefile | 2 +- kernel/net/mac80211/aes_ccm.c | 33 +- kernel/net/mac80211/aes_cmac.c | 17 - kernel/net/mac80211/aes_gcm.c | 33 +- kernel/net/mac80211/aes_gmac.c | 14 +- kernel/net/mac80211/agg-rx.c | 10 +- kernel/net/mac80211/agg-tx.c | 22 +- kernel/net/mac80211/cfg.c | 489 ++-- kernel/net/mac80211/cfg.h | 9 - kernel/net/mac80211/chan.c | 41 +- kernel/net/mac80211/debugfs.c | 179 +- kernel/net/mac80211/debugfs_key.c | 70 +- kernel/net/mac80211/debugfs_netdev.c | 75 +- kernel/net/mac80211/debugfs_sta.c | 95 +- kernel/net/mac80211/driver-ops.c | 309 +++ kernel/net/mac80211/driver-ops.h | 341 +-- kernel/net/mac80211/ethtool.c | 32 +- kernel/net/mac80211/event.c | 27 - kernel/net/mac80211/ibss.c | 34 +- kernel/net/mac80211/ieee80211_i.h | 120 +- kernel/net/mac80211/iface.c | 134 +- kernel/net/mac80211/key.c | 97 +- kernel/net/mac80211/key.h | 10 +- kernel/net/mac80211/led.c | 268 +- kernel/net/mac80211/led.h | 44 +- kernel/net/mac80211/main.c | 64 +- kernel/net/mac80211/mesh.c | 98 +- kernel/net/mac80211/mesh.h | 14 +- kernel/net/mac80211/mesh_hwmp.c | 117 +- kernel/net/mac80211/mesh_pathtbl.c | 8 +- kernel/net/mac80211/mesh_plink.c | 345 +-- kernel/net/mac80211/mesh_ps.c | 42 +- kernel/net/mac80211/mesh_sync.c | 16 +- kernel/net/mac80211/mlme.c | 704 +++--- kernel/net/mac80211/ocb.c | 4 +- kernel/net/mac80211/offchannel.c | 8 +- kernel/net/mac80211/pm.c | 32 +- kernel/net/mac80211/rate.c | 333 ++- kernel/net/mac80211/rate.h | 66 +- kernel/net/mac80211/rc80211_minstrel.c | 2 +- kernel/net/mac80211/rc80211_minstrel_debugfs.c | 12 +- kernel/net/mac80211/rc80211_minstrel_ht.c | 15 +- kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c | 12 +- kernel/net/mac80211/rx.c | 435 ++-- kernel/net/mac80211/scan.c | 71 +- kernel/net/mac80211/sta_info.c | 175 +- kernel/net/mac80211/sta_info.h | 251 +- kernel/net/mac80211/status.c | 326 +-- kernel/net/mac80211/tdls.c | 379 ++- kernel/net/mac80211/trace.h | 94 +- kernel/net/mac80211/tx.c | 620 ++++- kernel/net/mac80211/util.c | 299 +-- kernel/net/mac80211/vht.c | 59 +- kernel/net/mac80211/wpa.c | 102 +- kernel/net/mac802154/Kconfig | 1 + kernel/net/mac802154/Makefile | 4 +- kernel/net/mac802154/cfg.c | 367 ++- kernel/net/mac802154/driver-ops.h | 96 +- kernel/net/mac802154/ieee802154_i.h | 20 +- kernel/net/mac802154/iface.c | 296 ++- kernel/net/mac802154/llsec.c | 65 +- kernel/net/mac802154/mac_cmd.c | 42 +- kernel/net/mac802154/main.c | 39 +- kernel/net/mac802154/mib.c | 63 +- kernel/net/mac802154/rx.c | 31 +- kernel/net/mac802154/trace.c | 9 + kernel/net/mac802154/trace.h | 272 ++ kernel/net/mac802154/tx.c | 34 +- kernel/net/mac802154/util.c | 13 +- kernel/net/mpls/Kconfig | 8 +- kernel/net/mpls/Makefile | 1 + kernel/net/mpls/af_mpls.c | 796 ++++-- kernel/net/mpls/internal.h | 81 +- kernel/net/mpls/mpls_gso.c | 2 + kernel/net/mpls/mpls_iptunnel.c | 231 ++ kernel/net/netfilter/Kconfig | 52 +- kernel/net/netfilter/Makefile | 3 +- kernel/net/netfilter/core.c | 233 +- kernel/net/netfilter/ipset/ip_set_bitmap_gen.h | 61 +- kernel/net/netfilter/ipset/ip_set_bitmap_ip.c | 58 +- kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c | 119 +- kernel/net/netfilter/ipset/ip_set_bitmap_port.c | 45 +- kernel/net/netfilter/ipset/ip_set_core.c | 404 +-- kernel/net/netfilter/ipset/ip_set_getport.c | 19 +- kernel/net/netfilter/ipset/ip_set_hash_gen.h | 756 +++--- kernel/net/netfilter/ipset/ip_set_hash_ip.c | 72 +- kernel/net/netfilter/ipset/ip_set_hash_ipmark.c | 87 +- kernel/net/netfilter/ipset/ip_set_hash_ipport.c | 98 +- kernel/net/netfilter/ipset/ip_set_hash_ipportip.c | 91 +- kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c | 96 +- kernel/net/netfilter/ipset/ip_set_hash_mac.c | 30 +- kernel/net/netfilter/ipset/ip_set_hash_net.c | 73 +- kernel/net/netfilter/ipset/ip_set_hash_netiface.c | 250 +- kernel/net/netfilter/ipset/ip_set_hash_netnet.c | 158 +- kernel/net/netfilter/ipset/ip_set_hash_netport.c | 86 +- .../net/netfilter/ipset/ip_set_hash_netportnet.c | 188 +- kernel/net/netfilter/ipset/ip_set_list_set.c | 427 ++-- kernel/net/netfilter/ipset/pfxlen.c | 16 +- kernel/net/netfilter/ipvs/Kconfig | 11 + kernel/net/netfilter/ipvs/Makefile | 1 + kernel/net/netfilter/ipvs/ip_vs_app.c | 36 +- kernel/net/netfilter/ipvs/ip_vs_conn.c | 91 +- kernel/net/netfilter/ipvs/ip_vs_core.c | 566 +++-- kernel/net/netfilter/ipvs/ip_vs_ctl.c | 502 ++-- kernel/net/netfilter/ipvs/ip_vs_est.c | 20 +- kernel/net/netfilter/ipvs/ip_vs_ftp.c | 27 +- kernel/net/netfilter/ipvs/ip_vs_lblc.c | 3 +- kernel/net/netfilter/ipvs/ip_vs_lblcr.c | 3 +- kernel/net/netfilter/ipvs/ip_vs_nfct.c | 5 +- kernel/net/netfilter/ipvs/ip_vs_ovf.c | 86 + kernel/net/netfilter/ipvs/ip_vs_pe_sip.c | 2 +- kernel/net/netfilter/ipvs/ip_vs_proto.c | 33 +- kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 32 +- kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c | 58 +- kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c | 61 +- kernel/net/netfilter/ipvs/ip_vs_proto_udp.c | 49 +- kernel/net/netfilter/ipvs/ip_vs_sched.c | 14 +- kernel/net/netfilter/ipvs/ip_vs_sh.c | 45 +- kernel/net/netfilter/ipvs/ip_vs_sync.c | 374 +-- kernel/net/netfilter/ipvs/ip_vs_xmit.c | 145 +- kernel/net/netfilter/nf_conntrack_core.c | 197 +- kernel/net/netfilter/nf_conntrack_expect.c | 22 +- kernel/net/netfilter/nf_conntrack_h323_main.c | 4 +- kernel/net/netfilter/nf_conntrack_labels.c | 34 +- kernel/net/netfilter/nf_conntrack_netlink.c | 331 ++- kernel/net/netfilter/nf_conntrack_pptp.c | 3 +- kernel/net/netfilter/nf_conntrack_proto_dccp.c | 2 +- kernel/net/netfilter/nf_conntrack_proto_generic.c | 10 +- kernel/net/netfilter/nf_conntrack_proto_gre.c | 3 +- kernel/net/netfilter/nf_conntrack_proto_sctp.c | 103 +- kernel/net/netfilter/nf_conntrack_proto_tcp.c | 2 +- kernel/net/netfilter/nf_conntrack_proto_udp.c | 1 + kernel/net/netfilter/nf_conntrack_proto_udplite.c | 1 + kernel/net/netfilter/nf_conntrack_seqadj.c | 9 +- kernel/net/netfilter/nf_conntrack_standalone.c | 39 +- kernel/net/netfilter/nf_internals.h | 1 + kernel/net/netfilter/nf_log.c | 9 +- kernel/net/netfilter/nf_nat_core.c | 28 +- kernel/net/netfilter/nf_nat_proto_dccp.c | 2 +- kernel/net/netfilter/nf_nat_proto_tcp.c | 2 +- kernel/net/netfilter/nf_nat_proto_udp.c | 2 +- kernel/net/netfilter/nf_nat_proto_udplite.c | 2 +- kernel/net/netfilter/nf_nat_redirect.c | 2 +- kernel/net/netfilter/nf_queue.c | 55 +- kernel/net/netfilter/nf_synproxy_core.c | 24 +- kernel/net/netfilter/nf_tables_api.c | 221 +- kernel/net/netfilter/nf_tables_core.c | 10 +- kernel/net/netfilter/nf_tables_netdev.c | 256 ++ kernel/net/netfilter/nfnetlink.c | 54 +- kernel/net/netfilter/nfnetlink_acct.c | 71 +- kernel/net/netfilter/nfnetlink_cttimeout.c | 34 + kernel/net/netfilter/nfnetlink_log.c | 91 +- kernel/net/netfilter/nfnetlink_queue.c | 1444 +++++++++++ kernel/net/netfilter/nfnetlink_queue_core.c | 1362 ---------- kernel/net/netfilter/nfnetlink_queue_ct.c | 113 - kernel/net/netfilter/nft_compat.c | 26 +- kernel/net/netfilter/nft_counter.c | 124 +- kernel/net/netfilter/nft_ct.c | 1 + kernel/net/netfilter/nft_dynset.c | 5 +- kernel/net/netfilter/nft_limit.c | 188 +- kernel/net/netfilter/nft_log.c | 3 +- kernel/net/netfilter/nft_meta.c | 44 +- kernel/net/netfilter/nft_payload.c | 57 +- kernel/net/netfilter/nft_queue.c | 2 +- kernel/net/netfilter/nft_reject_inet.c | 19 +- kernel/net/netfilter/x_tables.c | 85 +- kernel/net/netfilter/xt_CT.c | 46 +- kernel/net/netfilter/xt_IDLETIMER.c | 1 + kernel/net/netfilter/xt_LOG.c | 2 +- kernel/net/netfilter/xt_NFLOG.c | 2 +- kernel/net/netfilter/xt_TCPMSS.c | 16 +- kernel/net/netfilter/xt_TCPOPTSTRIP.c | 2 +- kernel/net/netfilter/xt_TEE.c | 168 +- kernel/net/netfilter/xt_TPROXY.c | 30 +- kernel/net/netfilter/xt_addrtype.c | 6 +- kernel/net/netfilter/xt_connlabel.c | 16 +- kernel/net/netfilter/xt_connlimit.c | 13 +- kernel/net/netfilter/xt_ipvs.c | 5 +- kernel/net/netfilter/xt_mark.c | 1 + kernel/net/netfilter/xt_nfacct.c | 2 +- kernel/net/netfilter/xt_osf.c | 2 +- kernel/net/netfilter/xt_owner.c | 6 +- kernel/net/netfilter/xt_recent.c | 2 +- kernel/net/netfilter/xt_set.c | 47 +- kernel/net/netfilter/xt_socket.c | 73 +- kernel/net/netlink/af_netlink.c | 303 ++- kernel/net/netlink/genetlink.c | 14 +- kernel/net/netrom/af_netrom.c | 4 +- kernel/net/netrom/nr_route.c | 1 - kernel/net/nfc/af_nfc.c | 2 +- kernel/net/nfc/core.c | 4 +- kernel/net/nfc/digital_core.c | 3 +- kernel/net/nfc/hci/core.c | 3 +- kernel/net/nfc/hci/llc.c | 2 + kernel/net/nfc/llcp.h | 2 +- kernel/net/nfc/llcp_core.c | 2 +- kernel/net/nfc/llcp_sock.c | 10 +- kernel/net/nfc/nci/Kconfig | 9 +- kernel/net/nfc/nci/Makefile | 6 +- kernel/net/nfc/nci/core.c | 217 +- kernel/net/nfc/nci/data.c | 13 + kernel/net/nfc/nci/hci.c | 176 +- kernel/net/nfc/nci/ntf.c | 11 + kernel/net/nfc/nci/rsp.c | 11 + kernel/net/nfc/nci/spi.c | 11 +- kernel/net/nfc/nci/uart.c | 494 ++++ kernel/net/nfc/netlink.c | 146 +- kernel/net/nfc/nfc.h | 7 +- kernel/net/nfc/rawsock.c | 7 +- kernel/net/openvswitch/Kconfig | 4 +- kernel/net/openvswitch/Makefile | 6 +- kernel/net/openvswitch/actions.c | 306 ++- kernel/net/openvswitch/conntrack.c | 790 ++++++ kernel/net/openvswitch/conntrack.h | 93 + kernel/net/openvswitch/datapath.c | 140 +- kernel/net/openvswitch/datapath.h | 23 +- kernel/net/openvswitch/dp_notify.c | 7 +- kernel/net/openvswitch/flow.c | 45 +- kernel/net/openvswitch/flow.h | 91 +- kernel/net/openvswitch/flow_netlink.c | 458 +++- kernel/net/openvswitch/flow_netlink.h | 19 +- kernel/net/openvswitch/flow_table.c | 11 +- kernel/net/openvswitch/vport-geneve.c | 183 +- kernel/net/openvswitch/vport-gre.c | 248 +- kernel/net/openvswitch/vport-internal_dev.c | 145 +- kernel/net/openvswitch/vport-netdev.c | 154 +- kernel/net/openvswitch/vport-netdev.h | 15 +- kernel/net/openvswitch/vport-vxlan.c | 238 +- kernel/net/openvswitch/vport-vxlan.h | 11 - kernel/net/openvswitch/vport.c | 224 +- kernel/net/openvswitch/vport.h | 80 +- kernel/net/packet/af_packet.c | 562 +++-- kernel/net/packet/internal.h | 18 +- kernel/net/phonet/af_phonet.c | 6 +- kernel/net/phonet/pep.c | 2 +- kernel/net/rds/af_rds.c | 77 +- kernel/net/rds/bind.c | 129 +- kernel/net/rds/connection.c | 47 +- kernel/net/rds/ib.c | 67 +- kernel/net/rds/ib.h | 113 +- kernel/net/rds/ib_cm.c | 185 +- kernel/net/rds/ib_rdma.c | 171 +- kernel/net/rds/ib_recv.c | 198 +- kernel/net/rds/ib_send.c | 245 +- kernel/net/rds/ib_stats.c | 22 +- kernel/net/rds/iw.c | 14 +- kernel/net/rds/iw.h | 9 +- kernel/net/rds/iw_cm.c | 14 +- kernel/net/rds/iw_rdma.c | 140 +- kernel/net/rds/iw_send.c | 177 +- kernel/net/rds/rdma.c | 9 +- kernel/net/rds/rdma_transport.c | 53 +- kernel/net/rds/rds.h | 43 +- kernel/net/rds/send.c | 80 +- kernel/net/rds/tcp.c | 181 +- kernel/net/rds/tcp.h | 7 +- kernel/net/rds/tcp_connect.c | 9 +- kernel/net/rds/tcp_listen.c | 65 +- kernel/net/rds/tcp_recv.c | 11 +- kernel/net/rds/tcp_send.c | 8 +- kernel/net/rds/threads.c | 2 + kernel/net/rds/transport.c | 27 +- kernel/net/rfkill/Kconfig | 3 +- kernel/net/rfkill/core.c | 44 +- kernel/net/rfkill/rfkill-gpio.c | 25 +- kernel/net/rose/af_rose.c | 7 +- kernel/net/rose/rose_link.c | 1 - kernel/net/rose/rose_route.c | 1 - kernel/net/rxrpc/af_rxrpc.c | 4 +- kernel/net/rxrpc/ar-ack.c | 4 +- kernel/net/rxrpc/ar-connection.c | 6 +- kernel/net/rxrpc/ar-internal.h | 4 +- kernel/net/rxrpc/ar-key.c | 32 +- kernel/net/rxrpc/ar-local.c | 4 +- kernel/net/rxrpc/ar-output.c | 4 +- kernel/net/rxrpc/ar-security.c | 4 +- kernel/net/rxrpc/ar-transport.c | 4 +- kernel/net/rxrpc/rxkad.c | 16 +- kernel/net/sched/Kconfig | 11 + kernel/net/sched/Makefile | 1 + kernel/net/sched/act_api.c | 52 +- kernel/net/sched/act_bpf.c | 93 +- kernel/net/sched/act_connmark.c | 12 +- kernel/net/sched/act_csum.c | 3 +- kernel/net/sched/act_gact.c | 44 +- kernel/net/sched/act_ipt.c | 3 +- kernel/net/sched/act_mirred.c | 80 +- kernel/net/sched/act_nat.c | 10 +- kernel/net/sched/act_pedit.c | 13 +- kernel/net/sched/act_simple.c | 3 +- kernel/net/sched/act_skbedit.c | 3 +- kernel/net/sched/act_vlan.c | 3 +- kernel/net/sched/cls_bpf.c | 98 +- kernel/net/sched/cls_cgroup.c | 23 +- kernel/net/sched/cls_flow.c | 43 +- kernel/net/sched/cls_flower.c | 697 ++++++ kernel/net/sched/cls_rsvp.h | 18 +- kernel/net/sched/cls_tcindex.c | 29 +- kernel/net/sched/em_ipset.c | 5 +- kernel/net/sched/em_meta.c | 138 +- kernel/net/sched/sch_api.c | 90 +- kernel/net/sched/sch_atm.c | 2 +- kernel/net/sched/sch_blackhole.c | 15 +- kernel/net/sched/sch_cbq.c | 2 +- kernel/net/sched/sch_choke.c | 94 +- kernel/net/sched/sch_codel.c | 15 +- kernel/net/sched/sch_drr.c | 2 +- kernel/net/sched/sch_dsmark.c | 65 +- kernel/net/sched/sch_fifo.c | 2 +- kernel/net/sched/sch_fq.c | 13 +- kernel/net/sched/sch_fq_codel.c | 61 +- kernel/net/sched/sch_generic.c | 60 +- kernel/net/sched/sch_gred.c | 26 +- kernel/net/sched/sch_hfsc.c | 2 +- kernel/net/sched/sch_hhf.c | 30 +- kernel/net/sched/sch_htb.c | 8 +- kernel/net/sched/sch_ingress.c | 59 +- kernel/net/sched/sch_mq.c | 4 +- kernel/net/sched/sch_mqprio.c | 4 +- kernel/net/sched/sch_multiq.c | 2 +- kernel/net/sched/sch_netem.c | 4 +- kernel/net/sched/sch_plug.c | 9 +- kernel/net/sched/sch_prio.c | 2 +- kernel/net/sched/sch_qfq.c | 6 +- kernel/net/sched/sch_sfb.c | 28 +- kernel/net/sched/sch_sfq.c | 31 +- kernel/net/sctp/associola.c | 22 +- kernel/net/sctp/auth.c | 4 +- kernel/net/sctp/ipv6.c | 31 +- kernel/net/sctp/outqueue.c | 2 + kernel/net/sctp/protocol.c | 93 +- kernel/net/sctp/sm_make_chunk.c | 28 +- kernel/net/sctp/sm_sideeffect.c | 48 +- kernel/net/sctp/sm_statefuns.c | 13 +- kernel/net/sctp/socket.c | 90 +- kernel/net/sctp/sysctl.c | 2 +- kernel/net/sctp/transport.c | 2 +- kernel/net/socket.c | 36 +- kernel/net/sunrpc/Kconfig | 28 +- kernel/net/sunrpc/Makefile | 5 +- kernel/net/sunrpc/auth.c | 2 +- kernel/net/sunrpc/auth_gss/auth_gss.c | 13 +- kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c | 8 +- kernel/net/sunrpc/auth_unix.c | 2 +- kernel/net/sunrpc/backchannel_rqst.c | 156 +- kernel/net/sunrpc/bc_svc.c | 63 - kernel/net/sunrpc/cache.c | 158 +- kernel/net/sunrpc/clnt.c | 114 +- kernel/net/sunrpc/debugfs.c | 78 + kernel/net/sunrpc/sched.c | 18 +- kernel/net/sunrpc/svc.c | 169 +- kernel/net/sunrpc/svc_xprt.c | 10 +- kernel/net/sunrpc/svcsock.c | 40 +- kernel/net/sunrpc/sysctl.c | 23 +- kernel/net/sunrpc/xprt.c | 7 +- kernel/net/sunrpc/xprtrdma/Makefile | 15 +- kernel/net/sunrpc/xprtrdma/backchannel.c | 394 +++ kernel/net/sunrpc/xprtrdma/fmr_ops.c | 120 +- kernel/net/sunrpc/xprtrdma/frwr_ops.c | 343 ++- kernel/net/sunrpc/xprtrdma/module.c | 46 + kernel/net/sunrpc/xprtrdma/physical_ops.c | 31 +- kernel/net/sunrpc/xprtrdma/rpc_rdma.c | 353 +-- kernel/net/sunrpc/xprtrdma/svc_rdma.c | 14 +- kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c | 140 +- kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 141 +- kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c | 117 +- kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c | 256 +- kernel/net/sunrpc/xprtrdma/transport.c | 149 +- kernel/net/sunrpc/xprtrdma/verbs.c | 1029 +++----- kernel/net/sunrpc/xprtrdma/xprt_rdma.h | 129 +- kernel/net/sunrpc/xprtsock.c | 501 ++-- kernel/net/switchdev/switchdev.c | 1290 ++++++++-- kernel/net/sysctl_net.c | 6 +- kernel/net/tipc/addr.c | 7 - kernel/net/tipc/addr.h | 8 + kernel/net/tipc/bcast.c | 963 ++------ kernel/net/tipc/bcast.h | 120 +- kernel/net/tipc/bearer.c | 142 +- kernel/net/tipc/bearer.h | 14 +- kernel/net/tipc/core.c | 13 +- kernel/net/tipc/core.h | 55 +- kernel/net/tipc/discover.c | 150 +- kernel/net/tipc/link.c | 2595 +++++++++----------- kernel/net/tipc/link.h | 234 +- kernel/net/tipc/msg.c | 201 +- kernel/net/tipc/msg.h | 133 +- kernel/net/tipc/name_distr.c | 6 +- kernel/net/tipc/name_table.c | 34 +- kernel/net/tipc/net.c | 7 +- kernel/net/tipc/netlink_compat.c | 139 +- kernel/net/tipc/node.c | 1131 +++++++-- kernel/net/tipc/node.h | 127 +- kernel/net/tipc/server.c | 6 +- kernel/net/tipc/socket.c | 409 +-- kernel/net/tipc/socket.h | 2 +- kernel/net/tipc/subscr.c | 251 +- kernel/net/tipc/subscr.h | 18 +- kernel/net/tipc/udp_media.c | 29 +- kernel/net/unix/af_unix.c | 678 ++++- kernel/net/unix/diag.c | 2 +- kernel/net/unix/garbage.c | 17 +- kernel/net/vmw_vsock/af_vsock.c | 51 +- kernel/net/vmw_vsock/vmci_transport.c | 181 +- kernel/net/vmw_vsock/vmci_transport.h | 4 +- kernel/net/wimax/op-rfkill.c | 3 +- kernel/net/wireless/Kconfig | 10 + kernel/net/wireless/chan.c | 100 +- kernel/net/wireless/core.c | 13 +- kernel/net/wireless/core.h | 7 + kernel/net/wireless/mlme.c | 75 +- kernel/net/wireless/nl80211.c | 566 ++++- kernel/net/wireless/rdev-ops.h | 2 + kernel/net/wireless/reg.c | 379 +-- kernel/net/wireless/scan.c | 61 +- kernel/net/wireless/sme.c | 4 +- kernel/net/wireless/sysfs.c | 14 +- kernel/net/wireless/trace.h | 33 +- kernel/net/wireless/util.c | 3 +- kernel/net/wireless/wext-core.c | 52 +- kernel/net/x25/af_x25.c | 8 +- kernel/net/xfrm/xfrm_algo.c | 28 + kernel/net/xfrm/xfrm_input.c | 16 +- kernel/net/xfrm/xfrm_output.c | 34 +- kernel/net/xfrm/xfrm_policy.c | 173 +- kernel/net/xfrm/xfrm_state.c | 4 +- kernel/net/xfrm/xfrm_user.c | 57 +- 850 files changed, 51807 insertions(+), 29448 deletions(-) create mode 100644 kernel/net/6lowpan/core.c delete mode 100644 kernel/net/bridge/br_netfilter.c create mode 100644 kernel/net/bridge/br_netfilter_hooks.c create mode 100644 kernel/net/bridge/br_netfilter_ipv6.c create mode 100644 kernel/net/core/lwtunnel.c delete mode 100644 kernel/net/ipv4/geneve.c create mode 100644 kernel/net/ipv4/netfilter/nf_dup_ipv4.c create mode 100644 kernel/net/ipv4/netfilter/nft_dup_ipv4.c create mode 100644 kernel/net/ipv4/tcp_cdg.c create mode 100644 kernel/net/ipv4/tcp_recovery.c create mode 100644 kernel/net/ipv6/ila.c create mode 100644 kernel/net/ipv6/mcast_snoop.c create mode 100644 kernel/net/ipv6/netfilter/nf_dup_ipv6.c create mode 100644 kernel/net/ipv6/netfilter/nft_dup_ipv6.c create mode 100644 kernel/net/l3mdev/Kconfig create mode 100644 kernel/net/l3mdev/Makefile create mode 100644 kernel/net/l3mdev/l3mdev.c delete mode 100644 kernel/net/mac80211/cfg.h create mode 100644 kernel/net/mac80211/driver-ops.c delete mode 100644 kernel/net/mac80211/event.c create mode 100644 kernel/net/mac802154/trace.c create mode 100644 kernel/net/mac802154/trace.h create mode 100644 kernel/net/mpls/mpls_iptunnel.c create mode 100644 kernel/net/netfilter/ipvs/ip_vs_ovf.c create mode 100644 kernel/net/netfilter/nf_tables_netdev.c create mode 100644 kernel/net/netfilter/nfnetlink_queue.c delete mode 100644 kernel/net/netfilter/nfnetlink_queue_core.c delete mode 100644 kernel/net/netfilter/nfnetlink_queue_ct.c create mode 100644 kernel/net/nfc/nci/uart.c create mode 100644 kernel/net/openvswitch/conntrack.c create mode 100644 kernel/net/openvswitch/conntrack.h delete mode 100644 kernel/net/openvswitch/vport-vxlan.h create mode 100644 kernel/net/sched/cls_flower.c delete mode 100644 kernel/net/sunrpc/bc_svc.c create mode 100644 kernel/net/sunrpc/xprtrdma/backchannel.c create mode 100644 kernel/net/sunrpc/xprtrdma/module.c (limited to 'kernel/net') diff --git a/kernel/net/6lowpan/Makefile b/kernel/net/6lowpan/Makefile index eb8baa72a..c6ffc55ee 100644 --- a/kernel/net/6lowpan/Makefile +++ b/kernel/net/6lowpan/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := iphc.o nhc.o +6lowpan-y := core.o iphc.o nhc.o #rfc6282 nhcs obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o diff --git a/kernel/net/6lowpan/core.c b/kernel/net/6lowpan/core.c new file mode 100644 index 000000000..83b19e072 --- /dev/null +++ b/kernel/net/6lowpan/core.c @@ -0,0 +1,45 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2015 Pengutronix, Alexander Aring + */ + +#include + +#include + +void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) +{ + dev->addr_len = EUI64_ADDR_LEN; + dev->type = ARPHRD_6LOWPAN; + dev->mtu = IPV6_MIN_MTU; + dev->priv_flags |= IFF_NO_QUEUE; + + lowpan_priv(dev)->lltype = lltype; +} +EXPORT_SYMBOL(lowpan_netdev_setup); + +static int __init lowpan_module_init(void) +{ + request_module_nowait("ipv6"); + + request_module_nowait("nhc_dest"); + request_module_nowait("nhc_fragment"); + request_module_nowait("nhc_hop"); + request_module_nowait("nhc_ipv6"); + request_module_nowait("nhc_mobility"); + request_module_nowait("nhc_routing"); + request_module_nowait("nhc_udp"); + + return 0; +} +module_init(lowpan_module_init); + +MODULE_LICENSE("GPL"); diff --git a/kernel/net/6lowpan/iphc.c b/kernel/net/6lowpan/iphc.c index 94a375c04..346b5c1a9 100644 --- a/kernel/net/6lowpan/iphc.c +++ b/kernel/net/6lowpan/iphc.c @@ -48,38 +48,179 @@ #include #include -#include #include + #include #include -#include + +/* special link-layer handling */ +#include #include "nhc.h" +/* Values of fields within the IPHC encoding first byte */ +#define LOWPAN_IPHC_TF_MASK 0x18 +#define LOWPAN_IPHC_TF_00 0x00 +#define LOWPAN_IPHC_TF_01 0x08 +#define LOWPAN_IPHC_TF_10 0x10 +#define LOWPAN_IPHC_TF_11 0x18 + +#define LOWPAN_IPHC_NH 0x04 + +#define LOWPAN_IPHC_HLIM_MASK 0x03 +#define LOWPAN_IPHC_HLIM_00 0x00 +#define LOWPAN_IPHC_HLIM_01 0x01 +#define LOWPAN_IPHC_HLIM_10 0x02 +#define LOWPAN_IPHC_HLIM_11 0x03 + +/* Values of fields within the IPHC encoding second byte */ +#define LOWPAN_IPHC_CID 0x80 + +#define LOWPAN_IPHC_SAC 0x40 + +#define LOWPAN_IPHC_SAM_MASK 0x30 +#define LOWPAN_IPHC_SAM_00 0x00 +#define LOWPAN_IPHC_SAM_01 0x10 +#define LOWPAN_IPHC_SAM_10 0x20 +#define LOWPAN_IPHC_SAM_11 0x30 + +#define LOWPAN_IPHC_M 0x08 + +#define LOWPAN_IPHC_DAC 0x04 + +#define LOWPAN_IPHC_DAM_MASK 0x03 +#define LOWPAN_IPHC_DAM_00 0x00 +#define LOWPAN_IPHC_DAM_01 0x01 +#define LOWPAN_IPHC_DAM_10 0x02 +#define LOWPAN_IPHC_DAM_11 0x03 + +/* ipv6 address based on mac + * second bit-flip (Universe/Local) is done according RFC2464 + */ +#define is_addr_mac_addr_based(a, m) \ + ((((a)->s6_addr[8]) == (((m)[0]) ^ 0x02)) && \ + (((a)->s6_addr[9]) == (m)[1]) && \ + (((a)->s6_addr[10]) == (m)[2]) && \ + (((a)->s6_addr[11]) == (m)[3]) && \ + (((a)->s6_addr[12]) == (m)[4]) && \ + (((a)->s6_addr[13]) == (m)[5]) && \ + (((a)->s6_addr[14]) == (m)[6]) && \ + (((a)->s6_addr[15]) == (m)[7])) + +/* check whether we can compress the IID to 16 bits, + * it's possible for unicast addresses with first 49 bits are zero only. + */ +#define lowpan_is_iid_16_bit_compressable(a) \ + ((((a)->s6_addr16[4]) == 0) && \ + (((a)->s6_addr[10]) == 0) && \ + (((a)->s6_addr[11]) == 0xff) && \ + (((a)->s6_addr[12]) == 0xfe) && \ + (((a)->s6_addr[13]) == 0)) + +/* check whether the 112-bit gid of the multicast address is mappable to: */ + +/* 48 bits, FFXX::00XX:XXXX:XXXX */ +#define lowpan_is_mcast_addr_compressable48(a) \ + ((((a)->s6_addr16[1]) == 0) && \ + (((a)->s6_addr16[2]) == 0) && \ + (((a)->s6_addr16[3]) == 0) && \ + (((a)->s6_addr16[4]) == 0) && \ + (((a)->s6_addr[10]) == 0)) + +/* 32 bits, FFXX::00XX:XXXX */ +#define lowpan_is_mcast_addr_compressable32(a) \ + ((((a)->s6_addr16[1]) == 0) && \ + (((a)->s6_addr16[2]) == 0) && \ + (((a)->s6_addr16[3]) == 0) && \ + (((a)->s6_addr16[4]) == 0) && \ + (((a)->s6_addr16[5]) == 0) && \ + (((a)->s6_addr[12]) == 0)) + +/* 8 bits, FF02::00XX */ +#define lowpan_is_mcast_addr_compressable8(a) \ + ((((a)->s6_addr[1]) == 2) && \ + (((a)->s6_addr16[1]) == 0) && \ + (((a)->s6_addr16[2]) == 0) && \ + (((a)->s6_addr16[3]) == 0) && \ + (((a)->s6_addr16[4]) == 0) && \ + (((a)->s6_addr16[5]) == 0) && \ + (((a)->s6_addr16[6]) == 0) && \ + (((a)->s6_addr[14]) == 0)) + +static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr, + const void *lladdr) +{ + /* fe:80::XXXX:XXXX:XXXX:XXXX + * \_________________/ + * hwaddr + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + memcpy(&ipaddr->s6_addr[8], lladdr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) + * is done according RFC2464 + */ + ipaddr->s6_addr[8] ^= 0x02; +} + +static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + u8 eui64[EUI64_ADDR_LEN] = { }; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(eui64, &addr->extended_addr); + iphc_uncompress_eui64_lladdr(ipaddr, eui64); + break; + case IEEE802154_ADDR_SHORT: + /* fe:80::ff:fe00:XXXX + * \__/ + * short_addr + * + * Universe/Local bit is zero. + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&ipaddr->s6_addr16[7], + &addr->short_addr); + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } +} + /* Uncompress address function for source and * destination address(non-multicast). * - * address_mode is sam value or dam value. + * address_mode is the masked value for sam or dam value */ -static int uncompress_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, const u8 address_mode, - const u8 *lladdr, const u8 addr_type, - const u8 addr_len) +static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, + struct in6_addr *ipaddr, u8 address_mode, + const void *lladdr) { bool fail; switch (address_mode) { - case LOWPAN_IPHC_ADDR_00: + /* SAM and DAM are the same here */ + case LOWPAN_IPHC_DAM_00: /* for global link addresses */ fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16); break; - case LOWPAN_IPHC_ADDR_01: + case LOWPAN_IPHC_SAM_01: + case LOWPAN_IPHC_DAM_01: /* fe:80::XXXX:XXXX:XXXX:XXXX */ ipaddr->s6_addr[0] = 0xFE; ipaddr->s6_addr[1] = 0x80; fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8); break; - case LOWPAN_IPHC_ADDR_02: + case LOWPAN_IPHC_SAM_10: + case LOWPAN_IPHC_DAM_10: /* fe:80::ff:fe00:XXXX */ ipaddr->s6_addr[0] = 0xFE; ipaddr->s6_addr[1] = 0x80; @@ -87,38 +228,16 @@ static int uncompress_addr(struct sk_buff *skb, ipaddr->s6_addr[12] = 0xFE; fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2); break; - case LOWPAN_IPHC_ADDR_03: + case LOWPAN_IPHC_SAM_11: + case LOWPAN_IPHC_DAM_11: fail = false; - switch (addr_type) { - case IEEE802154_ADDR_LONG: - /* fe:80::XXXX:XXXX:XXXX:XXXX - * \_________________/ - * hwaddr - */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - memcpy(&ipaddr->s6_addr[8], lladdr, addr_len); - /* second bit-flip (Universe/Local) - * is done according RFC2464 - */ - ipaddr->s6_addr[8] ^= 0x02; - break; - case IEEE802154_ADDR_SHORT: - /* fe:80::ff:fe00:XXXX - * \__/ - * short_addr - * - * Universe/Local bit is zero. - */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - ipaddr->s6_addr[11] = 0xFF; - ipaddr->s6_addr[12] = 0xFE; - ipaddr->s6_addr16[7] = htons(*((u16 *)lladdr)); + switch (lowpan_priv(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - pr_debug("Invalid addr_type set\n"); - return -EINVAL; + iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + break; } break; default: @@ -142,24 +261,25 @@ static int uncompress_addr(struct sk_buff *skb, */ static int uncompress_context_based_src_addr(struct sk_buff *skb, struct in6_addr *ipaddr, - const u8 sam) + u8 address_mode) { - switch (sam) { - case LOWPAN_IPHC_ADDR_00: + switch (address_mode) { + case LOWPAN_IPHC_SAM_00: /* unspec address :: * Do nothing, address is already :: */ break; - case LOWPAN_IPHC_ADDR_01: + case LOWPAN_IPHC_SAM_01: /* TODO */ - case LOWPAN_IPHC_ADDR_02: + case LOWPAN_IPHC_SAM_10: /* TODO */ - case LOWPAN_IPHC_ADDR_03: + case LOWPAN_IPHC_SAM_11: /* TODO */ - netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam); + netdev_warn(skb->dev, "SAM value 0x%x not supported\n", + address_mode); return -EINVAL; default: - pr_debug("Invalid sam value: 0x%x\n", sam); + pr_debug("Invalid sam value: 0x%x\n", address_mode); return -EINVAL; } @@ -175,11 +295,11 @@ static int uncompress_context_based_src_addr(struct sk_buff *skb, */ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, struct in6_addr *ipaddr, - const u8 dam) + u8 address_mode) { bool fail; - switch (dam) { + switch (address_mode) { case LOWPAN_IPHC_DAM_00: /* 00: 128 bits. The full address * is carried in-line. @@ -211,7 +331,7 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1); break; default: - pr_debug("DAM value has a wrong value: 0x%x\n", dam); + pr_debug("DAM value has a wrong value: 0x%x\n", address_mode); return -EINVAL; } @@ -226,77 +346,142 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, return 0; } -/* TTL uncompression values */ -static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 }; - -int -lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, - const u8 *saddr, const u8 saddr_type, - const u8 saddr_len, const u8 *daddr, - const u8 daddr_type, const u8 daddr_len, - u8 iphc0, u8 iphc1) +/* get the ecn values from iphc tf format and set it to ipv6hdr */ +static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf) { - struct ipv6hdr hdr = {}; - u8 tmp, num_context = 0; - int err; + /* get the two higher bits which is ecn */ + u8 ecn = tf[0] & 0xc0; - raw_dump_table(__func__, "raw skb data dump uncompressed", - skb->data, skb->len); + /* ECN takes 0x30 in hdr->flow_lbl[0] */ + hdr->flow_lbl[0] |= (ecn >> 2); +} - /* another if the CID flag is set */ - if (iphc1 & LOWPAN_IPHC_CID) { - pr_debug("CID flag is set, increase header with one\n"); - if (lowpan_fetch_skb(skb, &num_context, sizeof(num_context))) - return -EINVAL; - } +/* get the dscp values from iphc tf format and set it to ipv6hdr */ +static inline void lowpan_iphc_tf_set_dscp(struct ipv6hdr *hdr, const u8 *tf) +{ + /* DSCP is at place after ECN */ + u8 dscp = tf[0] & 0x3f; - hdr.version = 6; + /* The four highest bits need to be set at hdr->priority */ + hdr->priority |= ((dscp & 0x3c) >> 2); + /* The two lower bits is part of hdr->flow_lbl[0] */ + hdr->flow_lbl[0] |= ((dscp & 0x03) << 6); +} - /* Traffic Class and Flow Label */ - switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) { - /* Traffic Class and FLow Label carried in-line - * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) +/* get the flow label values from iphc tf format and set it to ipv6hdr */ +static inline void lowpan_iphc_tf_set_lbl(struct ipv6hdr *hdr, const u8 *lbl) +{ + /* flow label is always some array started with lower nibble of + * flow_lbl[0] and followed with two bytes afterwards. Inside inline + * data the flow_lbl position can be different, which will be handled + * by lbl pointer. E.g. case "01" vs "00" the traffic class is 8 bit + * shifted, the different lbl pointer will handle that. + * + * The flow label will started at lower nibble of flow_lbl[0], the + * higher nibbles are part of DSCP + ECN. */ - case 0: /* 00b */ - if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) + hdr->flow_lbl[0] |= lbl[0] & 0x0f; + memcpy(&hdr->flow_lbl[1], &lbl[1], 2); +} + +/* lowpan_iphc_tf_decompress - decompress the traffic class. + * This function will return zero on success, a value lower than zero if + * failed. + */ +static int lowpan_iphc_tf_decompress(struct sk_buff *skb, struct ipv6hdr *hdr, + u8 val) +{ + u8 tf[4]; + + /* Traffic Class and Flow Label */ + switch (val) { + case LOWPAN_IPHC_TF_00: + /* ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) */ + if (lowpan_fetch_skb(skb, tf, 4)) return -EINVAL; - memcpy(&hdr.flow_lbl, &skb->data[0], 3); - skb_pull(skb, 3); - hdr.priority = ((tmp >> 2) & 0x0f); - hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) | - (hdr.flow_lbl[0] & 0x0f); + /* 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |ECN| DSCP | rsv | Flow Label | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + lowpan_iphc_tf_set_ecn(hdr, tf); + lowpan_iphc_tf_set_dscp(hdr, tf); + lowpan_iphc_tf_set_lbl(hdr, &tf[1]); break; - /* Traffic class carried in-line - * ECN + DSCP (1 byte), Flow Label is elided - */ - case 2: /* 10b */ - if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) + case LOWPAN_IPHC_TF_01: + /* ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided. */ + if (lowpan_fetch_skb(skb, tf, 3)) return -EINVAL; - hdr.priority = ((tmp >> 2) & 0x0f); - hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30); + /* 1 2 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |ECN|rsv| Flow Label | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + lowpan_iphc_tf_set_ecn(hdr, tf); + lowpan_iphc_tf_set_lbl(hdr, &tf[0]); break; - /* Flow Label carried in-line - * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided - */ - case 1: /* 01b */ - if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) + case LOWPAN_IPHC_TF_10: + /* ECN + DSCP (1 byte), Flow Label is elided. */ + if (lowpan_fetch_skb(skb, tf, 1)) return -EINVAL; - hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30); - memcpy(&hdr.flow_lbl[1], &skb->data[0], 2); - skb_pull(skb, 2); + /* 0 1 2 3 4 5 6 7 + * +-+-+-+-+-+-+-+-+ + * |ECN| DSCP | + * +-+-+-+-+-+-+-+-+ + */ + lowpan_iphc_tf_set_ecn(hdr, tf); + lowpan_iphc_tf_set_dscp(hdr, tf); break; - /* Traffic Class and Flow Label are elided */ - case 3: /* 11b */ + case LOWPAN_IPHC_TF_11: + /* Traffic Class and Flow Label are elided */ break; default: - break; + WARN_ON_ONCE(1); + return -EINVAL; } + return 0; +} + +/* TTL uncompression values */ +static const u8 lowpan_ttl_values[] = { + [LOWPAN_IPHC_HLIM_01] = 1, + [LOWPAN_IPHC_HLIM_10] = 64, + [LOWPAN_IPHC_HLIM_11] = 255, +}; + +int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, + const void *daddr, const void *saddr) +{ + struct ipv6hdr hdr = {}; + u8 iphc0, iphc1; + int err; + + raw_dump_table(__func__, "raw skb data dump uncompressed", + skb->data, skb->len); + + if (lowpan_fetch_skb(skb, &iphc0, sizeof(iphc0)) || + lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1))) + return -EINVAL; + + /* another if the CID flag is set */ + if (iphc1 & LOWPAN_IPHC_CID) + return -ENOTSUPP; + + hdr.version = 6; + + err = lowpan_iphc_tf_decompress(skb, &hdr, + iphc0 & LOWPAN_IPHC_TF_MASK); + if (err < 0) + return err; + /* Next Header */ - if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { + if (!(iphc0 & LOWPAN_IPHC_NH)) { /* Next header is carried inline */ if (lowpan_fetch_skb(skb, &hdr.nexthdr, sizeof(hdr.nexthdr))) return -EINVAL; @@ -306,35 +491,30 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, } /* Hop Limit */ - if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) { - hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03]; + if ((iphc0 & LOWPAN_IPHC_HLIM_MASK) != LOWPAN_IPHC_HLIM_00) { + hdr.hop_limit = lowpan_ttl_values[iphc0 & LOWPAN_IPHC_HLIM_MASK]; } else { if (lowpan_fetch_skb(skb, &hdr.hop_limit, sizeof(hdr.hop_limit))) return -EINVAL; } - /* Extract SAM to the tmp variable */ - tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03; - if (iphc1 & LOWPAN_IPHC_SAC) { /* Source address context based uncompression */ pr_debug("SAC bit is set. Handle context based source address.\n"); - err = uncompress_context_based_src_addr(skb, &hdr.saddr, tmp); + err = uncompress_context_based_src_addr(skb, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); - err = uncompress_addr(skb, &hdr.saddr, tmp, saddr, - saddr_type, saddr_len); + err = uncompress_addr(skb, dev, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); } /* Check on error of previous branch */ if (err) return -EINVAL; - /* Extract DAM to the tmp variable */ - tmp = ((iphc1 & LOWPAN_IPHC_DAM_11) >> LOWPAN_IPHC_DAM_BIT) & 0x03; - /* check for Multicast Compression */ if (iphc1 & LOWPAN_IPHC_M) { if (iphc1 & LOWPAN_IPHC_DAC) { @@ -342,22 +522,22 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, /* TODO: implement this */ } else { err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, - tmp); + iphc1 & LOWPAN_IPHC_DAM_MASK); if (err) return -EINVAL; } } else { - err = uncompress_addr(skb, &hdr.daddr, tmp, daddr, - daddr_type, daddr_len); + err = uncompress_addr(skb, dev, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); pr_debug("dest: stateless compression mode %d dest %pI6c\n", - tmp, &hdr.daddr); + iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr); if (err) return -EINVAL; } /* Next header data uncompression */ - if (iphc0 & LOWPAN_IPHC_NH_C) { + if (iphc0 & LOWPAN_IPHC_NH) { err = lowpan_nhc_do_uncompression(skb, dev, &hdr); if (err < 0) return err; @@ -367,7 +547,18 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, return err; } - hdr.payload_len = htons(skb->len); + switch (lowpan_priv(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_802154_cb(skb)->d_size) + hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size - + sizeof(struct ipv6hdr)); + else + hdr.payload_len = htons(skb->len); + break; + default: + hdr.payload_len = htons(skb->len); + break; + } pr_debug("skb headroom size = %d, data length = %d\n", skb_headroom(skb), skb->len); @@ -387,42 +578,176 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, } EXPORT_SYMBOL_GPL(lowpan_header_decompress); -static u8 lowpan_compress_addr_64(u8 **hc_ptr, u8 shift, - const struct in6_addr *ipaddr, - const unsigned char *lladdr) +static const u8 lowpan_iphc_dam_to_sam_value[] = { + [LOWPAN_IPHC_DAM_00] = LOWPAN_IPHC_SAM_00, + [LOWPAN_IPHC_DAM_01] = LOWPAN_IPHC_SAM_01, + [LOWPAN_IPHC_DAM_10] = LOWPAN_IPHC_SAM_10, + [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11, +}; + +static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr, + const unsigned char *lladdr, bool sam) { - u8 val = 0; + u8 dam = LOWPAN_IPHC_DAM_00; if (is_addr_mac_addr_based(ipaddr, lladdr)) { - val = 3; /* 0-bits */ + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ pr_debug("address compression 0 bits\n"); } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { /* compress IID to 16 bits xxxx::XXXX */ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2); - val = 2; /* 16-bits */ + dam = LOWPAN_IPHC_DAM_10; /* 16-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", *hc_ptr - 2, 2); } else { /* do not compress IID => xxxx::IID */ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); - val = 1; /* 64-bits */ + dam = LOWPAN_IPHC_DAM_01; /* 64-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", *hc_ptr - 8, 8); } - return rol8(val, shift); + if (sam) + return lowpan_iphc_dam_to_sam_value[dam]; + else + return dam; +} + +/* lowpan_iphc_get_tc - get the ECN + DCSP fields in hc format */ +static inline u8 lowpan_iphc_get_tc(const struct ipv6hdr *hdr) +{ + u8 dscp, ecn; + + /* hdr->priority contains the higher bits of dscp, lower are part of + * flow_lbl[0]. Note ECN, DCSP is swapped in ipv6 hdr. + */ + dscp = (hdr->priority << 2) | ((hdr->flow_lbl[0] & 0xc0) >> 6); + /* ECN is at the two lower bits from first nibble of flow_lbl[0] */ + ecn = (hdr->flow_lbl[0] & 0x30); + /* for pretty debug output, also shift ecn to get the ecn value */ + pr_debug("ecn 0x%02x dscp 0x%02x\n", ecn >> 4, dscp); + /* ECN is at 0x30 now, shift it to have ECN + DCSP */ + return (ecn << 2) | dscp; +} + +/* lowpan_iphc_is_flow_lbl_zero - check if flow label is zero */ +static inline bool lowpan_iphc_is_flow_lbl_zero(const struct ipv6hdr *hdr) +{ + return ((!(hdr->flow_lbl[0] & 0x0f)) && + !hdr->flow_lbl[1] && !hdr->flow_lbl[2]); +} + +/* lowpan_iphc_tf_compress - compress the traffic class which is set by + * ipv6hdr. Return the corresponding format identifier which is used. + */ +static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr) +{ + /* get ecn dscp data in a byteformat as: ECN(hi) + DSCP(lo) */ + u8 tc = lowpan_iphc_get_tc(hdr), tf[4], val; + + /* printout the traffic class in hc format */ + pr_debug("tc 0x%02x\n", tc); + + if (lowpan_iphc_is_flow_lbl_zero(hdr)) { + if (!tc) { + /* 11: Traffic Class and Flow Label are elided. */ + val = LOWPAN_IPHC_TF_11; + } else { + /* 10: ECN + DSCP (1 byte), Flow Label is elided. + * + * 0 1 2 3 4 5 6 7 + * +-+-+-+-+-+-+-+-+ + * |ECN| DSCP | + * +-+-+-+-+-+-+-+-+ + */ + lowpan_push_hc_data(hc_ptr, &tc, sizeof(tc)); + val = LOWPAN_IPHC_TF_10; + } + } else { + /* check if dscp is zero, it's after the first two bit */ + if (!(tc & 0x3f)) { + /* 01: ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided + * + * 1 2 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |ECN|rsv| Flow Label | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + memcpy(&tf[0], &hdr->flow_lbl[0], 3); + /* zero the highest 4-bits, contains DCSP + ECN */ + tf[0] &= ~0xf0; + /* set ECN */ + tf[0] |= (tc & 0xc0); + + lowpan_push_hc_data(hc_ptr, tf, 3); + val = LOWPAN_IPHC_TF_01; + } else { + /* 00: ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) + * + * 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |ECN| DSCP | rsv | Flow Label | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + memcpy(&tf[0], &tc, sizeof(tc)); + /* highest nibble of flow_lbl[0] is part of DSCP + ECN + * which will be the 4-bit pad and will be filled with + * zeros afterwards. + */ + memcpy(&tf[1], &hdr->flow_lbl[0], 3); + /* zero the 4-bit pad, which is reserved */ + tf[1] &= ~0xf0; + + lowpan_push_hc_data(hc_ptr, tf, 4); + val = LOWPAN_IPHC_TF_00; + } + } + + return val; } -int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) +static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr, + const struct in6_addr *ipaddr) { - u8 tmp, iphc0, iphc1, *hc_ptr; + u8 val; + + if (lowpan_is_mcast_addr_compressable8(ipaddr)) { + pr_debug("compressed to 1 octet\n"); + /* use last byte */ + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[15], 1); + val = LOWPAN_IPHC_DAM_11; + } else if (lowpan_is_mcast_addr_compressable32(ipaddr)) { + pr_debug("compressed to 4 octets\n"); + /* second byte + the last three */ + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[1], 1); + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[13], 3); + val = LOWPAN_IPHC_DAM_10; + } else if (lowpan_is_mcast_addr_compressable48(ipaddr)) { + pr_debug("compressed to 6 octets\n"); + /* second byte + the last five */ + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[1], 1); + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[11], 5); + val = LOWPAN_IPHC_DAM_01; + } else { + pr_debug("using full address\n"); + lowpan_push_hc_data(hc_ptr, ipaddr->s6_addr, 16); + val = LOWPAN_IPHC_DAM_00; + } + + return val; +} + +int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, + const void *daddr, const void *saddr) +{ + u8 iphc0, iphc1, *hc_ptr; struct ipv6hdr *hdr; - u8 head[100] = {}; + u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {}; int ret, addr_type; - if (type != ETH_P_IPV6) + if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; hdr = ipv6_hdr(skb); @@ -446,63 +771,26 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, /* TODO: context lookup */ - raw_dump_inline(__func__, "saddr", - (unsigned char *)_saddr, IEEE802154_ADDR_LEN); - raw_dump_inline(__func__, "daddr", - (unsigned char *)_daddr, IEEE802154_ADDR_LEN); + raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN); + raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN); raw_dump_table(__func__, "sending raw skb network uncompressed packet", skb->data, skb->len); - /* Traffic class, flow label - * If flow label is 0, compress it. If traffic class is 0, compress it - * We have to process both in the same time as the offset of traffic - * class depends on the presence of version and flow label - */ - - /* hc format of TC is ECN | DSCP , original one is DSCP | ECN */ - tmp = (hdr->priority << 4) | (hdr->flow_lbl[0] >> 4); - tmp = ((tmp & 0x03) << 6) | (tmp >> 2); - - if (((hdr->flow_lbl[0] & 0x0F) == 0) && - (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { - /* flow label can be compressed */ - iphc0 |= LOWPAN_IPHC_FL_C; - if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { - /* compress (elide) all */ - iphc0 |= LOWPAN_IPHC_TC_C; - } else { - /* compress only the flow label */ - *hc_ptr = tmp; - hc_ptr += 1; - } - } else { - /* Flow label cannot be compressed */ - if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { - /* compress only traffic class */ - iphc0 |= LOWPAN_IPHC_TC_C; - *hc_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F); - memcpy(hc_ptr + 1, &hdr->flow_lbl[1], 2); - hc_ptr += 3; - } else { - /* compress nothing */ - memcpy(hc_ptr, hdr, 4); - /* replace the top byte with new ECN | DSCP format */ - *hc_ptr = tmp; - hc_ptr += 4; - } - } + /* Traffic Class, Flow Label compression */ + iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr); /* NOTE: payload length is always compressed */ /* Check if we provide the nhc format for nexthdr and compression * functionality. If not nexthdr is handled inline and not compressed. */ - ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr, &iphc0); - if (ret < 0) - return ret; + ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr); + if (ret == -ENOENT) + lowpan_push_hc_data(&hc_ptr, &hdr->nexthdr, + sizeof(hdr->nexthdr)); + else + iphc0 |= LOWPAN_IPHC_NH; /* Hop limit * if 1: compress, encoding is 01 @@ -512,13 +800,13 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, */ switch (hdr->hop_limit) { case 1: - iphc0 |= LOWPAN_IPHC_TTL_1; + iphc0 |= LOWPAN_IPHC_HLIM_01; break; case 64: - iphc0 |= LOWPAN_IPHC_TTL_64; + iphc0 |= LOWPAN_IPHC_HLIM_10; break; case 255: - iphc0 |= LOWPAN_IPHC_TTL_255; + iphc0 |= LOWPAN_IPHC_HLIM_11; break; default: lowpan_push_hc_data(&hc_ptr, &hdr->hop_limit, @@ -532,9 +820,8 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, iphc1 |= LOWPAN_IPHC_SAC; } else { if (addr_type & IPV6_ADDR_LINKLOCAL) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, - LOWPAN_IPHC_SAM_BIT, - &hdr->saddr, _saddr); + iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr, + saddr, true); pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", &hdr->saddr, iphc1); } else { @@ -548,38 +835,12 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, if (addr_type & IPV6_ADDR_MULTICAST) { pr_debug("destination address is multicast: "); iphc1 |= LOWPAN_IPHC_M; - if (lowpan_is_mcast_addr_compressable8(&hdr->daddr)) { - pr_debug("compressed to 1 octet\n"); - iphc1 |= LOWPAN_IPHC_DAM_11; - /* use last byte */ - lowpan_push_hc_data(&hc_ptr, - &hdr->daddr.s6_addr[15], 1); - } else if (lowpan_is_mcast_addr_compressable32(&hdr->daddr)) { - pr_debug("compressed to 4 octets\n"); - iphc1 |= LOWPAN_IPHC_DAM_10; - /* second byte + the last three */ - lowpan_push_hc_data(&hc_ptr, - &hdr->daddr.s6_addr[1], 1); - lowpan_push_hc_data(&hc_ptr, - &hdr->daddr.s6_addr[13], 3); - } else if (lowpan_is_mcast_addr_compressable48(&hdr->daddr)) { - pr_debug("compressed to 6 octets\n"); - iphc1 |= LOWPAN_IPHC_DAM_01; - /* second byte + the last five */ - lowpan_push_hc_data(&hc_ptr, - &hdr->daddr.s6_addr[1], 1); - lowpan_push_hc_data(&hc_ptr, - &hdr->daddr.s6_addr[11], 5); - } else { - pr_debug("using full address\n"); - iphc1 |= LOWPAN_IPHC_DAM_00; - lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16); - } + iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr); } else { if (addr_type & IPV6_ADDR_LINKLOCAL) { /* TODO: context lookup */ - iphc1 |= lowpan_compress_addr_64(&hc_ptr, - LOWPAN_IPHC_DAM_BIT, &hdr->daddr, _daddr); + iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr, + daddr, false); pr_debug("dest address unicast link-local %pI6c " "iphc1 0x%02x\n", &hdr->daddr, iphc1); } else { @@ -589,7 +850,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, } /* next header compression */ - if (iphc0 & LOWPAN_IPHC_NH_C) { + if (iphc0 & LOWPAN_IPHC_NH) { ret = lowpan_nhc_do_compression(skb, hdr, &hc_ptr); if (ret < 0) return ret; @@ -610,19 +871,3 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, return 0; } EXPORT_SYMBOL_GPL(lowpan_header_compress); - -static int __init lowpan_module_init(void) -{ - request_module_nowait("nhc_dest"); - request_module_nowait("nhc_fragment"); - request_module_nowait("nhc_hop"); - request_module_nowait("nhc_ipv6"); - request_module_nowait("nhc_mobility"); - request_module_nowait("nhc_routing"); - request_module_nowait("nhc_udp"); - - return 0; -} -module_init(lowpan_module_init); - -MODULE_LICENSE("GPL"); diff --git a/kernel/net/6lowpan/nhc.c b/kernel/net/6lowpan/nhc.c index fd20fc51a..7008d53e4 100644 --- a/kernel/net/6lowpan/nhc.c +++ b/kernel/net/6lowpan/nhc.c @@ -95,23 +95,20 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb) } int lowpan_nhc_check_compression(struct sk_buff *skb, - const struct ipv6hdr *hdr, u8 **hc_ptr, - u8 *iphc0) + const struct ipv6hdr *hdr, u8 **hc_ptr) { struct lowpan_nhc *nhc; + int ret = 0; spin_lock_bh(&lowpan_nhc_lock); nhc = lowpan_nexthdr_nhcs[hdr->nexthdr]; - if (nhc && nhc->compress) - *iphc0 |= LOWPAN_IPHC_NH_C; - else - lowpan_push_hc_data(hc_ptr, &hdr->nexthdr, - sizeof(hdr->nexthdr)); + if (!(nhc && nhc->compress)) + ret = -ENOENT; spin_unlock_bh(&lowpan_nhc_lock); - return 0; + return ret; } int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr, @@ -157,7 +154,8 @@ out: return ret; } -int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev, +int lowpan_nhc_do_uncompression(struct sk_buff *skb, + const struct net_device *dev, struct ipv6hdr *hdr) { struct lowpan_nhc *nhc; diff --git a/kernel/net/6lowpan/nhc.h b/kernel/net/6lowpan/nhc.h index ed44938eb..803041400 100644 --- a/kernel/net/6lowpan/nhc.h +++ b/kernel/net/6lowpan/nhc.h @@ -8,8 +8,6 @@ #include #include -#define LOWPAN_NHC_MAX_ID_LEN 1 - /** * LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct * @@ -88,19 +86,16 @@ struct lowpan_nhc *lowpan_nhc_by_nexthdr(u8 nexthdr); /** * lowpan_nhc_check_compression - checks if we support compression format. If - * we support the nhc by nexthdr field, the 6LoWPAN iphc NHC bit will be - * set. If we don't support nexthdr will be added as inline data to the - * 6LoWPAN header. + * we support the nhc by nexthdr field, the function will return 0. If we + * don't support the nhc by nexthdr this function will return -ENOENT. * * @skb: skb of 6LoWPAN header to read nhc and replace header. * @hdr: ipv6hdr to check the nexthdr value * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of * replaced header. - * @iphc0: iphc0 pointer to set the 6LoWPAN NHC bit */ int lowpan_nhc_check_compression(struct sk_buff *skb, - const struct ipv6hdr *hdr, u8 **hc_ptr, - u8 *iphc0); + const struct ipv6hdr *hdr, u8 **hc_ptr); /** * lowpan_nhc_do_compression - calling compress callback for nhc @@ -121,7 +116,8 @@ int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr, * @dev: netdevice for print logging information. * @hdr: ipv6hdr for setting nexthdr value. */ -int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev, +int lowpan_nhc_do_uncompression(struct sk_buff *skb, + const struct net_device *dev, struct ipv6hdr *hdr); /** diff --git a/kernel/net/6lowpan/nhc_udp.c b/kernel/net/6lowpan/nhc_udp.c index c6bcaeb42..69537a2ea 100644 --- a/kernel/net/6lowpan/nhc_udp.c +++ b/kernel/net/6lowpan/nhc_udp.c @@ -17,7 +17,27 @@ #include "nhc.h" -#define LOWPAN_NHC_UDP_IDLEN 1 +#define LOWPAN_NHC_UDP_MASK 0xF8 +#define LOWPAN_NHC_UDP_ID 0xF0 +#define LOWPAN_NHC_UDP_IDLEN 1 + +#define LOWPAN_NHC_UDP_4BIT_PORT 0xF0B0 +#define LOWPAN_NHC_UDP_4BIT_MASK 0xFFF0 +#define LOWPAN_NHC_UDP_8BIT_PORT 0xF000 +#define LOWPAN_NHC_UDP_8BIT_MASK 0xFF00 + +/* values for port compression, _with checksum_ ie bit 5 set to 0 */ + +/* all inline */ +#define LOWPAN_NHC_UDP_CS_P_00 0xF0 +/* source 16bit inline, dest = 0xF0 + 8 bit inline */ +#define LOWPAN_NHC_UDP_CS_P_01 0xF1 +/* source = 0xF0 + 8bit inline, dest = 16 bit inline */ +#define LOWPAN_NHC_UDP_CS_P_10 0xF2 +/* source & dest = 0xF0B + 4bit inline */ +#define LOWPAN_NHC_UDP_CS_P_11 0xF3 +/* checksum elided */ +#define LOWPAN_NHC_UDP_CS_C 0x04 static int udp_uncompress(struct sk_buff *skb, size_t needed) { @@ -71,7 +91,18 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed) * here, we obtain the hint from the remaining size of the * frame */ - uh.len = htons(skb->len + sizeof(struct udphdr)); + switch (lowpan_priv(skb->dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_802154_cb(skb)->d_size) + uh.len = htons(lowpan_802154_cb(skb)->d_size - + sizeof(struct ipv6hdr)); + else + uh.len = htons(skb->len + sizeof(struct udphdr)); + break; + default: + uh.len = htons(skb->len + sizeof(struct udphdr)); + break; + } pr_debug("uncompressed UDP length: src = %d", ntohs(uh.len)); /* replace the compressed UDP head by the uncompressed UDP diff --git a/kernel/net/8021q/vlan.c b/kernel/net/8021q/vlan.c index 59555f0f8..d2cd9de4b 100644 --- a/kernel/net/8021q/vlan.c +++ b/kernel/net/8021q/vlan.c @@ -618,6 +618,92 @@ out: return err; } +static struct sk_buff **vlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff *p, **pp = NULL; + struct vlan_hdr *vhdr; + unsigned int hlen, off_vlan; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_vlan = skb_gro_offset(skb); + hlen = off_vlan + sizeof(*vhdr); + vhdr = skb_gro_header_fast(skb, off_vlan); + if (skb_gro_header_hard(skb, hlen)) { + vhdr = skb_gro_header_slow(skb, hlen, off_vlan); + if (unlikely(!vhdr)) + goto out; + } + + type = vhdr->h_vlan_encapsulated_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out_unlock; + + flush = 0; + + for (p = *head; p; p = p->next) { + struct vlan_hdr *vhdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); + if (compare_vlan_header(vhdr, vhdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int vlan_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); + __be16 type = vhdr->h_vlan_encapsulated_proto; + struct packet_offload *ptype; + int err = -ENOENT; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + + rcu_read_unlock(); + return err; +} + +static struct packet_offload vlan_packet_offloads[] __read_mostly = { + { + .type = cpu_to_be16(ETH_P_8021Q), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, + { + .type = cpu_to_be16(ETH_P_8021AD), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, +}; + static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); @@ -645,6 +731,7 @@ static struct pernet_operations vlan_net_ops = { static int __init vlan_proto_init(void) { int err; + unsigned int i; pr_info("%s v%s\n", vlan_fullname, vlan_version); @@ -668,6 +755,9 @@ static int __init vlan_proto_init(void) if (err < 0) goto err5; + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_add_offload(&vlan_packet_offloads[i]); + vlan_ioctl_set(vlan_ioctl_handler); return 0; @@ -685,7 +775,13 @@ err0: static void __exit vlan_cleanup_module(void) { + unsigned int i; + vlan_ioctl_set(NULL); + + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_remove_offload(&vlan_packet_offloads[i]); + vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); diff --git a/kernel/net/8021q/vlan_core.c b/kernel/net/8021q/vlan_core.c index 61bf2a06e..e2ed69850 100644 --- a/kernel/net/8021q/vlan_core.c +++ b/kernel/net/8021q/vlan_core.c @@ -30,7 +30,9 @@ bool vlan_do_receive(struct sk_buff **skbp) skb->pkt_type = PACKET_HOST; } - if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) { + if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) && + !netif_is_macvlan_port(vlan_dev) && + !netif_is_bridge_port(vlan_dev)) { unsigned int offset = skb->data - skb_mac_header(skb); /* @@ -206,7 +208,10 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, return -ENOMEM; if (vlan_hw_filter_capable(dev, vid_info)) { - err = ops->ndo_vlan_rx_add_vid(dev, proto, vid); + if (netif_device_present(dev)) + err = ops->ndo_vlan_rx_add_vid(dev, proto, vid); + else + err = -ENODEV; if (err) { kfree(vid_info); return err; @@ -264,7 +269,10 @@ static void __vlan_vid_del(struct vlan_info *vlan_info, int err; if (vlan_hw_filter_capable(dev, vid_info)) { - err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid); + if (netif_device_present(dev)) + err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid); + else + err = -ENODEV; if (err) { pr_warn("failed to kill vid %04x/%d for device %s\n", proto, vid, dev->name); diff --git a/kernel/net/8021q/vlan_dev.c b/kernel/net/8021q/vlan_dev.c index 01d7ba840..fded86508 100644 --- a/kernel/net/8021q/vlan_dev.c +++ b/kernel/net/8021q/vlan_dev.c @@ -791,10 +791,9 @@ void vlan_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags |= IFF_802_1Q_VLAN; + dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); - dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; dev->destructor = vlan_dev_free; diff --git a/kernel/net/9p/client.c b/kernel/net/9p/client.c index fcf6fe063..ea79ee9a7 100644 --- a/kernel/net/9p/client.c +++ b/kernel/net/9p/client.c @@ -1584,6 +1584,10 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) p9_free_req(clnt, req); break; } + if (rsize < count) { + pr_err("bogus RREAD count (%d > %d)\n", count, rsize); + count = rsize; + } p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); if (!count) { @@ -1652,6 +1656,10 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) p9_free_req(clnt, req); break; } + if (rsize < count) { + pr_err("bogus RWRITE count (%d > %d)\n", count, rsize); + count = rsize; + } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); diff --git a/kernel/net/9p/trans_rdma.c b/kernel/net/9p/trans_rdma.c index 3533d2a53..52b4a2f99 100644 --- a/kernel/net/9p/trans_rdma.c +++ b/kernel/net/9p/trans_rdma.c @@ -94,8 +94,6 @@ struct p9_trans_rdma { struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; - struct ib_mr *dma_mr; - u32 lkey; long timeout; int sq_depth; struct semaphore sq_sem; @@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) if (!rdma) return; - if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) - ib_dereg_mr(rdma->dma_mr); - if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); @@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) sge.addr = c->busa; sge.length = client->msize; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; c->wc_op = IB_WC_RECV; @@ -506,7 +501,7 @@ dont_need_post_recv: sge.addr = c->busa; sge.length = c->req->tc->size; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; c->wc_op = IB_WC_SEND; @@ -647,7 +642,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; + struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); @@ -660,8 +655,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) return -ENOMEM; /* Create the RDMA CM ID */ - rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP, - IB_QPT_RC); + rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; @@ -699,15 +694,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; - /* Query the device attributes */ - err = ib_query_device(rdma->cm_id->device, &devattr); - if (err) - goto error; - /* Create the Completion Queue */ + cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, cq_event_handler, client, - opts.sq_depth + opts.rq_depth + 1, 0); + &cq_attr); if (IS_ERR(rdma->cq)) goto error; ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); @@ -717,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (IS_ERR(rdma->pd)) goto error; - /* Cache the DMA lkey in the transport */ - rdma->dma_mr = NULL; - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) - rdma->lkey = rdma->cm_id->device->local_dma_lkey; - else { - rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rdma->dma_mr)) - goto error; - rdma->lkey = rdma->dma_mr->lkey; - } - /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; diff --git a/kernel/net/9p/trans_virtio.c b/kernel/net/9p/trans_virtio.c index 9dd49ca67..6e70ddb15 100644 --- a/kernel/net/9p/trans_virtio.c +++ b/kernel/net/9p/trans_virtio.c @@ -704,6 +704,7 @@ static void p9_virtio_remove(struct virtio_device *vdev) mutex_unlock(&virtio_9p_lock); + vdev->config->reset(vdev); vdev->config->del_vqs(vdev); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); diff --git a/kernel/net/Kconfig b/kernel/net/Kconfig index 44dd5786e..127da94ae 100644 --- a/kernel/net/Kconfig +++ b/kernel/net/Kconfig @@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES Newly written code should NEVER need this option but do compat-independent messages instead! +config NET_INGRESS + bool + menu "Networking options" source "net/packet/Kconfig" @@ -229,6 +232,7 @@ source "net/netlink/Kconfig" source "net/mpls/Kconfig" source "net/hsr/Kconfig" source "net/switchdev/Kconfig" +source "net/l3mdev/Kconfig" config RPS bool @@ -371,6 +375,13 @@ source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +config LWTUNNEL + bool "Network light weight tunnels" + ---help--- + This feature provides an infrastructure to support light weight + tunnels like mpls. There is no netdevice associated with a light + weight tunnel endpoint. Tunnel encapsulation parameters are stored + with light weight tunnel state associated with fib routes. endif # if NET diff --git a/kernel/net/Makefile b/kernel/net/Makefile index 3995613e5..a5d04098d 100644 --- a/kernel/net/Makefile +++ b/kernel/net/Makefile @@ -74,3 +74,6 @@ obj-$(CONFIG_HSR) += hsr/ ifneq ($(CONFIG_NET_SWITCHDEV),) obj-y += switchdev/ endif +ifneq ($(CONFIG_NET_L3_MASTER_DEV),) +obj-y += l3mdev/ +endif diff --git a/kernel/net/appletalk/ddp.c b/kernel/net/appletalk/ddp.c index 3b7ad43c7..d5871ac49 100644 --- a/kernel/net/appletalk/ddp.c +++ b/kernel/net/appletalk/ddp.c @@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto); + sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) goto out; rc = 0; diff --git a/kernel/net/atm/br2684.c b/kernel/net/atm/br2684.c index cc78538d1..aa0047c5c 100644 --- a/kernel/net/atm/br2684.c +++ b/kernel/net/atm/br2684.c @@ -802,13 +802,10 @@ static int br2684_seq_show(struct seq_file *seq, void *v) (brdev->payload == p_bridged) ? "bridged" : "routed", brvcc->copies_failed, brvcc->copies_needed); #ifdef CONFIG_ATM_BR2684_IPFILTER -#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte] -#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3) if (brvcc->filter.netmask != 0) - seq_printf(seq, " filter=%d.%d.%d.%d/" - "%d.%d.%d.%d\n", bs(prefix), bs(netmask)); -#undef bs -#undef b1 + seq_printf(seq, " filter=%pI4/%pI4\n", + &brvcc->filter.prefix, + &brvcc->filter.netmask); #endif /* CONFIG_ATM_BR2684_IPFILTER */ } return 0; diff --git a/kernel/net/atm/clip.c b/kernel/net/atm/clip.c index 17e55dfec..e07f551a8 100644 --- a/kernel/net/atm/clip.c +++ b/kernel/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh) static int clip_encap(struct atm_vcc *vcc, int mode) { + if (!CLIP_VCC(vcc)) + return -EBADFD; + CLIP_VCC(vcc)->encap = mode; return 0; } diff --git a/kernel/net/atm/common.c b/kernel/net/atm/common.c index ed0466637..49a872db7 100644 --- a/kernel/net/atm/common.c +++ b/kernel/net/atm/common.c @@ -141,7 +141,7 @@ static struct proto vcc_proto = { .release_cb = vcc_release_cb, }; -int vcc_create(struct net *net, struct socket *sock, int protocol, int family) +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern) { struct sock *sk; struct atm_vcc *vcc; @@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family) sock->sk = NULL; if (sock->type == SOCK_STREAM) return -EINVAL; - sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto); + sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); diff --git a/kernel/net/atm/common.h b/kernel/net/atm/common.h index 4d6f5b206..959436b87 100644 --- a/kernel/net/atm/common.h +++ b/kernel/net/atm/common.h @@ -10,7 +10,7 @@ #include /* for poll_table */ -int vcc_create(struct net *net, struct socket *sock, int protocol, int family); +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern); int vcc_release(struct socket *sock); int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/kernel/net/atm/pvc.c b/kernel/net/atm/pvc.c index ae0324021..040207ec3 100644 --- a/kernel/net/atm/pvc.c +++ b/kernel/net/atm/pvc.c @@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &pvc_proto_ops; - return vcc_create(net, sock, protocol, PF_ATMPVC); + return vcc_create(net, sock, protocol, PF_ATMPVC, kern); } static const struct net_proto_family pvc_family_ops = { diff --git a/kernel/net/atm/svc.c b/kernel/net/atm/svc.c index 1ba23f501..3fa0a9ee9 100644 --- a/kernel/net/atm/svc.c +++ b/kernel/net/atm/svc.c @@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; - error = vcc_create(net, sock, protocol, AF_ATMSVC); + error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; diff --git a/kernel/net/ax25/af_ax25.c b/kernel/net/ax25/af_ax25.c index 330c1f4a5..fbd0acf80 100644 --- a/kernel/net/ax25/af_ax25.c +++ b/kernel/net/ax25/af_ax25.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -58,7 +57,7 @@ static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { - ax25_cb_put(ax25_sk(sk)); + ax25_cb_put(sk_to_ax25(sk)); } /* @@ -307,7 +306,7 @@ void ax25_destroy_socket(ax25_cb *ax25) while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) { if (skb->sk != ax25->sk) { /* A pending connection */ - ax25_cb *sax25 = ax25_sk(skb->sk); + ax25_cb *sax25 = sk_to_ax25(skb->sk); /* Queue the unaccepted socket for death */ sock_orphan(skb->sk); @@ -552,7 +551,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); switch (optname) { case AX25_WINDOW: @@ -698,7 +697,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname, length = min_t(unsigned int, maxlen, sizeof(int)); lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); switch (optname) { case AX25_WINDOW: @@ -797,7 +796,7 @@ out: static struct proto ax25_proto = { .name = "AX25", .owner = THIS_MODULE, - .obj_size = sizeof(struct sock), + .obj_size = sizeof(struct ax25_sock), }; static int ax25_create(struct net *net, struct socket *sock, int protocol, @@ -806,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; @@ -855,11 +857,11 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto); + sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern); if (sk == NULL) return -ENOMEM; - ax25 = sk->sk_protinfo = ax25_create_cb(); + ax25 = ax25_sk(sk)->cb = ax25_create_cb(); if (!ax25) { sk_free(sk); return -ENOMEM; @@ -881,7 +883,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) struct sock *sk; ax25_cb *ax25, *oax25; - sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; @@ -911,7 +913,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); - oax25 = ax25_sk(osk); + oax25 = sk_to_ax25(osk); ax25->modulus = oax25->modulus; ax25->backoff = oax25->backoff; @@ -939,7 +941,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) } } - sk->sk_protinfo = ax25; + ax25_sk(sk)->cb = ax25; sk->sk_destruct = ax25_free_sock; ax25->sk = sk; @@ -957,7 +959,7 @@ static int ax25_release(struct socket *sock) sock_hold(sk); sock_orphan(sk); lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { @@ -1067,7 +1069,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (!sock_flag(sk, SOCK_ZAPPED)) { err = -EINVAL; goto out; @@ -1114,7 +1116,7 @@ static int __must_check ax25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; - ax25_cb *ax25 = ax25_sk(sk), *ax25t; + ax25_cb *ax25 = sk_to_ax25(sk), *ax25t; struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; ax25_digi *digi = NULL; int ct = 0, err = 0; @@ -1395,7 +1397,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, memset(fsa, 0, sizeof(*fsa)); lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { @@ -1447,7 +1449,7 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) return -EINVAL; lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (sock_flag(sk, SOCK_ZAPPED)) { err = -EADDRNOTAVAIL; @@ -1622,7 +1624,7 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (skb == NULL) goto out; - if (!ax25_sk(sk)->pidincl) + if (!sk_to_ax25(sk)->pidincl) skb_pull(skb, 1); /* Remove PID */ skb_reset_transport_header(skb); @@ -1763,7 +1765,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCAX25GETINFO: case SIOCAX25GETINFOOLD: { - ax25_cb *ax25 = ax25_sk(sk); + ax25_cb *ax25 = sk_to_ax25(sk); struct ax25_info_struct ax25_info; ax25_info.t1 = ax25->t1 / HZ; diff --git a/kernel/net/ax25/ax25_in.c b/kernel/net/ax25/ax25_in.c index 7ed8ab724..bb5a0e4e9 100644 --- a/kernel/net/ax25/ax25_in.c +++ b/kernel/net/ax25/ax25_in.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -354,7 +353,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } - ax25 = ax25_sk(make); + ax25 = sk_to_ax25(make); skb_set_owner_r(skb, make); skb_queue_head(&sk->sk_receive_queue, skb); diff --git a/kernel/net/ax25/ax25_ip.c b/kernel/net/ax25/ax25_ip.c index 7c646bb2c..b563a3f5f 100644 --- a/kernel/net/ax25/ax25_ip.c +++ b/kernel/net/ax25/ax25_ip.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/net/ax25/ax25_out.c b/kernel/net/ax25/ax25_out.c index be2acab9b..8ddd41baa 100644 --- a/kernel/net/ax25/ax25_out.c +++ b/kernel/net/ax25/ax25_out.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/net/ax25/ax25_subr.c b/kernel/net/ax25/ax25_subr.c index 1997538a5..3b78e8473 100644 --- a/kernel/net/ax25/ax25_subr.c +++ b/kernel/net/ax25/ax25_subr.c @@ -264,6 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); + ax25_stop_heartbeat(ax25); ax25_stop_t1timer(ax25); ax25_stop_t2timer(ax25); ax25_stop_t3timer(ax25); diff --git a/kernel/net/ax25/ax25_uid.c b/kernel/net/ax25/ax25_uid.c index 71c4badbc..4ad2fb7bc 100644 --- a/kernel/net/ax25/ax25_uid.c +++ b/kernel/net/ax25/ax25_uid.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/net/batman-adv/Makefile b/kernel/net/batman-adv/Makefile index eb7d8c038..21434ab79 100644 --- a/kernel/net/batman-adv/Makefile +++ b/kernel/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # @@ -20,7 +20,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_iv_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o -batman-adv-y += debugfs.o +batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o batman-adv-y += fragmentation.o batman-adv-y += gateway_client.o @@ -29,6 +29,7 @@ batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-y += icmp_socket.o batman-adv-y += main.o +batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o @@ -36,4 +37,3 @@ batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o batman-adv-y += translation-table.o -batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o diff --git a/kernel/net/batman-adv/bat_algo.h b/kernel/net/batman-adv/bat_algo.h index 4e49666f8..4e59cf3eb 100644 --- a/kernel/net/batman-adv/bat_algo.h +++ b/kernel/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/kernel/net/batman-adv/bat_iv_ogm.c b/kernel/net/batman-adv/bat_iv_ogm.c index 00e00e09b..912d9c36f 100644 --- a/kernel/net/batman-adv/bat_iv_ogm.c +++ b/kernel/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,20 +15,50 @@ * along with this program; if not, see . */ +#include "bat_algo.h" #include "main.h" -#include "translation-table.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitarray.h" +#include "hard-interface.h" +#include "hash.h" +#include "network-coding.h" #include "originator.h" +#include "packet.h" #include "routing.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "hard-interface.h" #include "send.h" -#include "bat_algo.h" -#include "network-coding.h" +#include "translation-table.h" /** * enum batadv_dup_status - duplicate status - * @BATADV_NO_DUP: the packet is a duplicate + * @BATADV_NO_DUP: the packet is no duplicate * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the * neighbor) * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor @@ -47,24 +77,25 @@ enum batadv_dup_status { * @lq_index: index to store the value at * @value: value to store in the ring buffer */ -static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, - uint8_t value) +static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) { lq_recv[*lq_index] = value; *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; } /** - * batadv_ring_buffer_set - compute the average of all non-zero values stored + * batadv_ring_buffer_avg - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * * Returns computed average value. */ -static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) +static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { - const uint8_t *ptr; - uint16_t count = 0, i = 0, sum = 0; + const u8 *ptr; + u16 count = 0; + u16 i = 0; + u16 sum = 0; ptr = lq_recv; @@ -81,7 +112,7 @@ static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) if (count == 0) return 0; - return (uint8_t)(sum / count); + return (u8)(sum / count); } /** @@ -123,14 +154,14 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, kfree(orig_node->bat_iv.bcast_own); orig_node->bat_iv.bcast_own = data_ptr; - data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); if (!data_ptr) { kfree(orig_node->bat_iv.bcast_own); goto unlock; } memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - (max_if_num - 1) * sizeof(uint8_t)); + (max_if_num - 1) * sizeof(u8)); kfree(orig_node->bat_iv.bcast_own_sum); orig_node->bat_iv.bcast_own_sum = data_ptr; @@ -183,19 +214,19 @@ free_bcast_own: if (max_if_num == 0) goto free_own_sum; - data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); if (!data_ptr) { kfree(orig_node->bat_iv.bcast_own); goto unlock; } memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - del_if_num * sizeof(uint8_t)); + del_if_num * sizeof(u8)); - if_offset = (del_if_num + 1) * sizeof(uint8_t); - memcpy((char *)data_ptr + del_if_num * sizeof(uint8_t), + if_offset = (del_if_num + 1) * sizeof(u8); + memcpy((char *)data_ptr + del_if_num * sizeof(u8), orig_node->bat_iv.bcast_own_sum + if_offset, - (max_if_num - del_if_num) * sizeof(uint8_t)); + (max_if_num - del_if_num) * sizeof(u8)); free_own_sum: kfree(orig_node->bat_iv.bcast_own_sum); @@ -218,7 +249,7 @@ unlock: * If the object does not exists it is created an initialised. */ static struct batadv_orig_node * -batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) +batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int size, hash_added; @@ -238,7 +269,7 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) if (!orig_node->bat_iv.bcast_own) goto free_orig_node; - size = bat_priv->num_ifaces * sizeof(uint8_t); + size = bat_priv->num_ifaces * sizeof(u8); orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); if (!orig_node->bat_iv.bcast_own_sum) goto free_orig_node; @@ -261,43 +292,17 @@ free_orig_node: static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, + const u8 *neigh_addr, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_neigh_node *neigh_node, *tmp_neigh_node; + struct batadv_neigh_node *neigh_node; - neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); + neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) { - kfree(neigh_node); - neigh_node = NULL; - goto out; - } - neigh_node->orig_node = orig_neigh; - neigh_node->if_incoming = hard_iface; - - spin_lock_bh(&orig_node->neigh_list_lock); - tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, - neigh_addr); - if (!tmp_neigh_node) { - hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); - } else { - kfree(neigh_node); - batadv_hardif_free_ref(hard_iface); - neigh_node = tmp_neigh_node; - } - spin_unlock_bh(&orig_node->neigh_list_lock); - - if (!tmp_neigh_node) - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM for orig_node %pM on interface %s\n", - neigh_addr, orig_node->orig, - hard_iface->net_dev->name); out: return neigh_node; @@ -307,8 +312,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; - uint32_t random_seqno; - int res = -ENOMEM; + u32 random_seqno; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); @@ -317,7 +321,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) - goto out; + return -ENOMEM; hard_iface->bat_iv.ogm_buff = ogm_buff; @@ -329,10 +333,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; - res = 0; - -out: - return res; + return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) @@ -383,8 +384,7 @@ static unsigned long batadv_iv_ogm_fwd_send_time(void) } /* apply hop penalty for a normal link */ -static uint8_t batadv_hop_penalty(uint8_t tq, - const struct batadv_priv *bat_priv) +static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) { int hop_penalty = atomic_read(&bat_priv->hop_penalty); int new_tq; @@ -396,8 +396,8 @@ static uint8_t batadv_hop_penalty(uint8_t tq, } /* is there another aggregated packet here? */ -static int batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, - __be16 tvlv_len) +static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, + __be16 tvlv_len) { int next_buff_pos = 0; @@ -413,12 +413,12 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - char *fwd_str; - uint8_t packet_num; - int16_t buff_pos; + const char *fwd_str; + u8 packet_num; + s16 buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; - uint8_t *packet_pos; + u8 *packet_pos; if (hard_iface->if_status != BATADV_IF_ACTIVE) return; @@ -451,7 +451,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, - (batadv_ogm_packet->flags & BATADV_DIRECTLINK ? + ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ? "on" : "off"), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); @@ -548,58 +548,62 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet wont be bigger than * MAX_AGGREGATION_BYTES + * otherwise aggregation is not possible */ - if (time_before(send_time, forw_packet->send_time) && - time_after_eq(aggregation_end_time, forw_packet->send_time) && - (aggregated_bytes <= BATADV_MAX_AGGREGATION_BYTES)) { - /* check aggregation compatibility - * -> direct link packets are broadcasted on - * their interface only - * -> aggregate packet if the current packet is - * a "global" packet as well as the base - * packet - */ - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - - /* packet is not leaving on the same interface. */ - if (forw_packet->if_outgoing != if_outgoing) - goto out; + if (!time_before(send_time, forw_packet->send_time) || + !time_after_eq(aggregation_end_time, forw_packet->send_time)) + return false; + + if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) + return false; + + /* packet is not leaving on the same interface. */ + if (forw_packet->if_outgoing != if_outgoing) + return false; + + /* check aggregation compatibility + * -> direct link packets are broadcasted on + * their interface only + * -> aggregate packet if the current packet is + * a "global" packet as well as the base + * packet + */ + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return false; - /* packets without direct link flag and high TTL - * are flooded through the net - */ - if ((!directlink) && - (!(batadv_ogm_packet->flags & BATADV_DIRECTLINK)) && - (batadv_ogm_packet->ttl != 1) && - - /* own packets originating non-primary - * interfaces leave only that interface - */ - ((!forw_packet->own) || - (forw_packet->if_incoming == primary_if))) { - res = true; - goto out; - } + /* packets without direct link flag and high TTL + * are flooded through the net + */ + if (!directlink && + !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && + batadv_ogm_packet->ttl != 1 && + + /* own packets originating non-primary + * interfaces leave only that interface + */ + (!forw_packet->own || + forw_packet->if_incoming == primary_if)) { + res = true; + goto out; + } - /* if the incoming packet is sent via this one - * interface only - we still can aggregate - */ - if ((directlink) && - (new_bat_ogm_packet->ttl == 1) && - (forw_packet->if_incoming == if_incoming) && - - /* packets from direct neighbors or - * own secondary interface packets - * (= secondary interface packets in general) - */ - (batadv_ogm_packet->flags & BATADV_DIRECTLINK || - (forw_packet->own && - forw_packet->if_incoming != primary_if))) { - res = true; - goto out; - } + /* if the incoming packet is sent via this one + * interface only - we still can aggregate + */ + if (directlink && + new_bat_ogm_packet->ttl == 1 && + forw_packet->if_incoming == if_incoming && + + /* packets from direct neighbors or + * own secondary interface packets + * (= secondary interface packets in general) + */ + (batadv_ogm_packet->flags & BATADV_DIRECTLINK || + (forw_packet->own && + forw_packet->if_incoming != primary_if))) { + res = true; + goto out; } out: @@ -642,19 +646,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "batman packet queue full\n"); - goto out; + goto out_free_outgoing; } } forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC); - if (!forw_packet_aggr) { - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); - goto out; - } + if (!forw_packet_aggr) + goto out_nomem; - if ((atomic_read(&bat_priv->aggregated_ogms)) && - (packet_len < BATADV_MAX_AGGREGATION_BYTES)) + if (atomic_read(&bat_priv->aggregated_ogms) && + packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; else skb_size = packet_len; @@ -662,12 +663,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb_size += ETH_HLEN; forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); - if (!forw_packet_aggr->skb) { - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); - kfree(forw_packet_aggr); - goto out; - } + if (!forw_packet_aggr->skb) + goto out_free_forw_packet; forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); @@ -699,7 +696,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, send_time - jiffies); return; -out: +out_free_forw_packet: + kfree(forw_packet_aggr); +out_nomem: + if (!own_packet) + atomic_inc(&bat_priv->batman_queue_left); +out_free_outgoing: batadv_hardif_free_ref(if_outgoing); out_free_incoming: batadv_hardif_free_ref(if_incoming); @@ -752,13 +754,13 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; - direct_link = batadv_ogm_packet->flags & BATADV_DIRECTLINK ? 1 : 0; + direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ - if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) { + if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, @@ -807,7 +809,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); - uint16_t tvlv_len; + u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); @@ -866,9 +868,9 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) struct hlist_head *head; struct batadv_orig_node *orig_node; unsigned long *word; - uint32_t i; + u32 i; size_t word_index; - uint8_t *w; + u8 *w; int if_num; for (i = 0; i < hash->size; i++) { @@ -897,8 +899,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; - uint32_t seqno; - uint16_t tvlv_len = 0; + u32 seqno; + u16 tvlv_len = 0; unsigned long send_time; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -917,7 +919,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->tvlv_len = htons(tvlv_len); /* change sequence number to network order */ - seqno = (uint32_t)atomic_read(&hard_iface->bat_iv.ogm_seqno); + seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno); batadv_ogm_packet->seqno = htonl(seqno); atomic_inc(&hard_iface->bat_iv.ogm_seqno); @@ -940,7 +942,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) rcu_read_lock(); list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) - continue; + continue; batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); @@ -976,13 +978,14 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; - struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL; + struct batadv_neigh_node *neigh_node = NULL; + struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; int if_num; - uint8_t sum_orig, sum_neigh; - uint8_t *neigh_addr; - uint8_t tq_avg; + u8 sum_orig, sum_neigh; + u8 *neigh_addr; + u8 tq_avg; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "update_originator(): Searching and updating originator entry of received packet\n"); @@ -1034,9 +1037,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, batadv_orig_node_free_ref(orig_tmp); if (!neigh_node) goto unlock; - } else + } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); + } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1081,7 +1085,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, * won't consider it either */ if (router_ifinfo && - (neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg)) { + neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { orig_node_tmp = router->orig_node; spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); if_num = router->if_incoming->if_num; @@ -1133,8 +1137,8 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; - uint8_t total_count; - uint8_t orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + u8 total_count; + u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0; unsigned int combined_tq; @@ -1280,13 +1284,13 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; int is_dup; - int32_t seq_diff; + s32 seq_diff; int need_update = 0; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; - uint32_t seqno = ntohl(batadv_ogm_packet->seqno); - uint8_t *neigh_addr; - uint8_t packet_count; + u32 seqno = ntohl(batadv_ogm_packet->seqno); + u8 *neigh_addr; + u8 packet_count; unsigned long *bitmap; orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); @@ -1356,8 +1360,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_free_ref(orig_node); - if (orig_ifinfo) - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_free_ref(orig_ifinfo); return ret; } @@ -1376,7 +1379,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); - struct batadv_neigh_node *router = NULL, *router_router = NULL; + struct batadv_neigh_node *router = NULL; + struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; @@ -1388,7 +1392,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, bool sameseq, similar_ttl; struct sk_buff *skb_priv; struct ethhdr *ethhdr; - uint8_t *prev_sender; + u8 *prev_sender; int is_bidirect; /* create a private copy of the skb, as some functions change tq value @@ -1570,7 +1574,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; - uint32_t if_incoming_seqno; + u32 if_incoming_seqno; bool has_directlink_flag; struct ethhdr *ethhdr; bool is_my_oldorig = false; @@ -1643,9 +1647,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (is_my_orig) { unsigned long *word; int offset; - int32_t bit_pos; - int16_t if_num; - uint8_t *weight; + s32 bit_pos; + s16 if_num; + u8 *weight; orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); @@ -1721,7 +1725,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm_packet *ogm_packet; - uint8_t *packet_pos; + u8 *packet_pos; int ogm_offset; bool ret; @@ -1805,7 +1809,7 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, unsigned long last_seen_jiffies; struct hlist_head *head; int batman_count = 0; - uint32_t i; + u32 i; seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n", "Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE, @@ -1873,7 +1877,7 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; - uint8_t tq1, tq2; + u8 tq1, tq2; int diff; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); @@ -1915,7 +1919,7 @@ batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; - uint8_t tq1, tq2; + u8 tq1, tq2; bool ret; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); diff --git a/kernel/net/batman-adv/bitarray.c b/kernel/net/batman-adv/bitarray.c index e3da07a64..25cbc36e9 100644 --- a/kernel/net/batman-adv/bitarray.c +++ b/kernel/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -15,13 +15,13 @@ * along with this program; if not, see . */ -#include "main.h" #include "bitarray.h" +#include "main.h" -#include +#include /* shift the packet array by n places. */ -static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n) +static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) { if (n <= 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE) return; @@ -35,8 +35,8 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n) * 1 if the window was moved (either new or very old) * 0 if the window was not moved/shifted. */ -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, - int32_t seq_num_diff, int set_mark) +int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, + int set_mark) { struct batadv_priv *bat_priv = priv; diff --git a/kernel/net/batman-adv/bitarray.h b/kernel/net/batman-adv/bitarray.h index 2acaafe60..0226b220f 100644 --- a/kernel/net/batman-adv/bitarray.h +++ b/kernel/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -18,13 +18,19 @@ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ #define _NET_BATMAN_ADV_BITARRAY_H_ +#include "main.h" + +#include +#include +#include + /* Returns 1 if the corresponding bit in the given seq_bits indicates true * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, - uint32_t last_seqno, uint32_t curr_seqno) + u32 last_seqno, u32 curr_seqno) { - int32_t diff; + s32 diff; diff = last_seqno - curr_seqno; if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE) @@ -33,7 +39,7 @@ static inline int batadv_test_bit(const unsigned long *seq_bits, } /* turn corresponding bit on, so we can remember that we got the packet */ -static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n) +static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) { /* if too old, just drop it */ if (n < 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE) @@ -45,7 +51,7 @@ static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n) /* receive and process one packet, returns 1 if received seq_num is considered * new, 0 if old */ -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, - int32_t seq_num_diff, int set_mark); +int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, + int set_mark); #endif /* _NET_BATMAN_ADV_BITARRAY_H_ */ diff --git a/kernel/net/batman-adv/bridge_loop_avoidance.c b/kernel/net/batman-adv/bridge_loop_avoidance.c index ac4b96ecc..f5d2fe5e3 100644 --- a/kernel/net/batman-adv/bridge_loop_avoidance.c +++ b/kernel/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -15,21 +15,43 @@ * along with this program; if not, see . */ -#include "main.h" -#include "hash.h" -#include "hard-interface.h" -#include "originator.h" #include "bridge_loop_avoidance.h" -#include "translation-table.h" -#include "send.h" +#include "main.h" -#include +#include +#include +#include #include +#include +#include +#include #include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" +#include "translation-table.h" + +static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; static void batadv_bla_periodic_work(struct work_struct *work); static void @@ -37,34 +59,25 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); /* return the index of the claim */ -static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) +static inline u32 batadv_choose_claim(const void *data, u32 size) { struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; - uint32_t hash = 0; - - hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); - hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid)); + u32 hash = 0; - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&claim->addr, sizeof(claim->addr), hash); + hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } /* return the index of the backbone gateway */ -static inline uint32_t batadv_choose_backbone_gw(const void *data, - uint32_t size) +static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; - uint32_t hash = 0; + u32 hash = 0; - hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); - hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&claim->addr, sizeof(claim->addr), hash); + hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } @@ -75,7 +88,8 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); - const struct batadv_bla_backbone_gw *gw1 = data1, *gw2 = data2; + const struct batadv_bla_backbone_gw *gw1 = data1; + const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) return 0; @@ -92,7 +106,8 @@ static int batadv_compare_claim(const struct hlist_node *node, { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); - const struct batadv_bla_claim *cl1 = data1, *cl2 = data2; + const struct batadv_bla_claim *cl1 = data1; + const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) return 0; @@ -112,21 +127,17 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) } /* finally deinitialize the claim */ -static void batadv_claim_free_rcu(struct rcu_head *rcu) +static void batadv_claim_release(struct batadv_bla_claim *claim) { - struct batadv_bla_claim *claim; - - claim = container_of(rcu, struct batadv_bla_claim, rcu); - batadv_backbone_gw_free_ref(claim->backbone_gw); - kfree(claim); + kfree_rcu(claim, rcu); } /* free a claim, call claim_free_rcu if its the last reference */ static void batadv_claim_free_ref(struct batadv_bla_claim *claim) { if (atomic_dec_and_test(&claim->refcount)) - call_rcu(&claim->rcu, batadv_claim_free_rcu); + batadv_claim_release(claim); } /** @@ -178,8 +189,8 @@ static struct batadv_bla_claim * Returns claim if found or NULL otherwise. */ static struct batadv_bla_backbone_gw * -batadv_backbone_hash_find(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid) +batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, + unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; @@ -255,14 +266,14 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ -static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, +static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; struct ethhdr *ethhdr; struct batadv_hard_iface *primary_if; struct net_device *soft_iface; - uint8_t *hw_src; + u8 *hw_src; struct batadv_bla_claim_dst local_claim_dest; __be32 zeroip = 0; @@ -290,13 +301,13 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, * with XX = claim type * and YY:YY = group id */ - (uint8_t *)&local_claim_dest); + (u8 *)&local_claim_dest); if (!skb) goto out; ethhdr = (struct ethhdr *)skb->data; - hw_src = (uint8_t *)ethhdr + ETH_HLEN + sizeof(struct arphdr); + hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr); /* now we pretend that the client would have sent this ... */ switch (claimtype) { @@ -369,7 +380,7 @@ out: * be found. */ static struct batadv_bla_backbone_gw * -batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, +batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; @@ -538,7 +549,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { - uint8_t mac[ETH_ALEN]; + u8 mac[ETH_ALEN]; __be16 crc; memcpy(mac, batadv_announce_mac, 4); @@ -557,7 +568,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, * @backbone_gw: the backbone gateway which claims it */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, - const uint8_t *mac, const unsigned short vid, + const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_bla_claim *claim; @@ -621,7 +632,7 @@ claim_free_ref: * given mac address and vid. */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, - const uint8_t *mac, const unsigned short vid) + const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; @@ -645,12 +656,11 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, } /* check for ANNOUNCE frame, return 1 if handled */ -static int batadv_handle_announce(struct batadv_priv *bat_priv, - uint8_t *an_addr, uint8_t *backbone_addr, - unsigned short vid) +static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, + u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; - uint16_t crc; + u16 crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return 0; @@ -694,8 +704,8 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, /* check for REQUEST frame, return 1 if handled */ static int batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *backbone_addr, - struct ethhdr *ethhdr, unsigned short vid) + u8 *backbone_addr, struct ethhdr *ethhdr, + unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) @@ -718,8 +728,8 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, /* check for UNCLAIM frame, return 1 if handled */ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *backbone_addr, - uint8_t *claim_addr, unsigned short vid) + u8 *backbone_addr, u8 *claim_addr, + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -747,7 +757,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, /* check for CLAIM frame, return 1 if handled */ static int batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *backbone_addr, uint8_t *claim_addr, + u8 *backbone_addr, u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -791,10 +801,10 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, */ static int batadv_check_claim_group(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *hw_src, uint8_t *hw_dst, + u8 *hw_src, u8 *hw_dst, struct ethhdr *ethhdr) { - uint8_t *backbone_addr; + u8 *backbone_addr; struct batadv_orig_node *orig_node; struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; @@ -863,7 +873,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; - uint8_t *hw_src, *hw_dst; + u8 *hw_src, *hw_dst; struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; @@ -909,7 +919,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); - arphdr = (struct arphdr *)((uint8_t *)ethhdr + headlen); + arphdr = (struct arphdr *)((u8 *)ethhdr + headlen); /* Check whether the ARP frame carries a valid * IP information @@ -923,7 +933,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, if (arphdr->ar_pln != 4) return 0; - hw_src = (uint8_t *)arphdr + sizeof(struct arphdr); + hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; @@ -1224,9 +1234,9 @@ static struct lock_class_key batadv_backbone_hash_lock_class_key; int batadv_bla_init(struct batadv_priv *bat_priv) { int i; - uint8_t claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; + u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; struct batadv_hard_iface *primary_if; - uint16_t crc; + u16 crc; unsigned long entrytime; spin_lock_init(&bat_priv->bla.bcast_duplist_lock); @@ -1354,7 +1364,7 @@ out: * * Returns true if orig is a backbone for this vid, false otherwise. */ -bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig, +bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; @@ -1633,9 +1643,9 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; - uint32_t i; + u32 i; bool is_own; - uint8_t *primary_addr; + u8 *primary_addr; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -1678,9 +1688,9 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_hard_iface *primary_if; struct hlist_head *head; int secs, msecs; - uint32_t i; + u32 i; bool is_own; - uint8_t *primary_addr; + u8 *primary_addr; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) diff --git a/kernel/net/batman-adv/bridge_loop_avoidance.h b/kernel/net/batman-adv/bridge_loop_avoidance.h index 43c985d92..025152b34 100644 --- a/kernel/net/batman-adv/bridge_loop_avoidance.h +++ b/kernel/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_BLA_H_ #define _NET_BATMAN_ADV_BLA_H_ +#include "main.h" + +#include + +struct seq_file; +struct sk_buff; + #ifdef CONFIG_BATMAN_ADV_BLA int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, bool is_bcast); @@ -28,7 +35,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); -bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig, +bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb); @@ -74,8 +81,7 @@ static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, } static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, - uint8_t *orig, - unsigned short vid) + u8 *orig, unsigned short vid) { return false; } diff --git a/kernel/net/batman-adv/debugfs.c b/kernel/net/batman-adv/debugfs.c index a4972874c..c4c1e8030 100644 --- a/kernel/net/batman-adv/debugfs.c +++ b/kernel/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,21 +15,42 @@ * along with this program; if not, see . */ +#include "debugfs.h" #include "main.h" +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include "debugfs.h" -#include "translation-table.h" -#include "originator.h" -#include "hard-interface.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "soft-interface.h" -#include "icmp_socket.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" +#include "gateway_client.h" +#include "icmp_socket.h" #include "network-coding.h" +#include "originator.h" +#include "translation-table.h" static struct dentry *batadv_debugfs; @@ -482,11 +503,7 @@ rem_attr: debugfs_remove_recursive(hard_iface->debug_dir); hard_iface->debug_dir = NULL; out: -#ifdef CONFIG_DEBUG_FS return -ENOMEM; -#else - return 0; -#endif /* CONFIG_DEBUG_FS */ } /** @@ -541,11 +558,7 @@ rem_attr: debugfs_remove_recursive(bat_priv->debug_dir); bat_priv->debug_dir = NULL; out: -#ifdef CONFIG_DEBUG_FS return -ENOMEM; -#else - return 0; -#endif /* CONFIG_DEBUG_FS */ } void batadv_debugfs_del_meshif(struct net_device *dev) diff --git a/kernel/net/batman-adv/debugfs.h b/kernel/net/batman-adv/debugfs.h index 37c4d6ddd..80ab8d6f0 100644 --- a/kernel/net/batman-adv/debugfs.h +++ b/kernel/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,8 +18,16 @@ #ifndef _NET_BATMAN_ADV_DEBUGFS_H_ #define _NET_BATMAN_ADV_DEBUGFS_H_ +#include "main.h" + +#include + +struct net_device; + #define BATADV_DEBUGFS_SUBDIR "batman_adv" +#if IS_ENABLED(CONFIG_DEBUG_FS) + void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); @@ -27,4 +35,36 @@ void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); +#else + +static inline void batadv_debugfs_init(void) +{ +} + +static inline void batadv_debugfs_destroy(void) +{ +} + +static inline int batadv_debugfs_add_meshif(struct net_device *dev) +{ + return 0; +} + +static inline void batadv_debugfs_del_meshif(struct net_device *dev) +{ +} + +static inline +int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +{ + return 0; +} + +static inline +void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) +{ +} + +#endif + #endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */ diff --git a/kernel/net/batman-adv/distributed-arp-table.c b/kernel/net/batman-adv/distributed-arp-table.c index aad022dd1..a49c705fb 100644 --- a/kernel/net/batman-adv/distributed-arp-table.c +++ b/kernel/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -15,18 +15,37 @@ * along with this program; if not, see . */ -#include +#include "distributed-arp-table.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include "main.h" -#include "hash.h" -#include "distributed-arp-table.h" #include "hard-interface.h" +#include "hash.h" #include "originator.h" #include "send.h" -#include "types.h" #include "translation-table.h" static void batadv_dat_purge(struct work_struct *work); @@ -83,7 +102,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, struct batadv_dat_entry *dat_entry; struct hlist_node *node_tmp; struct hlist_head *head; - uint32_t i; + u32 i; if (!bat_priv->dat.hash) return; @@ -149,11 +168,11 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) * * Returns the value of the hw_src field in the ARP packet. */ -static uint8_t *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) +static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { - uint8_t *addr; + u8 *addr; - addr = (uint8_t *)(skb->data + hdr_size); + addr = (u8 *)(skb->data + hdr_size); addr += ETH_HLEN + sizeof(struct arphdr); return addr; @@ -178,7 +197,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) * * Returns the value of the hw_dst field in the ARP packet. */ -static uint8_t *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) +static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { return batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN + 4; } @@ -202,13 +221,26 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) * * Returns the selected index in the hash table for the given data. */ -static uint32_t batadv_hash_dat(const void *data, uint32_t size) +static u32 batadv_hash_dat(const void *data, u32 size) { - uint32_t hash = 0; + u32 hash = 0; const struct batadv_dat_entry *dat = data; + const unsigned char *key; + u32 i; + + key = (const unsigned char *)&dat->ip; + for (i = 0; i < sizeof(dat->ip); i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } - hash = batadv_hash_bytes(hash, &dat->ip, sizeof(dat->ip)); - hash = batadv_hash_bytes(hash, &dat->vid, sizeof(dat->vid)); + key = (const unsigned char *)&dat->vid; + for (i = 0; i < sizeof(dat->vid); i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } hash += (hash << 3); hash ^= (hash >> 11); @@ -233,7 +265,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, struct hlist_head *head; struct batadv_dat_entry to_find, *dat_entry, *dat_entry_tmp = NULL; struct batadv_hashtable *hash = bat_priv->dat.hash; - uint32_t index; + u32 index; if (!hash) return NULL; @@ -268,7 +300,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, * @vid: VLAN identifier */ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, - uint8_t *mac_addr, unsigned short vid) + u8 *mac_addr, unsigned short vid) { struct batadv_dat_entry *dat_entry; int hash_added; @@ -325,11 +357,11 @@ out: * @msg: message to print together with the debugging information */ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint16_t type, int hdr_size, char *msg) + u16 type, int hdr_size, char *msg) { struct batadv_unicast_4addr_packet *unicast_4addr_packet; struct batadv_bcast_packet *bcast_pkt; - uint8_t *orig_addr; + u8 *orig_addr; __be32 ip_src, ip_dst; if (msg) @@ -392,7 +424,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, #else static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint16_t type, int hdr_size, char *msg) + u16 type, int hdr_size, char *msg) { } @@ -422,7 +454,7 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, int j; /* check if orig node candidate is running DAT */ - if (!(candidate->capabilities & BATADV_ORIG_CAPA_HAS_DAT)) + if (!test_bit(BATADV_ORIG_CAPA_HAS_DAT, &candidate->capabilities)) goto out; /* Check if this node has already been selected... */ @@ -465,7 +497,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, int select, batadv_dat_addr_t ip_key, batadv_dat_addr_t *last_max) { - batadv_dat_addr_t max = 0, tmp_max = 0; + batadv_dat_addr_t max = 0; + batadv_dat_addr_t tmp_max = 0; struct batadv_orig_node *orig_node, *max_orig_node = NULL; struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; @@ -533,6 +566,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) int select; batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key; struct batadv_dat_candidate *res; + struct batadv_dat_entry dat; if (!bat_priv->orig_hash) return NULL; @@ -542,7 +576,9 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) if (!res) return NULL; - ip_key = (batadv_dat_addr_t)batadv_hash_dat(&ip_dst, + dat.ip = ip_dst; + dat.vid = 0; + ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat, BATADV_DAT_ADDR_MAX); batadv_dbg(BATADV_DBG_DAT, bat_priv, @@ -677,14 +713,13 @@ void batadv_dat_status_update(struct net_device *net_dev) */ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, + void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_DAT; + clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); else - orig->capabilities |= BATADV_ORIG_CAPA_HAS_DAT; + set_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); } /** @@ -755,7 +790,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) struct hlist_head *head; unsigned long last_seen_jiffies; int last_seen_msecs, last_seen_secs, last_seen_mins; - uint32_t i; + u32 i; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -798,14 +833,14 @@ out: * * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ -static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv, - struct sk_buff *skb, int hdr_size) +static u16 batadv_arp_get_type(struct batadv_priv *bat_priv, + struct sk_buff *skb, int hdr_size) { struct arphdr *arphdr; struct ethhdr *ethhdr; __be32 ip_src, ip_dst; - uint8_t *hw_src, *hw_dst; - uint16_t type = 0; + u8 *hw_src, *hw_dst; + u16 type = 0; /* pull the ethernet header */ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN))) @@ -902,9 +937,9 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb) { - uint16_t type = 0; + u16 type = 0; __be32 ip_dst, ip_src; - uint8_t *hw_src; + u8 *hw_src; bool ret = false; struct batadv_dat_entry *dat_entry = NULL; struct sk_buff *skb_new; @@ -990,9 +1025,9 @@ out: bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { - uint16_t type; + u16 type; __be32 ip_src, ip_dst; - uint8_t *hw_src; + u8 *hw_src; struct sk_buff *skb_new; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; @@ -1068,9 +1103,9 @@ out: void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb) { - uint16_t type; + u16 type; __be32 ip_src, ip_dst; - uint8_t *hw_src, *hw_dst; + u8 *hw_src, *hw_dst; int hdr_size = 0; unsigned short vid; @@ -1107,14 +1142,17 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * @hdr_size: size of the encapsulation header + * + * Returns true if the packet was snooped and consumed by DAT. False if the + * packet has to be delivered to the interface */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { - uint16_t type; + u16 type; __be32 ip_src, ip_dst; - uint8_t *hw_src, *hw_dst; - bool ret = false; + u8 *hw_src, *hw_dst; + bool dropped = false; unsigned short vid; if (!atomic_read(&bat_priv->distributed_arp_table)) @@ -1143,12 +1181,17 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, /* if this REPLY is directed to a client of mine, let's deliver the * packet to the interface */ - ret = !batadv_is_my_client(bat_priv, hw_dst, vid); + dropped = !batadv_is_my_client(bat_priv, hw_dst, vid); + + /* if this REPLY is sent on behalf of a client of mine, let's drop the + * packet because the client will reply by itself + */ + dropped |= batadv_is_my_client(bat_priv, hw_src, vid); out: - if (ret) + if (dropped) kfree_skb(skb); - /* if ret == false -> packet has to be delivered to the interface */ - return ret; + /* if dropped == false -> deliver to the interface */ + return dropped; } /** @@ -1162,7 +1205,7 @@ out: bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) { - uint16_t type; + u16 type; __be32 ip_dst; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; diff --git a/kernel/net/batman-adv/distributed-arp-table.h b/kernel/net/batman-adv/distributed-arp-table.h index 2fe0764c6..26d4a525a 100644 --- a/kernel/net/batman-adv/distributed-arp-table.h +++ b/kernel/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -18,12 +18,19 @@ #ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ #define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ -#ifdef CONFIG_BATMAN_ADV_DAT +#include "main.h" + +#include +#include +#include -#include "types.h" #include "originator.h" +#include "packet.h" -#include +struct seq_file; +struct sk_buff; + +#ifdef CONFIG_BATMAN_ADV_DAT /* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */ #define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0) @@ -47,7 +54,7 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, static inline void batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node) { - uint32_t addr; + u32 addr; addr = batadv_choose_orig(orig_node->orig, BATADV_DAT_ADDR_MAX); orig_node->dat_addr = (batadv_dat_addr_t)addr; @@ -62,7 +69,7 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if) { - uint32_t addr; + u32 addr; addr = batadv_choose_orig(primary_if->net_dev->dev_addr, BATADV_DAT_ADDR_MAX); @@ -82,7 +89,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); * Updates the ethtool statistics for the received packet if it is a DAT subtype */ static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, - uint8_t subtype) + u8 subtype) { switch (subtype) { case BATADV_P_DAT_DHT_GET: @@ -162,7 +169,7 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv) } static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, - uint8_t subtype) + u8 subtype) { } diff --git a/kernel/net/batman-adv/fragmentation.c b/kernel/net/batman-adv/fragmentation.c index 3d1dcaa3e..700c96c82 100644 --- a/kernel/net/batman-adv/fragmentation.c +++ b/kernel/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll * @@ -15,12 +15,29 @@ * along with this program; if not, see . */ -#include "main.h" #include "fragmentation.h" -#include "send.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hard-interface.h" #include "originator.h" +#include "packet.h" #include "routing.h" -#include "hard-interface.h" +#include "send.h" #include "soft-interface.h" /** @@ -50,7 +67,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, bool (*check_cb)(struct batadv_frag_table_entry *)) { struct batadv_frag_table_entry *chain; - uint8_t i; + u8 i; for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { chain = &orig_node->fragments[i]; @@ -94,8 +111,10 @@ static int batadv_frag_size_limit(void) * without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, - uint16_t seqno) + u16 seqno) { + lockdep_assert_held(&chain->lock); + if (chain->seqno == seqno) return false; @@ -129,8 +148,8 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, struct batadv_frag_list_entry *frag_entry_new = NULL, *frag_entry_curr; struct batadv_frag_list_entry *frag_entry_last = NULL; struct batadv_frag_packet *frag_packet; - uint8_t bucket; - uint16_t seqno, hdr_size = sizeof(struct batadv_frag_packet); + u8 bucket; + u16 seqno, hdr_size = sizeof(struct batadv_frag_packet); bool ret = false; /* Linearize packet to avoid linearizing 16 packets in a row when doing @@ -161,6 +180,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, hlist_add_head(&frag_entry_new->list, &chain->head); chain->size = skb->len - hdr_size; chain->timestamp = jiffies; + chain->total_size = ntohs(frag_packet->total_size); ret = true; goto out; } @@ -195,9 +215,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, out: if (chain->size > batadv_frag_size_limit() || - ntohs(frag_packet->total_size) > batadv_frag_size_limit()) { + chain->total_size != ntohs(frag_packet->total_size) || + chain->total_size > batadv_frag_size_limit()) { /* Clear chain if total size of either the list or the packet - * exceeds the maximum size of one merged packet. + * exceeds the maximum size of one merged packet. Don't allow + * packets to have different total_size. */ batadv_frag_clear_chain(&chain->head); chain->size = 0; @@ -228,19 +250,13 @@ err: * Returns the merged skb or NULL on error. */ static struct sk_buff * -batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) +batadv_frag_merge_packets(struct hlist_head *chain) { struct batadv_frag_packet *packet; struct batadv_frag_list_entry *entry; struct sk_buff *skb_out = NULL; int size, hdr_size = sizeof(struct batadv_frag_packet); - /* Make sure incoming skb has non-bogus data. */ - packet = (struct batadv_frag_packet *)skb->data; - size = ntohs(packet->total_size); - if (size > batadv_frag_size_limit()) - goto free; - /* Remove first entry, as this is the destination for the rest of the * fragments. */ @@ -249,6 +265,9 @@ batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) skb_out = entry->skb; kfree(entry); + packet = (struct batadv_frag_packet *)skb_out->data; + size = ntohs(packet->total_size); + /* Make room for the rest of the fragments. */ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { kfree_skb(skb_out); @@ -304,7 +323,7 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, if (hlist_empty(&head)) goto out; - skb_out = batadv_frag_merge_packets(&head, *skb); + skb_out = batadv_frag_merge_packets(&head); if (!skb_out) goto out_err; @@ -335,7 +354,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_orig_node *orig_node_dst = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_frag_packet *packet; - uint16_t total_size; + u16 total_size; bool ret = false; packet = (struct batadv_frag_packet *)skb->data; diff --git a/kernel/net/batman-adv/fragmentation.h b/kernel/net/batman-adv/fragmentation.h index d848cf667..8b9877e70 100644 --- a/kernel/net/batman-adv/fragmentation.h +++ b/kernel/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll * @@ -18,6 +18,15 @@ #ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_ #define _NET_BATMAN_ADV_FRAGMENTATION_H_ +#include "main.h" + +#include +#include +#include +#include + +struct sk_buff; + void batadv_frag_purge_orig(struct batadv_orig_node *orig, bool (*check_cb)(struct batadv_frag_table_entry *)); bool batadv_frag_skb_fwd(struct sk_buff *skb, diff --git a/kernel/net/batman-adv/gateway_client.c b/kernel/net/batman-adv/gateway_client.c index 090828cf1..e6c8382c7 100644 --- a/kernel/net/batman-adv/gateway_client.c +++ b/kernel/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,18 +15,37 @@ * along with this program; if not, see . */ -#include "main.h" -#include "sysfs.h" #include "gateway_client.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "gateway_common.h" #include "hard-interface.h" #include "originator.h" -#include "translation-table.h" +#include "packet.h" #include "routing.h" -#include -#include -#include -#include +#include "sysfs.h" +#include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp * packet starting at the beginning of the dhcp header @@ -133,20 +152,14 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; - uint32_t max_gw_factor = 0, tmp_gw_factor = 0; - uint32_t gw_divisor; - uint8_t max_tq = 0; - uint8_t tq_avg; + u64 max_gw_factor = 0; + u64 tmp_gw_factor = 0; + u8 max_tq = 0; + u8 tq_avg; struct batadv_orig_node *orig_node; - gw_divisor = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE; - gw_divisor *= 64; - rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - if (gw_node->deleted) - continue; - orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) @@ -167,7 +180,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; tmp_gw_factor *= 100 * 100; - tmp_gw_factor /= gw_divisor; + tmp_gw_factor >>= 18; if ((tmp_gw_factor > max_gw_factor) || ((tmp_gw_factor == max_gw_factor) && @@ -247,7 +260,8 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) void batadv_gw_election(struct batadv_priv *bat_priv) { - struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL; + struct batadv_gw_node *curr_gw = NULL; + struct batadv_gw_node *next_gw = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; @@ -331,8 +345,9 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_neigh_ifinfo *router_orig_tq = NULL; struct batadv_neigh_ifinfo *router_gw_tq = NULL; struct batadv_orig_node *curr_gw_orig; - struct batadv_neigh_node *router_gw = NULL, *router_orig = NULL; - uint8_t gw_tq_avg, orig_tq_avg; + struct batadv_neigh_node *router_gw = NULL; + struct batadv_neigh_node *router_orig = NULL; + u8 gw_tq_avg, orig_tq_avg; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) @@ -419,6 +434,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, INIT_HLIST_NODE(&gw_node->list); gw_node->orig_node = orig_node; + gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); + gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); atomic_set(&gw_node->refcount, 1); spin_lock_bh(&bat_priv->gw.list_lock); @@ -452,9 +469,6 @@ batadv_gw_node_get(struct batadv_priv *bat_priv, if (gw_node_tmp->orig_node != orig_node) continue; - if (gw_node_tmp->deleted) - continue; - if (!atomic_inc_not_zero(&gw_node_tmp->refcount)) continue; @@ -504,9 +518,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - gw_node->deleted = 0; if (ntohl(gateway->bandwidth_down) == 0) { - gw_node->deleted = jiffies; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway %pM removed from gateway list\n", orig_node->orig); @@ -514,14 +526,21 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, /* Note: We don't need a NULL check here, since curr_gw never * gets dereferenced. */ + spin_lock_bh(&bat_priv->gw.list_lock); + hlist_del_init_rcu(&gw_node->list); + spin_unlock_bh(&bat_priv->gw.list_lock); + + batadv_gw_node_free_ref(gw_node); + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); + + if (curr_gw) + batadv_gw_node_free_ref(curr_gw); } out: - if (curr_gw) - batadv_gw_node_free_ref(curr_gw); if (gw_node) batadv_gw_node_free_ref(gw_node); } @@ -537,39 +556,18 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig_node, &gateway); } -void batadv_gw_node_purge(struct batadv_priv *bat_priv) +void batadv_gw_node_free(struct batadv_priv *bat_priv) { - struct batadv_gw_node *gw_node, *curr_gw; + struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; - unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT); - int do_reselect = 0; - - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); spin_lock_bh(&bat_priv->gw.list_lock); - hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { - if (((!gw_node->deleted) || - (time_before(jiffies, gw_node->deleted + timeout))) && - atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) - continue; - - if (curr_gw == gw_node) - do_reselect = 1; - - hlist_del_rcu(&gw_node->list); + hlist_del_init_rcu(&gw_node->list); batadv_gw_node_free_ref(gw_node); } - spin_unlock_bh(&bat_priv->gw.list_lock); - - /* gw_reselect() needs to acquire the gw_list_lock */ - if (do_reselect) - batadv_gw_reselect(bat_priv); - - if (curr_gw) - batadv_gw_node_free_ref(curr_gw); } /* fails if orig_node has no router */ @@ -633,9 +631,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - if (gw_node->deleted) - continue; - /* fails if orig_node has no router */ if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0) continue; @@ -670,7 +665,7 @@ out: */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, - uint8_t *chaddr) + u8 *chaddr) { enum batadv_dhcp_recipient ret = BATADV_DHCP_NO; struct ethhdr *ethhdr; @@ -680,7 +675,7 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, struct vlan_ethhdr *vhdr; int chaddr_offset; __be16 proto; - uint8_t *p; + u8 *p; /* check for ethernet header */ if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) @@ -733,11 +728,6 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return BATADV_DHCP_NO; - /* skb->data might have been reallocated by pskb_may_pull() */ - ethhdr = eth_hdr(skb); - if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) - ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); - udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); @@ -795,13 +785,15 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) { - struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL; + struct batadv_neigh_node *neigh_curr = NULL; + struct batadv_neigh_node *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; - struct batadv_gw_node *gw_node = NULL, *curr_gw = NULL; + struct batadv_gw_node *gw_node = NULL; + struct batadv_gw_node *curr_gw = NULL; struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; struct ethhdr *ethhdr = (struct ethhdr *)skb->data; bool out_of_range = false; - uint8_t curr_tq_avg; + u8 curr_tq_avg; unsigned short vid; vid = batadv_get_vid(skb, 0); diff --git a/kernel/net/batman-adv/gateway_client.h b/kernel/net/batman-adv/gateway_client.h index 7ee53bb7d..fa9527785 100644 --- a/kernel/net/batman-adv/gateway_client.h +++ b/kernel/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,14 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ +#include "main.h" + +#include + +struct batadv_tvlv_gateway_data; +struct seq_file; +struct sk_buff; + void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); @@ -30,11 +38,11 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_tvlv_gateway_data *gateway); void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); -void batadv_gw_node_purge(struct batadv_priv *bat_priv); +void batadv_gw_node_free(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, - uint8_t *chaddr); + u8 *chaddr); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/kernel/net/batman-adv/gateway_common.c b/kernel/net/batman-adv/gateway_common.c index 88a1bc380..0cb5e6b6f 100644 --- a/kernel/net/batman-adv/gateway_common.c +++ b/kernel/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,9 +15,20 @@ * along with this program; if not, see . */ -#include "main.h" #include "gateway_common.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include + #include "gateway_client.h" +#include "packet.h" /** * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download @@ -30,11 +41,11 @@ * Returns false on parse error and true otherwise. */ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, - uint32_t *down, uint32_t *up) + u32 *down, u32 *up) { enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; char *slash_ptr, *tmp_ptr; - long ldown, lup; + u64 ldown, lup; int ret; slash_ptr = strchr(buff, '/'); @@ -52,7 +63,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = kstrtol(buff, 10, &ldown); + ret = kstrtou64(buff, 10, &ldown); if (ret) { batadv_err(net_dev, "Download speed of gateway mode invalid: %s\n", @@ -62,14 +73,31 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, switch (bw_unit_type) { case BATADV_BW_UNIT_MBIT: - *down = ldown * 10; + /* prevent overflow */ + if (U64_MAX / 10 < ldown) { + batadv_err(net_dev, + "Download speed of gateway mode too large: %s\n", + buff); + return false; + } + + ldown *= 10; break; case BATADV_BW_UNIT_KBIT: default: - *down = ldown / 100; + ldown = div_u64(ldown, 100); break; } + if (U32_MAX < ldown) { + batadv_err(net_dev, + "Download speed of gateway mode too large: %s\n", + buff); + return false; + } + + *down = ldown; + /* we also got some upload info */ if (slash_ptr) { bw_unit_type = BATADV_BW_UNIT_KBIT; @@ -85,7 +113,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = kstrtol(slash_ptr + 1, 10, &lup); + ret = kstrtou64(slash_ptr + 1, 10, &lup); if (ret) { batadv_err(net_dev, "Upload speed of gateway mode invalid: %s\n", @@ -95,13 +123,30 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, switch (bw_unit_type) { case BATADV_BW_UNIT_MBIT: - *up = lup * 10; + /* prevent overflow */ + if (U64_MAX / 10 < lup) { + batadv_err(net_dev, + "Upload speed of gateway mode too large: %s\n", + slash_ptr + 1); + return false; + } + + lup *= 10; break; case BATADV_BW_UNIT_KBIT: default: - *up = lup / 100; + lup = div_u64(lup, 100); break; } + + if (U32_MAX < lup) { + batadv_err(net_dev, + "Upload speed of gateway mode too large: %s\n", + slash_ptr + 1); + return false; + } + + *up = lup; } return true; @@ -115,7 +160,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) { struct batadv_tvlv_gateway_data gw; - uint32_t down, up; + u32 down, up; char gw_mode; gw_mode = atomic_read(&bat_priv->gw_mode); @@ -140,7 +185,10 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count) { struct batadv_priv *bat_priv = netdev_priv(net_dev); - uint32_t down_curr, up_curr, down_new = 0, up_new = 0; + u32 down_curr; + u32 up_curr; + u32 down_new = 0; + u32 up_new = 0; bool ret; down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down); @@ -148,7 +196,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new); if (!ret) - goto end; + return -EINVAL; if (!down_new) down_new = 1; @@ -172,7 +220,6 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, atomic_set(&bat_priv->gw.bandwidth_up, up_new); batadv_gw_tvlv_container_update(bat_priv); -end: return count; } @@ -186,9 +233,8 @@ end: */ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_gateway_data gateway, *gateway_ptr; diff --git a/kernel/net/batman-adv/gateway_common.h b/kernel/net/batman-adv/gateway_common.h index aa5116561..ab893e318 100644 --- a/kernel/net/batman-adv/gateway_common.h +++ b/kernel/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ #define _NET_BATMAN_ADV_GATEWAY_COMMON_H_ +#include "main.h" + +#include + +struct net_device; + enum batadv_gw_modes { BATADV_GW_MODE_OFF, BATADV_GW_MODE_CLIENT, diff --git a/kernel/net/batman-adv/hard-interface.c b/kernel/net/batman-adv/hard-interface.c index baf1f9843..f11345e16 100644 --- a/kernel/net/batman-adv/hard-interface.c +++ b/kernel/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,22 +15,36 @@ * along with this program; if not, see . */ -#include "main.h" -#include "distributed-arp-table.h" #include "hard-interface.h" -#include "soft-interface.h" -#include "send.h" -#include "translation-table.h" -#include "routing.h" -#include "sysfs.h" -#include "debugfs.h" -#include "originator.h" -#include "hash.h" -#include "bridge_loop_avoidance.h" -#include "gateway_client.h" +#include "main.h" +#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bridge_loop_avoidance.h" +#include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "soft-interface.h" +#include "sysfs.h" +#include "translation-table.h" void batadv_hardif_free_rcu(struct rcu_head *rcu) { @@ -238,6 +252,44 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev) rcu_read_unlock(); } +/** + * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom + * @soft_iface: netdev struct of the mesh interface + */ +static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) +{ + const struct batadv_hard_iface *hard_iface; + unsigned short lower_header_len = ETH_HLEN; + unsigned short lower_headroom = 0; + unsigned short lower_tailroom = 0; + unsigned short needed_headroom; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) + continue; + + if (hard_iface->soft_iface != soft_iface) + continue; + + lower_header_len = max_t(unsigned short, lower_header_len, + hard_iface->net_dev->hard_header_len); + + lower_headroom = max_t(unsigned short, lower_headroom, + hard_iface->net_dev->needed_headroom); + + lower_tailroom = max_t(unsigned short, lower_tailroom, + hard_iface->net_dev->needed_tailroom); + } + rcu_read_unlock(); + + needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); + needed_headroom += batadv_max_header_len(); + + soft_iface->needed_headroom = needed_headroom; + soft_iface->needed_tailroom = lower_tailroom; +} + int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -460,6 +512,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); + batadv_hardif_recalc_extra_skbroom(soft_iface); + /* begin scheduling originator messages on that interface */ batadv_schedule_bat_ogm(hard_iface); @@ -514,6 +568,9 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_purge_outstanding_packets(bat_priv, hard_iface); dev_put(hard_iface->soft_iface); + netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); + batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); + /* nobody uses this interface anymore */ if (!bat_priv->num_ifaces) { batadv_gw_check_client_stop(bat_priv); @@ -522,7 +579,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_softif_destroy_sysfs(hard_iface->soft_iface); } - netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); hard_iface->soft_iface = NULL; batadv_hardif_free_ref(hard_iface); diff --git a/kernel/net/batman-adv/hard-interface.h b/kernel/net/batman-adv/hard-interface.h index 1918cd50b..7b12ea8ea 100644 --- a/kernel/net/batman-adv/hard-interface.h +++ b/kernel/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,17 @@ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ #define _NET_BATMAN_ADV_HARD_INTERFACE_H_ +#include "main.h" + +#include +#include +#include +#include +#include +#include + +struct net_device; + enum batadv_hard_if_state { BATADV_IF_NOT_IN_USE, BATADV_IF_TO_BE_REMOVED, @@ -64,18 +75,6 @@ batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); } -/** - * batadv_hardif_free_ref_now - decrement the hard interface refcounter and - * possibly free it (without rcu callback) - * @hard_iface: the hard interface to free - */ -static inline void -batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface) -{ - if (atomic_dec_and_test(&hard_iface->refcount)) - batadv_hardif_free_rcu(&hard_iface->rcu); -} - static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { diff --git a/kernel/net/batman-adv/hash.c b/kernel/net/batman-adv/hash.c index 7c1c63080..2ea6a18d7 100644 --- a/kernel/net/batman-adv/hash.c +++ b/kernel/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -15,13 +15,17 @@ * along with this program; if not, see . */ -#include "main.h" #include "hash.h" +#include "main.h" + +#include +#include +#include /* clears the hash */ static void batadv_hash_init(struct batadv_hashtable *hash) { - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { INIT_HLIST_HEAD(&hash->table[i]); @@ -38,7 +42,7 @@ void batadv_hash_destroy(struct batadv_hashtable *hash) } /* allocates and clears the hash */ -struct batadv_hashtable *batadv_hash_new(uint32_t size) +struct batadv_hashtable *batadv_hash_new(u32 size) { struct batadv_hashtable *hash; @@ -69,7 +73,7 @@ free_hash: void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key) { - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) lockdep_set_class(&hash->list_locks[i], key); diff --git a/kernel/net/batman-adv/hash.h b/kernel/net/batman-adv/hash.h index 539fc1266..377626250 100644 --- a/kernel/net/batman-adv/hash.h +++ b/kernel/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -18,7 +18,16 @@ #ifndef _NET_BATMAN_ADV_HASH_H_ #define _NET_BATMAN_ADV_HASH_H_ +#include "main.h" + +#include #include +#include +#include +#include +#include + +struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their * keys, return 0 if same and not 0 if not same @@ -30,17 +39,17 @@ typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, * based on the key in the data of the first * argument and the size the second */ -typedef uint32_t (*batadv_hashdata_choose_cb)(const void *, uint32_t); +typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); struct batadv_hashtable { struct hlist_head *table; /* the hashtable itself with the buckets */ spinlock_t *list_locks; /* spinlock for each hash list entry */ - uint32_t size; /* size of hashtable */ + u32 size; /* size of hashtable */ }; /* allocates and clears the hash */ -struct batadv_hashtable *batadv_hash_new(uint32_t size); +struct batadv_hashtable *batadv_hash_new(u32 size); /* set class key for all locks */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, @@ -60,7 +69,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, struct hlist_head *head; struct hlist_node *node, *node_tmp; spinlock_t *list_lock; /* spinlock to protect write access */ - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -79,28 +88,6 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, batadv_hash_destroy(hash); } -/** - * batadv_hash_bytes - hash some bytes and add them to the previous hash - * @hash: previous hash value - * @data: data to be hashed - * @size: number of bytes to be hashed - * - * Returns the new hash value. - */ -static inline uint32_t batadv_hash_bytes(uint32_t hash, const void *data, - uint32_t size) -{ - const unsigned char *key = data; - int i; - - for (i = 0; i < size; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - return hash; -} - /** * batadv_hash_add - adds data to the hashtable * @hash: storage hash table @@ -118,7 +105,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash, const void *data, struct hlist_node *data_node) { - uint32_t index; + u32 index; int ret = -1; struct hlist_head *head; struct hlist_node *node; @@ -162,7 +149,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_choose_cb choose, void *data) { - uint32_t index; + u32 index; struct hlist_node *node; struct hlist_head *head; void *data_save = NULL; diff --git a/kernel/net/batman-adv/icmp_socket.c b/kernel/net/batman-adv/icmp_socket.c index 161ef8f17..bcabb5e3f 100644 --- a/kernel/net/batman-adv/icmp_socket.c +++ b/kernel/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,14 +15,39 @@ * along with this program; if not, see . */ +#include "icmp_socket.h" #include "main.h" + +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include #include -#include "icmp_socket.h" -#include "send.h" -#include "hash.h" -#include "originator.h" +#include +#include +#include +#include +#include +#include + #include "hard-interface.h" +#include "originator.h" +#include "packet.h" +#include "send.h" static struct batadv_socket_client *batadv_socket_client_hash[256]; @@ -158,7 +183,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, struct batadv_orig_node *orig_node = NULL; struct batadv_neigh_node *neigh_node = NULL; size_t packet_len = sizeof(struct batadv_icmp_packet); - uint8_t *addr; + u8 *addr; if (len < sizeof(struct batadv_icmp_header)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -312,8 +337,8 @@ err: } /** - * batadv_socket_receive_packet - schedule an icmp packet to be sent to userspace - * on an icmp socket. + * batadv_socket_receive_packet - schedule an icmp packet to be sent to + * userspace on an icmp socket. * @socket_client: the socket this packet belongs to * @icmph: pointer to the header of the icmp packet * @icmp_len: total length of the icmp packet diff --git a/kernel/net/batman-adv/icmp_socket.h b/kernel/net/batman-adv/icmp_socket.h index 0c33950aa..e937143f0 100644 --- a/kernel/net/batman-adv/icmp_socket.h +++ b/kernel/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ #define _NET_BATMAN_ADV_ICMP_SOCKET_H_ +#include "main.h" + +#include + +struct batadv_icmp_header; + #define BATADV_ICMP_SOCKET "socket" void batadv_socket_init(void); diff --git a/kernel/net/batman-adv/main.c b/kernel/net/batman-adv/main.c index 12fc77bef..d7f17c1aa 100644 --- a/kernel/net/batman-adv/main.c +++ b/kernel/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,31 +15,54 @@ * along with this program; if not, see . */ +#include "main.h" + +#include +#include +#include #include -#include +#include +#include +#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include "main.h" -#include "sysfs.h" +#include + +#include "bat_algo.h" +#include "bridge_loop_avoidance.h" #include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "icmp_socket.h" +#include "multicast.h" +#include "network-coding.h" +#include "originator.h" +#include "packet.h" #include "routing.h" #include "send.h" -#include "originator.h" #include "soft-interface.h" -#include "icmp_socket.h" #include "translation-table.h" -#include "hard-interface.h" -#include "gateway_client.h" -#include "bridge_loop_avoidance.h" -#include "distributed-arp-table.h" -#include "multicast.h" -#include "gateway_common.h" -#include "hash.h" -#include "bat_algo.h" -#include "network-coding.h" -#include "fragmentation.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, * list traversals just rcu-locked @@ -126,7 +149,7 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list); #endif INIT_LIST_HEAD(&bat_priv->tt.changes_list); - INIT_LIST_HEAD(&bat_priv->tt.req_list); + INIT_HLIST_HEAD(&bat_priv->tt.req_list); INIT_LIST_HEAD(&bat_priv->tt.roam_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.mla_list); @@ -176,7 +199,7 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_purge_outstanding_packets(bat_priv, NULL); - batadv_gw_node_purge(bat_priv); + batadv_gw_node_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); @@ -209,10 +232,13 @@ void batadv_mesh_free(struct net_device *soft_iface) * interfaces in the current mesh * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check + * + * Returns 'true' if the mac address was found, false otherwise. */ -int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { const struct batadv_hard_iface *hard_iface; + bool is_my_mac = false; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -223,12 +249,12 @@ int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { - rcu_read_unlock(); - return 1; + is_my_mac = true; + break; } } rcu_read_unlock(); - return 0; + return is_my_mac; } /** @@ -362,7 +388,7 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct batadv_priv *bat_priv; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *hard_iface; - uint8_t idx; + u8 idx; int ret; hard_iface = container_of(ptype, struct batadv_hard_iface, @@ -471,7 +497,7 @@ static void batadv_recv_handler_init(void) } int -batadv_recv_handler_register(uint8_t packet_type, +batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { @@ -487,7 +513,7 @@ batadv_recv_handler_register(uint8_t packet_type, return 0; } -void batadv_recv_handler_unregister(uint8_t packet_type) +void batadv_recv_handler_unregister(u8 packet_type) { batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } @@ -510,14 +536,12 @@ static struct batadv_algo_ops *batadv_algo_get(char *name) int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; - int ret; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); - ret = -EEXIST; - goto out; + return -EEXIST; } /* all algorithms must implement all ops (for now) */ @@ -531,32 +555,26 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->bat_neigh_is_equiv_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); - ret = -EINVAL; - goto out; + return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); - ret = 0; -out: - return ret; + return 0; } int batadv_algo_select(struct batadv_priv *bat_priv, char *name) { struct batadv_algo_ops *bat_algo_ops; - int ret = -EINVAL; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) - goto out; + return -EINVAL; bat_priv->bat_algo_ops = bat_algo_ops; - ret = 0; -out: - return ret; + return 0; } int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) @@ -566,7 +584,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) seq_puts(seq, "Available routing algorithms:\n"); hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { - seq_printf(seq, "%s\n", bat_algo_ops->name); + seq_printf(seq, " * %s\n", bat_algo_ops->name); } return 0; @@ -625,8 +643,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) * Returns tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler -*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) +*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; @@ -674,8 +691,7 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) * Returns tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container -*batadv_tvlv_container_get(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) +*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; @@ -706,10 +722,10 @@ static struct batadv_tvlv_container * * Returns size of all currently registered tvlv containers in bytes. */ -static uint16_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) +static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; - uint16_t tvlv_len = 0; + u16 tvlv_len = 0; hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); @@ -722,13 +738,17 @@ static uint16_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) /** * batadv_tvlv_container_remove - remove tvlv container from the tvlv container * list + * @bat_priv: the bat priv with all the soft interface information * @tvlv: the to be removed tvlv container * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). */ -static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv) +static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, + struct batadv_tvlv_container *tvlv) { + lockdep_assert_held(&bat_priv->tvlv.handler_list_lock); + if (!tvlv) return; @@ -747,13 +767,13 @@ static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv) * @version: tvlv container type to unregister */ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) + u8 type, u8 version) { struct batadv_tvlv_container *tvlv; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(tvlv); + batadv_tvlv_container_remove(bat_priv, tvlv); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } @@ -770,8 +790,8 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, * content is going to replace the old one. */ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len) + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_container *tvlv_old, *tvlv_new; @@ -792,7 +812,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(tvlv_old); + batadv_tvlv_container_remove(bat_priv, tvlv_old); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } @@ -819,15 +839,15 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); /* keep old buffer if kmalloc should fail */ - if (new_buff) { - memcpy(new_buff, *packet_buff, min_packet_len); - kfree(*packet_buff); - *packet_buff = new_buff; - *packet_buff_len = min_packet_len + additional_packet_len; - return true; - } + if (!new_buff) + return false; + + memcpy(new_buff, *packet_buff, min_packet_len); + kfree(*packet_buff); + *packet_buff = new_buff; + *packet_buff_len = min_packet_len + additional_packet_len; - return false; + return true; } /** @@ -844,14 +864,13 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * * Returns size of all appended tvlv containers in bytes. */ -uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, - int packet_min_len) +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; - uint16_t tvlv_value_len; + u16 tvlv_value_len; void *tvlv_value; bool ret; @@ -876,7 +895,7 @@ uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, tvlv_hdr->len = tvlv->tvlv_hdr.len; tvlv_value = tvlv_hdr + 1; memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); - tvlv_value = (uint8_t *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); + tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); } end: @@ -903,8 +922,8 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, bool ogm_source, struct batadv_orig_node *orig_node, - uint8_t *src, uint8_t *dst, - void *tvlv_value, uint16_t tvlv_value_len) + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) { if (!tvlv_handler) return NET_RX_SUCCESS; @@ -955,13 +974,13 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, bool ogm_source, struct batadv_orig_node *orig_node, - uint8_t *src, uint8_t *dst, - void *tvlv_value, uint16_t tvlv_value_len) + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; - uint16_t tvlv_value_cont_len; - uint8_t cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; + u16 tvlv_value_cont_len; + u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; int ret = NET_RX_SUCCESS; while (tvlv_value_len >= sizeof(*tvlv_hdr)) { @@ -983,7 +1002,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, tvlv_value_cont_len); if (tvlv_handler) batadv_tvlv_handler_free_ref(tvlv_handler); - tvlv_value = (uint8_t *)tvlv_value + tvlv_value_cont_len; + tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } @@ -1017,7 +1036,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { void *tvlv_value; - uint16_t tvlv_value_len; + u16 tvlv_value_len; if (!batadv_ogm_packet) return; @@ -1049,14 +1068,14 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, + u8 flags, void *tvlv_value, - uint16_t tvlv_value_len), + u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len), - uint8_t type, uint8_t version, uint8_t flags) + u16 tvlv_value_len), + u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; @@ -1091,7 +1110,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, * @version: tvlv handler version to be unregistered */ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) + u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler; @@ -1117,9 +1136,9 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len) +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; struct batadv_tvlv_hdr *tvlv_hdr; diff --git a/kernel/net/batman-adv/main.h b/kernel/net/batman-adv/main.h index 4d2318829..ebd8af0a1 100644 --- a/kernel/net/batman-adv/main.h +++ b/kernel/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2015.0" +#define BATADV_SOURCE_VERSION "2015.2" #endif /* B.A.T.M.A.N. parameters */ @@ -44,7 +44,7 @@ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ -#define BATADV_DAT_ENTRY_TIMEOUT (5*60000) /* 5 mins in milliseconds */ +#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) */ @@ -163,28 +163,26 @@ enum batadv_uev_type { /* Kernel headers */ -#include /* mutex */ -#include /* needed by all modules */ -#include /* netdevice */ -#include /* ethernet address classification */ -#include /* ethernet header */ -#include /* poll_table */ -#include /* kernel threads */ -#include /* schedule types */ -#include /* workqueue */ +#include +#include /* for packet.h */ +#include +#include +#include +#include /* for packet.h */ +#include +#include +#include #include -#include -#include /* struct sock */ -#include /* ipv6 address stuff */ -#include -#include #include -#include #include #include "types.h" -#define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \ +struct batadv_ogm_packet; +struct seq_file; +struct sk_buff; + +#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \ (int)(vid & VLAN_VID_MASK) : -1) extern char batadv_routing_algo[]; @@ -195,7 +193,7 @@ extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); -int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq); int batadv_max_header_len(void); @@ -204,10 +202,10 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); int -batadv_recv_handler_register(uint8_t packet_type, +batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); -void batadv_recv_handler_unregister(uint8_t packet_type); +void batadv_recv_handler_unregister(u8 packet_type); int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); int batadv_algo_select(struct batadv_priv *bat_priv, char *name); int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); @@ -279,7 +277,7 @@ static inline void _batadv_dbg(int type __always_unused, * * note: can't use ether_addr_equal() as it requires aligned memory */ -static inline int batadv_compare_eth(const void *data1, const void *data2) +static inline bool batadv_compare_eth(const void *data1, const void *data2) { return ether_addr_equal_unaligned(data1, data2); } @@ -306,7 +304,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * they handle overflows/underflows and can correctly check for a * predecessor/successor unless the variable sequence number has grown by * more then 2**(bitwidth(x)-1)-1. - * This means that for a uint8_t with the maximum value 255, it would think: + * This means that for a u8 with the maximum value 255, it would think: * - when adding nothing - it is neither a predecessor nor a successor * - before adding more than 127 to the starting value - it is a predecessor, * - when adding 128 - it is neither a predecessor nor a successor, @@ -329,10 +327,9 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) /* Sum and return the cpu-local counters for index 'idx' */ -static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv, - size_t idx) +static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { - uint64_t *counters, sum = 0; + u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { @@ -350,39 +347,38 @@ static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv, #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len); -uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, - int packet_min_len); + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len); void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_orig_node *orig_node); void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version); + u8 type, u8 version); void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, + u8 flags, void *tvlv_value, - uint16_t tvlv_value_len), + u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len), - uint8_t type, uint8_t version, uint8_t flags); + u16 tvlv_value_len), + u8 type, u8 version, u8 flags); void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version); + u8 type, u8 version); int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, bool ogm_source, struct batadv_orig_node *orig_node, - uint8_t *src, uint8_t *dst, - void *tvlv_buff, uint16_t tvlv_buff_len); -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len); + u8 *src, u8 *dst, + void *tvlv_buff, u16 tvlv_buff_len); +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/kernel/net/batman-adv/multicast.c b/kernel/net/batman-adv/multicast.c index b24e4bb64..eb76386f8 100644 --- a/kernel/net/batman-adv/multicast.c +++ b/kernel/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -15,10 +15,36 @@ * along with this program; if not, see . */ -#include "main.h" #include "multicast.h" -#include "originator.h" -#include "hard-interface.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "packet.h" #include "translation-table.h" /** @@ -64,7 +90,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, * Returns true if the given address is already in the given list. * Otherwise returns false. */ -static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr, +static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; @@ -78,15 +104,19 @@ static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr, /** * batadv_mcast_mla_list_free - free a list of multicast addresses + * @bat_priv: the bat priv with all the soft interface information * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ -static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) +static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv, + struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; + lockdep_assert_held(&bat_priv->tt.commit_lock); + hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); @@ -109,6 +139,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; + lockdep_assert_held(&bat_priv->tt.commit_lock); + hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && @@ -139,6 +171,8 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; + lockdep_assert_held(&bat_priv->tt.commit_lock); + if (!mcast_list) return; @@ -243,7 +277,7 @@ update: batadv_mcast_mla_tt_add(bat_priv, &mcast_list); out: - batadv_mcast_mla_list_free(&mcast_list); + batadv_mcast_mla_list_free(bat_priv, &mcast_list); } /** @@ -565,19 +599,28 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t mcast_flags) + u8 mcast_flags) { + struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; + struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_add_head_rcu(&orig->mcast_want_all_unsnoopables_node, - &bat_priv->mcast.want_all_unsnoopables_list); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && @@ -585,7 +628,10 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_del_rcu(&orig->mcast_want_all_unsnoopables_node); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } @@ -598,19 +644,28 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t mcast_flags) + u8 mcast_flags) { + struct hlist_node *node = &orig->mcast_want_all_ipv4_node; + struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_add_head_rcu(&orig->mcast_want_all_ipv4_node, - &bat_priv->mcast.want_all_ipv4_list); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && @@ -618,7 +673,10 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_del_rcu(&orig->mcast_want_all_ipv4_node); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } @@ -631,19 +689,28 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t mcast_flags) + u8 mcast_flags) { + struct hlist_node *node = &orig->mcast_want_all_ipv6_node; + struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_add_head_rcu(&orig->mcast_want_all_ipv6_node, - &bat_priv->mcast.want_all_ipv6_list); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && @@ -651,7 +718,10 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_del_rcu(&orig->mcast_want_all_ipv6_node); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } @@ -666,47 +736,50 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, */ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, + u8 flags, void *tvlv_value, - uint16_t tvlv_value_len) + u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - uint8_t mcast_flags = BATADV_NO_FLAGS; + u8 mcast_flags = BATADV_NO_FLAGS; bool orig_initialized; - orig_initialized = orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST; + if (orig_mcast_enabled && tvlv_value && + (tvlv_value_len >= sizeof(mcast_flags))) + mcast_flags = *(u8 *)tvlv_value; + + spin_lock_bh(&orig->mcast_handler_lock); + orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig->capa_initialized); /* If mcast support is turned on decrease the disabled mcast node * counter only if we had increased it for this node before. If this * is a completely new orig_node no need to decrease the counter. */ if (orig_mcast_enabled && - !(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) { + !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { if (orig_initialized) atomic_dec(&bat_priv->mcast.num_disabled); - orig->capabilities |= BATADV_ORIG_CAPA_HAS_MCAST; + set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); /* If mcast support is being switched off or if this is an initial * OGM without mcast support then increase the disabled mcast * node counter. */ } else if (!orig_mcast_enabled && - (orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST || + (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) || !orig_initialized)) { atomic_inc(&bat_priv->mcast.num_disabled); - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_MCAST; + clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } - orig->capa_initialized |= BATADV_ORIG_CAPA_HAS_MCAST; - - if (orig_mcast_enabled && tvlv_value && - (tvlv_value_len >= sizeof(mcast_flags))) - mcast_flags = *(uint8_t *)tvlv_value; + set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; + spin_unlock_bh(&orig->mcast_handler_lock); } /** @@ -740,11 +813,15 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; - if (!(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST) && - orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST) + spin_lock_bh(&orig->mcast_handler_lock); + + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) && + test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized)) atomic_dec(&bat_priv->mcast.num_disabled); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); + + spin_unlock_bh(&orig->mcast_handler_lock); } diff --git a/kernel/net/batman-adv/multicast.h b/kernel/net/batman-adv/multicast.h index 3a44ebdb4..8f3cb04b9 100644 --- a/kernel/net/batman-adv/multicast.h +++ b/kernel/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -18,6 +18,10 @@ #ifndef _NET_BATMAN_ADV_MULTICAST_H_ #define _NET_BATMAN_ADV_MULTICAST_H_ +#include "main.h" + +struct sk_buff; + /** * batadv_forw_mode - the way a packet should be forwarded as * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic diff --git a/kernel/net/batman-adv/network-coding.c b/kernel/net/batman-adv/network-coding.c index 127cc4d73..d0956f726 100644 --- a/kernel/net/batman-adv/network-coding.c +++ b/kernel/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -15,15 +15,45 @@ * along with this program; if not, see . */ +#include "network-coding.h" +#include "main.h" + +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include "main.h" +#include "hard-interface.h" #include "hash.h" -#include "network-coding.h" -#include "send.h" #include "originator.h" -#include "hard-interface.h" +#include "packet.h" #include "routing.h" +#include "send.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; @@ -100,14 +130,13 @@ void batadv_nc_status_update(struct net_device *net_dev) */ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, + void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_NC; + clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); else - orig->capabilities |= BATADV_ORIG_CAPA_HAS_NC; + set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); } /** @@ -155,7 +184,7 @@ err: */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { - atomic_set(&bat_priv->network_coding, 1); + atomic_set(&bat_priv->network_coding, 0); bat_priv->nc.min_tq = 200; bat_priv->nc.max_fwd_delay = 10; bat_priv->nc.max_buffer_time = 200; @@ -174,28 +203,25 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) } /** - * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove - * its refcount on the orig_node - * @rcu: rcu pointer of the nc node + * batadv_nc_node_release - release nc_node from lists and queue for free after + * rcu grace period + * @nc_node: the nc node to free */ -static void batadv_nc_node_free_rcu(struct rcu_head *rcu) +static void batadv_nc_node_release(struct batadv_nc_node *nc_node) { - struct batadv_nc_node *nc_node; - - nc_node = container_of(rcu, struct batadv_nc_node, rcu); batadv_orig_node_free_ref(nc_node->orig_node); - kfree(nc_node); + kfree_rcu(nc_node, rcu); } /** - * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly - * frees it + * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly + * release it * @nc_node: the nc node to free */ static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) { if (atomic_dec_and_test(&nc_node->refcount)) - call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu); + batadv_nc_node_release(nc_node); } /** @@ -275,7 +301,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, * max_buffer time */ return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_buffer_time*10); + bat_priv->nc.max_buffer_time * 10); } /** @@ -352,7 +378,7 @@ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; if (!hash) return; @@ -388,7 +414,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, struct hlist_node *node_tmp; struct batadv_nc_path *nc_path; spinlock_t *lock; /* Protects lists in hash */ - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -448,19 +474,13 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, * * Returns the selected index in the hash table for the given data. */ -static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size) +static u32 batadv_nc_hash_choose(const void *data, u32 size) { const struct batadv_nc_path *nc_path = data; - uint32_t hash = 0; - - hash = batadv_hash_bytes(hash, &nc_path->prev_hop, - sizeof(nc_path->prev_hop)); - hash = batadv_hash_bytes(hash, &nc_path->next_hop, - sizeof(nc_path->next_hop)); + u32 hash = 0; - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); + hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); return hash % size; } @@ -563,6 +583,8 @@ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, unsigned long timeout = bat_priv->nc.max_buffer_time; bool res = false; + lockdep_assert_held(&nc_path->packet_list_lock); + /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ @@ -599,6 +621,8 @@ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, { unsigned long timeout = bat_priv->nc.max_fwd_delay; + lockdep_assert_held(&nc_path->packet_list_lock); + /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ @@ -720,8 +744,8 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, struct batadv_ogm_packet *ogm_packet) { struct batadv_orig_ifinfo *orig_ifinfo; - uint32_t last_real_seqno; - uint8_t last_ttl; + u32 last_real_seqno; + u8 last_ttl; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); if (!orig_ifinfo) @@ -849,8 +873,8 @@ free: } /** - * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node structs - * (best called on incoming OGMs) + * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node + * structs (best called on incoming OGMs) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet @@ -864,14 +888,15 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, struct batadv_ogm_packet *ogm_packet, int is_single_hop_neigh) { - struct batadv_nc_node *in_nc_node = NULL, *out_nc_node = NULL; + struct batadv_nc_node *in_nc_node = NULL; + struct batadv_nc_node *out_nc_node = NULL; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* check if orig node is network coding enabled */ - if (!(orig_node->capabilities & BATADV_ORIG_CAPA_HAS_NC)) + if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) goto out; /* accept ogms from 'good' neighbors and single hop neighbors */ @@ -914,8 +939,8 @@ out: */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, - uint8_t *src, - uint8_t *dst) + u8 *src, + u8 *dst) { int hash_added; struct batadv_nc_path *nc_path, nc_path_key; @@ -967,9 +992,9 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value */ -static uint8_t batadv_nc_random_weight_tq(uint8_t tq) +static u8 batadv_nc_random_weight_tq(u8 tq) { - uint8_t rand_val, rand_tq; + u8 rand_val, rand_tq; get_random_bytes(&rand_val, sizeof(rand_val)); @@ -1014,7 +1039,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_nc_packet *nc_packet, struct batadv_neigh_node *neigh_node) { - uint8_t tq_weighted_neigh, tq_weighted_coding, tq_tmp; + u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; struct sk_buff *skb_dest, *skb_src; struct batadv_unicast_packet *packet1; struct batadv_unicast_packet *packet2; @@ -1023,7 +1048,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_neigh_node *router_coding = NULL; struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - uint8_t *first_source, *first_dest, *second_source, *second_dest; + u8 *first_source, *first_dest, *second_source, *second_dest; __be32 packet_id1, packet_id2; size_t count; bool res = false; @@ -1207,8 +1232,7 @@ out: * * Returns true if coding of a decoded packet is allowed. */ -static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, - uint8_t *dst, uint8_t *src) +static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) return false; @@ -1231,7 +1255,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, struct batadv_nc_node *in_nc_node, struct batadv_nc_node *out_nc_node, struct sk_buff *skb, - uint8_t *eth_dst) + u8 *eth_dst) { struct batadv_nc_path *nc_path, nc_path_key; struct batadv_nc_packet *nc_packet_out = NULL; @@ -1297,8 +1321,8 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint8_t *eth_dst, - uint8_t *eth_src, + u8 *eth_dst, + u8 *eth_src, struct batadv_nc_node *in_nc_node) { struct batadv_orig_node *orig_node; @@ -1338,7 +1362,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, */ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint8_t *eth_dst_new) + u8 *eth_dst_new) { struct ethhdr *ethhdr; @@ -1614,7 +1638,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet coded_packet_tmp; struct ethhdr *ethhdr, ethhdr_tmp; - uint8_t *orig_dest, ttl, ttvn; + u8 *orig_dest, ttl, ttvn; unsigned int coding_len; int err; @@ -1706,7 +1730,7 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; struct batadv_nc_path *nc_path, nc_path_key; - uint8_t *dest, *source; + u8 *dest, *source; __be32 packet_id; int index; diff --git a/kernel/net/batman-adv/network-coding.h b/kernel/net/batman-adv/network-coding.h index 358c0d686..8f6d4ad87 100644 --- a/kernel/net/batman-adv/network-coding.h +++ b/kernel/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -18,6 +18,15 @@ #ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ #define _NET_BATMAN_ADV_NETWORK_CODING_H_ +#include "main.h" + +#include + +struct batadv_ogm_packet; +struct net_device; +struct seq_file; +struct sk_buff; + #ifdef CONFIG_BATMAN_ADV_NC void batadv_nc_status_update(struct net_device *net_dev); diff --git a/kernel/net/batman-adv/originator.c b/kernel/net/batman-adv/originator.c index 90e805aba..17851d3aa 100644 --- a/kernel/net/batman-adv/originator.c +++ b/kernel/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,19 +15,32 @@ * along with this program; if not, see . */ +#include "originator.h" #include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "distributed-arp-table.h" -#include "originator.h" -#include "hash.h" -#include "translation-table.h" -#include "routing.h" +#include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" -#include "soft-interface.h" -#include "bridge_loop_avoidance.h" -#include "network-coding.h" -#include "fragmentation.h" +#include "hash.h" #include "multicast.h" +#include "network-coding.h" +#include "routing.h" +#include "translation-table.h" /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; @@ -58,7 +71,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan *vlan = NULL, *tmp; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) { if (tmp->vid != vid) continue; @@ -106,7 +119,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, atomic_set(&vlan->refcount, 2); vlan->vid = vid; - list_add_rcu(&vlan->list, &orig_node->vlan_list); + hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); out: spin_unlock_bh(&orig_node->vlan_list_lock); @@ -150,86 +163,66 @@ err: } /** - * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object - * @rcu: rcu pointer of the neigh_ifinfo object - */ -static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu) -{ - struct batadv_neigh_ifinfo *neigh_ifinfo; - - neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu); - - if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing); - - kfree(neigh_ifinfo); -} - -/** - * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free - * the neigh_ifinfo (without rcu callback) + * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for + * free after rcu grace period * @neigh_ifinfo: the neigh_ifinfo object to release */ static void -batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo) +batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo) { - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu); + if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) + batadv_hardif_free_ref(neigh_ifinfo->if_outgoing); + + kfree_rcu(neigh_ifinfo, rcu); } /** - * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free + * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu); + batadv_neigh_ifinfo_release(neigh_ifinfo); } /** * batadv_neigh_node_free_rcu - free the neigh_node - * @rcu: rcu pointer of the neigh_node + * batadv_neigh_node_release - release neigh_node from lists and queue for + * free after rcu grace period + * @neigh_node: neigh neighbor to free */ -static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) +static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node) { struct hlist_node *node_tmp; - struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; + struct batadv_algo_ops *bao; - neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); + bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { - batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); } - batadv_hardif_free_ref_now(neigh_node->if_incoming); - kfree(neigh_node); -} + if (bao->bat_neigh_free) + bao->bat_neigh_free(neigh_node); -/** - * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter - * and possibly free it (without rcu callback) - * @neigh_node: neigh neighbor to free - */ -static void -batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node) -{ - if (atomic_dec_and_test(&neigh_node->refcount)) - batadv_neigh_node_free_rcu(&neigh_node->rcu); + batadv_hardif_free_ref(neigh_node->if_incoming); + + kfree_rcu(neigh_node, rcu); } /** * batadv_neigh_node_free_ref - decrement the neighbors refcounter - * and possibly free it + * and possibly release it * @neigh_node: neigh neighbor to free */ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) { if (atomic_dec_and_test(&neigh_node->refcount)) - call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu); + batadv_neigh_node_release(neigh_node); } /** @@ -423,41 +416,6 @@ out: return neigh_ifinfo; } -/** - * batadv_neigh_node_new - create and init a new neigh_node object - * @hard_iface: the interface where the neighbour is connected to - * @neigh_addr: the mac address of the neighbour interface - * @orig_node: originator object representing the neighbour - * - * Allocates a new neigh_node object and initialises all the generic fields. - * Returns the new object or NULL on failure. - */ -struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, - struct batadv_orig_node *orig_node) -{ - struct batadv_neigh_node *neigh_node; - - neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); - if (!neigh_node) - goto out; - - INIT_HLIST_NODE(&neigh_node->list); - INIT_HLIST_HEAD(&neigh_node->ifinfo_list); - spin_lock_init(&neigh_node->ifinfo_lock); - - ether_addr_copy(neigh_node->addr, neigh_addr); - neigh_node->if_incoming = hard_iface; - neigh_node->orig_node = orig_node; - - /* extra reference for return */ - atomic_set(&neigh_node->refcount, 2); - -out: - return neigh_node; -} - /** * batadv_neigh_node_get - retrieve a neighbour from the list * @orig_node: originator which the neighbour belongs to @@ -468,10 +426,10 @@ out: * which is connected through the provided hard interface. * Returns NULL if the neighbour is not found. */ -struct batadv_neigh_node * +static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, - const uint8_t *addr) + const u8 *addr) { struct batadv_neigh_node *tmp_neigh_node, *res = NULL; @@ -495,108 +453,152 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, } /** - * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object - * @rcu: rcu pointer of the orig_ifinfo object + * batadv_neigh_node_new - create and init a new neigh_node object + * @orig_node: originator object representing the neighbour + * @hard_iface: the interface where the neighbour is connected to + * @neigh_addr: the mac address of the neighbour interface + * + * Allocates a new neigh_node object and initialises all the generic fields. + * Returns the new object or NULL on failure. */ -static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu) +struct batadv_neigh_node * +batadv_neigh_node_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) { - struct batadv_orig_ifinfo *orig_ifinfo; - struct batadv_neigh_node *router; + struct batadv_neigh_node *neigh_node; - orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu); + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); + if (neigh_node) + goto out; - if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing); + neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); + if (!neigh_node) + goto out; - /* this is the last reference to this object */ - router = rcu_dereference_protected(orig_ifinfo->router, true); - if (router) - batadv_neigh_node_free_ref_now(router); - kfree(orig_ifinfo); + if (!atomic_inc_not_zero(&hard_iface->refcount)) { + kfree(neigh_node); + neigh_node = NULL; + goto out; + } + + INIT_HLIST_NODE(&neigh_node->list); + INIT_HLIST_HEAD(&neigh_node->ifinfo_list); + spin_lock_init(&neigh_node->ifinfo_lock); + + ether_addr_copy(neigh_node->addr, neigh_addr); + neigh_node->if_incoming = hard_iface; + neigh_node->orig_node = orig_node; + + /* extra reference for return */ + atomic_set(&neigh_node->refcount, 2); + + spin_lock_bh(&orig_node->neigh_list_lock); + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + spin_unlock_bh(&orig_node->neigh_list_lock); + + batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, + "Creating new neighbor %pM for orig_node %pM on interface %s\n", + neigh_addr, orig_node->orig, hard_iface->net_dev->name); + +out: + return neigh_node; } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free - * the orig_ifinfo (without rcu callback) + * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for + * free after rcu grace period * @orig_ifinfo: the orig_ifinfo object to release */ -static void -batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo) +static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu); + struct batadv_neigh_node *router; + + if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) + batadv_hardif_free_ref(orig_ifinfo->if_outgoing); + + /* this is the last reference to this object */ + router = rcu_dereference_protected(orig_ifinfo->router, true); + if (router) + batadv_neigh_node_free_ref(router); + + kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free + * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) { if (atomic_dec_and_test(&orig_ifinfo->refcount)) - call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu); + batadv_orig_ifinfo_release(orig_ifinfo); } +/** + * batadv_orig_node_free_rcu - free the orig_node + * @rcu: rcu pointer of the orig_node + */ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { - struct hlist_node *node_tmp; - struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; - struct batadv_orig_ifinfo *orig_ifinfo; orig_node = container_of(rcu, struct batadv_orig_node, rcu); + batadv_mcast_purge_orig(orig_node); + + batadv_frag_purge_orig(orig_node, NULL); + + if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) + orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); + + kfree(orig_node->tt_buff); + kfree(orig_node); +} + +/** + * batadv_orig_node_release - release orig_node from lists and queue for + * free after rcu grace period + * @orig_node: the orig node to free + */ +static void batadv_orig_node_release(struct batadv_orig_node *orig_node) +{ + struct hlist_node *node_tmp; + struct batadv_neigh_node *neigh_node; + struct batadv_orig_ifinfo *orig_ifinfo; + spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref_now(neigh_node); + batadv_neigh_node_free_ref(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref_now(orig_ifinfo); + batadv_orig_ifinfo_free_ref(orig_ifinfo); } spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_mcast_purge_orig(orig_node); - /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - batadv_frag_purge_orig(orig_node, NULL); - - if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) - orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); - - kfree(orig_node->tt_buff); - kfree(orig_node); + call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } /** * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly - * schedule an rcu callback for freeing it + * release it * @orig_node: the orig node to free */ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) { if (atomic_dec_and_test(&orig_node->refcount)) - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); -} - -/** - * batadv_orig_node_free_ref_now - decrement the orig node refcounter and - * possibly free it (without rcu callback) - * @orig_node: the orig node to free - */ -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node) -{ - if (atomic_dec_and_test(&orig_node->refcount)) - batadv_orig_node_free_rcu(&orig_node->rcu); + batadv_orig_node_release(orig_node); } void batadv_originator_free(struct batadv_priv *bat_priv) @@ -606,7 +608,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; if (!hash) return; @@ -641,7 +643,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * Returns the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, - const uint8_t *addr) + const u8 *addr) { struct batadv_orig_node *orig_node; struct batadv_orig_node_vlan *vlan; @@ -656,7 +658,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, return NULL; INIT_HLIST_HEAD(&orig_node->neigh_list); - INIT_LIST_HEAD(&orig_node->vlan_list); + INIT_HLIST_HEAD(&orig_node->vlan_list); INIT_HLIST_HEAD(&orig_node->ifinfo_list); spin_lock_init(&orig_node->bcast_seqno_lock); spin_lock_init(&orig_node->neigh_list_lock); @@ -678,8 +680,13 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, orig_node->last_seen = jiffies; reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_node->bcast_seqno_reset = reset_time; + #ifdef CONFIG_BATMAN_ADV_MCAST orig_node->mcast_flags = BATADV_NO_FLAGS; + INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); + INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); + INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); + spin_lock_init(&orig_node->mcast_handler_lock); #endif /* create a vlan object for the "untagged" LAN */ @@ -958,7 +965,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; if (!hash) return; @@ -987,7 +994,6 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) spin_unlock_bh(list_lock); } - batadv_gw_node_purge(bat_priv); batadv_gw_election(bat_priv); } @@ -1092,7 +1098,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; int ret; /* resize all orig nodes because orig_node->bcast_own(_sum) depend on @@ -1129,7 +1135,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, struct batadv_hard_iface *hard_iface_tmp; struct batadv_orig_node *orig_node; struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; - uint32_t i; + u32 i; int ret; /* resize all orig nodes because orig_node->bcast_own(_sum) depend on diff --git a/kernel/net/batman-adv/originator.h b/kernel/net/batman-adv/originator.h index aa4a43696..a5c37882b 100644 --- a/kernel/net/batman-adv/originator.h +++ b/kernel/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,24 +18,32 @@ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include + #include "hash.h" +struct seq_file; + int batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, - const uint8_t *addr); + const u8 *addr); struct batadv_neigh_node * -batadv_neigh_node_get(const struct batadv_orig_node *orig_node, - const struct batadv_hard_iface *hard_iface, - const uint8_t *addr); -struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, - struct batadv_orig_node *orig_node); +batadv_neigh_node_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr); void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, @@ -73,22 +81,11 @@ void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); /* hashfunction to choose an entry in a hash table of given size * hash algorithm from http://en.wikipedia.org/wiki/Hash_table */ -static inline uint32_t batadv_choose_orig(const void *data, uint32_t size) +static inline u32 batadv_choose_orig(const void *data, u32 size) { - const unsigned char *key = data; - uint32_t hash = 0; - size_t i; - - for (i = 0; i < 6; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + u32 hash = 0; + hash = jhash(data, ETH_ALEN, hash); return hash % size; } diff --git a/kernel/net/batman-adv/packet.h b/kernel/net/batman-adv/packet.h index b81fbbf21..11f996b39 100644 --- a/kernel/net/batman-adv/packet.h +++ b/kernel/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,9 @@ #ifndef _NET_BATMAN_ADV_PACKET_H_ #define _NET_BATMAN_ADV_PACKET_H_ +#include +#include + /** * enum batadv_packettype - types for batman-adv encapsulated packets * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV @@ -194,8 +197,8 @@ enum batadv_tvlv_type { * transport the claim type and the group id */ struct batadv_bla_claim_dst { - uint8_t magic[3]; /* FF:43:05 */ - uint8_t type; /* bla_claimframe */ + u8 magic[3]; /* FF:43:05 */ + u8 type; /* bla_claimframe */ __be16 group; /* group id */ }; @@ -210,16 +213,16 @@ struct batadv_bla_claim_dst { * @tvlv_len: length of tvlv data following the ogm header */ struct batadv_ogm_packet { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t flags; - __be32 seqno; - uint8_t orig[ETH_ALEN]; - uint8_t prev_sender[ETH_ALEN]; - uint8_t reserved; - uint8_t tq; - __be16 tvlv_len; + u8 packet_type; + u8 version; + u8 ttl; + u8 flags; + __be32 seqno; + u8 orig[ETH_ALEN]; + u8 prev_sender[ETH_ALEN]; + u8 reserved; + u8 tq; + __be16 tvlv_len; /* __packed is not needed as the struct size is divisible by 4, * and the largest data type in this struct has a size of 4. */ @@ -243,14 +246,14 @@ struct batadv_ogm_packet { * members are padded the same way as they are in real packets. */ struct batadv_icmp_header { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t msg_type; /* see ICMP message types above */ - uint8_t dst[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - uint8_t uid; - uint8_t align[3]; + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 align[3]; }; /** @@ -266,15 +269,15 @@ struct batadv_icmp_header { * @seqno: ICMP sequence number */ struct batadv_icmp_packet { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t msg_type; /* see ICMP message types above */ - uint8_t dst[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - uint8_t uid; - uint8_t reserved; - __be16 seqno; + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 reserved; + __be16 seqno; }; #define BATADV_RR_LEN 16 @@ -293,16 +296,16 @@ struct batadv_icmp_packet { * @rr: route record array */ struct batadv_icmp_packet_rr { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t msg_type; /* see ICMP message types above */ - uint8_t dst[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - uint8_t uid; - uint8_t rr_cur; - __be16 seqno; - uint8_t rr[BATADV_RR_LEN][ETH_ALEN]; + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 rr_cur; + __be16 seqno; + u8 rr[BATADV_RR_LEN][ETH_ALEN]; }; #define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr) @@ -328,11 +331,11 @@ struct batadv_icmp_packet_rr { * @dest: originator destination of the unicast packet */ struct batadv_unicast_packet { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t ttvn; /* destination translation table version number */ - uint8_t dest[ETH_ALEN]; + u8 packet_type; + u8 version; + u8 ttl; + u8 ttvn; /* destination translation table version number */ + u8 dest[ETH_ALEN]; /* "4 bytes boundary + 2 bytes" long to make the payload after the * following ethernet header again 4 bytes boundary aligned */ @@ -346,9 +349,9 @@ struct batadv_unicast_packet { */ struct batadv_unicast_4addr_packet { struct batadv_unicast_packet u; - uint8_t src[ETH_ALEN]; - uint8_t subtype; - uint8_t reserved; + u8 src[ETH_ALEN]; + u8 subtype; + u8 reserved; /* "4 bytes boundary + 2 bytes" long to make the payload after the * following ethernet header again 4 bytes boundary aligned */ @@ -367,22 +370,22 @@ struct batadv_unicast_4addr_packet { * @total_size: size of the merged packet */ struct batadv_frag_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; #if defined(__BIG_ENDIAN_BITFIELD) - uint8_t no:4; - uint8_t reserved:4; + u8 no:4; + u8 reserved:4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - uint8_t reserved:4; - uint8_t no:4; + u8 reserved:4; + u8 no:4; #else #error "unknown bitfield endianness" #endif - uint8_t dest[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - __be16 seqno; - __be16 total_size; + u8 dest[ETH_ALEN]; + u8 orig[ETH_ALEN]; + __be16 seqno; + __be16 total_size; }; /** @@ -395,12 +398,12 @@ struct batadv_frag_packet { * @orig: originator of the broadcast packet */ struct batadv_bcast_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - uint8_t reserved; - __be32 seqno; - uint8_t orig[ETH_ALEN]; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; + u8 reserved; + __be32 seqno; + u8 orig[ETH_ALEN]; /* "4 bytes boundary + 2 bytes" long to make the payload after the * following ethernet header again 4 bytes boundary aligned */ @@ -425,21 +428,21 @@ struct batadv_bcast_packet { * @coded_len: length of network coded part of the payload */ struct batadv_coded_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - uint8_t first_ttvn; - /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */ - uint8_t first_source[ETH_ALEN]; - uint8_t first_orig_dest[ETH_ALEN]; - __be32 first_crc; - uint8_t second_ttl; - uint8_t second_ttvn; - uint8_t second_dest[ETH_ALEN]; - uint8_t second_source[ETH_ALEN]; - uint8_t second_orig_dest[ETH_ALEN]; - __be32 second_crc; - __be16 coded_len; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; + u8 first_ttvn; + /* u8 first_dest[ETH_ALEN]; - saved in mac header destination */ + u8 first_source[ETH_ALEN]; + u8 first_orig_dest[ETH_ALEN]; + __be32 first_crc; + u8 second_ttl; + u8 second_ttvn; + u8 second_dest[ETH_ALEN]; + u8 second_source[ETH_ALEN]; + u8 second_orig_dest[ETH_ALEN]; + __be32 second_crc; + __be16 coded_len; }; #pragma pack() @@ -456,14 +459,14 @@ struct batadv_coded_packet { * @align: 2 bytes to align the header to a 4 byte boundary */ struct batadv_unicast_tvlv_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - uint8_t reserved; - uint8_t dst[ETH_ALEN]; - uint8_t src[ETH_ALEN]; - __be16 tvlv_len; - uint16_t align; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; + u8 reserved; + u8 dst[ETH_ALEN]; + u8 src[ETH_ALEN]; + __be16 tvlv_len; + u16 align; }; /** @@ -473,9 +476,9 @@ struct batadv_unicast_tvlv_packet { * @len: tvlv container length */ struct batadv_tvlv_hdr { - uint8_t type; - uint8_t version; - __be16 len; + u8 type; + u8 version; + __be16 len; }; /** @@ -497,9 +500,9 @@ struct batadv_tvlv_gateway_data { * one batadv_tvlv_tt_vlan_data object per announced vlan */ struct batadv_tvlv_tt_data { - uint8_t flags; - uint8_t ttvn; - __be16 num_vlan; + u8 flags; + u8 ttvn; + __be16 num_vlan; }; /** @@ -510,9 +513,9 @@ struct batadv_tvlv_tt_data { * @reserved: unused, useful for alignment purposes */ struct batadv_tvlv_tt_vlan_data { - __be32 crc; - __be16 vid; - uint16_t reserved; + __be32 crc; + __be16 vid; + u16 reserved; }; /** @@ -524,9 +527,9 @@ struct batadv_tvlv_tt_vlan_data { * @vid: VLAN identifier */ struct batadv_tvlv_tt_change { - uint8_t flags; - uint8_t reserved[3]; - uint8_t addr[ETH_ALEN]; + u8 flags; + u8 reserved[3]; + u8 addr[ETH_ALEN]; __be16 vid; }; @@ -536,7 +539,7 @@ struct batadv_tvlv_tt_change { * @vid: VLAN identifier */ struct batadv_tvlv_roam_adv { - uint8_t client[ETH_ALEN]; + u8 client[ETH_ALEN]; __be16 vid; }; @@ -546,8 +549,8 @@ struct batadv_tvlv_roam_adv { * @reserved: reserved field */ struct batadv_tvlv_mcast_data { - uint8_t flags; - uint8_t reserved[3]; + u8 flags; + u8 reserved[3]; }; #endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/kernel/net/batman-adv/routing.c b/kernel/net/batman-adv/routing.c index da83982bf..3207667e6 100644 --- a/kernel/net/batman-adv/routing.c +++ b/kernel/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,20 +15,36 @@ * along with this program; if not, see . */ -#include "main.h" #include "routing.h" -#include "send.h" -#include "soft-interface.h" -#include "hard-interface.h" -#include "icmp_socket.h" -#include "translation-table.h" -#include "originator.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitarray.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" -#include "network-coding.h" #include "fragmentation.h" - -#include +#include "hard-interface.h" +#include "icmp_socket.h" +#include "network-coding.h" +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "soft-interface.h" +#include "translation-table.h" static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -129,7 +145,7 @@ out: * 0 if the packet is to be accepted * 1 if the packet is to be ignored. */ -int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff, +int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, unsigned long *last_reset) { if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE || @@ -637,19 +653,19 @@ out: static bool batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct batadv_unicast_packet *unicast_packet, - uint8_t *dst_addr, unsigned short vid) + u8 *dst_addr, unsigned short vid) { struct batadv_orig_node *orig_node = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = false; - uint8_t *orig_addr, orig_ttvn; + u8 *orig_addr, orig_ttvn; if (batadv_is_my_client(bat_priv, dst_addr, vid)) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; orig_addr = primary_if->net_dev->dev_addr; - orig_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + orig_ttvn = (u8)atomic_read(&bat_priv->tt.vn); } else { orig_node = batadv_transtable_search(bat_priv, NULL, dst_addr, vid); @@ -660,7 +676,7 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, goto out; orig_addr = orig_node->orig; - orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); } /* update the packet header */ @@ -682,7 +698,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, struct batadv_unicast_packet *unicast_packet; struct batadv_hard_iface *primary_if; struct batadv_orig_node *orig_node; - uint8_t curr_ttvn, old_ttvn; + u8 curr_ttvn, old_ttvn; struct ethhdr *ethhdr; unsigned short vid; int is_old_ttvn; @@ -724,7 +740,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * value is used later to check if the node which sent (or re-routed * last time) the packet had an updated information or not */ - curr_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + curr_ttvn = (u8)atomic_read(&bat_priv->tt.vn); if (!batadv_is_my_mac(bat_priv, unicast_packet->dest)) { orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest); @@ -735,7 +751,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, if (!orig_node) return 0; - curr_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); batadv_orig_node_free_ref(orig_node); } @@ -817,9 +833,10 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_unicast_4addr_packet *unicast_4addr_packet; - uint8_t *orig_addr; + u8 *orig_addr; struct batadv_orig_node *orig_node = NULL; int check, hdr_size = sizeof(*unicast_packet); + enum batadv_subtype subtype; bool is4addr; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -847,10 +864,20 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, /* packet for me */ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { if (is4addr) { - batadv_dat_inc_counter(bat_priv, - unicast_4addr_packet->subtype); - orig_addr = unicast_4addr_packet->src; - orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + subtype = unicast_4addr_packet->subtype; + batadv_dat_inc_counter(bat_priv, subtype); + + /* Only payload data should be considered for speedy + * join. For example, DAT also uses unicast 4addr + * types, but those packets should not be considered + * for speedy join, since the clients do not actually + * reside at the sending originator. + */ + if (subtype == BATADV_P_DATA) { + orig_addr = unicast_4addr_packet->src; + orig_node = batadv_orig_hash_find(bat_priv, + orig_addr); + } } if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, @@ -888,7 +915,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; unsigned char *tvlv_buff; - uint16_t tvlv_buff_len; + u16 tvlv_buff_len; int hdr_size = sizeof(*unicast_tvlv_packet); int ret = NET_RX_DROP; @@ -991,8 +1018,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, struct ethhdr *ethhdr; int hdr_size = sizeof(*bcast_packet); int ret = NET_RX_DROP; - int32_t seq_diff; - uint32_t seqno; + s32 seq_diff; + u32 seqno; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) diff --git a/kernel/net/batman-adv/routing.h b/kernel/net/batman-adv/routing.h index 557d3d12a..204bbe495 100644 --- a/kernel/net/batman-adv/routing.h +++ b/kernel/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_ROUTING_H_ #define _NET_BATMAN_ADV_ROUTING_H_ +#include "main.h" + +#include + +struct sk_buff; + bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len); @@ -45,7 +51,7 @@ struct batadv_neigh_node * batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); -int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff, +int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, unsigned long *last_reset); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/kernel/net/batman-adv/send.c b/kernel/net/batman-adv/send.c index 3d64ed20c..f66432480 100644 --- a/kernel/net/batman-adv/send.c +++ b/kernel/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,19 +15,37 @@ * along with this program; if not, see . */ +#include "send.h" #include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "distributed-arp-table.h" -#include "send.h" -#include "routing.h" -#include "translation-table.h" -#include "soft-interface.h" -#include "hard-interface.h" -#include "gateway_common.h" +#include "fragmentation.h" #include "gateway_client.h" -#include "originator.h" +#include "hard-interface.h" #include "network-coding.h" -#include "fragmentation.h" -#include "multicast.h" +#include "originator.h" +#include "routing.h" +#include "soft-interface.h" +#include "translation-table.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); @@ -36,7 +54,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work); */ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, - const uint8_t *dst_addr) + const u8 *dst_addr) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct ethhdr *ethhdr; @@ -154,7 +172,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_unicast_packet *unicast_packet; - uint8_t ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + u8 ttvn = (u8)atomic_read(&orig_node->last_ttvn); if (batadv_skb_head_push(skb, hdr_size) < 0) return false; @@ -255,8 +273,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { - struct ethhdr *ethhdr; struct batadv_unicast_packet *unicast_packet; + struct ethhdr *ethhdr; int ret = NET_XMIT_DROP; if (!orig_node) @@ -325,12 +343,12 @@ out: */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, - int packet_subtype, uint8_t *dst_hint, + int packet_subtype, u8 *dst_hint, unsigned short vid) { struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; - uint8_t *src, *dst; + u8 *src, *dst; src = ethhdr->h_source; dst = ethhdr->h_dest; @@ -598,7 +616,8 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, * we delete only packets belonging to the given interface */ if ((hard_iface) && - (forw_packet->if_incoming != hard_iface)) + (forw_packet->if_incoming != hard_iface) && + (forw_packet->if_outgoing != hard_iface)) continue; spin_unlock_bh(&bat_priv->forw_bcast_list_lock); diff --git a/kernel/net/batman-adv/send.h b/kernel/net/batman-adv/send.h index 38d0ec183..82059f259 100644 --- a/kernel/net/batman-adv/send.h +++ b/kernel/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,9 +18,19 @@ #ifndef _NET_BATMAN_ADV_SEND_H_ #define _NET_BATMAN_ADV_SEND_H_ +#include "main.h" + +#include +#include + +#include "packet.h" + +struct sk_buff; +struct work_struct; + int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, - const uint8_t *dst_addr); + const u8 *dst_addr); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); @@ -43,7 +53,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, unsigned short vid); int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, - int packet_subtype, uint8_t *dst_hint, + int packet_subtype, u8 *dst_hint, unsigned short vid); int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); @@ -62,7 +72,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, - struct sk_buff *skb, uint8_t *dst_hint, + struct sk_buff *skb, u8 *dst_hint, unsigned short vid) { return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST, 0, @@ -87,7 +97,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_subtype, - uint8_t *dst_hint, + u8 *dst_hint, unsigned short vid) { return batadv_send_skb_via_tt_generic(bat_priv, skb, diff --git a/kernel/net/batman-adv/soft-interface.c b/kernel/net/batman-adv/soft-interface.c index 5ec31d7de..ac4d08de5 100644 --- a/kernel/net/batman-adv/soft-interface.c +++ b/kernel/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,26 +15,50 @@ * along with this program; if not, see . */ -#include "main.h" #include "soft-interface.h" -#include "hard-interface.h" -#include "distributed-arp-table.h" -#include "routing.h" -#include "send.h" -#include "debugfs.h" -#include "translation-table.h" -#include "hash.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "sysfs.h" -#include "originator.h" -#include -#include +#include "main.h" + +#include +#include +#include +#include +#include #include +#include +#include +#include #include -#include "multicast.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "bridge_loop_avoidance.h" +#include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "multicast.h" #include "network-coding.h" +#include "packet.h" +#include "send.h" +#include "sysfs.h" +#include "translation-table.h" static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd); static void batadv_get_drvinfo(struct net_device *dev, @@ -105,8 +129,9 @@ static struct net_device_stats *batadv_interface_stats(struct net_device *dev) static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; - uint8_t old_addr[ETH_ALEN]; + u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; @@ -115,12 +140,17 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) ether_addr_copy(dev->dev_addr, addr->sa_data); /* only modify transtable if it has been initialized before */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) { - batadv_tt_local_remove(bat_priv, old_addr, BATADV_NO_FLAGS, + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + return 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); - batadv_tt_local_add(dev, addr->sa_data, BATADV_NO_FLAGS, + batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } + rcu_read_unlock(); return 0; } @@ -156,22 +186,23 @@ static int batadv_interface_tx(struct sk_buff *skb, struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; __be16 ethertype = htons(ETH_P_BATMAN); - static const uint8_t stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, - 0x00, 0x00}; - static const uint8_t ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, - 0x00, 0x00}; + static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, + 0x00, 0x00}; + static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, + 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; - uint8_t *dst_hint = NULL, chaddr[ETH_ALEN]; + u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 1; bool do_bcast = false, client_added; unsigned short vid; - uint32_t seqno; + u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode; struct batadv_orig_node *mcast_single_orig = NULL; + int network_offset = ETH_HLEN; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; @@ -184,14 +215,18 @@ static int batadv_interface_tx(struct sk_buff *skb, case ETH_P_8021Q: vhdr = vlan_eth_hdr(skb); - if (vhdr->h_vlan_encapsulated_proto != ethertype) + if (vhdr->h_vlan_encapsulated_proto != ethertype) { + network_offset += VLAN_HLEN; break; + } /* fall through */ case ETH_P_BATMAN: goto dropped; } + skb_set_network_header(skb, network_offset); + if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; @@ -449,6 +484,9 @@ out: */ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) { + if (!vlan) + return; + if (atomic_dec_and_test(&vlan->refcount)) { spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); @@ -712,9 +750,9 @@ static void batadv_softif_destroy_finish(struct work_struct *work) static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; - uint32_t random_seqno; + u32 random_seqno; int ret; - size_t cnt_len = sizeof(uint64_t) * BATADV_CNT_NUM; + size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); @@ -725,14 +763,14 @@ static int batadv_softif_init_late(struct net_device *dev) /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ - bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(uint64_t)); + bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA - atomic_set(&bat_priv->bridge_loop_avoidance, 0); + atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); @@ -818,7 +856,7 @@ static int batadv_softif_slave_add(struct net_device *dev, int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); - if (!hard_iface || hard_iface->soft_iface != NULL) + if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev->name); @@ -903,14 +941,12 @@ static void batadv_softif_init_early(struct net_device *dev) dev->netdev_ops = &batadv_netdev_ops; dev->destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; - /* reserve more space in the skbuff for our header */ - dev->hard_header_len = batadv_max_header_len(); /* generate random address */ eth_hw_addr_random(dev); @@ -1079,8 +1115,7 @@ static const struct { #endif }; -static void batadv_get_strings(struct net_device *dev, uint32_t stringset, - uint8_t *data) +static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, @@ -1088,8 +1123,7 @@ static void batadv_get_strings(struct net_device *dev, uint32_t stringset, } static void batadv_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, - uint64_t *data) + struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; diff --git a/kernel/net/batman-adv/soft-interface.h b/kernel/net/batman-adv/soft-interface.h index dbab22fd8..8e82176f4 100644 --- a/kernel/net/batman-adv/soft-interface.h +++ b/kernel/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ +#include "main.h" + +#include + +struct net_device; +struct sk_buff; + int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, struct batadv_hard_iface *recv_if, diff --git a/kernel/net/batman-adv/sysfs.c b/kernel/net/batman-adv/sysfs.c index a75dc12f9..9de3c8804 100644 --- a/kernel/net/batman-adv/sysfs.c +++ b/kernel/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,16 +15,35 @@ * along with this program; if not, see . */ -#include "main.h" #include "sysfs.h" -#include "translation-table.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "distributed-arp-table.h" -#include "network-coding.h" -#include "originator.h" +#include "gateway_client.h" +#include "gateway_common.h" #include "hard-interface.h" +#include "network-coding.h" +#include "packet.h" #include "soft-interface.h" -#include "gateway_common.h" -#include "gateway_client.h" static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) { @@ -151,7 +170,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ batadv_store_##_name) -#define BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func) \ +#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ ssize_t batadv_store_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff, \ size_t count) \ @@ -161,24 +180,24 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &bat_priv->_name, net_dev); \ + &bat_priv->_var, net_dev); \ } -#define BATADV_ATTR_SIF_SHOW_UINT(_name) \ +#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ ssize_t batadv_show_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff) \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ \ - return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \ + return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \ } \ /* Use this, if you are going to set [name] in the soft-interface * (bat_priv) to an unsigned integer value */ -#define BATADV_ATTR_SIF_UINT(_name, _mode, _min, _max, _post_func) \ - static BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func)\ - static BATADV_ATTR_SIF_SHOW_UINT(_name) \ +#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ + static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\ + static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ batadv_store_##_name) @@ -438,7 +457,7 @@ static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, struct attribute *attr, char *buff) { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - uint32_t down, up; + u32 down, up; down = atomic_read(&bat_priv->gw.bandwidth_down); up = atomic_read(&bat_priv->gw.bandwidth_up); @@ -493,7 +512,7 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, { struct net_device *net_dev = batadv_kobj_to_netdev(kobj); struct batadv_priv *bat_priv = netdev_priv(net_dev); - uint32_t mark, mask; + u32 mark, mask; char *mask_ptr; /* parse the mask if it has been specified, otherwise assume the mask is @@ -540,19 +559,20 @@ BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu); static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL); static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode, batadv_store_gw_mode); -BATADV_ATTR_SIF_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, - INT_MAX, NULL); -BATADV_ATTR_SIF_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, - NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE, - batadv_post_gw_reselect); +BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, + 2 * BATADV_JITTER, INT_MAX, NULL); +BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, + BATADV_TQ_MAX_VALUE, NULL); +BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1, + BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); #ifdef CONFIG_BATMAN_ADV_MCAST BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL); #endif #ifdef CONFIG_BATMAN_ADV_DEBUG -BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL); +BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0, + BATADV_DBG_ALL, NULL); #endif #ifdef CONFIG_BATMAN_ADV_NC BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, diff --git a/kernel/net/batman-adv/sysfs.h b/kernel/net/batman-adv/sysfs.h index b715b60db..61974428a 100644 --- a/kernel/net/batman-adv/sysfs.h +++ b/kernel/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,14 @@ #ifndef _NET_BATMAN_ADV_SYSFS_H_ #define _NET_BATMAN_ADV_SYSFS_H_ +#include "main.h" + +#include +#include + +struct kobject; +struct net_device; + #define BATADV_SYSFS_IF_MESH_SUBDIR "mesh" #define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv" /** diff --git a/kernel/net/batman-adv/translation-table.c b/kernel/net/batman-adv/translation-table.c index 07b263a43..83b0ca27a 100644 --- a/kernel/net/batman-adv/translation-table.c +++ b/kernel/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -15,24 +15,48 @@ * along with this program; if not, see . */ -#include "main.h" #include "translation-table.h" -#include "soft-interface.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bridge_loop_avoidance.h" #include "hard-interface.h" -#include "send.h" #include "hash.h" -#include "originator.h" -#include "routing.h" -#include "bridge_loop_avoidance.h" #include "multicast.h" - -#include +#include "originator.h" +#include "packet.h" +#include "soft-interface.h" /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; -static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, +static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node); static void batadv_tt_purge(struct work_struct *work); @@ -44,13 +68,15 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, unsigned short vid, const char *message, bool roaming); -/* returns 1 if they are the same mac addr */ +/* returns 1 if they are the same mac addr and vid */ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); + const struct batadv_tt_common_entry *tt1 = data1; + const struct batadv_tt_common_entry *tt2 = data2; - return batadv_compare_eth(data1, data2); + return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2); } /** @@ -61,18 +87,14 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) * Returns the hash index where the object represented by 'data' should be * stored at. */ -static inline uint32_t batadv_choose_tt(const void *data, uint32_t size) +static inline u32 batadv_choose_tt(const void *data, u32 size) { struct batadv_tt_common_entry *tt; - uint32_t hash = 0; + u32 hash = 0; tt = (struct batadv_tt_common_entry *)data; - hash = batadv_hash_bytes(hash, &tt->addr, ETH_ALEN); - hash = batadv_hash_bytes(hash, &tt->vid, sizeof(tt->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&tt->addr, ETH_ALEN, hash); + hash = jhash(&tt->vid, sizeof(tt->vid), hash); return hash % size; } @@ -87,12 +109,12 @@ static inline uint32_t batadv_choose_tt(const void *data, uint32_t size) * found, NULL otherwise. */ static struct batadv_tt_common_entry * -batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr, +batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, unsigned short vid) { struct hlist_head *head; struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL; - uint32_t index; + u32 index; if (!hash) return NULL; @@ -132,7 +154,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr, * found, NULL otherwise. */ static struct batadv_tt_local_entry * -batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr, +batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; @@ -157,7 +179,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr, * is found, NULL otherwise. */ static struct batadv_tt_global_entry * -batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr, +batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; @@ -203,7 +225,7 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) * (excluding ourself). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid) + const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; int count; @@ -218,20 +240,6 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, return count; } -static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_orig_list_entry *orig_entry; - - orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - - /* We are in an rcu callback here, therefore we cannot use - * batadv_orig_node_free_ref() and its call_rcu(): - * An rcu_barrier() wouldn't wait for that to finish - */ - batadv_orig_node_free_ref_now(orig_entry->orig_node); - kfree(orig_entry); -} - /** * batadv_tt_local_size_mod - change the size by v of the local table identified * by vid @@ -295,7 +303,7 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - list_del_rcu(&vlan->list); + hlist_del_init_rcu(&vlan->list); spin_unlock_bh(&orig_node->vlan_list_lock); batadv_orig_node_vlan_free_ref(vlan); } @@ -327,13 +335,25 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, batadv_tt_global_size_mod(orig_node, vid, -1); } +/** + * batadv_tt_orig_list_entry_release - release tt orig entry from lists and + * queue for free after rcu grace period + * @orig_entry: tt orig entry to be free'd + */ +static void +batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry) +{ + batadv_orig_node_free_ref(orig_entry->orig_node); + kfree_rcu(orig_entry, rcu); +} + static void batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) { if (!atomic_dec_and_test(&orig_entry->refcount)) return; - call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); + batadv_tt_orig_list_entry_release(orig_entry); } /** @@ -344,11 +364,11 @@ batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) */ static void batadv_tt_local_event(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, - uint8_t event_flags) + u8 event_flags) { struct batadv_tt_change_node *tt_change_node, *entry, *safe; struct batadv_tt_common_entry *common = &tt_local_entry->common; - uint8_t flags = common->flags | event_flags; + u8 flags = common->flags | event_flags; bool event_removed = false; bool del_op_requested, del_op_entry; @@ -428,7 +448,7 @@ static int batadv_tt_len(int changes_num) * * Returns the number of entries. */ -static uint16_t batadv_tt_entries(uint16_t tt_len) +static u16 batadv_tt_entries(u16 tt_len) { return tt_len / batadv_tt_len(1); } @@ -442,7 +462,8 @@ static uint16_t batadv_tt_entries(uint16_t tt_len) */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { - uint16_t num_vlan = 0, tt_local_entries = 0; + u16 num_vlan = 0; + u16 tt_local_entries = 0; struct batadv_softif_vlan *vlan; int hdr_size; @@ -505,8 +526,8 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * * Returns true if the client was successfully added, false otherwise. */ -bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, - unsigned short vid, int ifindex, uint32_t mark) +bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, + unsigned short vid, int ifindex, u32 mark) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; @@ -516,9 +537,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; - bool ret = false, roamed_back = false; - uint8_t remote_flags; - uint32_t match_mark; + bool ret = false; + bool roamed_back = false; + u8 remote_flags; + u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) in_dev = dev_get_by_index(&init_net, ifindex); @@ -575,11 +597,17 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, /* increase the refcounter of the related vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); + if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d", + addr, BATADV_PRINT_VID(vid))) { + kfree(tt_local); + tt_local = NULL; + goto out; + } batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, BATADV_PRINT_VID(vid), - (uint8_t)atomic_read(&bat_priv->tt.vn)); + (u8)atomic_read(&bat_priv->tt.vn)); ether_addr_copy(tt_local->common.addr, addr); /* The local entry has to be marked as NEW to avoid to send it in @@ -698,19 +726,22 @@ out: * * Return the size of the allocated buffer or 0 in case of failure. */ -static uint16_t +static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, - int32_t *tt_len) + s32 *tt_len) { - uint16_t num_vlan = 0, num_entries = 0, change_offset, tvlv_len; + u16 num_vlan = 0; + u16 num_entries = 0; + u16 change_offset; + u16 tvlv_len; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; - uint8_t *tt_change_ptr; + u8 *tt_change_ptr; rcu_read_lock(); - list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { num_vlan++; num_entries += atomic_read(&vlan->tt.num_entries); } @@ -736,14 +767,14 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); - list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan++; } - tt_change_ptr = (uint8_t *)*tt_data + change_offset; + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: @@ -769,16 +800,18 @@ out: * * Return the size of the allocated buffer or 0 in case of failure. */ -static uint16_t +static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, - int32_t *tt_len) + s32 *tt_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_softif_vlan *vlan; - uint16_t num_vlan = 0, num_entries = 0, tvlv_len; - uint8_t *tt_change_ptr; + u16 num_vlan = 0; + u16 num_entries = 0; + u16 tvlv_len; + u8 *tt_change_ptr; int change_offset; rcu_read_lock(); @@ -815,7 +848,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, tt_vlan++; } - tt_change_ptr = (uint8_t *)*tt_data + change_offset; + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: @@ -834,8 +867,9 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) struct batadv_tvlv_tt_data *tt_data; struct batadv_tvlv_tt_change *tt_change; int tt_diff_len, tt_change_len = 0; - int tt_diff_entries_num = 0, tt_diff_entries_count = 0; - uint16_t tvlv_len; + int tt_diff_entries_num = 0; + int tt_diff_entries_count = 0; + u16 tvlv_len; tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); tt_diff_len = batadv_tt_len(tt_diff_entries_num); @@ -909,12 +943,12 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_softif_vlan *vlan; struct hlist_head *head; unsigned short vid; - uint32_t i; + u32 i; int last_seen_secs; int last_seen_msecs; unsigned long last_seen_jiffies; bool no_purge; - uint16_t np_flag = BATADV_TT_CLIENT_NOPURGE; + u16 np_flag = BATADV_TT_CLIENT_NOPURGE; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -922,7 +956,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", - net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn)); + net_dev->name, (u8)atomic_read(&bat_priv->tt.vn)); seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID", "Flags", "Last seen", "CRC"); @@ -954,17 +988,17 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, BATADV_PRINT_VID(tt_common_entry->vid), - (tt_common_entry->flags & - BATADV_TT_CLIENT_ROAM ? 'R' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), no_purge ? 'P' : '.', - (tt_common_entry->flags & - BATADV_TT_CLIENT_NEW ? 'N' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_PENDING ? 'X' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_NEW) ? 'N' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_PENDING) ? 'X' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, vlan->tt.crc); @@ -982,7 +1016,7 @@ out: static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, - uint16_t flags, const char *message) + u16 flags, const char *message) { batadv_tt_local_event(bat_priv, tt_local_entry, flags); @@ -1008,13 +1042,14 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, * * Returns the flags assigned to the local entry before being deleted */ -uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid, - const char *message, bool roaming) +u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, + unsigned short vid, const char *message, + bool roaming) { struct batadv_tt_local_entry *tt_local_entry; - uint16_t flags, curr_flags = BATADV_NO_FLAGS; + u16 flags, curr_flags = BATADV_NO_FLAGS; struct batadv_softif_vlan *vlan; + void *tt_entry_exists; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) @@ -1042,11 +1077,22 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, * immediately purge it */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); - hlist_del_rcu(&tt_local_entry->common.hash_entry); + + tt_entry_exists = batadv_hash_remove(bat_priv->tt.local_hash, + batadv_compare_tt, + batadv_choose_tt, + &tt_local_entry->common); + if (!tt_entry_exists) + goto out; + + /* extra call to free the local tt entry */ batadv_tt_local_entry_free_ref(tt_local_entry); /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); + if (!vlan) + goto out; + batadv_softif_vlan_free_ref(vlan); batadv_softif_vlan_free_ref(vlan); @@ -1104,7 +1150,7 @@ static void batadv_tt_local_purge(struct batadv_priv *bat_priv, struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1125,7 +1171,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; - uint32_t i; + u32 i; if (!bat_priv->tt.local_hash) return; @@ -1147,8 +1193,10 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, tt_common_entry->vid); - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + if (vlan) { + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + } batadv_tt_local_entry_free_ref(tt_local); } @@ -1298,15 +1346,14 @@ out: static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *tt_addr, - unsigned short vid, uint16_t flags, - uint8_t ttvn) + unsigned short vid, u16 flags, u8 ttvn) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *tt_local_entry; bool ret = false; int hash_added; struct batadv_tt_common_entry *common; - uint16_t local_flags; + u16 local_flags; /* ignore global entries from backbone nodes */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) @@ -1380,9 +1427,15 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } /* if the client was temporary added before receiving the first - * OGM announcing it, we have to clear the TEMP flag + * OGM announcing it, we have to clear the TEMP flag. Also, + * remove the previous temporary orig node and re-add it + * if required. If the orig entry changed, the new one which + * is a non-temporary entry is preferred. */ - common->flags &= ~BATADV_TT_CLIENT_TEMP; + if (common->flags & BATADV_TT_CLIENT_TEMP) { + batadv_tt_global_del_orig_list(tt_global_entry); + common->flags &= ~BATADV_TT_CLIENT_TEMP; + } /* the change can carry possible "attribute" flags like the * TT_CLIENT_WIFI, therefore they have to be copied in the @@ -1503,8 +1556,8 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, struct batadv_tt_common_entry *tt_common_entry; struct batadv_orig_node_vlan *vlan; struct hlist_head *head; - uint8_t last_ttvn; - uint16_t flags; + u8 last_ttvn; + u16 flags; tt_common_entry = &tt_global_entry->common; flags = tt_common_entry->flags; @@ -1528,10 +1581,10 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, BATADV_PRINT_VID(tt_global_entry->common.vid), best_entry->ttvn, best_entry->orig_node->orig, last_ttvn, vlan->tt.crc, - (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), - (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); + ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), + ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), + ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); } @@ -1560,10 +1613,10 @@ print_list: BATADV_PRINT_VID(tt_global_entry->common.vid), orig_entry->ttvn, orig_entry->orig_node->orig, last_ttvn, vlan->tt.crc, - (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), - (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); + ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), + ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), + ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); } @@ -1578,7 +1631,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_global_entry *tt_global; struct batadv_hard_iface *primary_if; struct hlist_head *head; - uint32_t i; + u32 i; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -1611,20 +1664,28 @@ out: } /** - * batadv_tt_global_del_orig_entry - remove and free an orig_entry + * _batadv_tt_global_del_orig_entry - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from * @orig_entry: the orig entry to remove and free * * Remove an orig_entry from its list in the given tt_global_entry and * free this orig_entry afterwards. + * + * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is + * part of a list. */ static void -batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, - struct batadv_tt_orig_list_entry *orig_entry) +_batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, + struct batadv_tt_orig_list_entry *orig_entry) { + lockdep_assert_held(&tt_global_entry->list_lock); + batadv_tt_global_size_dec(orig_entry->orig_node, tt_global_entry->common.vid); atomic_dec(&tt_global_entry->orig_list_count); + /* requires holding tt_global_entry->list_lock and orig_entry->list + * being part of a list + */ hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_free_ref(orig_entry); } @@ -1640,7 +1701,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) - batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); + _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); spin_unlock_bh(&tt_global_entry->list_lock); } @@ -1675,8 +1736,8 @@ batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, orig_node->orig, tt_global_entry->common.addr, BATADV_PRINT_VID(vid), message); - batadv_tt_global_del_orig_entry(tt_global_entry, - orig_entry); + _batadv_tt_global_del_orig_entry(tt_global_entry, + orig_entry); } } spin_unlock_bh(&tt_global_entry->list_lock); @@ -1798,12 +1859,12 @@ out: */ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - int32_t match_vid, + s32 match_vid, const char *message) { struct batadv_tt_global_entry *tt_global; struct batadv_tt_common_entry *tt_common_entry; - uint32_t i; + u32 i; struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_node *safe; struct hlist_head *head; @@ -1843,7 +1904,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, } spin_unlock_bh(list_lock); } - orig_node->capa_initialized &= ~BATADV_ORIG_CAPA_HAS_TT; + clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, @@ -1874,7 +1935,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) struct hlist_head *head; struct hlist_node *node_tmp; spinlock_t *list_lock; /* protects write access to the hash lists */ - uint32_t i; + u32 i; char *msg = NULL; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; @@ -1915,7 +1976,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) struct batadv_tt_global_entry *tt_global; struct hlist_node *node_tmp; struct hlist_head *head; - uint32_t i; + u32 i; if (!bat_priv->tt.global_hash) return; @@ -1976,8 +2037,8 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, * If the two clients are AP isolated the function returns NULL. */ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, - const uint8_t *src, - const uint8_t *addr, + const u8 *src, + const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; @@ -2045,16 +2106,16 @@ out: * * Returns the checksum of the global table of a given originator. */ -static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - unsigned short vid) +static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; - uint32_t i, crc_tmp, crc = 0; - uint8_t flags; + u32 i, crc_tmp, crc = 0; + u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { @@ -2122,14 +2183,14 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, * * Returns the checksum of the local table */ -static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, - unsigned short vid) +static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, + unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct hlist_head *head; - uint32_t i, crc_tmp, crc = 0; - uint8_t flags; + u32 i, crc_tmp, crc = 0; + u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { @@ -2171,12 +2232,13 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { - struct batadv_tt_req_node *node, *safe; + struct batadv_tt_req_node *node; + struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { - list_del(&node->list); + hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { + hlist_del_init(&node->list); kfree(node); } @@ -2186,7 +2248,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, - uint16_t tt_buff_len) + u16 tt_buff_len) { /* Replace the old buffer only if I received something in the * last OGM (the OGM could carry no changes) @@ -2206,30 +2268,36 @@ static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, static void batadv_tt_req_purge(struct batadv_priv *bat_priv) { - struct batadv_tt_req_node *node, *safe; + struct batadv_tt_req_node *node; + struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { + hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { - list_del(&node->list); + hlist_del_init(&node->list); kfree(node); } } spin_unlock_bh(&bat_priv->tt.req_list_lock); } -/* returns the pointer to the new tt_req_node struct if no request - * has already been issued for this orig_node, NULL otherwise +/** + * batadv_tt_req_node_new - search and possibly create a tt_req_node object + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node this request is being issued for + * + * Returns the pointer to the new tt_req_node struct if no request + * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * -batadv_new_tt_req_node(struct batadv_priv *bat_priv, +batadv_tt_req_node_new(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL; spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { + hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { if (batadv_compare_eth(tt_req_node_tmp, orig_node) && !batadv_has_timed_out(tt_req_node_tmp->issued_at, BATADV_TT_REQUEST_TIMEOUT)) @@ -2243,7 +2311,7 @@ batadv_new_tt_req_node(struct batadv_priv *bat_priv, ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; - list_add(&tt_req_node->list, &bat_priv->tt.req_list); + hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); return tt_req_node; @@ -2295,15 +2363,15 @@ static int batadv_tt_global_valid(const void *entry_ptr, */ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, - void *tvlv_buff, uint16_t tt_len, + void *tvlv_buff, u16 tt_len, int (*valid_cb)(const void *, const void *), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; struct hlist_head *head; - uint16_t tt_tot, tt_num_entries = 0; - uint32_t i; + u16 tt_tot, tt_num_entries = 0; + u32 i; tt_tot = batadv_tt_entries(tt_len); tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff; @@ -2345,11 +2413,11 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_vlan_data *tt_vlan, - uint16_t num_vlan) + u16 num_vlan) { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; - uint32_t crc; + u32 crc; int i; /* check if each received CRC matches the locally stored one */ @@ -2404,11 +2472,11 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node_vlan *vlan; - uint32_t crc; + u32 crc; /* recompute the global CRC for each VLAN */ rcu_read_lock(); - list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { /* if orig_node is a backbone node for this VLAN, don't compute * the CRC as we ignore all the global entries over it */ @@ -2434,9 +2502,9 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, */ static int batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, - uint8_t ttvn, + u8 ttvn, struct batadv_tvlv_tt_vlan_data *tt_vlan, - uint16_t num_vlan, bool full_table) + u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; @@ -2452,7 +2520,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ - tt_req_node = batadv_new_tt_req_node(bat_priv, dst_orig_node); + tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node); if (!tt_req_node) goto out; @@ -2494,7 +2562,8 @@ out: batadv_hardif_free_ref(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); - list_del(&tt_req_node->list); + /* hlist_del_init() verifies tt_req_node still is in the list */ + hlist_del_init(&tt_req_node->list); spin_unlock_bh(&bat_priv->tt.req_list_lock); kfree(tt_req_node); } @@ -2514,7 +2583,7 @@ out: */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *req_src, uint8_t *req_dst) + u8 *req_src, u8 *req_dst) { struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; @@ -2522,14 +2591,14 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tvlv_tt_vlan_data *tt_vlan; bool ret = false, full_table; - uint8_t orig_ttvn, req_ttvn; - uint16_t tvlv_len; - int32_t tt_len; + u8 orig_ttvn, req_ttvn; + u16 tvlv_len; + s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", req_src, tt_data->ttvn, req_dst, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst); @@ -2540,7 +2609,7 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, if (!res_dst_orig_node) goto out; - orig_ttvn = (uint8_t)atomic_read(&req_dst_orig_node->last_ttvn); + orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); @@ -2646,25 +2715,25 @@ out: */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *req_src) + u8 *req_src) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_hard_iface *primary_if = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_orig_node *orig_node; - uint8_t my_ttvn, req_ttvn; - uint16_t tvlv_len; + u8 my_ttvn, req_ttvn; + u16 tvlv_len; bool full_table; - int32_t tt_len; + s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", req_src, tt_data->ttvn, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); spin_lock_bh(&bat_priv->tt.commit_lock); - my_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + my_ttvn = (u8)atomic_read(&bat_priv->tt.vn); req_ttvn = tt_data->ttvn; orig_node = batadv_orig_hash_find(bat_priv, req_src); @@ -2703,7 +2772,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, bat_priv->tt.last_changeset_len); spin_unlock_bh(&bat_priv->tt.last_changeset_lock); } else { - req_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + req_ttvn = (u8)atomic_read(&bat_priv->tt.vn); /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part @@ -2764,7 +2833,7 @@ out: */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *req_src, uint8_t *req_dst) + u8 *req_src, u8 *req_dst) { if (batadv_is_my_mac(bat_priv, req_dst)) return batadv_send_my_tt_response(bat_priv, tt_data, req_src); @@ -2775,7 +2844,7 @@ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_change *tt_change, - uint16_t tt_num_changes, uint8_t ttvn) + u16 tt_num_changes, u8 ttvn) { int i; int roams; @@ -2802,13 +2871,13 @@ static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, return; } } - orig_node->capa_initialized |= BATADV_ORIG_CAPA_HAS_TT; + set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change, - uint8_t ttvn, uint8_t *resp_src, - uint16_t num_entries) + u8 ttvn, u8 *resp_src, + u16 num_entries) { struct batadv_orig_node *orig_node; @@ -2838,7 +2907,7 @@ out: static void batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - uint16_t tt_num_changes, uint8_t ttvn, + u16 tt_num_changes, u8 ttvn, struct batadv_tvlv_tt_change *tt_change) { _batadv_tt_update_changes(bat_priv, orig_node, tt_change, @@ -2857,7 +2926,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, * * Returns true if the client is served by this node, false otherwise. */ -bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr, +bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; @@ -2888,18 +2957,19 @@ out: */ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *resp_src, uint16_t num_entries) + u8 *resp_src, u16 num_entries) { - struct batadv_tt_req_node *node, *safe; + struct batadv_tt_req_node *node; + struct hlist_node *safe; struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; - uint8_t *tvlv_ptr = (uint8_t *)tt_data; - uint16_t change_offset; + u8 *tvlv_ptr = (u8 *)tt_data; + u16 change_offset; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", resp_src, tt_data->ttvn, num_entries, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) @@ -2928,10 +2998,10 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, /* Delete the tt_req_node from pending tt_requests list */ spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { + hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (!batadv_compare_eth(node->addr, resp_src)) continue; - list_del(&node->list); + hlist_del_init(&node->list); kfree(node); } @@ -2977,8 +3047,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) * * returns true if the ROAMING_ADV can be sent, false otherwise */ -static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, - uint8_t *client) +static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { struct batadv_tt_roam_node *tt_roam_node; bool ret = false; @@ -3033,7 +3102,7 @@ unlock: * for this particular roamed client has to be forwarded to the sender of the * roaming message. */ -static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, +static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node) { @@ -3111,14 +3180,14 @@ void batadv_tt_free(struct batadv_priv *bat_priv) * @enable: whether to set or unset the flag * @count: whether to increase the TT size by the number of changed entries */ -static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, - uint16_t flags, bool enable, bool count) +static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, + bool enable, bool count) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; - uint16_t changed_num = 0; + u16 changed_num = 0; struct hlist_head *head; - uint32_t i; + u32 i; if (!hash) return; @@ -3160,7 +3229,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - uint32_t i; + u32 i; if (!hash) return; @@ -3188,8 +3257,10 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + if (vlan) { + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + } batadv_tt_local_entry_free_ref(tt_local); } @@ -3206,6 +3277,8 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) */ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { + lockdep_assert_held(&bat_priv->tt.commit_lock); + /* Update multicast addresses in local translation table */ batadv_mcast_mla_update(bat_priv); @@ -3224,7 +3297,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) atomic_inc(&bat_priv->tt.vn); batadv_dbg(BATADV_DBG_TT, bat_priv, "Local changes committed, updating to ttvn %u\n", - (uint8_t)atomic_read(&bat_priv->tt.vn)); + (u8)atomic_read(&bat_priv->tt.vn)); /* reset the sending counter */ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX); @@ -3243,8 +3316,8 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.commit_lock); } -bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, unsigned short vid) +bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, + unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; struct batadv_tt_global_entry *tt_global_entry = NULL; @@ -3292,17 +3365,18 @@ out: */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - const void *tt_buff, uint16_t tt_num_vlan, + const void *tt_buff, u16 tt_num_vlan, struct batadv_tvlv_tt_change *tt_change, - uint16_t tt_num_changes, uint8_t ttvn) + u16 tt_num_changes, u8 ttvn) { - uint8_t orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); struct batadv_tvlv_tt_vlan_data *tt_vlan; bool full_table = true; bool has_tt_init; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff; - has_tt_init = orig_node->capa_initialized & BATADV_ORIG_CAPA_HAS_TT; + has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT, + &orig_node->capa_initialized); /* orig table not initialised AND first diff is in the OGM OR the ttvn * increased by one -> we can apply the attached changes @@ -3374,7 +3448,7 @@ request_table: * deleted later by a DEL or because of timeout */ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid) + u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; bool ret = false; @@ -3400,7 +3474,7 @@ out: * to keep the latter consistent with the node TTVN */ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid) + u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; @@ -3486,13 +3560,13 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) */ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, void *tvlv_value, + u16 tvlv_value_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; - uint16_t num_entries, num_vlan; + u16 num_entries, num_vlan; if (tvlv_value_len < sizeof(*tt_data)) return; @@ -3528,12 +3602,12 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len) + u16 tvlv_value_len) { struct batadv_tvlv_tt_data *tt_data; - uint16_t tt_vlan_len, tt_num_entries; + u16 tt_vlan_len, tt_num_entries; char tt_flag; bool ret; @@ -3609,9 +3683,9 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len) + u16 tvlv_value_len) { struct batadv_tvlv_roam_adv *roaming_adv; struct batadv_orig_node *orig_node = NULL; @@ -3693,7 +3767,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid) + const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt; bool ret; diff --git a/kernel/net/batman-adv/translation-table.h b/kernel/net/batman-adv/translation-table.h index ad84d7b89..abd8e116e 100644 --- a/kernel/net/batman-adv/translation-table.h +++ b/kernel/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -18,39 +18,45 @@ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ +#include "main.h" + +#include + +struct net_device; +struct seq_file; + int batadv_tt_init(struct batadv_priv *bat_priv); -bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, - unsigned short vid, int ifindex, uint32_t mark); -uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid, - const char *message, bool roaming); +bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, + unsigned short vid, int ifindex, u32 mark); +u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, + const u8 *addr, unsigned short vid, + const char *message, bool roaming); int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - int32_t match_vid, const char *message); + s32 match_vid, const char *message); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid); + const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, - const uint8_t *src, - const uint8_t *addr, + const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); -bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr, +bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); -bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, unsigned short vid); +bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, + unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid); + u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid); + u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid); + const u8 *addr, unsigned short vid); #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/kernel/net/batman-adv/types.h b/kernel/net/batman-adv/types.h index 9398c3fb4..d260efd70 100644 --- a/kernel/net/batman-adv/types.h +++ b/kernel/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,9 +18,23 @@ #ifndef _NET_BATMAN_ADV_TYPES_H_ #define _NET_BATMAN_ADV_TYPES_H_ +#ifndef _NET_BATMAN_ADV_MAIN_H_ +#error only "main.h" can be included directly +#endif + +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include +#include +#include +#include + #include "packet.h" -#include "bitarray.h" -#include + +struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT @@ -30,7 +44,7 @@ * * *Please be careful: batadv_dat_addr_t must be UNSIGNED* */ -#define batadv_dat_addr_t uint16_t +#define batadv_dat_addr_t u16 #endif /* CONFIG_BATMAN_ADV_DAT */ @@ -89,10 +103,10 @@ struct batadv_hard_iface_bat_iv { */ struct batadv_hard_iface { struct list_head list; - int16_t if_num; + s16 if_num; char if_status; struct net_device *net_dev; - uint8_t num_bcasts; + u8 num_bcasts; struct kobject *hardif_obj; atomic_t refcount; struct packet_type batman_adv_ptype; @@ -118,8 +132,8 @@ struct batadv_orig_ifinfo { struct hlist_node list; struct batadv_hard_iface *if_outgoing; struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ - uint32_t last_real_seqno; - uint8_t last_ttl; + u32 last_real_seqno; + u8 last_ttl; unsigned long batman_seqno_reset; atomic_t refcount; struct rcu_head rcu; @@ -132,13 +146,15 @@ struct batadv_orig_ifinfo { * @timestamp: time (jiffie) of last received fragment * @seqno: sequence number of the fragments in the list * @size: accumulated size of packets in list + * @total_size: expected size of the assembled packet */ struct batadv_frag_table_entry { struct hlist_head head; spinlock_t lock; /* protects head */ unsigned long timestamp; - uint16_t seqno; - uint16_t size; + u16 seqno; + u16 size; + u16 total_size; }; /** @@ -150,7 +166,7 @@ struct batadv_frag_table_entry { struct batadv_frag_list_entry { struct hlist_node list; struct sk_buff *skb; - uint8_t no; + u8 no; }; /** @@ -159,7 +175,7 @@ struct batadv_frag_list_entry { * @num_entries: number of TT entries for this VLAN */ struct batadv_vlan_tt { - uint32_t crc; + u32 crc; atomic_t num_entries; }; @@ -174,22 +190,23 @@ struct batadv_vlan_tt { struct batadv_orig_node_vlan { unsigned short vid; struct batadv_vlan_tt tt; - struct list_head list; + struct hlist_node list; atomic_t refcount; struct rcu_head rcu; }; /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: bitfield containing the number of our OGMs this orig_node - * rebroadcasted "back" to us (relative to last_real_seqno) - * @bcast_own_sum: counted result of bcast_own + * @bcast_own: set of bitfields (one per hard interface) where each one counts + * the number of our OGMs this orig_node rebroadcasted "back" to us (relative + * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. + * @bcast_own_sum: sum of bcast_own * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ struct batadv_orig_bat_iv { unsigned long *bcast_own; - uint8_t *bcast_own_sum; + u8 *bcast_own_sum; /* ogm_cnt_lock protects: bcast_own, bcast_own_sum, * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ @@ -204,6 +221,7 @@ struct batadv_orig_bat_iv { * @batadv_dat_addr_t: address of the orig node in the distributed hash * @last_seen: time when last packet from this node was received * @bcast_seqno_reset: time when the broadcast seqno window was reset + * @mcast_handler_lock: synchronizes mcast-capability and -flag changes * @mcast_flags: multicast flags announced by the orig node * @mcast_want_all_unsnoop_node: a list node for the * mcast.want_all_unsnoopables list @@ -242,7 +260,7 @@ struct batadv_orig_bat_iv { * @bat_iv: B.A.T.M.A.N. IV private structure */ struct batadv_orig_node { - uint8_t orig[ETH_ALEN]; + u8 orig[ETH_ALEN]; struct hlist_head ifinfo_list; struct batadv_orig_ifinfo *last_bonding_candidate; #ifdef CONFIG_BATMAN_ADV_DAT @@ -251,21 +269,23 @@ struct batadv_orig_node { unsigned long last_seen; unsigned long bcast_seqno_reset; #ifdef CONFIG_BATMAN_ADV_MCAST - uint8_t mcast_flags; + /* synchronizes mcast tvlv specific orig changes */ + spinlock_t mcast_handler_lock; + u8 mcast_flags; struct hlist_node mcast_want_all_unsnoopables_node; struct hlist_node mcast_want_all_ipv4_node; struct hlist_node mcast_want_all_ipv6_node; #endif - uint8_t capabilities; - uint8_t capa_initialized; + unsigned long capabilities; + unsigned long capa_initialized; atomic_t last_ttvn; unsigned char *tt_buff; - int16_t tt_buff_len; + s16 tt_buff_len; spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */ /* prevents from changing the table while reading it */ spinlock_t tt_lock; DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); - uint32_t last_bcast_seqno; + u32 last_bcast_seqno; struct hlist_head neigh_list; /* neigh_list_lock protects: neigh_list and router */ spinlock_t neigh_list_lock; @@ -282,7 +302,7 @@ struct batadv_orig_node { spinlock_t out_coding_list_lock; /* Protects out_coding_list */ #endif struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT]; - struct list_head vlan_list; + struct hlist_head vlan_list; spinlock_t vlan_list_lock; /* protects vlan_list */ struct batadv_orig_bat_iv bat_iv; }; @@ -296,10 +316,10 @@ struct batadv_orig_node { * (= orig node announces a tvlv of type BATADV_TVLV_MCAST) */ enum batadv_orig_capabilities { - BATADV_ORIG_CAPA_HAS_DAT = BIT(0), - BATADV_ORIG_CAPA_HAS_NC = BIT(1), - BATADV_ORIG_CAPA_HAS_TT = BIT(2), - BATADV_ORIG_CAPA_HAS_MCAST = BIT(3), + BATADV_ORIG_CAPA_HAS_DAT, + BATADV_ORIG_CAPA_HAS_NC, + BATADV_ORIG_CAPA_HAS_TT, + BATADV_ORIG_CAPA_HAS_MCAST, }; /** @@ -308,16 +328,14 @@ enum batadv_orig_capabilities { * @orig_node: pointer to corresponding orig node * @bandwidth_down: advertised uplink download bandwidth * @bandwidth_up: advertised uplink upload bandwidth - * @deleted: this struct is scheduled for deletion * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_gw_node { struct hlist_node list; struct batadv_orig_node *orig_node; - uint32_t bandwidth_down; - uint32_t bandwidth_up; - unsigned long deleted; + u32 bandwidth_down; + u32 bandwidth_up; atomic_t refcount; struct rcu_head rcu; }; @@ -338,7 +356,7 @@ struct batadv_gw_node { struct batadv_neigh_node { struct hlist_node list; struct batadv_orig_node *orig_node; - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; struct hlist_head ifinfo_list; spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; @@ -358,11 +376,11 @@ struct batadv_neigh_node { * @real_packet_count: counted result of real_bits */ struct batadv_neigh_ifinfo_bat_iv { - uint8_t tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; - uint8_t tq_index; - uint8_t tq_avg; + u8 tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; + u8 tq_index; + u8 tq_avg; DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); - uint8_t real_packet_count; + u8 real_packet_count; }; /** @@ -378,7 +396,7 @@ struct batadv_neigh_ifinfo { struct hlist_node list; struct batadv_hard_iface *if_outgoing; struct batadv_neigh_ifinfo_bat_iv bat_iv; - uint8_t last_ttl; + u8 last_ttl; atomic_t refcount; struct rcu_head rcu; }; @@ -391,7 +409,7 @@ struct batadv_neigh_ifinfo { */ #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bcast_duplist_entry { - uint8_t orig[ETH_ALEN]; + u8 orig[ETH_ALEN]; __be32 crc; unsigned long entrytime; }; @@ -517,13 +535,13 @@ struct batadv_priv_tt { struct list_head changes_list; struct batadv_hashtable *local_hash; struct batadv_hashtable *global_hash; - struct list_head req_list; + struct hlist_head req_list; struct list_head roam_list; spinlock_t changes_list_lock; /* protects changes */ spinlock_t req_list_lock; /* protects req_list */ spinlock_t roam_list_lock; /* protects roam_list */ unsigned char *last_changeset; - int16_t last_changeset_len; + s16 last_changeset_len; /* protects last_changeset & last_changeset_len */ spinlock_t last_changeset_lock; /* prevents from executing a commit while reading the table */ @@ -643,7 +661,7 @@ struct batadv_priv_mcast { struct hlist_head want_all_unsnoopables_list; struct hlist_head want_all_ipv4_list; struct hlist_head want_all_ipv6_list; - uint8_t flags; + u8 flags; bool enabled; atomic_t num_disabled; atomic_t num_want_all_unsnoopables; @@ -761,7 +779,7 @@ struct batadv_priv { atomic_t mesh_state; struct net_device *soft_iface; struct net_device_stats stats; - uint64_t __percpu *bat_counters; /* Per cpu counters */ + u64 __percpu *bat_counters; /* Per cpu counters */ atomic_t aggregated_ogms; atomic_t bonding; atomic_t fragmentation; @@ -783,8 +801,8 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_t log_level; #endif - uint32_t isolation_mark; - uint32_t isolation_mark_mask; + u32 isolation_mark; + u32 isolation_mark_mask; atomic_t bcast_seqno; atomic_t bcast_queue_left; atomic_t batman_queue_left; @@ -850,7 +868,7 @@ struct batadv_socket_client { struct batadv_socket_packet { struct list_head list; size_t icmp_len; - uint8_t icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; + u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; }; /** @@ -871,14 +889,14 @@ struct batadv_socket_packet { */ #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bla_backbone_gw { - uint8_t orig[ETH_ALEN]; + u8 orig[ETH_ALEN]; unsigned short vid; struct hlist_node hash_entry; struct batadv_priv *bat_priv; unsigned long lasttime; atomic_t wait_periods; atomic_t request_sent; - uint16_t crc; + u16 crc; atomic_t refcount; struct rcu_head rcu; }; @@ -894,7 +912,7 @@ struct batadv_bla_backbone_gw { * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_bla_claim { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; unsigned long lasttime; @@ -916,10 +934,10 @@ struct batadv_bla_claim { * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_tt_common_entry { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; unsigned short vid; struct hlist_node hash_entry; - uint16_t flags; + u16 flags; unsigned long added_at; atomic_t refcount; struct rcu_head rcu; @@ -961,7 +979,7 @@ struct batadv_tt_global_entry { */ struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; - uint8_t ttvn; + u8 ttvn; struct hlist_node list; atomic_t refcount; struct rcu_head rcu; @@ -984,9 +1002,9 @@ struct batadv_tt_change_node { * @list: list node for batadv_priv_tt::req_list */ struct batadv_tt_req_node { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; unsigned long issued_at; - struct list_head list; + struct hlist_node list; }; /** @@ -998,7 +1016,7 @@ struct batadv_tt_req_node { * @list: list node for batadv_priv_tt::roam_list */ struct batadv_tt_roam_node { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; atomic_t counter; unsigned long first_time; struct list_head list; @@ -1015,7 +1033,7 @@ struct batadv_tt_roam_node { */ struct batadv_nc_node { struct list_head list; - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; atomic_t refcount; struct rcu_head rcu; struct batadv_orig_node *orig_node; @@ -1039,8 +1057,8 @@ struct batadv_nc_path { atomic_t refcount; struct list_head packet_list; spinlock_t packet_list_lock; /* Protects packet_list */ - uint8_t next_hop[ETH_ALEN]; - uint8_t prev_hop[ETH_ALEN]; + u8 next_hop[ETH_ALEN]; + u8 prev_hop[ETH_ALEN]; unsigned long last_valid; }; @@ -1092,11 +1110,11 @@ struct batadv_skb_cb { struct batadv_forw_packet { struct hlist_node list; unsigned long send_time; - uint8_t own; + u8 own; struct sk_buff *skb; - uint16_t packet_len; - uint32_t direct_link_flags; - uint8_t num_packets; + u16 packet_len; + u32 direct_link_flags; + u8 num_packets; struct delayed_work delayed_work; struct batadv_hard_iface *if_incoming; struct batadv_hard_iface *if_outgoing; @@ -1118,6 +1136,8 @@ struct batadv_forw_packet { * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better * than neigh2 for their respective outgoing interface from the metric * prospective + * @bat_neigh_free: free the resources allocated by the routing algorithm for a + * neigh_node object * @bat_orig_print: print the originator table (optional) * @bat_orig_free: free the resources allocated by the routing algorithm for an * orig_node object @@ -1135,6 +1155,7 @@ struct batadv_algo_ops { void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); + /* neigh_node handling API */ int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, @@ -1144,6 +1165,7 @@ struct batadv_algo_ops { struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); + void (*bat_neigh_free)(struct batadv_neigh_node *neigh); /* orig_node handling API */ void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); @@ -1167,7 +1189,7 @@ struct batadv_algo_ops { */ struct batadv_dat_entry { __be32 ip; - uint8_t mac_addr[ETH_ALEN]; + u8 mac_addr[ETH_ALEN]; unsigned short vid; unsigned long last_update; struct hlist_node hash_entry; @@ -1229,14 +1251,13 @@ struct batadv_tvlv_handler { struct hlist_node list; void (*ogm_handler)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, uint16_t tvlv_value_len); + u8 flags, void *tvlv_value, u16 tvlv_value_len); int (*unicast_handler)(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, - void *tvlv_value, uint16_t tvlv_value_len); - uint8_t type; - uint8_t version; - uint8_t flags; + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len); + u8 type; + u8 version; + u8 flags; atomic_t refcount; struct rcu_head rcu; }; diff --git a/kernel/net/bluetooth/6lowpan.c b/kernel/net/bluetooth/6lowpan.c index 1742b849f..795ddd8b2 100644 --- a/kernel/net/bluetooth/6lowpan.c +++ b/kernel/net/bluetooth/6lowpan.c @@ -21,8 +21,6 @@ #include #include -#include /* to get the address type */ - #include #include #include @@ -35,7 +33,6 @@ static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" -#define EUI64_ADDR_LEN 8 struct skb_cb { struct in6_addr addr; @@ -85,7 +82,7 @@ struct lowpan_dev { static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev) { - return netdev_priv(netdev); + return (struct lowpan_dev *)lowpan_priv(netdev)->priv; } static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer) @@ -192,7 +189,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, if (ipv6_addr_any(nexthop)) return NULL; } else { - nexthop = rt6_nexthop(rt); + nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the @@ -266,14 +263,13 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) if (!skb_cp) return NET_RX_DROP; - return netif_rx(skb_cp); + return netif_rx_ni(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct l2cap_chan *chan) { const u8 *saddr, *daddr; - u8 iphc0, iphc1; struct lowpan_dev *dev; struct lowpan_peer *peer; @@ -288,22 +284,7 @@ static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, saddr = peer->eui64_addr; daddr = dev->netdev->dev_addr; - /* at least two bytes will be used for the encoding */ - if (skb->len < 2) - return -EINVAL; - - if (lowpan_fetch_skb_u8(skb, &iphc0)) - return -EINVAL; - - if (lowpan_fetch_skb_u8(skb, &iphc1)) - return -EINVAL; - - return lowpan_header_decompress(skb, netdev, - saddr, IEEE802154_ADDR_LONG, - EUI64_ADDR_LEN, daddr, - IEEE802154_ADDR_LONG, EUI64_ADDR_LEN, - iphc0, iphc1); - + return lowpan_header_decompress(skb, netdev, daddr, saddr); } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, @@ -315,15 +296,20 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, if (!netif_running(dev)) goto drop; - if (dev->type != ARPHRD_6LOWPAN) + if (dev->type != ARPHRD_6LOWPAN || !skb->len) goto drop; + skb_reset_network_header(skb); + skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto drop; /* check that it's our buffer */ - if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { + if (lowpan_is_ipv6(*skb_network_header(skb))) { + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(skb, 1); + /* Copy the packet so that the IPv6 header is * properly aligned. */ @@ -334,8 +320,8 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; + local_skb->dev = dev; - skb_reset_network_header(local_skb); skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { @@ -348,38 +334,35 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, consume_skb(local_skb); consume_skb(skb); - } else { - switch (skb->data[0] & 0xe0) { - case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ - local_skb = skb_clone(skb, GFP_ATOMIC); - if (!local_skb) - goto drop; - - ret = iphc_decompress(local_skb, dev, chan); - if (ret < 0) { - kfree_skb(local_skb); - goto drop; - } + } else if (lowpan_is_iphc(*skb_network_header(skb))) { + local_skb = skb_clone(skb, GFP_ATOMIC); + if (!local_skb) + goto drop; - local_skb->protocol = htons(ETH_P_IPV6); - local_skb->pkt_type = PACKET_HOST; - local_skb->dev = dev; + local_skb->dev = dev; - if (give_skb_to_upper(local_skb, dev) - != NET_RX_SUCCESS) { - kfree_skb(local_skb); - goto drop; - } + ret = iphc_decompress(local_skb, dev, chan); + if (ret < 0) { + kfree_skb(local_skb); + goto drop; + } - dev->stats.rx_bytes += skb->len; - dev->stats.rx_packets++; + local_skb->protocol = htons(ETH_P_IPV6); + local_skb->pkt_type = PACKET_HOST; - consume_skb(local_skb); - consume_skb(skb); - break; - default: - break; + if (give_skb_to_upper(local_skb, dev) + != NET_RX_SUCCESS) { + kfree_skb(local_skb); + goto drop; } + + dev->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + + consume_skb(local_skb); + consume_skb(skb); + } else { + goto drop; } return NET_RX_SUCCESS; @@ -493,8 +476,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, status = 1; } - lowpan_header_compress(skb, netdev, ETH_P_IPV6, daddr, - dev->netdev->dev_addr, skb->len); + lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr); err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); if (err < 0) @@ -674,13 +656,8 @@ static struct header_ops header_ops = { static void netdev_setup(struct net_device *dev) { - dev->addr_len = EUI64_ADDR_LEN; - dev->type = ARPHRD_6LOWPAN; - dev->hard_header_len = 0; dev->needed_tailroom = 0; - dev->mtu = IPV6_MIN_MTU; - dev->tx_queue_len = 0; dev->flags = IFF_RUNNING | IFF_POINTOPOINT | IFF_MULTICAST; dev->watchdog_timeo = 0; @@ -775,24 +752,7 @@ static struct l2cap_chan *chan_create(void) chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; - chan->omtu = 65535; - chan->imtu = chan->omtu; - - return chan; -} - -static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) -{ - struct l2cap_chan *chan; - - chan = chan_create(); - if (!chan) - return NULL; - - chan->remote_mps = chan->omtu; - chan->mps = chan->omtu; - - chan->state = BT_CONNECTED; + chan->imtu = 1280; return chan; } @@ -848,20 +808,36 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) struct net_device *netdev; int err = 0; - netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE, - NET_NAME_UNKNOWN, netdev_setup); + netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)), + IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, + netdev_setup); if (!netdev) return -ENOMEM; set_dev_addr(netdev, &chan->src, chan->src_type); netdev->netdev_ops = &netdev_ops; - SET_NETDEV_DEV(netdev, &chan->conn->hcon->dev); + SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); + *dev = lowpan_dev(netdev); + (*dev)->netdev = netdev; + (*dev)->hdev = chan->conn->hcon->hdev; + INIT_LIST_HEAD(&(*dev)->peers); + + spin_lock(&devices_lock); + INIT_LIST_HEAD(&(*dev)->list); + list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); + spin_unlock(&devices_lock); + + lowpan_netdev_setup(netdev, LOWPAN_LLTYPE_BTLE); + err = register_netdev(netdev); if (err < 0) { BT_INFO("register_netdev failed %d", err); + spin_lock(&devices_lock); + list_del_rcu(&(*dev)->list); + spin_unlock(&devices_lock); free_netdev(netdev); goto out; } @@ -871,16 +847,6 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); - *dev = netdev_priv(netdev); - (*dev)->netdev = netdev; - (*dev)->hdev = chan->conn->hcon->hdev; - INIT_LIST_HEAD(&(*dev)->peers); - - spin_lock(&devices_lock); - INIT_LIST_HEAD(&(*dev)->list); - list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); - spin_unlock(&devices_lock); - return 0; out: @@ -913,7 +879,10 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; - chan = chan_open(pchan); + chan = chan_create(); + if (!chan) + return NULL; + chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); @@ -928,7 +897,7 @@ static void delete_netdev(struct work_struct *work) unregister_netdev(entry->netdev); - /* The entry pointer is deleted in device_event() */ + /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) @@ -937,7 +906,7 @@ static void chan_close_cb(struct l2cap_chan *chan) struct lowpan_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; - bool last = false, removed = true; + bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); @@ -948,7 +917,7 @@ static void chan_close_cb(struct l2cap_chan *chan) /* If conn is set, then the netdev is also there and we should * not remove it. */ - removed = false; + remove = false; } spin_lock(&devices_lock); @@ -977,7 +946,7 @@ static void chan_close_cb(struct l2cap_chan *chan) ifdown(dev->netdev); - if (!removed) { + if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } @@ -1059,34 +1028,23 @@ static inline __u8 bdaddr_type(__u8 type) return BDADDR_LE_RANDOM; } -static struct l2cap_chan *chan_get(void) -{ - struct l2cap_chan *pchan; - - pchan = chan_create(); - if (!pchan) - return NULL; - - pchan->ops = &bt_6lowpan_chan_ops; - - return pchan; -} - static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { - struct l2cap_chan *pchan; + struct l2cap_chan *chan; int err; - pchan = chan_get(); - if (!pchan) + chan = chan_create(); + if (!chan) return -EINVAL; - err = l2cap_chan_connect(pchan, cpu_to_le16(L2CAP_PSM_IPSP), 0, + chan->ops = &bt_6lowpan_chan_ops; + + err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type); - BT_DBG("chan %p err %d", pchan, err); + BT_DBG("chan %p err %d", chan, err); if (err < 0) - l2cap_chan_put(pchan); + l2cap_chan_put(chan); return err; } @@ -1111,31 +1069,32 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; - struct l2cap_chan *pchan; + struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; - pchan = chan_get(); - if (!pchan) + chan = chan_create(); + if (!chan) return NULL; - pchan->state = BT_LISTEN; - pchan->src_type = BDADDR_LE_PUBLIC; + chan->ops = &bt_6lowpan_chan_ops; + chan->state = BT_LISTEN; + chan->src_type = BDADDR_LE_PUBLIC; - atomic_set(&pchan->nesting, L2CAP_NESTING_PARENT); + atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); - BT_DBG("chan %p src type %d", pchan, pchan->src_type); + BT_DBG("chan %p src type %d", chan, chan->src_type); - err = l2cap_add_psm(pchan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); + err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { - l2cap_chan_put(pchan); + l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } - return pchan; + return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, @@ -1159,7 +1118,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, return -ENOENT; hci_dev_lock(hdev); - hcon = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); + hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); hci_dev_unlock(hdev); if (!hcon) @@ -1208,8 +1167,6 @@ static void disconnect_all_peers(void) list_del_rcu(&peer->list); kfree_rcu(peer, rcu); - - module_put(THIS_MODULE); } spin_unlock(&devices_lock); } @@ -1418,7 +1375,6 @@ static int device_event(struct notifier_block *unused, BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); - kfree(entry); break; } } diff --git a/kernel/net/bluetooth/Kconfig b/kernel/net/bluetooth/Kconfig index b8c794b87..95d1a66ba 100644 --- a/kernel/net/bluetooth/Kconfig +++ b/kernel/net/bluetooth/Kconfig @@ -53,6 +53,11 @@ source "net/bluetooth/cmtp/Kconfig" source "net/bluetooth/hidp/Kconfig" +config BT_HS + bool "Bluetooth High Speed (HS) features" + depends on BT_BREDR + default y + config BT_LE bool "Bluetooth Low Energy (LE) features" depends on BT diff --git a/kernel/net/bluetooth/Makefile b/kernel/net/bluetooth/Makefile index 9a8ea232d..2b15ae8c1 100644 --- a/kernel/net/bluetooth/Makefile +++ b/kernel/net/bluetooth/Makefile @@ -12,9 +12,11 @@ obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ - hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \ - a2mp.o amp.o ecc.o hci_request.o mgmt_util.o + hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ + ecc.o hci_request.o mgmt_util.o +bluetooth-$(CONFIG_BT_BREDR) += sco.o +bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/kernel/net/bluetooth/a2mp.c b/kernel/net/bluetooth/a2mp.c index 5a04eb1a7..5f123c332 100644 --- a/kernel/net/bluetooth/a2mp.c +++ b/kernel/net/bluetooth/a2mp.c @@ -16,6 +16,7 @@ #include #include +#include "hci_request.h" #include "a2mp.h" #include "amp.h" @@ -286,11 +287,21 @@ static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb, return 0; } +static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + a2mp_send_getinfo_rsp(hdev); +} + static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cmd *hdr) { struct a2mp_info_req *req = (void *) skb->data; struct hci_dev *hdev; + struct hci_request hreq; + int err = 0; if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; @@ -311,7 +322,11 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, } set_bit(READ_LOC_AMP_INFO, &mgr->state); - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); + hci_req_init(&hreq, hdev); + hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); + err = hci_req_run(&hreq, read_local_amp_info_complete); + if (err < 0) + a2mp_send_getinfo_rsp(hdev); done: if (hdev) diff --git a/kernel/net/bluetooth/a2mp.h b/kernel/net/bluetooth/a2mp.h index 296f665ad..a4ff3ea9b 100644 --- a/kernel/net/bluetooth/a2mp.h +++ b/kernel/net/bluetooth/a2mp.h @@ -130,10 +130,29 @@ struct a2mp_physlink_rsp { #define A2MP_STATUS_SECURITY_VIOLATION 0x06 struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr); + +#if IS_ENABLED(CONFIG_BT_HS) int amp_mgr_put(struct amp_mgr *mgr); struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn, struct sk_buff *skb); void a2mp_discover_amp(struct l2cap_chan *chan); +#else +static inline int amp_mgr_put(struct amp_mgr *mgr) +{ + return 0; +} + +static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn, + struct sk_buff *skb) +{ + return NULL; +} + +static inline void a2mp_discover_amp(struct l2cap_chan *chan) +{ +} +#endif + void a2mp_send_getinfo_rsp(struct hci_dev *hdev); void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status); void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status); diff --git a/kernel/net/bluetooth/af_bluetooth.c b/kernel/net/bluetooth/af_bluetooth.c index 70f9d945f..70306cc9d 100644 --- a/kernel/net/bluetooth/af_bluetooth.c +++ b/kernel/net/bluetooth/af_bluetooth.c @@ -33,7 +33,7 @@ #include "selftest.h" -#define VERSION "2.20" +#define VERSION "2.21" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 @@ -221,7 +221,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, BT_DBG("sock %p sk %p len %zu", sock, sk, len); - if (flags & (MSG_OOB)) + if (flags & MSG_OOB) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, noblock, &err); @@ -271,11 +271,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo) if (signal_pending(current) || !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } __set_current_state(TASK_RUNNING); @@ -441,7 +441,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock, if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/kernel/net/bluetooth/amp.c b/kernel/net/bluetooth/amp.c index ee016f039..e32f34189 100644 --- a/kernel/net/bluetooth/amp.c +++ b/kernel/net/bluetooth/amp.c @@ -16,6 +16,7 @@ #include #include +#include "hci_request.h" #include "a2mp.h" #include "amp.h" @@ -220,10 +221,49 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type) return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data); } +static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data; + struct amp_assoc *assoc = &hdev->loc_assoc; + size_t rem_len, frag_len; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + goto send_rsp; + + frag_len = skb->len - sizeof(*rp); + rem_len = __le16_to_cpu(rp->rem_len); + + if (rem_len > frag_len) { + BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len); + + memcpy(assoc->data + assoc->offset, rp->frag, frag_len); + assoc->offset += frag_len; + + /* Read other fragments */ + amp_read_loc_assoc_frag(hdev, rp->phy_handle); + + return; + } + + memcpy(assoc->data + assoc->offset, rp->frag, rem_len); + assoc->len = assoc->offset + rem_len; + assoc->offset = 0; + +send_rsp: + /* Send A2MP Rsp when all fragments are received */ + a2mp_send_getampassoc_rsp(hdev, rp->status); + a2mp_send_create_phy_link_req(hdev, rp->status); +} + void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) { struct hci_cp_read_local_amp_assoc cp; struct amp_assoc *loc_assoc = &hdev->loc_assoc; + struct hci_request req; + int err = 0; BT_DBG("%s handle %d", hdev->name, phy_handle); @@ -231,12 +271,18 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) cp.max_len = cpu_to_le16(hdev->amp_assoc_size); cp.len_so_far = cpu_to_le16(loc_assoc->offset); - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + err = hci_req_run_skb(&req, read_local_amp_assoc_complete); + if (err < 0) + a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); } void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) { struct hci_cp_read_local_amp_assoc cp; + struct hci_request req; + int err = 0; memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc)); memset(&cp, 0, sizeof(cp)); @@ -244,7 +290,11 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) cp.max_len = cpu_to_le16(hdev->amp_assoc_size); set_bit(READ_LOC_AMP_ASSOC, &mgr->state); - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_run_skb(&req, read_local_amp_assoc_complete); + if (err < 0) + a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); } void amp_read_loc_assoc_final_data(struct hci_dev *hdev, @@ -252,6 +302,8 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, { struct hci_cp_read_local_amp_assoc cp; struct amp_mgr *mgr = hcon->amp_mgr; + struct hci_request req; + int err = 0; cp.phy_handle = hcon->handle; cp.len_so_far = cpu_to_le16(0); @@ -260,7 +312,25 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state); /* Read Local AMP Assoc final link information data */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_run_skb(&req, read_local_amp_assoc_complete); + if (err < 0) + a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); +} + +static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data; + + BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x", + hdev->name, rp->status, rp->phy_handle); + + if (rp->status) + return; + + amp_write_rem_assoc_continue(hdev, rp->phy_handle); } /* Write AMP Assoc data fragments, returns true with last fragment written*/ @@ -270,6 +340,7 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev, struct hci_cp_write_remote_amp_assoc *cp; struct amp_mgr *mgr = hcon->amp_mgr; struct amp_ctrl *ctrl; + struct hci_request req; u16 frag_len, len; ctrl = amp_ctrl_lookup(mgr, hcon->remote_id); @@ -307,7 +378,9 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev, amp_ctrl_put(ctrl); - hci_send_cmd(hdev, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp); + hci_req_run_skb(&req, write_remote_amp_assoc_complete); kfree(cp); @@ -344,10 +417,37 @@ void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle) amp_write_rem_assoc_frag(hdev, hcon); } +static void create_phylink_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct hci_cp_create_phy_link *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK); + if (!cp) + return; + + hci_dev_lock(hdev); + + if (status) { + struct hci_conn *hcon; + + hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle); + if (hcon) + hci_conn_del(hcon); + } else { + amp_write_remote_assoc(hdev, cp->phy_handle); + } + + hci_dev_unlock(hdev); +} + void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon) { struct hci_cp_create_phy_link cp; + struct hci_request req; cp.phy_handle = hcon->handle; @@ -360,13 +460,33 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, return; } - hci_send_cmd(hdev, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp); + hci_req_run(&req, create_phylink_complete); +} + +static void accept_phylink_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct hci_cp_accept_phy_link *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK); + if (!cp) + return; + + amp_write_remote_assoc(hdev, cp->phy_handle); } void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon) { struct hci_cp_accept_phy_link cp; + struct hci_request req; cp.phy_handle = hcon->handle; @@ -379,7 +499,9 @@ void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, return; } - hci_send_cmd(hdev, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp); + hci_req_run(&req, accept_phylink_complete); } void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon) diff --git a/kernel/net/bluetooth/amp.h b/kernel/net/bluetooth/amp.h index 7ea3db77b..8848f8158 100644 --- a/kernel/net/bluetooth/amp.h +++ b/kernel/net/bluetooth/amp.h @@ -44,6 +44,20 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon); void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon); + +#if IS_ENABLED(CONFIG_BT_HS) +void amp_create_logical_link(struct l2cap_chan *chan); +void amp_disconnect_logical_link(struct hci_chan *hchan); +#else +static inline void amp_create_logical_link(struct l2cap_chan *chan) +{ +} + +static inline void amp_disconnect_logical_link(struct hci_chan *hchan) +{ +} +#endif + void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle); void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle); void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon); diff --git a/kernel/net/bluetooth/bnep/sock.c b/kernel/net/bluetooth/bnep/sock.c index bde2bdd9e..b5116fa98 100644 --- a/kernel/net/bluetooth/bnep/sock.c +++ b/kernel/net/bluetooth/bnep/sock.c @@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/bluetooth/cmtp/capi.c b/kernel/net/bluetooth/cmtp/capi.c index b0c6c6af7..9a5033877 100644 --- a/kernel/net/bluetooth/cmtp/capi.c +++ b/kernel/net/bluetooth/cmtp/capi.c @@ -100,9 +100,9 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value) { struct cmtp_application *app; - struct list_head *p, *n; + struct list_head *p; - list_for_each_safe(p, n, &session->applications) { + list_for_each(p, &session->applications) { app = list_entry(p, struct cmtp_application, list); switch (pattern) { case CMTP_MSGNUM: @@ -511,13 +511,13 @@ static int cmtp_proc_show(struct seq_file *m, void *v) struct capi_ctr *ctrl = m->private; struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *app; - struct list_head *p, *n; + struct list_head *p; seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl)); seq_printf(m, "addr %s\n", session->name); seq_printf(m, "ctrl %d\n", session->num); - list_for_each_safe(p, n, &session->applications) { + list_for_each(p, &session->applications) { app = list_entry(p, struct cmtp_application, list); seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); } diff --git a/kernel/net/bluetooth/cmtp/sock.c b/kernel/net/bluetooth/cmtp/sock.c index d82787d41..ce86a7bae 100644 --- a/kernel/net/bluetooth/cmtp/sock.c +++ b/kernel/net/bluetooth/cmtp/sock.c @@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/bluetooth/hci_conn.c b/kernel/net/bluetooth/hci_conn.c index ee5e59839..24e941092 100644 --- a/kernel/net/bluetooth/hci_conn.c +++ b/kernel/net/bluetooth/hci_conn.c @@ -59,9 +59,126 @@ static const struct sco_param esco_param_msbc[] = { { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; -static void hci_le_create_connection_cancel(struct hci_conn *conn) +/* This function requires the caller holds hdev->lock */ +static void hci_connect_le_scan_cleanup(struct hci_conn *conn) +{ + struct hci_conn_params *params; + struct hci_dev *hdev = conn->hdev; + struct smp_irk *irk; + bdaddr_t *bdaddr; + u8 bdaddr_type; + + bdaddr = &conn->dst; + bdaddr_type = conn->dst_type; + + /* Check if we need to convert to identity address */ + irk = hci_get_irk(hdev, bdaddr, bdaddr_type); + if (irk) { + bdaddr = &irk->bdaddr; + bdaddr_type = irk->addr_type; + } + + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr, + bdaddr_type); + if (!params || !params->explicit_connect) + return; + + /* The connection attempt was doing scan for new RPA, and is + * in scan phase. If params are not associated with any other + * autoconnect action, remove them completely. If they are, just unmark + * them as waiting for connection, by clearing explicit_connect field. + */ + params->explicit_connect = false; + + list_del_init(¶ms->action); + + switch (params->auto_connect) { + case HCI_AUTO_CONN_EXPLICIT: + hci_conn_params_del(hdev, bdaddr, bdaddr_type); + /* return instead of break to avoid duplicate scan update */ + return; + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + list_add(¶ms->action, &hdev->pend_le_conns); + break; + case HCI_AUTO_CONN_REPORT: + list_add(¶ms->action, &hdev->pend_le_reports); + break; + default: + break; + } + + hci_update_background_scan(hdev); +} + +static void hci_conn_cleanup(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) + hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); + + hci_chan_list_flush(conn); + + hci_conn_hash_del(hdev, conn); + + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + + hci_conn_del_sysfs(conn); + + debugfs_remove_recursive(conn->debugfs); + + hci_dev_put(hdev); + + hci_conn_put(conn); +} + +static void le_scan_cleanup(struct work_struct *work) +{ + struct hci_conn *conn = container_of(work, struct hci_conn, + le_scan_cleanup); + struct hci_dev *hdev = conn->hdev; + struct hci_conn *c = NULL; + + BT_DBG("%s hcon %p", hdev->name, conn); + + hci_dev_lock(hdev); + + /* Check that the hci_conn is still around */ + rcu_read_lock(); + list_for_each_entry_rcu(c, &hdev->conn_hash.list, list) { + if (c == conn) + break; + } + rcu_read_unlock(); + + if (c == conn) { + hci_connect_le_scan_cleanup(conn); + hci_conn_cleanup(conn); + } + + hci_dev_unlock(hdev); + hci_dev_put(hdev); + hci_conn_put(conn); +} + +static void hci_connect_le_scan_remove(struct hci_conn *conn) { - hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL); + BT_DBG("%s hcon %p", conn->hdev->name, conn); + + /* We can't call hci_conn_del/hci_conn_cleanup here since that + * could deadlock with another hci_conn_del() call that's holding + * hci_dev_lock and doing cancel_delayed_work_sync(&conn->disc_work). + * Instead, grab temporary extra references to the hci_dev and + * hci_conn and perform the necessary cleanup in a separate work + * callback. + */ + + hci_dev_hold(conn->hdev); + hci_conn_get(conn); + + schedule_work(&conn->le_scan_cleanup); } static void hci_acl_create_connection(struct hci_conn *conn) @@ -107,33 +224,8 @@ static void hci_acl_create_connection(struct hci_conn *conn) hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp); } -static void hci_acl_create_connection_cancel(struct hci_conn *conn) -{ - struct hci_cp_create_conn_cancel cp; - - BT_DBG("hcon %p", conn); - - if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2) - return; - - bacpy(&cp.bdaddr, &conn->dst); - hci_send_cmd(conn->hdev, HCI_OP_CREATE_CONN_CANCEL, sizeof(cp), &cp); -} - -static void hci_reject_sco(struct hci_conn *conn) -{ - struct hci_cp_reject_sync_conn_req cp; - - cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; - bacpy(&cp.bdaddr, &conn->dst); - - hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); -} - int hci_disconnect(struct hci_conn *conn, __u8 reason) { - struct hci_cp_disconnect cp; - BT_DBG("hcon %p", conn); /* When we are master of an established connection and it enters @@ -141,7 +233,8 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason) * current clock offset. Processing of the result is done * within the event handling and hci_clock_offset_evt function. */ - if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER) { + if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER && + (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) { struct hci_dev *hdev = conn->hdev; struct hci_cp_read_clock_offset clkoff_cp; @@ -150,25 +243,7 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason) &clkoff_cp); } - conn->state = BT_DISCONN; - - cp.handle = cpu_to_le16(conn->handle); - cp.reason = reason; - return hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); -} - -static void hci_amp_disconn(struct hci_conn *conn) -{ - struct hci_cp_disconn_phy_link cp; - - BT_DBG("hcon %p", conn); - - conn->state = BT_DISCONN; - - cp.phy_handle = HCI_PHY_HANDLE(conn->handle); - cp.reason = hci_proto_disconn_ind(conn); - hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK, - sizeof(cp), &cp); + return hci_abort_conn(conn, reason); } static void hci_add_sco(struct hci_conn *conn, __u16 handle) @@ -276,7 +351,7 @@ u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, - __u8 ltk[16]) + __u8 ltk[16], __u8 key_size) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_start_enc cp; @@ -288,7 +363,7 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, cp.handle = cpu_to_le16(conn->handle); cp.rand = rand; cp.ediv = ediv; - memcpy(cp.ltk, ltk, sizeof(cp.ltk)); + memcpy(cp.ltk, ltk, key_size); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } @@ -334,31 +409,14 @@ static void hci_conn_timeout(struct work_struct *work) if (refcnt > 0) return; - switch (conn->state) { - case BT_CONNECT: - case BT_CONNECT2: - if (conn->out) { - if (conn->type == ACL_LINK) - hci_acl_create_connection_cancel(conn); - else if (conn->type == LE_LINK) - hci_le_create_connection_cancel(conn); - } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { - hci_reject_sco(conn); - } - break; - case BT_CONFIG: - case BT_CONNECTED: - if (conn->type == AMP_LINK) { - hci_amp_disconn(conn); - } else { - __u8 reason = hci_proto_disconn_ind(conn); - hci_disconnect(conn, reason); - } - break; - default: - conn->state = BT_CLOSED; - break; + /* LE connections in scanning state need special handling */ + if (conn->state == BT_CONNECT && conn->type == LE_LINK && + test_bit(HCI_CONN_SCANNING, &conn->flags)) { + hci_connect_le_scan_remove(conn); + return; } + + hci_abort_conn(conn, hci_proto_disconn_ind(conn)); } /* Enter sniff mode */ @@ -426,7 +484,7 @@ static void le_conn_timeout(struct work_struct *work) return; } - hci_le_create_connection_cancel(conn); + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); } struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, @@ -489,6 +547,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); + INIT_WORK(&conn->le_scan_cleanup, le_scan_cleanup); atomic_set(&conn->refcnt, 0); @@ -535,27 +594,17 @@ int hci_conn_del(struct hci_conn *conn) } } - hci_chan_list_flush(conn); - if (conn->amp_mgr) amp_mgr_put(conn->amp_mgr); - hci_conn_hash_del(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); - skb_queue_purge(&conn->data_q); - hci_conn_del_sysfs(conn); - - debugfs_remove_recursive(conn->debugfs); - - if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) - hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); - - hci_dev_put(hdev); - - hci_conn_put(conn); + /* Remove the connection from the list and cleanup its remaining + * state. This is a separate function since for some cases like + * BT_CONNECT_SCAN we *only* want the cleanup part without the + * rest of hci_conn_del. + */ + hci_conn_cleanup(conn); return 0; } @@ -637,15 +686,18 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct hci_conn *conn; - if (status == 0) - return; + hci_dev_lock(hdev); + + conn = hci_lookup_le_connect(hdev); + + if (!status) { + hci_connect_le_scan_cleanup(conn); + goto done; + } BT_ERR("HCI request failed to create LE connection: status 0x%2.2x", status); - hci_dev_lock(hdev); - - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); if (!conn) goto done; @@ -670,8 +722,12 @@ static void hci_req_add_le_create_conn(struct hci_request *req, if (hci_update_random_address(req, false, &own_addr_type)) return; + /* Set window to be the same value as the interval to enable + * continuous scanning. + */ cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); - cp.scan_window = cpu_to_le16(hdev->le_scan_window); + cp.scan_window = cp.scan_interval; + bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; cp.own_address_type = own_addr_type; @@ -685,6 +741,7 @@ static void hci_req_add_le_create_conn(struct hci_request *req, hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); conn->state = BT_CONNECT; + clear_bit(HCI_CONN_SCANNING, &conn->flags); } static void hci_req_directed_advertising(struct hci_request *req, @@ -728,7 +785,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 role) { struct hci_conn_params *params; - struct hci_conn *conn; + struct hci_conn *conn, *conn_unfinished; struct smp_irk *irk; struct hci_request req; int err; @@ -750,27 +807,30 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * attempt, we simply update pending_sec_level and auth_type fields * and return the object found. */ - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst); + conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); + conn_unfinished = NULL; if (conn) { - conn->pending_sec_level = sec_level; - goto done; + if (conn->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &conn->flags)) { + BT_DBG("will continue unfinished conn %pMR", dst); + conn_unfinished = conn; + } else { + if (conn->pending_sec_level < sec_level) + conn->pending_sec_level = sec_level; + goto done; + } } /* Since the controller supports only one LE connection attempt at a * time, we return -EBUSY if there is any connection attempt running. */ - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); - if (conn) + if (hci_lookup_le_connect(hdev)) return ERR_PTR(-EBUSY); /* When given an identity address with existing identity * resolving key, the connection needs to be established * to a resolvable random address. * - * This uses the cached random resolvable address from - * a previous scan. When no cached address is available, - * try connecting to the identity address instead. - * * Storing the resolvable random address is required here * to handle connection failures. The address will later * be resolved back into the original identity address @@ -782,15 +842,23 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, dst_type = ADDR_LE_DEV_RANDOM; } - conn = hci_conn_add(hdev, LE_LINK, dst, role); + if (conn_unfinished) { + conn = conn_unfinished; + bacpy(&conn->dst, dst); + } else { + conn = hci_conn_add(hdev, LE_LINK, dst, role); + } + if (!conn) return ERR_PTR(-ENOMEM); conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; - conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; + if (!conn_unfinished) + conn->pending_sec_level = sec_level; + hci_req_init(&req, hdev); /* Disable advertising if we're active. For master role @@ -854,6 +922,149 @@ create_conn: return ERR_PTR(err); } +done: + /* If this is continuation of connect started by hci_connect_le_scan, + * it already called hci_conn_hold and calling it again would mess the + * counter. + */ + if (!conn_unfinished) + hci_conn_hold(conn); + + return conn; +} + +static void hci_connect_le_scan_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct hci_conn *conn; + + if (!status) + return; + + BT_ERR("Failed to add device to auto conn whitelist: status 0x%2.2x", + status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + if (conn) + hci_le_conn_failed(conn, status); + + hci_dev_unlock(hdev); +} + +static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) +{ + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_le(hdev, addr, type); + if (!conn) + return false; + + if (conn->state != BT_CONNECTED) + return false; + + return true; +} + +/* This function requires the caller holds hdev->lock */ +static int hci_explicit_conn_params_set(struct hci_request *req, + bdaddr_t *addr, u8 addr_type) +{ + struct hci_dev *hdev = req->hdev; + struct hci_conn_params *params; + + if (is_connected(hdev, addr, addr_type)) + return -EISCONN; + + params = hci_conn_params_lookup(hdev, addr, addr_type); + if (!params) { + params = hci_conn_params_add(hdev, addr, addr_type); + if (!params) + return -ENOMEM; + + /* If we created new params, mark them to be deleted in + * hci_connect_le_scan_cleanup. It's different case than + * existing disabled params, those will stay after cleanup. + */ + params->auto_connect = HCI_AUTO_CONN_EXPLICIT; + } + + /* We're trying to connect, so make sure params are at pend_le_conns */ + if (params->auto_connect == HCI_AUTO_CONN_DISABLED || + params->auto_connect == HCI_AUTO_CONN_REPORT || + params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { + list_del_init(¶ms->action); + list_add(¶ms->action, &hdev->pend_le_conns); + } + + params->explicit_connect = true; + __hci_update_background_scan(req); + + BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, + params->auto_connect); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, + u16 conn_timeout, u8 role) +{ + struct hci_conn *conn; + struct hci_request req; + int err; + + /* Let's make sure that le is enabled.*/ + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + if (lmp_le_capable(hdev)) + return ERR_PTR(-ECONNREFUSED); + + return ERR_PTR(-EOPNOTSUPP); + } + + /* Some devices send ATT messages as soon as the physical link is + * established. To be able to handle these ATT messages, the user- + * space first establishes the connection and then starts the pairing + * process. + * + * So if a hci_conn object already exists for the following connection + * attempt, we simply update pending_sec_level and auth_type fields + * and return the object found. + */ + conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); + if (conn) { + if (conn->pending_sec_level < sec_level) + conn->pending_sec_level = sec_level; + goto done; + } + + BT_DBG("requesting refresh of dst_addr"); + + conn = hci_conn_add(hdev, LE_LINK, dst, role); + if (!conn) + return ERR_PTR(-ENOMEM); + + hci_req_init(&req, hdev); + + if (hci_explicit_conn_params_set(&req, dst, dst_type) < 0) + return ERR_PTR(-EBUSY); + + conn->state = BT_CONNECT; + set_bit(HCI_CONN_SCANNING, &conn->flags); + + err = hci_req_run(&req, hci_connect_le_scan_complete); + if (err && err != -ENODATA) { + hci_conn_del(conn); + return ERR_PTR(err); + } + + conn->dst_type = dst_type; + conn->sec_level = BT_SECURITY_LOW; + conn->pending_sec_level = sec_level; + conn->conn_timeout = conn_timeout; + done: hci_conn_hold(conn); return conn; diff --git a/kernel/net/bluetooth/hci_core.c b/kernel/net/bluetooth/hci_core.c index c4802f3bd..62edbf1b1 100644 --- a/kernel/net/bluetooth/hci_core.c +++ b/kernel/net/bluetooth/hci_core.c @@ -65,13 +65,6 @@ static DEFINE_IDA(hci_index_ida); #define hci_req_lock(d) mutex_lock(&d->req_lock) #define hci_req_unlock(d) mutex_unlock(&d->req_lock) -/* ---- HCI notifications ---- */ - -static void hci_notify(struct hci_dev *hdev, int event) -{ - hci_sock_dev_event(hdev, event); -} - /* ---- HCI debugfs entries ---- */ static ssize_t dut_mode_read(struct file *file, char __user *user_buf, @@ -94,7 +87,6 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; - int err; if (!test_bit(HCI_UP, &hdev->flags)) return -ENETDOWN; @@ -121,12 +113,8 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (IS_ERR(skb)) return PTR_ERR(skb); - err = -bt_to_errno(skb->data[0]); kfree_skb(skb); - if (err < 0) - return err; - hci_dev_change_flag(hdev, HCI_DUT_MODE); return count; @@ -139,6 +127,77 @@ static const struct file_operations dut_mode_fops = { .llseek = default_llseek, }; +static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y': 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf)-1)); + bool enable; + int err; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + /* When the diagnostic flags are not persistent and the transport + * is not active, then there is no need for the vendor callback. + * + * Instead just store the desired value. If needed the setting + * will be programmed when the controller gets powered on. + */ + if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + !test_bit(HCI_RUNNING, &hdev->flags)) + goto done; + + hci_req_lock(hdev); + err = hdev->set_diag(hdev, enable); + hci_req_unlock(hdev); + + if (err < 0) + return err; + +done: + if (enable) + hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); + else + hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); + + return count; +} + +static const struct file_operations vendor_diag_fops = { + .open = simple_open, + .read = vendor_diag_read, + .write = vendor_diag_write, + .llseek = default_llseek, +}; + +static void hci_debugfs_create_basic(struct hci_dev *hdev) +{ + debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, + &dut_mode_fops); + + if (hdev->set_diag) + debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, + &vendor_diag_fops); +} + /* ---- HCI requests ---- */ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, @@ -449,12 +508,6 @@ static void le_setup(struct hci_request *req) /* Read LE Supported States */ hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL); - /* Read LE White List Size */ - hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL); - - /* Clear LE White List */ - hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); - /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) hci_dev_set_flag(hdev, HCI_LE_ENABLED); @@ -698,7 +751,8 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) hci_setup_event_mask(req); - if (hdev->commands[6] & 0x20) { + if (hdev->commands[6] & 0x20 && + !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) { struct hci_cp_read_stored_link_key cp; bacpy(&cp.bdaddr, BDADDR_ANY); @@ -772,6 +826,17 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); } + if (hdev->commands[26] & 0x40) { + /* Read LE White List Size */ + hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, + 0, NULL); + } + + if (hdev->commands[26] & 0x80) { + /* Clear LE White List */ + hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); @@ -854,13 +919,8 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - /* The Device Under Test (DUT) mode is special and available for - * all controller types. So just create it early on. - */ - if (hci_dev_test_flag(hdev, HCI_SETUP)) { - debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, - &dut_mode_fops); - } + if (hci_dev_test_flag(hdev, HCI_SETUP)) + hci_debugfs_create_basic(hdev); err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); if (err < 0) @@ -937,6 +997,9 @@ static int __hci_unconf_init(struct hci_dev *hdev) if (err < 0) return err; + if (hci_dev_test_flag(hdev, HCI_SETUP)) + hci_debugfs_create_basic(hdev); + return 0; } @@ -1389,10 +1452,15 @@ static int hci_dev_do_open(struct hci_dev *hdev) goto done; } + set_bit(HCI_RUNNING, &hdev->flags); + hci_sock_dev_event(hdev, HCI_DEV_OPEN); + atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); if (hci_dev_test_flag(hdev, HCI_SETUP)) { + hci_sock_dev_event(hdev, HCI_DEV_SETUP); + if (hdev->setup) ret = hdev->setup(hdev); @@ -1433,17 +1501,28 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (!ret) { if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = __hci_init(hdev); + if (!ret && hdev->post_init) + ret = hdev->post_init(hdev); + } } + /* If the HCI Reset command is clearing all diagnostic settings, + * then they need to be reprogrammed after the init procedure + * completed. + */ + if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) + ret = hdev->set_diag(hdev, true); + clear_bit(HCI_INIT, &hdev->flags); if (!ret) { hci_dev_hold(hdev); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); set_bit(HCI_UP, &hdev->flags); - hci_notify(hdev, HCI_DEV_UP); + hci_sock_dev_event(hdev, HCI_DEV_UP); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && @@ -1470,6 +1549,9 @@ static int hci_dev_do_open(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + clear_bit(HCI_RUNNING, &hdev->flags); + hci_sock_dev_event(hdev, HCI_DEV_CLOSE); + hdev->close(hdev); hdev->flags &= BIT(HCI_RAW); } @@ -1553,11 +1635,14 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) BT_DBG("All LE pending actions cleared"); } -static int hci_dev_do_close(struct hci_dev *hdev) +int hci_dev_do_close(struct hci_dev *hdev) { + bool auto_off; + BT_DBG("%s %p", hdev->name, hdev); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) @@ -1595,6 +1680,11 @@ static int hci_dev_do_close(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_MGMT)) cancel_delayed_work_sync(&hdev->rpa_expired); + if (hdev->adv_instance_timeout) { + cancel_delayed_work_sync(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -1604,10 +1694,10 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - if (!hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { - if (hdev->dev_type == HCI_BREDR) - mgmt_powered(hdev, 0); - } + auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); + + if (!auto_off && hdev->dev_type == HCI_BREDR) + mgmt_powered(hdev, 0); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); @@ -1616,7 +1706,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) smp_unregister(hdev); - hci_notify(hdev, HCI_DEV_DOWN); + hci_sock_dev_event(hdev, HCI_DEV_DOWN); if (hdev->flush) hdev->flush(hdev); @@ -1624,9 +1714,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && - !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && - test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && + !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); clear_bit(HCI_INIT, &hdev->flags); @@ -1647,6 +1736,9 @@ static int hci_dev_do_close(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + clear_bit(HCI_RUNNING, &hdev->flags); + hci_sock_dev_event(hdev, HCI_DEV_CLOSE); + /* After this point our queues are empty * and no tasks are scheduled. */ hdev->close(hdev); @@ -2151,6 +2243,17 @@ static void hci_discov_off(struct work_struct *work) mgmt_discoverable_timeout(hdev); } +static void hci_adv_timeout_expire(struct work_struct *work) +{ + struct hci_dev *hdev; + + hdev = container_of(work, struct hci_dev, adv_instance_expire.work); + + BT_DBG("%s", hdev->name); + + mgmt_adv_timeout_expired(hdev); +} + void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; @@ -2614,6 +2717,130 @@ int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + if (adv_instance->instance == instance) + return adv_instance; + } + + return NULL; +} + +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { + struct adv_info *cur_instance; + + cur_instance = hci_find_adv_instance(hdev, instance); + if (!cur_instance) + return NULL; + + if (cur_instance == list_last_entry(&hdev->adv_instances, + struct adv_info, list)) + return list_first_entry(&hdev->adv_instances, + struct adv_info, list); + else + return list_next_entry(cur_instance, list); +} + +/* This function requires the caller holds hdev->lock */ +int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + BT_DBG("%s removing %dMR", hdev->name, instance); + + if (hdev->cur_adv_instance == instance && hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + + list_del(&adv_instance->list); + kfree(adv_instance); + + hdev->adv_instance_cnt--; + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +void hci_adv_instances_clear(struct hci_dev *hdev) +{ + struct adv_info *adv_instance, *n; + + if (hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + list_del(&adv_instance->list); + kfree(adv_instance); + } + + hdev->adv_instance_cnt = 0; +} + +/* This function requires the caller holds hdev->lock */ +int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, + u16 adv_data_len, u8 *adv_data, + u16 scan_rsp_len, u8 *scan_rsp_data, + u16 timeout, u16 duration) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (adv_instance) { + memset(adv_instance->adv_data, 0, + sizeof(adv_instance->adv_data)); + memset(adv_instance->scan_rsp_data, 0, + sizeof(adv_instance->scan_rsp_data)); + } else { + if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES || + instance < 1 || instance > HCI_MAX_ADV_INSTANCES) + return -EOVERFLOW; + + adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL); + if (!adv_instance) + return -ENOMEM; + + adv_instance->pending = true; + adv_instance->instance = instance; + list_add(&adv_instance->list, &hdev->adv_instances); + hdev->adv_instance_cnt++; + } + + adv_instance->flags = flags; + adv_instance->adv_data_len = adv_data_len; + adv_instance->scan_rsp_len = scan_rsp_len; + + if (adv_data_len) + memcpy(adv_instance->adv_data, adv_data, adv_data_len); + + if (scan_rsp_len) + memcpy(adv_instance->scan_rsp_data, + scan_rsp_data, scan_rsp_len); + + adv_instance->timeout = timeout; + adv_instance->remaining_time = timeout; + + if (duration == 0) + adv_instance->duration = HCI_DEFAULT_ADV_DURATION; + else + adv_instance->duration = duration; + + BT_DBG("%s for %dMR", hdev->name, instance); + + return 0; +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { @@ -2686,10 +2913,6 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, { struct hci_conn_params *params; - /* The conn params list only contains identity addresses */ - if (!hci_is_identity_address(addr, addr_type)) - return NULL; - list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(¶ms->addr, addr) == 0 && params->addr_type == addr_type) { @@ -2706,10 +2929,6 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, { struct hci_conn_params *param; - /* The list only contains identity addresses */ - if (!hci_is_identity_address(addr, addr_type)) - return NULL; - list_for_each_entry(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && param->addr_type == addr_type) @@ -2725,9 +2944,6 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, { struct hci_conn_params *params; - if (!hci_is_identity_address(addr, addr_type)) - return NULL; - params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; @@ -2791,6 +3007,15 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; + + /* If trying to estabilish one time connection to disabled + * device, leave the params, but mark them as just once. + */ + if (params->explicit_connect) { + params->auto_connect = HCI_AUTO_CONN_EXPLICIT; + continue; + } + list_del(¶ms->list); kfree(params); } @@ -3019,6 +3244,9 @@ struct hci_dev *hci_alloc_dev(void) hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; + hdev->adv_instance_cnt = 0; + hdev->cur_adv_instance = 0x00; + hdev->adv_instance_timeout = 0; hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; @@ -3060,6 +3288,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); + INIT_LIST_HEAD(&hdev->adv_instances); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); @@ -3071,6 +3300,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); + INIT_DELAYED_WORK(&hdev->adv_instance_expire, hci_adv_timeout_expire); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); @@ -3082,7 +3312,6 @@ struct hci_dev *hci_alloc_dev(void) hci_init_sysfs(hdev); discovery_init(hdev); - adv_info_init(hdev); return hdev; } @@ -3183,7 +3412,7 @@ int hci_register_dev(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); - hci_notify(hdev, HCI_DEV_REG); + hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); queue_work(hdev->req_workqueue, &hdev->power_on); @@ -3231,7 +3460,7 @@ void hci_unregister_dev(struct hci_dev *hdev) * pending list */ BUG_ON(!list_empty(&hdev->mgmt_pending)); - hci_notify(hdev, HCI_DEV_UNREG); + hci_sock_dev_event(hdev, HCI_DEV_UNREG); if (hdev->rfkill) { rfkill_unregister(hdev->rfkill); @@ -3253,6 +3482,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); + hci_adv_instances_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); @@ -3267,7 +3497,7 @@ EXPORT_SYMBOL(hci_unregister_dev); /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { - hci_notify(hdev, HCI_DEV_SUSPEND); + hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); return 0; } EXPORT_SYMBOL(hci_suspend_dev); @@ -3275,7 +3505,7 @@ EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { - hci_notify(hdev, HCI_DEV_RESUME); + hci_sock_dev_event(hdev, HCI_DEV_RESUME); return 0; } EXPORT_SYMBOL(hci_resume_dev); @@ -3307,6 +3537,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) return -ENXIO; } + if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT && + bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && + bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + kfree_skb(skb); + return -EINVAL; + } + /* Incoming skb */ bt_cb(skb)->incoming = 1; @@ -3320,6 +3557,22 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_frame); +/* Receive diagnostic message from HCI drivers */ +int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) +{ + /* Mark as diagnostic packet */ + bt_cb(skb)->pkt_type = HCI_DIAG_PKT; + + /* Time stamp */ + __net_timestamp(skb); + + skb_queue_tail(&hdev->rx_q, skb); + queue_work(hdev->workqueue, &hdev->rx_work); + + return 0; +} +EXPORT_SYMBOL(hci_recv_diag); + /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) @@ -3366,6 +3619,11 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); + if (!test_bit(HCI_RUNNING, &hdev->flags)) { + kfree_skb(skb); + return; + } + err = hdev->send(hdev, skb); if (err < 0) { BT_ERR("%s sending frame failed (%d)", hdev->name, err); @@ -3390,7 +3648,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, /* Stand-alone HCI commands must be flagged as * single-command requests. */ - bt_cb(skb)->req.start = true; + bt_cb(skb)->hci.req_start = true; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -3416,6 +3674,25 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } +/* Send HCI command and wait for command commplete event */ +struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + struct sk_buff *skb; + + if (!test_bit(HCI_UP, &hdev->flags)) + return ERR_PTR(-ENETDOWN); + + bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); + + hci_req_lock(hdev); + skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); + hci_req_unlock(hdev); + + return skb; +} +EXPORT_SYMBOL(hci_cmd_sync); + /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { @@ -4068,7 +4345,7 @@ static bool hci_req_is_complete(struct hci_dev *hdev) if (!skb) return true; - return bt_cb(skb)->req.start; + return bt_cb(skb)->hci.req_start; } static void hci_resend_last(struct hci_dev *hdev) @@ -4128,26 +4405,26 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, * callback would be found in hdev->sent_cmd instead of the * command queue (hdev->cmd_q). */ - if (bt_cb(hdev->sent_cmd)->req.complete) { - *req_complete = bt_cb(hdev->sent_cmd)->req.complete; + if (bt_cb(hdev->sent_cmd)->hci.req_complete) { + *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete; return; } - if (bt_cb(hdev->sent_cmd)->req.complete_skb) { - *req_complete_skb = bt_cb(hdev->sent_cmd)->req.complete_skb; + if (bt_cb(hdev->sent_cmd)->hci.req_complete_skb) { + *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { - if (bt_cb(skb)->req.start) { + if (bt_cb(skb)->hci.req_start) { __skb_queue_head(&hdev->cmd_q, skb); break; } - *req_complete = bt_cb(skb)->req.complete; - *req_complete_skb = bt_cb(skb)->req.complete_skb; + *req_complete = bt_cb(skb)->hci.req_complete; + *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; kfree_skb(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); diff --git a/kernel/net/bluetooth/hci_event.c b/kernel/net/bluetooth/hci_event.c index 7b61be736..d57c11c1c 100644 --- a/kernel/net/bluetooth/hci_event.c +++ b/kernel/net/bluetooth/hci_event.c @@ -55,7 +55,12 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) wake_up_bit(&hdev->flags, HCI_INQUIRY); hci_dev_lock(hdev); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + /* Set discovery state to stopped if we're not doing LE active + * scanning. + */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || + hdev->le_scan_type != LE_SCAN_ACTIVE) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); hci_dev_unlock(hdev); hci_conn_check_pending(hdev); @@ -823,7 +828,7 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); if (rp->status) - goto a2mp_rsp; + return; hdev->amp_status = rp->amp_status; hdev->amp_total_bw = __le32_to_cpu(rp->total_bw); @@ -835,46 +840,6 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev, hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size); hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to); hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to); - -a2mp_rsp: - a2mp_send_getinfo_rsp(hdev); -} - -static void hci_cc_read_local_amp_assoc(struct hci_dev *hdev, - struct sk_buff *skb) -{ - struct hci_rp_read_local_amp_assoc *rp = (void *) skb->data; - struct amp_assoc *assoc = &hdev->loc_assoc; - size_t rem_len, frag_len; - - BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - - if (rp->status) - goto a2mp_rsp; - - frag_len = skb->len - sizeof(*rp); - rem_len = __le16_to_cpu(rp->rem_len); - - if (rem_len > frag_len) { - BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len); - - memcpy(assoc->data + assoc->offset, rp->frag, frag_len); - assoc->offset += frag_len; - - /* Read other fragments */ - amp_read_loc_assoc_frag(hdev, rp->phy_handle); - - return; - } - - memcpy(assoc->data + assoc->offset, rp->frag, rem_len); - assoc->len = assoc->offset + rem_len; - assoc->offset = 0; - -a2mp_rsp: - /* Send A2MP Rsp when all fragments are received */ - a2mp_send_getampassoc_rsp(hdev, rp->status); - a2mp_send_create_phy_link_req(hdev, rp->status); } static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, @@ -1099,7 +1064,7 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_set_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + conn = hci_lookup_le_connect(hdev); if (conn) queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, @@ -1409,20 +1374,6 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } -static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev, - struct sk_buff *skb) -{ - struct hci_rp_write_remote_amp_assoc *rp = (void *) skb->data; - - BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x", - hdev->name, rp->status, rp->phy_handle); - - if (rp->status) - return; - - amp_write_rem_assoc_continue(hdev, rp->phy_handle); -} - static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_rssi *rp = (void *) skb->data; @@ -1944,47 +1895,6 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } -static void hci_cs_create_phylink(struct hci_dev *hdev, u8 status) -{ - struct hci_cp_create_phy_link *cp; - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK); - if (!cp) - return; - - hci_dev_lock(hdev); - - if (status) { - struct hci_conn *hcon; - - hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle); - if (hcon) - hci_conn_del(hcon); - } else { - amp_write_remote_assoc(hdev, cp->phy_handle); - } - - hci_dev_unlock(hdev); -} - -static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status) -{ - struct hci_cp_accept_phy_link *cp; - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - if (status) - return; - - cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK); - if (!cp) - return; - - amp_write_remote_assoc(hdev, cp->phy_handle); -} - static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) { struct hci_cp_le_create_conn *cp; @@ -2005,7 +1915,8 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->peer_addr); + conn = hci_conn_hash_lookup_le(hdev, &cp->peer_addr, + cp->peer_addr_type); if (!conn) goto unlock; @@ -2603,6 +2514,63 @@ unlock: hci_dev_unlock(hdev); } +static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + const struct hci_rp_read_enc_key_size *rp; + struct hci_conn *conn; + u16 handle; + + BT_DBG("%s status 0x%02x", hdev->name, status); + + if (!skb || skb->len < sizeof(*rp)) { + BT_ERR("%s invalid HCI Read Encryption Key Size response", + hdev->name); + return; + } + + rp = (void *)skb->data; + handle = le16_to_cpu(rp->handle); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + goto unlock; + + /* If we fail to read the encryption key size, assume maximum + * (which is the same we do also when this HCI command isn't + * supported. + */ + if (rp->status) { + BT_ERR("%s failed to read key size for handle %u", hdev->name, + handle); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + conn->enc_key_size = rp->key_size; + } + + if (conn->state == BT_CONFIG) { + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, 0); + hci_conn_drop(conn); + } else { + u8 encrypt; + + if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) + encrypt = 0x00; + else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) + encrypt = 0x02; + else + encrypt = 0x01; + + hci_encrypt_cfm(conn, 0, encrypt); + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_encrypt_change *ev = (void *) skb->data; @@ -2650,22 +2618,51 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } - if (conn->state == BT_CONFIG) { - if (!ev->status) - conn->state = BT_CONNECTED; + /* In Secure Connections Only mode, do not allow any connections + * that are not encrypted with AES-CCM using a P-256 authenticated + * combination key. + */ + if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && + (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || + conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { + hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + goto unlock; + } + + /* Try reading the encryption key size for encrypted ACL links */ + if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { + struct hci_cp_read_enc_key_size cp; + struct hci_request req; - /* In Secure Connections Only mode, do not allow any - * connections that are not encrypted with AES-CCM - * using a P-256 authenticated combination key. + /* Only send HCI_Read_Encryption_Key_Size if the + * controller really supports it. If it doesn't, assume + * the default size (16). */ - if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && - (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || - conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { - hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_drop(conn); - goto unlock; + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + goto notify; + } + + hci_req_init(&req, hdev); + + cp.handle = cpu_to_le16(conn->handle); + hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); + + if (hci_req_run_skb(&req, read_enc_key_size_complete)) { + BT_ERR("Sending HCI Read Encryption Key Size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + goto notify; } + goto unlock; + } + +notify: + if (conn->state == BT_CONFIG) { + if (!ev->status) + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } else @@ -2912,10 +2909,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_read_clock(hdev, skb); break; - case HCI_OP_READ_LOCAL_AMP_ASSOC: - hci_cc_read_local_amp_assoc(hdev, skb); - break; - case HCI_OP_READ_INQ_RSP_TX_POWER: hci_cc_read_inq_rsp_tx_power(hdev, skb); break; @@ -3020,10 +3013,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_set_adv_param(hdev, skb); break; - case HCI_OP_WRITE_REMOTE_AMP_ASSOC: - hci_cc_write_remote_amp_assoc(hdev, skb); - break; - case HCI_OP_READ_RSSI: hci_cc_read_rssi(hdev, skb); break; @@ -3107,14 +3096,6 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cs_setup_sync_conn(hdev, ev->status); break; - case HCI_OP_CREATE_PHY_LINK: - hci_cs_create_phylink(hdev, ev->status); - break; - - case HCI_OP_ACCEPT_PHY_LINK: - hci_cs_accept_phylink(hdev, ev->status); - break; - case HCI_OP_SNIFF_MODE: hci_cs_sniff_mode(hdev, ev->status); break; @@ -3157,7 +3138,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, * complete event). */ if (ev->status || - (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->req.event)) + (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->hci.req_event)) hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete, req_complete_skb); @@ -3751,17 +3732,25 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, if (ev->link_type == ESCO_LINK) goto unlock; + /* When the link type in the event indicates SCO connection + * and lookup of the connection object fails, then check + * if an eSCO connection object exists. + * + * The core limits the synchronous connections to either + * SCO or eSCO. The eSCO connection is preferred and tried + * to be setup first and until successfully established, + * the link type will be hinted as eSCO. + */ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr); if (!conn) goto unlock; - - conn->type = SCO_LINK; } switch (ev->status) { case 0x00: conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; + conn->type = ev->link_type; hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); @@ -4313,6 +4302,23 @@ unlock: hci_dev_unlock(hdev); } +#if IS_ENABLED(CONFIG_BT_HS) +static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_channel_selected *ev = (void *)skb->data; + struct hci_conn *hcon; + + BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle); + + skb_pull(skb, sizeof(*ev)); + + hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); + if (!hcon) + return; + + amp_read_loc_assoc_final_data(hdev, hcon); +} + static void hci_phy_link_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -4436,6 +4442,7 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, hci_dev_unlock(hdev); } +#endif static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -4454,7 +4461,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + conn = hci_lookup_le_connect(hdev); if (!conn) { conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role); if (!conn) { @@ -4647,42 +4654,49 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, /* If we're not connectable only connect devices that we have in * our pend_le_conns list. */ - params = hci_pend_le_action_lookup(&hdev->pend_le_conns, - addr, addr_type); + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, addr, + addr_type); if (!params) return NULL; - switch (params->auto_connect) { - case HCI_AUTO_CONN_DIRECT: - /* Only devices advertising with ADV_DIRECT_IND are - * triggering a connection attempt. This is allowing - * incoming connections from slave devices. - */ - if (adv_type != LE_ADV_DIRECT_IND) + if (!params->explicit_connect) { + switch (params->auto_connect) { + case HCI_AUTO_CONN_DIRECT: + /* Only devices advertising with ADV_DIRECT_IND are + * triggering a connection attempt. This is allowing + * incoming connections from slave devices. + */ + if (adv_type != LE_ADV_DIRECT_IND) + return NULL; + break; + case HCI_AUTO_CONN_ALWAYS: + /* Devices advertising with ADV_IND or ADV_DIRECT_IND + * are triggering a connection attempt. This means + * that incoming connectioms from slave device are + * accepted and also outgoing connections to slave + * devices are established when found. + */ + break; + default: return NULL; - break; - case HCI_AUTO_CONN_ALWAYS: - /* Devices advertising with ADV_IND or ADV_DIRECT_IND - * are triggering a connection attempt. This means - * that incoming connectioms from slave device are - * accepted and also outgoing connections to slave - * devices are established when found. - */ - break; - default: - return NULL; + } } conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER); if (!IS_ERR(conn)) { - /* Store the pointer since we don't really have any + /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned + * by higher layer that tried to connect, if no then + * store the pointer since we don't really have any * other owner of the object besides the params that * triggered it. This way we can abort the connection if * the parameters get removed and keep the reference * count consistent once the connection is established. */ - params->conn = hci_conn_get(conn); + + if (!params->explicit_connect) + params->conn = hci_conn_get(conn); + return conn; } @@ -4711,6 +4725,27 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, struct hci_conn *conn; bool match; u32 flags; + u8 *ptr, real_len; + + /* Find the end of the data in case the report contains padded zero + * bytes at the end causing an invalid length value. + * + * When data is NULL, len is 0 so there is no need for extra ptr + * check as 'ptr < data + 0' is already false in such case. + */ + for (ptr = data; ptr < data + len && *ptr; ptr += *ptr + 1) { + if (ptr + 1 + *ptr > data + len) + break; + } + + real_len = ptr - data; + + /* Adjust for actual length */ + if (len != real_len) { + BT_ERR_RATELIMITED("%s advertising data length corrected", + hdev->name); + len = real_len; + } /* If the direct address is present, then this report is from * a LE Direct Advertising Report event. In that case it is @@ -4955,7 +4990,8 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) goto not_found; } - memcpy(cp.ltk, ltk->val, sizeof(ltk->val)); + memcpy(cp.ltk, ltk->val, ltk->enc_size); + memset(cp.ltk + ltk->enc_size, 0, sizeof(cp.ltk) - ltk->enc_size); cp.handle = cpu_to_le16(conn->handle); conn->pending_sec_level = smp_ltk_sec_level(ltk); @@ -5119,22 +5155,6 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) } } -static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb) -{ - struct hci_ev_channel_selected *ev = (void *) skb->data; - struct hci_conn *hcon; - - BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle); - - skb_pull(skb, sizeof(*ev)); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - return; - - amp_read_loc_assoc_final_data(hdev, hcon); -} - static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 event, struct sk_buff *skb) { @@ -5189,7 +5209,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) u8 status = 0, event = hdr->evt, req_evt = 0; u16 opcode = HCI_OP_NOP; - if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->req.event == event) { + if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->hci.req_event == event) { struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(cmd_hdr->opcode); hci_req_cmd_complete(hdev, opcode, status, &req_complete, @@ -5355,14 +5375,15 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_le_meta_evt(hdev, skb); break; - case HCI_EV_CHANNEL_SELECTED: - hci_chan_selected_evt(hdev, skb); - break; - case HCI_EV_REMOTE_OOB_DATA_REQUEST: hci_remote_oob_data_request_evt(hdev, skb); break; +#if IS_ENABLED(CONFIG_BT_HS) + case HCI_EV_CHANNEL_SELECTED: + hci_chan_selected_evt(hdev, skb); + break; + case HCI_EV_PHY_LINK_COMPLETE: hci_phy_link_complete_evt(hdev, skb); break; @@ -5378,6 +5399,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) case HCI_EV_DISCONN_PHY_LINK_COMPLETE: hci_disconn_phylink_complete_evt(hdev, skb); break; +#endif case HCI_EV_NUM_COMP_BLOCKS: hci_num_comp_blocks_evt(hdev, skb); diff --git a/kernel/net/bluetooth/hci_request.c b/kernel/net/bluetooth/hci_request.c index d6025d6e6..02778c5bc 100644 --- a/kernel/net/bluetooth/hci_request.c +++ b/kernel/net/bluetooth/hci_request.c @@ -56,8 +56,8 @@ static int req_run(struct hci_request *req, hci_req_complete_t complete, return -ENODATA; skb = skb_peek_tail(&req->cmd_q); - bt_cb(skb)->req.complete = complete; - bt_cb(skb)->req.complete_skb = complete_skb; + bt_cb(skb)->hci.req_complete = complete; + bt_cb(skb)->hci.req_complete_skb = complete_skb; spin_lock_irqsave(&hdev->cmd_q.lock, flags); skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); @@ -99,7 +99,7 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, BT_DBG("skb len %d", skb->len); bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; - bt_cb(skb)->opcode = opcode; + bt_cb(skb)->hci.opcode = opcode; return skb; } @@ -128,9 +128,9 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, } if (skb_queue_empty(&req->cmd_q)) - bt_cb(skb)->req.start = true; + bt_cb(skb)->hci.req_start = true; - bt_cb(skb)->req.event = event; + bt_cb(skb)->hci.req_event = event; skb_queue_tail(&req->cmd_q, skb); } @@ -175,21 +175,29 @@ static u8 update_white_list(struct hci_request *req) * command to remove it from the controller. */ list_for_each_entry(b, &hdev->le_white_list, list) { - struct hci_cp_le_del_from_white_list cp; + /* If the device is neither in pend_le_conns nor + * pend_le_reports then remove it from the whitelist. + */ + if (!hci_pend_le_action_lookup(&hdev->pend_le_conns, + &b->bdaddr, b->bdaddr_type) && + !hci_pend_le_action_lookup(&hdev->pend_le_reports, + &b->bdaddr, b->bdaddr_type)) { + struct hci_cp_le_del_from_white_list cp; + + cp.bdaddr_type = b->bdaddr_type; + bacpy(&cp.bdaddr, &b->bdaddr); - if (hci_pend_le_action_lookup(&hdev->pend_le_conns, - &b->bdaddr, b->bdaddr_type) || - hci_pend_le_action_lookup(&hdev->pend_le_reports, - &b->bdaddr, b->bdaddr_type)) { - white_list_entries++; + hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, + sizeof(cp), &cp); continue; } - cp.bdaddr_type = b->bdaddr_type; - bacpy(&cp.bdaddr, &b->bdaddr); + if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { + /* White list can not be used with RPAs */ + return 0x00; + } - hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, - sizeof(cp), &cp); + white_list_entries++; } /* Since all no longer valid white list entries have been @@ -317,7 +325,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) * address be updated at the next cycle. */ if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { + hci_lookup_le_connect(hdev)) { BT_DBG("Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return; @@ -479,7 +487,6 @@ void hci_update_page_scan(struct hci_dev *hdev) void __hci_update_background_scan(struct hci_request *req) { struct hci_dev *hdev = req->hdev; - struct hci_conn *conn; if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || @@ -529,8 +536,7 @@ void __hci_update_background_scan(struct hci_request *req) * since some controllers are not able to scan and connect at * the same time. */ - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); - if (conn) + if (hci_lookup_le_connect(hdev)) return; /* If controller is currently scanning, we stop it to ensure we @@ -566,3 +572,96 @@ void hci_update_background_scan(struct hci_dev *hdev) if (err && err != -ENODATA) BT_ERR("Failed to run HCI request: err %d", err); } + +void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, + u8 reason) +{ + switch (conn->state) { + case BT_CONNECTED: + case BT_CONFIG: + if (conn->type == AMP_LINK) { + struct hci_cp_disconn_phy_link cp; + + cp.phy_handle = HCI_PHY_HANDLE(conn->handle); + cp.reason = reason; + hci_req_add(req, HCI_OP_DISCONN_PHY_LINK, sizeof(cp), + &cp); + } else { + struct hci_cp_disconnect dc; + + dc.handle = cpu_to_le16(conn->handle); + dc.reason = reason; + hci_req_add(req, HCI_OP_DISCONNECT, sizeof(dc), &dc); + } + + conn->state = BT_DISCONN; + + break; + case BT_CONNECT: + if (conn->type == LE_LINK) { + if (test_bit(HCI_CONN_SCANNING, &conn->flags)) + break; + hci_req_add(req, HCI_OP_LE_CREATE_CONN_CANCEL, + 0, NULL); + } else if (conn->type == ACL_LINK) { + if (req->hdev->hci_ver < BLUETOOTH_VER_1_2) + break; + hci_req_add(req, HCI_OP_CREATE_CONN_CANCEL, + 6, &conn->dst); + } + break; + case BT_CONNECT2: + if (conn->type == ACL_LINK) { + struct hci_cp_reject_conn_req rej; + + bacpy(&rej.bdaddr, &conn->dst); + rej.reason = reason; + + hci_req_add(req, HCI_OP_REJECT_CONN_REQ, + sizeof(rej), &rej); + } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { + struct hci_cp_reject_sync_conn_req rej; + + bacpy(&rej.bdaddr, &conn->dst); + + /* SCO rejection has its own limited set of + * allowed error values (0x0D-0x0F) which isn't + * compatible with most values passed to this + * function. To be safe hard-code one of the + * values that's suitable for SCO. + */ + rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES; + + hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ, + sizeof(rej), &rej); + } + break; + default: + conn->state = BT_CLOSED; + break; + } +} + +static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) +{ + if (status) + BT_DBG("Failed to abort connection: status 0x%2.2x", status); +} + +int hci_abort_conn(struct hci_conn *conn, u8 reason) +{ + struct hci_request req; + int err; + + hci_req_init(&req, conn->hdev); + + __hci_abort_conn(&req, conn, reason); + + err = hci_req_run(&req, abort_conn_complete); + if (err && err != -ENODATA) { + BT_ERR("Failed to run HCI request: err %d", err); + return err; + } + + return 0; +} diff --git a/kernel/net/bluetooth/hci_request.h b/kernel/net/bluetooth/hci_request.h index bf6df92f4..25c7f1305 100644 --- a/kernel/net/bluetooth/hci_request.h +++ b/kernel/net/bluetooth/hci_request.h @@ -55,3 +55,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, void hci_update_background_scan(struct hci_dev *hdev); void __hci_update_background_scan(struct hci_request *req); + +int hci_abort_conn(struct hci_conn *conn, u8 reason); +void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, + u8 reason); diff --git a/kernel/net/bluetooth/hci_sock.c b/kernel/net/bluetooth/hci_sock.c index e11a5cfda..b1eb8c09a 100644 --- a/kernel/net/bluetooth/hci_sock.c +++ b/kernel/net/bluetooth/hci_sock.c @@ -120,10 +120,7 @@ static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb) /* Apply filter */ flt = &hci_pi(sk)->filter; - if (bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) - flt_type = 0; - else - flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS; + flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS; if (!test_bit(flt_type, &flt->type_mask)) return true; @@ -173,6 +170,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) { + if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT && + bt_cb(skb)->pkt_type != HCI_EVENT_PKT && + bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && + bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) + continue; if (is_filtered_packet(sk, skb)) continue; } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { @@ -279,6 +281,9 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; + case HCI_DIAG_PKT: + opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); + break; default: return; } @@ -303,6 +308,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; struct hci_mon_new_index *ni; + struct hci_mon_index_info *ii; struct sk_buff *skb; __le16 opcode; @@ -312,7 +318,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) if (!skb) return NULL; - ni = (void *) skb_put(skb, HCI_MON_NEW_INDEX_SIZE); + ni = (void *)skb_put(skb, HCI_MON_NEW_INDEX_SIZE); ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); @@ -329,6 +335,40 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; + case HCI_DEV_SETUP: + if (hdev->manufacturer == 0xffff) + return NULL; + + /* fall through */ + + case HCI_DEV_UP: + skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); + if (!skb) + return NULL; + + ii = (void *)skb_put(skb, HCI_MON_INDEX_INFO_SIZE); + bacpy(&ii->bdaddr, &hdev->bdaddr); + ii->manufacturer = cpu_to_le16(hdev->manufacturer); + + opcode = cpu_to_le16(HCI_MON_INDEX_INFO); + break; + + case HCI_DEV_OPEN: + skb = bt_skb_alloc(0, GFP_ATOMIC); + if (!skb) + return NULL; + + opcode = cpu_to_le16(HCI_MON_OPEN_INDEX); + break; + + case HCI_DEV_CLOSE: + skb = bt_skb_alloc(0, GFP_ATOMIC); + if (!skb) + return NULL; + + opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX); + break; + default: return NULL; } @@ -358,6 +398,28 @@ static void send_monitor_replay(struct sock *sk) if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); + + if (!test_bit(HCI_RUNNING, &hdev->flags)) + continue; + + skb = create_monitor_event(hdev, HCI_DEV_OPEN); + if (!skb) + continue; + + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); + + if (test_bit(HCI_UP, &hdev->flags)) + skb = create_monitor_event(hdev, HCI_DEV_UP); + else if (hci_dev_test_flag(hdev, HCI_SETUP)) + skb = create_monitor_event(hdev, HCI_DEV_SETUP); + else + skb = NULL; + + if (skb) { + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); + } } read_unlock(&hci_dev_list_lock); @@ -392,14 +454,12 @@ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) void hci_sock_dev_event(struct hci_dev *hdev, int event) { - struct hci_ev_si_device ev; - BT_DBG("hdev %s event %d", hdev->name, event); - /* Send event to monitor */ if (atomic_read(&monitor_promisc)) { struct sk_buff *skb; + /* Send event to monitor */ skb = create_monitor_event(hdev, event); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, @@ -408,10 +468,14 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) } } - /* Send event to sockets */ - ev.event = event; - ev.dev_id = hdev->id; - hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + if (event <= HCI_DEV_DOWN) { + struct hci_ev_si_device ev; + + /* Send event to sockets */ + ev.event = event; + ev.dev_id = hdev->id; + hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + } if (event == HCI_DEV_UNREG) { struct sock *sk; @@ -503,9 +567,18 @@ static int hci_sock_release(struct socket *sock) if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { - mgmt_index_added(hdev); + /* When releasing an user channel exclusive access, + * call hci_dev_do_close directly instead of calling + * hci_dev_close to ensure the exclusive access will + * be released and the controller brought back down. + * + * The checking of HCI_AUTO_OFF is not needed in this + * case since it will have been cleared already when + * opening the user channel. + */ + hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); - hci_dev_close(hdev->id); + mgmt_index_added(hdev); } atomic_dec(&hdev->promisc); @@ -928,7 +1001,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, BT_DBG("sock %p, sk %p", sock, sk); - if (flags & (MSG_OOB)) + if (flags & MSG_OOB) return -EOPNOTSUPP; if (sk->sk_state == BT_CLOSED) @@ -1176,7 +1249,7 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* Stand-alone HCI commands must be flagged as * single-command requests. */ - bt_cb(skb)->req.start = true; + bt_cb(skb)->hci.req_start = true; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -1187,6 +1260,12 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, goto drop; } + if (bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && + bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + err = -EINVAL; + goto drop; + } + skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } @@ -1389,7 +1468,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &hci_sock_ops; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/bluetooth/hidp/core.c b/kernel/net/bluetooth/hidp/core.c index 9070dfd6b..0bec4588c 100644 --- a/kernel/net/bluetooth/hidp/core.c +++ b/kernel/net/bluetooth/hidp/core.c @@ -401,6 +401,20 @@ static void hidp_idle_timeout(unsigned long arg) { struct hidp_session *session = (struct hidp_session *) arg; + /* The HIDP user-space API only contains calls to add and remove + * devices. There is no way to forward events of any kind. Therefore, + * we have to forcefully disconnect a device on idle-timeouts. This is + * unfortunate and weird API design, but it is spec-compliant and + * required for backwards-compatibility. Hence, on idle-timeout, we + * signal driver-detach events, so poll() will be woken up with an + * error-condition on both sockets. + */ + + session->intr_sock->sk->sk_err = EUNATCH; + session->ctrl_sock->sk->sk_err = EUNATCH; + wake_up_interruptible(sk_sleep(session->intr_sock->sk)); + wake_up_interruptible(sk_sleep(session->ctrl_sock->sk)); + hidp_session_terminate(session); } @@ -915,6 +929,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, session->conn = l2cap_conn_get(conn); session->user.probe = hidp_session_probe; session->user.remove = hidp_session_remove; + INIT_LIST_HEAD(&session->user.list); session->ctrl_sock = ctrl_sock; session->intr_sock = intr_sock; skb_queue_head_init(&session->ctrl_transmit); diff --git a/kernel/net/bluetooth/hidp/sock.c b/kernel/net/bluetooth/hidp/sock.c index cb3fdde19..008ba439b 100644 --- a/kernel/net/bluetooth/hidp/sock.c +++ b/kernel/net/bluetooth/hidp/sock.c @@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/bluetooth/l2cap_core.c b/kernel/net/bluetooth/l2cap_core.c index dad419782..66e8b6ee1 100644 --- a/kernel/net/bluetooth/l2cap_core.c +++ b/kernel/net/bluetooth/l2cap_core.c @@ -239,7 +239,7 @@ static u16 l2cap_alloc_cid(struct l2cap_conn *conn) else dyn_end = L2CAP_CID_DYN_END; - for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) { + for (cid = L2CAP_CID_DYN_START; cid <= dyn_end; cid++) { if (!__l2cap_get_chan_by_scid(conn, cid)) return cid; } @@ -1601,7 +1601,7 @@ int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user) hci_dev_lock(hdev); - if (user->list.next || user->list.prev) { + if (!list_empty(&user->list)) { ret = -EINVAL; goto out_unlock; } @@ -1631,12 +1631,10 @@ void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user) hci_dev_lock(hdev); - if (!user->list.next || !user->list.prev) + if (list_empty(&user->list)) goto out_unlock; - list_del(&user->list); - user->list.next = NULL; - user->list.prev = NULL; + list_del_init(&user->list); user->remove(conn, user); out_unlock: @@ -1650,9 +1648,7 @@ static void l2cap_unregister_all_users(struct l2cap_conn *conn) while (!list_empty(&conn->users)) { user = list_first_entry(&conn->users, struct l2cap_user, list); - list_del(&user->list); - user->list.next = NULL; - user->list.prev = NULL; + list_del_init(&user->list); user->remove(conn, user); } } @@ -5254,7 +5250,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, credits = __le16_to_cpu(rsp->credits); result = __le16_to_cpu(rsp->result); - if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23)) + if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 || + dcid < L2CAP_CID_DYN_START || + dcid > L2CAP_CID_LE_DYN_END)) return -EPROTO; BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", @@ -5274,6 +5272,11 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, switch (result) { case L2CAP_CR_SUCCESS: + if (__l2cap_get_chan_by_dcid(conn, dcid)) { + err = -EBADSLT; + break; + } + chan->ident = 0; chan->dcid = dcid; chan->omtu = mtu; @@ -5441,9 +5444,16 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response_unlock; } + /* Check for valid dynamic CID range */ + if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { + result = L2CAP_CR_INVALID_SCID; + chan = NULL; + goto response_unlock; + } + /* Check if we already have channel with that dcid */ if (__l2cap_get_chan_by_dcid(conn, scid)) { - result = L2CAP_CR_NO_MEM; + result = L2CAP_CR_SCID_IN_USE; chan = NULL; goto response_unlock; } @@ -7117,8 +7127,10 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, else role = HCI_ROLE_MASTER; - hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level, - HCI_LE_CONN_TIMEOUT, role); + hcon = hci_connect_le_scan(hdev, dst, dst_type, + chan->sec_level, + HCI_LE_CONN_TIMEOUT, + role); } else { u8 auth_type = l2cap_get_auth_type(chan); hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); @@ -7442,7 +7454,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) mutex_unlock(&conn->chan_lock); } -int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_hdr *hdr; @@ -7485,7 +7497,7 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return 0; + return; } BT_DBG("Start: total len %d, frag len %d", len, skb->len); @@ -7544,7 +7556,6 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); - return 0; } static struct hci_cb l2cap_cb = { diff --git a/kernel/net/bluetooth/l2cap_sock.c b/kernel/net/bluetooth/l2cap_sock.c index a7278f05e..1bb551527 100644 --- a/kernel/net/bluetooth/l2cap_sock.c +++ b/kernel/net/bluetooth/l2cap_sock.c @@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = { static const struct proto_ops l2cap_sock_ops; static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio); + int proto, gfp_t prio, int kern); bool l2cap_is_socket(struct socket *sock) { @@ -1054,18 +1054,23 @@ static void l2cap_sock_kill(struct sock *sk) sock_put(sk); } -static int __l2cap_wait_ack(struct sock *sk) +static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) { - struct l2cap_chan *chan = l2cap_pi(sk)->chan; DECLARE_WAITQUEUE(wait, current); int err = 0; - int timeo = HZ/5; + int timeo = L2CAP_WAIT_ACK_POLL_PERIOD; + /* Timeout to prevent infinite loop */ + unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT; add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); - while (chan->unacked_frames > 0 && chan->conn) { + do { + BT_DBG("Waiting for %d ACKs, timeout %04d ms", + chan->unacked_frames, time_after(jiffies, timeout) ? 0 : + jiffies_to_msecs(timeout - jiffies)); + if (!timeo) - timeo = HZ/5; + timeo = L2CAP_WAIT_ACK_POLL_PERIOD; if (signal_pending(current)) { err = sock_intr_errno(timeo); @@ -1080,7 +1085,15 @@ static int __l2cap_wait_ack(struct sock *sk) err = sock_error(sk); if (err) break; - } + + if (time_after(jiffies, timeout)) { + err = -ENOLINK; + break; + } + + } while (chan->unacked_frames > 0 && + chan->state == BT_CONNECTED); + set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; @@ -1098,41 +1111,76 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) if (!sk) return 0; + lock_sock(sk); + + if (sk->sk_shutdown) + goto shutdown_already; + + BT_DBG("Handling sock shutdown"); + + /* prevent sk structure from being freed whilst unlocked */ + sock_hold(sk); + chan = l2cap_pi(sk)->chan; - conn = chan->conn; + /* prevent chan structure from being freed whilst unlocked */ + l2cap_chan_hold(chan); BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); + if (chan->mode == L2CAP_MODE_ERTM && + chan->unacked_frames > 0 && + chan->state == BT_CONNECTED) { + err = __l2cap_wait_ack(sk, chan); + + /* After waiting for ACKs, check whether shutdown + * has already been actioned to close the L2CAP + * link such as by l2cap_disconnection_req(). + */ + if (sk->sk_shutdown) + goto has_shutdown; + } + + sk->sk_shutdown = SHUTDOWN_MASK; + release_sock(sk); + + l2cap_chan_lock(chan); + conn = chan->conn; + if (conn) + /* prevent conn structure from being freed */ + l2cap_conn_get(conn); + l2cap_chan_unlock(chan); + if (conn) + /* mutex lock must be taken before l2cap_chan_lock() */ mutex_lock(&conn->chan_lock); l2cap_chan_lock(chan); - lock_sock(sk); + l2cap_chan_close(chan, 0); + l2cap_chan_unlock(chan); - if (!sk->sk_shutdown) { - if (chan->mode == L2CAP_MODE_ERTM) - err = __l2cap_wait_ack(sk); + if (conn) { + mutex_unlock(&conn->chan_lock); + l2cap_conn_put(conn); + } - sk->sk_shutdown = SHUTDOWN_MASK; + lock_sock(sk); - release_sock(sk); - l2cap_chan_close(chan, 0); - lock_sock(sk); + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) + err = bt_sock_wait_state(sk, BT_CLOSED, + sk->sk_lingertime); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && - !(current->flags & PF_EXITING)) - err = bt_sock_wait_state(sk, BT_CLOSED, - sk->sk_lingertime); - } +has_shutdown: + l2cap_chan_put(chan); + sock_put(sk); +shutdown_already: if (!err && sk->sk_err) err = -sk->sk_err; release_sock(sk); - l2cap_chan_unlock(chan); - if (conn) - mutex_unlock(&conn->chan_lock); + BT_DBG("Sock shutdown complete err: %d", err); return err; } @@ -1193,7 +1241,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) } sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, - GFP_ATOMIC); + GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return NULL; @@ -1523,12 +1571,12 @@ static struct proto l2cap_proto = { }; static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio) + int proto, gfp_t prio, int kern) { struct sock *sk; struct l2cap_chan *chan; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern); if (!sk) return NULL; @@ -1574,7 +1622,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &l2cap_sock_ops; - sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/bluetooth/lib.c b/kernel/net/bluetooth/lib.c index b36bc0415..aa4cf64e3 100644 --- a/kernel/net/bluetooth/lib.c +++ b/kernel/net/bluetooth/lib.c @@ -151,6 +151,22 @@ void bt_info(const char *format, ...) } EXPORT_SYMBOL(bt_info); +void bt_warn(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_warn("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_warn); + void bt_err(const char *format, ...) { struct va_format vaf; @@ -166,3 +182,19 @@ void bt_err(const char *format, ...) va_end(args); } EXPORT_SYMBOL(bt_err); + +void bt_err_ratelimited(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_err_ratelimited("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_err_ratelimited); diff --git a/kernel/net/bluetooth/mgmt.c b/kernel/net/bluetooth/mgmt.c index 7fd87e713..7f2211927 100644 --- a/kernel/net/bluetooth/mgmt.c +++ b/kernel/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 9 +#define MGMT_REVISION 10 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -268,6 +268,14 @@ static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, HCI_SOCK_TRUSTED, skip_sk); } +static u8 le_addr_type(u8 mgmt_addr_type) +{ + if (mgmt_addr_type == BDADDR_LE_PUBLIC) + return ADDR_LE_DEV_PUBLIC; + else + return ADDR_LE_DEV_RANDOM; +} + static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -832,6 +840,20 @@ static struct mgmt_pending_cmd *pending_find_data(u16 opcode, return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data); } +static u8 get_current_adv_instance(struct hci_dev *hdev) +{ + /* The "Set Advertising" setting supersedes the "Add Advertising" + * setting. Here we set the advertising data based on which + * setting was set. When neither apply, default to the global settings, + * represented by instance "0". + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + !hci_dev_test_flag(hdev, HCI_ADVERTISING)) + return hdev->cur_adv_instance; + + return 0x00; +} + static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) { u8 ad_len = 0; @@ -858,19 +880,25 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) return ad_len; } -static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, + u8 *ptr) { + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + /* TODO: Set the appropriate entries based on advertising instance flags * here once flags other than 0 are supported. */ - memcpy(ptr, hdev->adv_instance.scan_rsp_data, - hdev->adv_instance.scan_rsp_len); + memcpy(ptr, adv_instance->scan_rsp_data, + adv_instance->scan_rsp_len); - return hdev->adv_instance.scan_rsp_len; + return adv_instance->scan_rsp_len; } -static void update_scan_rsp_data_for_instance(struct hci_request *req, - u8 instance) +static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_scan_rsp_data cp; @@ -882,7 +910,7 @@ static void update_scan_rsp_data_for_instance(struct hci_request *req, memset(&cp, 0, sizeof(cp)); if (instance) - len = create_instance_scan_rsp_data(hdev, cp.data); + len = create_instance_scan_rsp_data(hdev, instance, cp.data); else len = create_default_scan_rsp_data(hdev, cp.data); @@ -900,21 +928,7 @@ static void update_scan_rsp_data_for_instance(struct hci_request *req, static void update_scan_rsp_data(struct hci_request *req) { - struct hci_dev *hdev = req->hdev; - u8 instance; - - /* The "Set Advertising" setting supersedes the "Add Advertising" - * setting. Here we set the scan response data based on which - * setting was set. When neither apply, default to the global settings, - * represented by instance "0". - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - !hci_dev_test_flag(hdev, HCI_ADVERTISING)) - instance = 0x01; - else - instance = 0x00; - - update_scan_rsp_data_for_instance(req, instance); + update_inst_scan_rsp_data(req, get_current_adv_instance(req->hdev)); } static u8 get_adv_discov_flags(struct hci_dev *hdev) @@ -941,20 +955,6 @@ static u8 get_adv_discov_flags(struct hci_dev *hdev) return 0; } -static u8 get_current_adv_instance(struct hci_dev *hdev) -{ - /* The "Set Advertising" setting supersedes the "Add Advertising" - * setting. Here we set the advertising data based on which - * setting was set. When neither apply, default to the global settings, - * represented by instance "0". - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - !hci_dev_test_flag(hdev, HCI_ADVERTISING)) - return 0x01; - - return 0x00; -} - static bool get_connectable(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; @@ -975,41 +975,65 @@ static bool get_connectable(struct hci_dev *hdev) static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; + struct adv_info *adv_instance; - if (instance > 0x01) - return 0; + if (instance == 0x00) { + /* Instance 0 always manages the "Tx Power" and "Flags" + * fields + */ + flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; - if (instance == 0x01) - return hdev->adv_instance.flags; + /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting + * corresponds to the "connectable" instance flag. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) + flags |= MGMT_ADV_FLAG_CONNECTABLE; - /* Instance 0 always manages the "Tx Power" and "Flags" fields */ - flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; + return flags; + } - /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting corresponds - * to the "connectable" instance flag. - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) - flags |= MGMT_ADV_FLAG_CONNECTABLE; + adv_instance = hci_find_adv_instance(hdev, instance); - return flags; + /* Return 0 when we got an invalid instance identifier. */ + if (!adv_instance) + return 0; + + return adv_instance->flags; } -static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) +static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) { - /* Ignore instance 0 and other unsupported instances */ - if (instance != 0x01) + u8 instance = get_current_adv_instance(hdev); + struct adv_info *adv_instance; + + /* Ignore instance 0 */ + if (instance == 0x00) + return 0; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) return 0; /* TODO: Take into account the "appearance" and "local-name" flags here. * These are currently being ignored as they are not supported. */ - return hdev->adv_instance.scan_rsp_len; + return adv_instance->scan_rsp_len; } static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) { + struct adv_info *adv_instance = NULL; u8 ad_len = 0, flags = 0; - u32 instance_flags = get_adv_instance_flags(hdev, instance); + u32 instance_flags; + + /* Return 0 when the current instance identifier is invalid. */ + if (instance) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + } + + instance_flags = get_adv_instance_flags(hdev, instance); /* The Add Advertising command allows userspace to set both the general * and limited discoverable flags. @@ -1043,12 +1067,11 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) } } - if (instance) { - memcpy(ptr, hdev->adv_instance.adv_data, - hdev->adv_instance.adv_data_len); - - ad_len += hdev->adv_instance.adv_data_len; - ptr += hdev->adv_instance.adv_data_len; + if (adv_instance) { + memcpy(ptr, adv_instance->adv_data, + adv_instance->adv_data_len); + ad_len += adv_instance->adv_data_len; + ptr += adv_instance->adv_data_len; } /* Provide Tx Power only if we can provide a valid value for it */ @@ -1065,7 +1088,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) return ad_len; } -static void update_adv_data_for_instance(struct hci_request *req, u8 instance) +static void update_inst_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_adv_data cp; @@ -1093,10 +1116,7 @@ static void update_adv_data_for_instance(struct hci_request *req, u8 instance) static void update_adv_data(struct hci_request *req) { - struct hci_dev *hdev = req->hdev; - u8 instance = get_current_adv_instance(hdev); - - update_adv_data_for_instance(req, instance); + update_inst_adv_data(req, get_current_adv_instance(req->hdev)); } int mgmt_update_adv_data(struct hci_dev *hdev) @@ -1277,7 +1297,7 @@ static void enable_advertising(struct hci_request *req) if (connectable) cp.type = LE_ADV_IND; - else if (get_adv_instance_scan_rsp_len(hdev, instance)) + else if (get_cur_adv_instance_scan_rsp_len(hdev)) cp.type = LE_ADV_SCAN_IND; else cp.type = LE_ADV_NONCONN_IND; @@ -1459,27 +1479,141 @@ static void advertising_removed(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk); } -static void clear_adv_instance(struct hci_dev *hdev) +static int schedule_adv_instance(struct hci_request *req, u8 instance, + bool force) { + struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance = NULL; + u16 timeout; + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + return -EPERM; + + if (hdev->adv_instance_timeout) + return -EBUSY; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + /* A zero timeout means unlimited advertising. As long as there is + * only one instance, duration should be ignored. We still set a timeout + * in case further instances are being added later on. + * + * If the remaining lifetime of the instance is more than the duration + * then the timeout corresponds to the duration, otherwise it will be + * reduced to the remaining instance lifetime. + */ + if (adv_instance->timeout == 0 || + adv_instance->duration <= adv_instance->remaining_time) + timeout = adv_instance->duration; + else + timeout = adv_instance->remaining_time; + + /* The remaining time is being reduced unless the instance is being + * advertised without time limit. + */ + if (adv_instance->timeout) + adv_instance->remaining_time = + adv_instance->remaining_time - timeout; + + hdev->adv_instance_timeout = timeout; + queue_delayed_work(hdev->workqueue, + &hdev->adv_instance_expire, + msecs_to_jiffies(timeout * 1000)); + + /* If we're just re-scheduling the same instance again then do not + * execute any HCI commands. This happens when a single instance is + * being advertised. + */ + if (!force && hdev->cur_adv_instance == instance && + hci_dev_test_flag(hdev, HCI_LE_ADV)) + return 0; + + hdev->cur_adv_instance = instance; + update_adv_data(req); + update_scan_rsp_data(req); + enable_advertising(req); + + return 0; +} + +static void cancel_adv_timeout(struct hci_dev *hdev) { - struct hci_request req; + if (hdev->adv_instance_timeout) { + hdev->adv_instance_timeout = 0; + cancel_delayed_work(&hdev->adv_instance_expire); + } +} - if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) - return; +/* For a single instance: + * - force == true: The instance will be removed even when its remaining + * lifetime is not zero. + * - force == false: the instance will be deactivated but kept stored unless + * the remaining lifetime is zero. + * + * For instance == 0x00: + * - force == true: All instances will be removed regardless of their timeout + * setting. + * - force == false: Only instances that have a timeout will be removed. + */ +static void clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, + u8 instance, bool force) +{ + struct adv_info *adv_instance, *n, *next_instance = NULL; + int err; + u8 rem_inst; - if (hdev->adv_instance.timeout) - cancel_delayed_work(&hdev->adv_instance.timeout_exp); + /* Cancel any timeout concerning the removed instance(s). */ + if (!instance || hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); - memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); - advertising_removed(NULL, hdev, 1); - hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + /* Get the next instance to advertise BEFORE we remove + * the current one. This can be the same instance again + * if there is only one instance. + */ + if (instance && hdev->cur_adv_instance == instance) + next_instance = hci_get_next_instance(hdev, instance); - if (!hdev_is_powered(hdev) || + if (instance == 0x00) { + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, + list) { + if (!(force || adv_instance->timeout)) + continue; + + rem_inst = adv_instance->instance; + err = hci_remove_adv_instance(hdev, rem_inst); + if (!err) + advertising_removed(NULL, hdev, rem_inst); + } + hdev->cur_adv_instance = 0x00; + } else { + adv_instance = hci_find_adv_instance(hdev, instance); + + if (force || (adv_instance && adv_instance->timeout && + !adv_instance->remaining_time)) { + /* Don't advertise a removed instance. */ + if (next_instance && + next_instance->instance == instance) + next_instance = NULL; + + err = hci_remove_adv_instance(hdev, instance); + if (!err) + advertising_removed(NULL, hdev, instance); + } + } + + if (list_empty(&hdev->adv_instances)) { + hdev->cur_adv_instance = 0x00; + hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + } + + if (!req || !hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return; - hci_req_init(&req, hdev); - disable_advertising(&req); - hci_req_run(&req, NULL); + if (next_instance) + schedule_adv_instance(req, next_instance->instance, false); } static int clean_up_hci_state(struct hci_dev *hdev) @@ -1497,8 +1631,7 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - if (hdev->adv_instance.timeout) - clear_adv_instance(hdev); + clear_adv_instance(hdev, NULL, 0x00, false); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) disable_advertising(&req); @@ -1506,35 +1639,8 @@ static int clean_up_hci_state(struct hci_dev *hdev) discov_stopped = hci_stop_discovery(&req); list_for_each_entry(conn, &hdev->conn_hash.list, list) { - struct hci_cp_disconnect dc; - struct hci_cp_reject_conn_req rej; - - switch (conn->state) { - case BT_CONNECTED: - case BT_CONFIG: - dc.handle = cpu_to_le16(conn->handle); - dc.reason = 0x15; /* Terminated due to Power Off */ - hci_req_add(&req, HCI_OP_DISCONNECT, sizeof(dc), &dc); - break; - case BT_CONNECT: - if (conn->type == LE_LINK) - hci_req_add(&req, HCI_OP_LE_CREATE_CONN_CANCEL, - 0, NULL); - else if (conn->type == ACL_LINK) - hci_req_add(&req, HCI_OP_CREATE_CONN_CANCEL, - 6, &conn->dst); - break; - case BT_CONNECT2: - bacpy(&rej.bdaddr, &conn->dst); - rej.reason = 0x15; /* Terminated due to Power Off */ - if (conn->type == ACL_LINK) - hci_req_add(&req, HCI_OP_REJECT_CONN_REQ, - sizeof(rej), &rej); - else if (conn->type == SCO_LINK) - hci_req_add(&req, HCI_OP_REJECT_SYNC_CONN_REQ, - sizeof(rej), &rej); - break; - } + /* 0x15 == Terminated due to Power Off */ + __hci_abort_conn(&req, conn, 0x15); } err = hci_req_run(&req, clean_up_hci_complete); @@ -2453,6 +2559,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) val = !!cp->val; enabled = lmp_host_le_capable(hdev); + if (!val) + clear_adv_instance(hdev, NULL, 0x00, true); + if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; @@ -2916,9 +3025,10 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_unpair_device *cp = data; struct mgmt_rp_unpair_device rp; - struct hci_cp_disconnect dc; + struct hci_conn_params *params; struct mgmt_pending_cmd *cmd; struct hci_conn *conn; + u8 addr_type; int err; memset(&rp, 0, sizeof(rp)); @@ -2959,36 +3069,23 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, conn = NULL; err = hci_remove_link_key(hdev, &cp->addr.bdaddr); - } else { - u8 addr_type; - - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, - &cp->addr.bdaddr); - if (conn) { - /* Defer clearing up the connection parameters - * until closing to give a chance of keeping - * them if a repairing happens. - */ - set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags); - - /* If disconnection is not requested, then - * clear the connection variable so that the - * link is not terminated. - */ - if (!cp->disconnect) - conn = NULL; + if (err < 0) { + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_NOT_PAIRED, &rp, + sizeof(rp)); + goto unlock; } - if (cp->addr.type == BDADDR_LE_PUBLIC) - addr_type = ADDR_LE_DEV_PUBLIC; - else - addr_type = ADDR_LE_DEV_RANDOM; + goto done; + } - hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); + /* LE address type */ + addr_type = le_addr_type(cp->addr.type); - err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); - } + hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); + err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, @@ -2996,6 +3093,36 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } + conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, addr_type); + if (!conn) { + hci_conn_params_del(hdev, &cp->addr.bdaddr, addr_type); + goto done; + } + + /* Abort any ongoing SMP pairing */ + smp_cancel_pairing(conn); + + /* Defer clearing up the connection parameters until closing to + * give a chance of keeping them if a repairing happens. + */ + set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags); + + /* Disable auto-connection parameters if present */ + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); + if (params) { + if (params->explicit_connect) + params->auto_connect = HCI_AUTO_CONN_EXPLICIT; + else + params->auto_connect = HCI_AUTO_CONN_DISABLED; + } + + /* If disconnection is not requested, then clear the connection + * variable so that the link is not terminated. + */ + if (!cp->disconnect) + conn = NULL; + +done: /* If the connection variable is set, then termination of the * link is requested. */ @@ -3015,9 +3142,7 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, cmd->cmd_complete = addr_cmd_complete; - dc.handle = cpu_to_le16(conn->handle); - dc.reason = 0x13; /* Remote User Terminated Connection */ - err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc); + err = hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); if (err < 0) mgmt_pending_remove(cmd); @@ -3065,7 +3190,8 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); else - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); + conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, @@ -3416,14 +3542,8 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level, auth_type); } else { - u8 addr_type; - - /* Convert from L2CAP channel address type to HCI address type - */ - if (cp->addr.type == BDADDR_LE_PUBLIC) - addr_type = ADDR_LE_DEV_PUBLIC; - else - addr_type = ADDR_LE_DEV_RANDOM; + u8 addr_type = le_addr_type(cp->addr.type); + struct hci_conn_params *p; /* When pairing a new device, it is expected to remember * this device for future connections. Adding the connection @@ -3434,11 +3554,15 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, * If connection parameters already exist, then they * will be kept and this function does nothing. */ - hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + + if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) + p->auto_connect = HCI_AUTO_CONN_DISABLED; - conn = hci_connect_le(hdev, &cp->addr.bdaddr, addr_type, - sec_level, HCI_LE_CONN_TIMEOUT, - HCI_ROLE_MASTER); + conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, + addr_type, sec_level, + HCI_LE_CONN_TIMEOUT, + HCI_ROLE_MASTER); } if (IS_ERR(conn)) { @@ -3564,7 +3688,8 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, if (addr->type == BDADDR_BREDR) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &addr->bdaddr); else - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &addr->bdaddr); + conn = hci_conn_hash_lookup_le(hdev, &addr->bdaddr, + le_addr_type(addr->type)); if (!conn) { err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, @@ -4082,11 +4207,12 @@ static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status) /* Don't let discovery abort an outgoing connection attempt * that's using directed advertising. */ - if (hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { + if (hci_lookup_le_connect(hdev)) { *status = MGMT_STATUS_REJECTED; return false; } + cancel_adv_timeout(hdev); disable_advertising(req); } @@ -4669,6 +4795,9 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, { struct cmd_lookup match = { NULL, hdev }; struct hci_request req; + u8 instance; + struct adv_info *adv_instance; + int err; hci_dev_lock(hdev); @@ -4694,18 +4823,31 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, sock_put(match.sk); /* If "Set Advertising" was just disabled and instance advertising was - * set up earlier, then enable the advertising instance. + * set up earlier, then re-enable multi-instance advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) || + list_empty(&hdev->adv_instances)) goto unlock; + instance = hdev->cur_adv_instance; + if (!instance) { + adv_instance = list_first_entry_or_null(&hdev->adv_instances, + struct adv_info, list); + if (!adv_instance) + goto unlock; + + instance = adv_instance->instance; + } + hci_req_init(&req, hdev); - update_adv_data(&req); - enable_advertising(&req); + err = schedule_adv_instance(&req, instance, true); + + if (!err) + err = hci_req_run(&req, enable_advertising_instance); - if (hci_req_run(&req, enable_advertising_instance) < 0) + if (err) BT_ERR("Failed to re-configure advertising"); unlock: @@ -4790,10 +4932,15 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); + cancel_adv_timeout(hdev); + if (val) { - /* Switch to instance "0" for the Set Advertising setting. */ - update_adv_data_for_instance(&req, 0); - update_scan_rsp_data_for_instance(&req, 0); + /* Switch to instance "0" for the Set Advertising setting. + * We cannot use update_[adv|scan_rsp]_data() here as the + * HCI_ADVERTISING flag is not yet set. + */ + update_inst_adv_data(&req, 0x00); + update_inst_scan_rsp_data(&req, 0x00); enable_advertising(&req); } else { disable_advertising(&req); @@ -5445,14 +5592,9 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *irk = &cp->irks[i]; - u8 addr_type; - - if (irk->addr.type == BDADDR_LE_PUBLIC) - addr_type = ADDR_LE_DEV_PUBLIC; - else - addr_type = ADDR_LE_DEV_RANDOM; - hci_add_irk(hdev, &irk->addr.bdaddr, addr_type, irk->val, + hci_add_irk(hdev, &irk->addr.bdaddr, + le_addr_type(irk->addr.type), irk->val, BDADDR_ANY); } @@ -5532,12 +5674,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; - u8 type, addr_type, authenticated; - - if (key->addr.type == BDADDR_LE_PUBLIC) - addr_type = ADDR_LE_DEV_PUBLIC; - else - addr_type = ADDR_LE_DEV_RANDOM; + u8 type, authenticated; switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: @@ -5563,9 +5700,9 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, continue; } - hci_add_ltk(hdev, &key->addr.bdaddr, addr_type, type, - authenticated, key->val, key->enc_size, key->ediv, - key->rand); + hci_add_ltk(hdev, &key->addr.bdaddr, + le_addr_type(key->addr.type), type, authenticated, + key->val, key->enc_size, key->ediv, key->rand); } err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0, @@ -5957,17 +6094,30 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr, switch (auto_connect) { case HCI_AUTO_CONN_DISABLED: case HCI_AUTO_CONN_LINK_LOSS: + /* If auto connect is being disabled when we're trying to + * connect to device, keep connecting. + */ + if (params->explicit_connect) + list_add(¶ms->action, &hdev->pend_le_conns); + __hci_update_background_scan(req); break; case HCI_AUTO_CONN_REPORT: - list_add(¶ms->action, &hdev->pend_le_reports); + if (params->explicit_connect) + list_add(¶ms->action, &hdev->pend_le_conns); + else + list_add(¶ms->action, &hdev->pend_le_reports); __hci_update_background_scan(req); break; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: if (!is_connected(hdev, addr, addr_type)) { list_add(¶ms->action, &hdev->pend_le_conns); - __hci_update_background_scan(req); + /* If we are in scan phase of connecting, we were + * already added to pend_le_conns and scanning. + */ + if (params->auto_connect != HCI_AUTO_CONN_EXPLICIT) + __hci_update_background_scan(req); } break; } @@ -6064,10 +6214,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto added; } - if (cp->addr.type == BDADDR_LE_PUBLIC) - addr_type = ADDR_LE_DEV_PUBLIC; - else - addr_type = ADDR_LE_DEV_RANDOM; + addr_type = le_addr_type(cp->addr.type); if (cp->action == 0x02) auto_conn = HCI_AUTO_CONN_ALWAYS; @@ -6076,6 +6223,17 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, else auto_conn = HCI_AUTO_CONN_REPORT; + /* Kernel internally uses conn_params with resolvable private + * address, but Add Device allows only identity addresses. + * Make sure it is enforced before calling + * hci_conn_params_lookup. + */ + if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { + err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS); + mgmt_pending_remove(cmd); + goto unlock; + } + /* If the connection parameters don't exist for this device, * they will be created and configured with defaults. */ @@ -6185,10 +6343,19 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto complete; } - if (cp->addr.type == BDADDR_LE_PUBLIC) - addr_type = ADDR_LE_DEV_PUBLIC; - else - addr_type = ADDR_LE_DEV_RANDOM; + addr_type = le_addr_type(cp->addr.type); + + /* Kernel internally uses conn_params with resolvable private + * address, but Remove Device allows only identity addresses. + * Make sure it is enforced before calling + * hci_conn_params_lookup. + */ + if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { + err = cmd->cmd_complete(cmd, + MGMT_STATUS_INVALID_PARAMS); + mgmt_pending_remove(cmd); + goto unlock; + } params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); @@ -6199,7 +6366,8 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { + if (params->auto_connect == HCI_AUTO_CONN_DISABLED || + params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS); mgmt_pending_remove(cmd); @@ -6235,6 +6403,10 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, if (p->auto_connect == HCI_AUTO_CONN_DISABLED) continue; device_removed(sk, hdev, &p->addr, p->addr_type); + if (p->explicit_connect) { + p->auto_connect = HCI_AUTO_CONN_EXPLICIT; + continue; + } list_del(&p->action); list_del(&p->list); kfree(p); @@ -6781,8 +6953,9 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_read_adv_features *rp; size_t rp_len; - int err; + int err, i; bool instance; + struct adv_info *adv_instance; u32 supported_flags; BT_DBG("%s", hdev->name); @@ -6795,12 +6968,9 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, rp_len = sizeof(*rp); - /* Currently only one instance is supported, so just add 1 to the - * response length. - */ instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE); if (instance) - rp_len++; + rp_len += hdev->adv_instance_cnt; rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) { @@ -6813,14 +6983,18 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, rp->supported_flags = cpu_to_le32(supported_flags); rp->max_adv_data_len = HCI_MAX_AD_LENGTH; rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; - rp->max_instances = 1; + rp->max_instances = HCI_MAX_ADV_INSTANCES; - /* Currently only one instance is supported, so simply return the - * current instance number. - */ if (instance) { - rp->num_instances = 1; - rp->instance[0] = 1; + i = 0; + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + if (i >= hdev->adv_instance_cnt) + break; + + rp->instance[i] = adv_instance->instance; + i++; + } + rp->num_instances = hdev->adv_instance_cnt; } else { rp->num_instances = 0; } @@ -6882,7 +7056,10 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_pending_cmd *cmd; + struct mgmt_cp_add_advertising *cp; struct mgmt_rp_add_advertising rp; + struct adv_info *adv_instance, *n; + u8 instance; BT_DBG("status %d", status); @@ -6890,16 +7067,32 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev); - if (status) { + if (status) hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); - memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); - advertising_removed(cmd ? cmd->sk : NULL, hdev, 1); + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + if (!adv_instance->pending) + continue; + + if (!status) { + adv_instance->pending = false; + continue; + } + + instance = adv_instance->instance; + + if (hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); + + hci_remove_adv_instance(hdev, instance); + advertising_removed(cmd ? cmd->sk : NULL, hdev, instance); } if (!cmd) goto unlock; - rp.instance = 0x01; + cp = cmd->param; + rp.instance = cp->instance; if (status) mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, @@ -6914,15 +7107,28 @@ unlock: hci_dev_unlock(hdev); } -static void adv_timeout_expired(struct work_struct *work) +void mgmt_adv_timeout_expired(struct hci_dev *hdev) { - struct hci_dev *hdev = container_of(work, struct hci_dev, - adv_instance.timeout_exp.work); + u8 instance; + struct hci_request req; - hdev->adv_instance.timeout = 0; + hdev->adv_instance_timeout = 0; + + instance = get_current_adv_instance(hdev); + if (instance == 0x00) + return; hci_dev_lock(hdev); - clear_adv_instance(hdev); + hci_req_init(&req, hdev); + + clear_adv_instance(hdev, &req, instance, false); + + if (list_empty(&hdev->adv_instances)) + disable_advertising(&req); + + if (!skb_queue_empty(&req.cmd_q)) + hci_req_run(&req, NULL); + hci_dev_unlock(hdev); } @@ -6934,7 +7140,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, u32 flags; u32 supported_flags; u8 status; - u16 timeout; + u16 timeout, duration; + unsigned int prev_instance_cnt = hdev->adv_instance_cnt; + u8 schedule_instance = 0; + struct adv_info *next_instance; int err; struct mgmt_pending_cmd *cmd; struct hci_request req; @@ -6948,12 +7157,13 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, flags = __le32_to_cpu(cp->flags); timeout = __le16_to_cpu(cp->timeout); + duration = __le16_to_cpu(cp->duration); - /* The current implementation only supports adding one instance and only - * a subset of the specified flags. + /* The current implementation only supports a subset of the specified + * flags. */ supported_flags = get_supported_adv_flags(hdev); - if (cp->instance != 0x01 || (flags & ~supported_flags)) + if (flags & ~supported_flags) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -6981,38 +7191,51 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - INIT_DELAYED_WORK(&hdev->adv_instance.timeout_exp, adv_timeout_expired); - - hdev->adv_instance.flags = flags; - hdev->adv_instance.adv_data_len = cp->adv_data_len; - hdev->adv_instance.scan_rsp_len = cp->scan_rsp_len; - - if (cp->adv_data_len) - memcpy(hdev->adv_instance.adv_data, cp->data, cp->adv_data_len); - - if (cp->scan_rsp_len) - memcpy(hdev->adv_instance.scan_rsp_data, - cp->data + cp->adv_data_len, cp->scan_rsp_len); - - if (hdev->adv_instance.timeout) - cancel_delayed_work(&hdev->adv_instance.timeout_exp); + err = hci_add_adv_instance(hdev, cp->instance, flags, + cp->adv_data_len, cp->data, + cp->scan_rsp_len, + cp->data + cp->adv_data_len, + timeout, duration); + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_FAILED); + goto unlock; + } - hdev->adv_instance.timeout = timeout; + /* Only trigger an advertising added event if a new instance was + * actually added. + */ + if (hdev->adv_instance_cnt > prev_instance_cnt) + advertising_added(sk, hdev, cp->instance); - if (timeout) - queue_delayed_work(hdev->workqueue, - &hdev->adv_instance.timeout_exp, - msecs_to_jiffies(timeout * 1000)); + hci_dev_set_flag(hdev, HCI_ADVERTISING_INSTANCE); - if (!hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING_INSTANCE)) - advertising_added(sk, hdev, 1); + if (hdev->cur_adv_instance == cp->instance) { + /* If the currently advertised instance is being changed then + * cancel the current advertising and schedule the next + * instance. If there is only one instance then the overridden + * advertising data will be visible right away. + */ + cancel_adv_timeout(hdev); + + next_instance = hci_get_next_instance(hdev, cp->instance); + if (next_instance) + schedule_instance = next_instance->instance; + } else if (!hdev->adv_instance_timeout) { + /* Immediately advertise the new instance if no other + * instance is currently being advertised. + */ + schedule_instance = cp->instance; + } - /* If the HCI_ADVERTISING flag is set or the device isn't powered then - * we have no HCI communication to make. Simply return. + /* If the HCI_ADVERTISING flag is set or the device isn't powered or + * there is no instance to be advertised then we have no HCI + * communication to make. Simply return. */ if (!hdev_is_powered(hdev) || - hci_dev_test_flag(hdev, HCI_ADVERTISING)) { - rp.instance = 0x01; + hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !schedule_instance) { + rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); goto unlock; @@ -7030,11 +7253,11 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - update_adv_data(&req); - update_scan_rsp_data(&req); - enable_advertising(&req); + err = schedule_adv_instance(&req, schedule_instance, true); + + if (!err) + err = hci_req_run(&req, add_advertising_complete); - err = hci_req_run(&req, add_advertising_complete); if (err < 0) mgmt_pending_remove(cmd); @@ -7048,6 +7271,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_pending_cmd *cmd; + struct mgmt_cp_remove_advertising *cp; struct mgmt_rp_remove_advertising rp; BT_DBG("status %d", status); @@ -7062,7 +7286,8 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, if (!cmd) goto unlock; - rp.instance = 1; + cp = cmd->param; + rp.instance = cp->instance; mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); @@ -7077,21 +7302,21 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_remove_advertising *cp = data; struct mgmt_rp_remove_advertising rp; - int err; struct mgmt_pending_cmd *cmd; struct hci_request req; + int err; BT_DBG("%s", hdev->name); - /* The current implementation only allows modifying instance no 1. A - * value of 0 indicates that all instances should be cleared. - */ - if (cp->instance > 1) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, - MGMT_STATUS_INVALID_PARAMS); - hci_dev_lock(hdev); + if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_REMOVE_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || pending_find(MGMT_OP_SET_LE, hdev)) { @@ -7106,21 +7331,21 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (hdev->adv_instance.timeout) - cancel_delayed_work(&hdev->adv_instance.timeout_exp); - - memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance)); + hci_req_init(&req, hdev); - advertising_removed(sk, hdev, 1); + clear_adv_instance(hdev, &req, cp->instance, true); - hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + if (list_empty(&hdev->adv_instances)) + disable_advertising(&req); - /* If the HCI_ADVERTISING flag is set or the device isn't powered then - * we have no HCI communication to make. Simply return. + /* If no HCI commands have been collected so far or the HCI_ADVERTISING + * flag is set or the device isn't powered then we have no HCI + * communication to make. Simply return. */ - if (!hdev_is_powered(hdev) || + if (skb_queue_empty(&req.cmd_q) || + !hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) { - rp.instance = 1; + rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); @@ -7134,9 +7359,6 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - hci_req_init(&req, hdev); - disable_advertising(&req); - err = hci_req_run(&req, remove_advertising_complete); if (err < 0) mgmt_pending_remove(cmd); @@ -7361,6 +7583,7 @@ static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode) static int powered_update_hci(struct hci_dev *hdev) { struct hci_request req; + struct adv_info *adv_instance; u8 link_sec; hci_req_init(&req, hdev); @@ -7400,14 +7623,27 @@ static int powered_update_hci(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && + (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))) { update_adv_data(&req); update_scan_rsp_data(&req); } - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + hdev->cur_adv_instance == 0x00 && + !list_empty(&hdev->adv_instances)) { + adv_instance = list_first_entry(&hdev->adv_instances, + struct adv_info, list); + hdev->cur_adv_instance = adv_instance->instance; + } + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) enable_advertising(&req); + else if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + hdev->cur_adv_instance) + schedule_adv_instance(&req, hdev->cur_adv_instance, + true); restart_le_actions(&req); } @@ -7577,7 +7813,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses - * without providing an indentity resolving key don't require + * without providing an identity resolving key don't require * to store long term keys. Their addresses will change the * next time around. * @@ -7603,32 +7839,23 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) if (key->type == SMP_LTK) ev.key.master = 1; - memcpy(ev.key.val, key->val, sizeof(key->val)); + /* Make sure we copy only the significant bytes based on the + * encryption key size, and set the rest of the value to zeroes. + */ + memcpy(ev.key.val, key->val, key->enc_size); + memset(ev.key.val + key->enc_size, 0, + sizeof(ev.key.val) - key->enc_size); mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL); } -void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk) +void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent) { struct mgmt_ev_new_irk ev; memset(&ev, 0, sizeof(ev)); - /* For identity resolving keys from devices that are already - * using a public address or static random address, do not - * ask for storing this key. The identity resolving key really - * is only mandatory for devices using resovlable random - * addresses. - * - * Storing all identity resolving keys has the downside that - * they will be also loaded on next boot of they system. More - * identity resolving keys, means more time during scanning is - * needed to actually resolve these addresses. - */ - if (bacmp(&irk->rpa, BDADDR_ANY)) - ev.store_hint = 0x01; - else - ev.store_hint = 0x00; + ev.store_hint = persistent; bacpy(&ev.rpa, &irk->rpa); bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr); @@ -7646,7 +7873,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses - * without providing an indentity resolving key don't require + * without providing an identity resolving key don't require * to store signature resolving keys. Their addresses will change * the next time around. * @@ -8387,13 +8614,24 @@ static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) void mgmt_reenable_advertising(struct hci_dev *hdev) { struct hci_request req; + u8 instance; if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) return; + instance = get_current_adv_instance(hdev); + hci_req_init(&req, hdev); - enable_advertising(&req); + + if (instance) { + schedule_adv_instance(&req, instance, true); + } else { + update_adv_data(&req); + update_scan_rsp_data(&req); + enable_advertising(&req); + } + hci_req_run(&req, adv_enable_complete); } diff --git a/kernel/net/bluetooth/rfcomm/core.c b/kernel/net/bluetooth/rfcomm/core.c index 4fea24275..29709fbfd 100644 --- a/kernel/net/bluetooth/rfcomm/core.c +++ b/kernel/net/bluetooth/rfcomm/core.c @@ -200,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock) BT_DBG(""); - err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); + err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); if (!err) { struct sock *sk = (*sock)->sk; sk->sk_data_ready = rfcomm_l2data_ready; diff --git a/kernel/net/bluetooth/rfcomm/sock.c b/kernel/net/bluetooth/rfcomm/sock.c index 825e8fb51..7511df723 100644 --- a/kernel/net/bluetooth/rfcomm/sock.c +++ b/kernel/net/bluetooth/rfcomm/sock.c @@ -269,12 +269,12 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); if (!sk) return NULL; @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, sock->ops = &rfcomm_sock_ops; - sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -334,16 +334,19 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { - struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sockaddr_rc sa; struct sock *sk = sock->sk; - int chan = sa->rc_channel; - int err = 0; - - BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr); + int len, err = 0; if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + memset(&sa, 0, sizeof(sa)); + len = min_t(unsigned int, sizeof(sa), addr_len); + memcpy(&sa, addr, len); + + BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr); + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -358,12 +361,13 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr write_lock(&rfcomm_sk_list.lock); - if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) { + if (sa.rc_channel && + __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ - bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr); - rfcomm_pi(sk)->channel = chan; + bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr); + rfcomm_pi(sk)->channel = sa.rc_channel; sk->sk_state = BT_BOUND; } @@ -969,7 +973,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * goto done; } - sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC); + sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0); if (!sk) goto done; diff --git a/kernel/net/bluetooth/sco.c b/kernel/net/bluetooth/sco.c index 4322c833e..f52bcbf2e 100644 --- a/kernel/net/bluetooth/sco.c +++ b/kernel/net/bluetooth/sco.c @@ -74,7 +74,7 @@ struct sco_pinfo { static void sco_sock_timeout(unsigned long arg) { - struct sock *sk = (struct sock *) arg; + struct sock *sk = (struct sock *)arg; BT_DBG("sock %p state %d", sk, sk->sk_state); @@ -154,13 +154,13 @@ static void sco_chan_del(struct sock *sk, int err) sock_set_flag(sk, SOCK_ZAPPED); } -static int sco_conn_del(struct hci_conn *hcon, int err) +static void sco_conn_del(struct hci_conn *hcon, int err) { struct sco_conn *conn = hcon->sco_data; struct sock *sk; if (!conn) - return 0; + return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); @@ -170,19 +170,21 @@ static int sco_conn_del(struct hci_conn *hcon, int err) sco_conn_unlock(conn); if (sk) { + sock_hold(sk); bh_lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); bh_unlock_sock(sk); sco_sock_kill(sk); + sock_put(sk); } hcon->sco_data = NULL; kfree(conn); - return 0; } -static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) +static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, + struct sock *parent) { BT_DBG("conn %p", conn); @@ -415,8 +417,10 @@ static void __sco_sock_close(struct sock *sk) if (sco_pi(sk)->conn->hcon) { sk->sk_state = BT_DISCONN; sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); + sco_conn_lock(sco_pi(sk)->conn); hci_conn_drop(sco_pi(sk)->conn->hcon); sco_pi(sk)->conn->hcon = NULL; + sco_conn_unlock(sco_pi(sk)->conn); } else sco_chan_del(sk, ECONNRESET); break; @@ -460,11 +464,12 @@ static struct proto sco_proto = { .obj_size = sizeof(struct sco_pinfo) }; -static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, + int proto, gfp_t prio, int kern) { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern); if (!sk) return NULL; @@ -501,7 +506,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &sco_sock_ops; - sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -509,7 +514,8 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, + int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; @@ -520,6 +526,9 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_le if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + if (addr_len < sizeof(struct sockaddr_sco)) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -616,7 +625,8 @@ done: return err; } -static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags) +static int sco_sock_accept(struct socket *sock, struct socket *newsock, + int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; @@ -670,7 +680,8 @@ done: return err; } -static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, + int *len, int peer) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; @@ -780,7 +791,8 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, return bt_sock_recvmsg(sock, msg, len, flags); } -static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int sco_sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; int len, err = 0; @@ -820,7 +832,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_user((char *) &voice, optval, len)) { + if (copy_from_user((char *)&voice, optval, len)) { err = -EFAULT; break; } @@ -844,7 +856,8 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char return err; } -static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) +static int sco_sock_getsockopt_old(struct socket *sock, int optname, + char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sco_options opts; @@ -904,7 +917,8 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user return err; } -static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int sco_sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; @@ -929,7 +943,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), - (u32 __user *) optval)) + (u32 __user *)optval)) err = -EFAULT; break; @@ -962,7 +976,9 @@ static int sco_sock_shutdown(struct socket *sock, int how) if (!sk) return 0; + sock_hold(sk); lock_sock(sk); + if (!sk->sk_shutdown) { sk->sk_shutdown = SHUTDOWN_MASK; sco_sock_clear_timer(sk); @@ -973,7 +989,10 @@ static int sco_sock_shutdown(struct socket *sock, int how) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } + release_sock(sk); + sock_put(sk); + return err; } @@ -1017,6 +1036,11 @@ static void sco_conn_ready(struct sco_conn *conn) } else { sco_conn_lock(conn); + if (!conn->hcon) { + sco_conn_unlock(conn); + return; + } + parent = sco_get_sock_listen(&conn->hcon->src); if (!parent) { sco_conn_unlock(conn); @@ -1026,7 +1050,7 @@ static void sco_conn_ready(struct sco_conn *conn) bh_lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, - BTPROTO_SCO, GFP_ATOMIC); + BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { bh_unlock_sock(parent); sco_conn_unlock(conn); @@ -1110,7 +1134,7 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) sco_conn_del(hcon, bt_to_errno(reason)); } -int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) { struct sco_conn *conn = hcon->sco_data; @@ -1121,12 +1145,11 @@ int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) if (skb->len) { sco_recv_frame(conn, skb); - return 0; + return; } drop: kfree_skb(skb); - return 0; } static struct hci_cb sco_cb = { diff --git a/kernel/net/bluetooth/smp.c b/kernel/net/bluetooth/smp.c index 7b815bcc8..4b175df35 100644 --- a/kernel/net/bluetooth/smp.c +++ b/kernel/net/bluetooth/smp.c @@ -33,6 +33,9 @@ #include "ecc.h" #include "smp.h" +#define SMP_DEV(hdev) \ + ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) + /* Low-level debug macros to be used for stuff that we don't want * accidentially in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. @@ -81,6 +84,9 @@ struct smp_dev { u8 local_rand[16]; bool debug_key; + u8 min_key_size; + u8 max_key_size; + struct crypto_blkcipher *tfm_aes; struct crypto_hash *tfm_cmac; }; @@ -371,6 +377,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) uint8_t tmp[16], data[16]; int err; + SMP_DBG("k %16phN r %16phN", k, r); + if (!tfm) { BT_ERR("tfm %p", tfm); return -EINVAL; @@ -400,6 +408,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); + SMP_DBG("r %16phN", r); + return err; } @@ -410,6 +420,10 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], u8 p1[16], p2[16]; int err; + SMP_DBG("k %16phN r %16phN", k, r); + SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra); + SMP_DBG("preq %7phN pres %7phN", preq, pres); + memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ @@ -418,10 +432,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], memcpy(p1 + 2, preq, 7); memcpy(p1 + 9, pres, 7); - /* p2 = padding || ia || ra */ - memcpy(p2, ra, 6); - memcpy(p2 + 6, ia, 6); - memset(p2 + 12, 0, 4); + SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); @@ -433,6 +444,13 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], return err; } + /* p2 = padding || ia || ra */ + memcpy(p2, ra, 6); + memcpy(p2 + 6, ia, 6); + memset(p2 + 12, 0, 4); + + SMP_DBG("p2 %16phN", p2); + /* res = res XOR p2 */ u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); @@ -477,7 +495,7 @@ static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16], } /* The output of the random address function ah is: - * ah(h, r) = e(k, r') mod 2^24 + * ah(k, r) = e(k, r') mod 2^24 * The output of the security function e is then truncated to 24 bits * by taking the least significant 24 bits of the output of e as the * result of ah. @@ -696,7 +714,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; - req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->max_key_size = SMP_DEV(hdev)->max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -707,7 +725,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; - rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->max_key_size = SMP_DEV(hdev)->max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -718,10 +736,11 @@ static void build_pairing_cmd(struct l2cap_conn *conn, static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { struct l2cap_chan *chan = conn->smp; + struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; - if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || - (max_key_size < SMP_MIN_ENC_KEY_SIZE)) + if (max_key_size > SMP_DEV(hdev)->max_key_size || + max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; smp->enc_key_size = max_key_size; @@ -792,7 +811,6 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); - clear_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags); mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); if (chan->data) @@ -985,13 +1003,10 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk); - memset(stk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; - hci_le_start_enc(hcon, ediv, rand, stk); + hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { @@ -1004,9 +1019,6 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk); - memset(stk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; else @@ -1033,35 +1045,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; bool persistent; - if (smp->remote_irk) { - mgmt_new_irk(hdev, smp->remote_irk); - /* Now that user space can be considered to know the - * identity address track the connection based on it - * from now on (assuming this is an LE link). - */ - if (hcon->type == LE_LINK) { - bacpy(&hcon->dst, &smp->remote_irk->bdaddr); - hcon->dst_type = smp->remote_irk->addr_type; - queue_work(hdev->workqueue, &conn->id_addr_update_work); - } - - /* When receiving an indentity resolving key for - * a remote device that does not use a resolvable - * private address, just remove the key so that - * it is possible to use the controller white - * list for scanning. - * - * Userspace will have been told to not store - * this key at this point. So it is safe to - * just remove it. - */ - if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) { - list_del_rcu(&smp->remote_irk->list); - kfree_rcu(smp->remote_irk, rcu); - smp->remote_irk = NULL; - } - } - if (hcon->type == ACL_LINK) { if (hcon->key_type == HCI_LK_DEBUG_COMBINATION) persistent = false; @@ -1069,13 +1052,27 @@ static void smp_notify_keys(struct l2cap_conn *conn) persistent = !test_bit(HCI_CONN_FLUSH_KEY, &hcon->flags); } else { - /* The LTKs and CSRKs should be persistent only if both sides - * had the bonding bit set in their authentication requests. + /* The LTKs, IRKs and CSRKs should be persistent only if + * both sides had the bonding bit set in their + * authentication requests. */ persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); } + if (smp->remote_irk) { + mgmt_new_irk(hdev, smp->remote_irk, persistent); + + /* Now that user space can be considered to know the + * identity address track the connection based on it + * from now on (assuming this is an LE link). + */ + if (hcon->type == LE_LINK) { + bacpy(&hcon->dst, &smp->remote_irk->bdaddr); + hcon->dst_type = smp->remote_irk->addr_type; + queue_work(hdev->workqueue, &conn->id_addr_update_work); + } + } if (smp->csrk) { smp->csrk->bdaddr_type = hcon->dst_type; @@ -1144,9 +1141,6 @@ static void sc_add_ltk(struct smp_chan *smp) else auth = 0; - memset(smp->tk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, key_type, auth, smp->tk, smp->enc_key_size, 0, 0); @@ -1268,7 +1262,14 @@ static void smp_distribute_keys(struct smp_chan *smp) __le16 ediv; __le64 rand; - get_random_bytes(enc.ltk, sizeof(enc.ltk)); + /* Make sure we generate only the significant amount of + * bytes based on the encryption key size, and set the rest + * of the value to zeroes. + */ + get_random_bytes(enc.ltk, smp->enc_key_size); + memset(enc.ltk + smp->enc_key_size, 0, + sizeof(enc.ltk) - smp->enc_key_size); + get_random_bytes(&ediv, sizeof(ediv)); get_random_bytes(&rand, sizeof(rand)); @@ -1688,7 +1689,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; - req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->max_key_size = conn->hcon->enc_key_size; smp->remote_key_dist = remote_dist; @@ -1697,7 +1698,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, memset(rsp, 0, sizeof(*rsp)); - rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; @@ -2190,7 +2191,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return true; - hci_le_start_enc(hcon, key->ediv, key->rand, key->val); + hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; /* We never store STKs for master role, so clear this flag */ @@ -2294,12 +2295,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; - chan = conn->smp; - if (!chan) { - BT_ERR("SMP security requested but not available"); - return 1; - } - if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; @@ -2313,6 +2308,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; + chan = conn->smp; + if (!chan) { + BT_ERR("SMP security requested but not available"); + return 1; + } + l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ @@ -2363,6 +2364,32 @@ unlock: return ret; } +void smp_cancel_pairing(struct hci_conn *hcon) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; + struct smp_chan *smp; + + if (!conn) + return; + + chan = conn->smp; + if (!chan) + return; + + l2cap_chan_lock(chan); + + smp = chan->data; + if (smp) { + if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) + smp_failure(conn, 0); + else + smp_failure(conn, SMP_UNSPECIFIED); + } + + l2cap_chan_unlock(chan); +} + static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; @@ -2742,7 +2769,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) sc_add_ltk(smp); if (hcon->out) { - hci_le_start_enc(hcon, 0, 0, smp->tk); + hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } @@ -2984,8 +3011,13 @@ static void smp_ready_cb(struct l2cap_chan *chan) BT_DBG("chan %p", chan); + /* No need to call l2cap_chan_hold() here since we already own + * the reference taken in smp_new_conn_cb(). This is just the + * first time that we tie it to a specific pointer. The code in + * l2cap_core.c ensures that there's no risk this function wont + * get called if smp_new_conn_cb was previously called. + */ conn->smp = chan; - l2cap_chan_hold(chan); if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) bredr_pairing(chan); @@ -3124,6 +3156,8 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; + smp->min_key_size = SMP_MIN_ENC_KEY_SIZE; + smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; create_chan: chan = l2cap_chan_create(); @@ -3246,6 +3280,94 @@ static const struct file_operations force_bredr_smp_fops = { .llseek = default_llseek, }; +static ssize_t le_min_key_size_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[4]; + + snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size); + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t le_min_key_size_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + u8 key_size; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + sscanf(buf, "%hhu", &key_size); + + if (key_size > SMP_DEV(hdev)->max_key_size || + key_size < SMP_MIN_ENC_KEY_SIZE) + return -EINVAL; + + SMP_DEV(hdev)->min_key_size = key_size; + + return count; +} + +static const struct file_operations le_min_key_size_fops = { + .open = simple_open, + .read = le_min_key_size_read, + .write = le_min_key_size_write, + .llseek = default_llseek, +}; + +static ssize_t le_max_key_size_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[4]; + + snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size); + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t le_max_key_size_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + u8 key_size; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + sscanf(buf, "%hhu", &key_size); + + if (key_size > SMP_MAX_ENC_KEY_SIZE || + key_size < SMP_DEV(hdev)->min_key_size) + return -EINVAL; + + SMP_DEV(hdev)->max_key_size = key_size; + + return count; +} + +static const struct file_operations le_max_key_size_fops = { + .open = simple_open, + .read = le_max_key_size_read, + .write = le_max_key_size_write, + .llseek = default_llseek, +}; + int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; @@ -3270,6 +3392,11 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; + debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev, + &le_min_key_size_fops); + debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev, + &le_max_key_size_fops); + /* If the controller does not support BR/EDR Secure Connections * feature, then the BR/EDR SMP channel shall not be present. * diff --git a/kernel/net/bluetooth/smp.h b/kernel/net/bluetooth/smp.h index 6cf872563..ffcc70b6b 100644 --- a/kernel/net/bluetooth/smp.h +++ b/kernel/net/bluetooth/smp.h @@ -180,6 +180,7 @@ enum smp_key_pref { }; /* SMP Commands */ +void smp_cancel_pairing(struct hci_conn *hcon); bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); diff --git a/kernel/net/bridge/Makefile b/kernel/net/bridge/Makefile index fd7ee03c5..a1cda5d47 100644 --- a/kernel/net/bridge/Makefile +++ b/kernel/net/bridge/Makefile @@ -12,6 +12,8 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o +br_netfilter-y := br_netfilter_hooks.o +br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/kernel/net/bridge/br.c b/kernel/net/bridge/br.c index 02c24cf63..3addc05b9 100644 --- a/kernel/net/bridge/br.c +++ b/kernel/net/bridge/br.c @@ -121,16 +121,16 @@ static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; -static int br_netdev_switch_event(struct notifier_block *unused, - unsigned long event, void *ptr) +/* called with RTNL */ +static int br_switchdev_event(struct notifier_block *unused, + unsigned long event, void *ptr) { - struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; - struct netdev_switch_notifier_fdb_info *fdb_info; + struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; - rtnl_lock(); p = br_port_get_rtnl(dev); if (!p) goto out; @@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused, br = p->br; switch (event) { - case NETDEV_SWITCH_FDB_ADD: + case SWITCHDEV_FDB_ADD: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid); if (err) err = notifier_from_errno(err); break; - case NETDEV_SWITCH_FDB_DEL: + case SWITCHDEV_FDB_DEL: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid); @@ -155,12 +155,11 @@ static int br_netdev_switch_event(struct notifier_block *unused, } out: - rtnl_unlock(); return err; } -static struct notifier_block br_netdev_switch_notifier = { - .notifier_call = br_netdev_switch_event, +static struct notifier_block br_switchdev_notifier = { + .notifier_call = br_switchdev_event, }; static void __net_exit br_net_exit(struct net *net) @@ -214,7 +213,7 @@ static int __init br_init(void) if (err) goto err_out3; - err = register_netdev_switch_notifier(&br_netdev_switch_notifier); + err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; @@ -235,7 +234,7 @@ static int __init br_init(void) return 0; err_out5: - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: @@ -253,7 +252,7 @@ static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); diff --git a/kernel/net/bridge/br_device.c b/kernel/net/bridge/br_device.c index 4ff77a169..2c8095a5d 100644 --- a/kernel/net/bridge/br_device.c +++ b/kernel/net/bridge/br_device.c @@ -28,6 +28,8 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); +static struct lock_class_key bridge_netdev_addr_lock_key; + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -56,7 +58,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; if (is_broadcast_ether_addr(dest)) @@ -87,6 +89,11 @@ out: return NETDEV_TX_OK; } +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev) err = br_vlan_init(br); if (err) free_percpu(br->stats); + br_set_lockdep_class(dev); return err; } @@ -339,6 +347,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, + .ndo_features_check = passthru_features_check, }; static void br_dev_free(struct net_device *dev) @@ -364,8 +373,7 @@ void br_dev_setup(struct net_device *dev) dev->destructor = br_dev_free; dev->ethtool_ops = &br_ethtool_ops; SET_NETDEV_DEVTYPE(dev, &br_type); - dev->tx_queue_len = 0; - dev->priv_flags = IFF_EBRIDGE; + dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; @@ -391,7 +399,7 @@ void br_dev_setup(struct net_device *dev) br->bridge_max_age = br->max_age = 20 * HZ; br->bridge_hello_time = br->hello_time = 2 * HZ; br->bridge_forward_delay = br->forward_delay = 15 * HZ; - br->ageing_time = 300 * HZ; + br->ageing_time = BR_DEFAULT_AGEING_TIME; br_netfilter_rtable_init(br); br_stp_timer_init(br); diff --git a/kernel/net/bridge/br_fdb.c b/kernel/net/bridge/br_fdb.c index 659fb9667..a642bb829 100644 --- a/kernel/net/bridge/br_fdb.c +++ b/kernel/net/bridge/br_fdb.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; @@ -130,11 +131,28 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) } } +static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) +{ + struct switchdev_obj_port_fdb fdb = { + .obj = { + .id = SWITCHDEV_OBJ_ID_PORT_FDB, + .flags = SWITCHDEV_F_DEFER, + }, + .vid = f->vlan_id, + }; + + ether_addr_copy(fdb.addr, f->addr.addr); + switchdev_port_obj_del(f->dst->dev, &fdb.obj); +} + static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) { if (f->is_static) fdb_del_hw_addr(br, f->addr.addr); + if (f->added_by_external_learn) + fdb_del_external_learn(f); + hlist_del_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); @@ -146,22 +164,27 @@ static void fdb_delete_local(struct net_bridge *br, struct net_bridge_fdb_entry *f) { const unsigned char *addr = f->addr.addr; - u16 vid = f->vlan_id; + struct net_bridge_vlan_group *vg; + const struct net_bridge_vlan *v; struct net_bridge_port *op; + u16 vid = f->vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { + vg = nbp_vlan_group(op); if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && - (!vid || nbp_vlan_find(op, vid))) { + (!vid || br_vlan_find(vg, vid))) { f->dst = op; f->added_by_user = 0; return; } } + vg = br_vlan_group(br); + v = br_vlan_find(vg, vid); /* Maybe bridge device has same hw addr? */ if (p && ether_addr_equal(br->dev->dev_addr, addr) && - (!vid || br_vlan_find(br, vid))) { + (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; f->added_by_user = 0; return; @@ -186,14 +209,14 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { + struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; - struct net_port_vlans *pv = nbp_get_vlan_info(p); - bool no_vlan = !pv; + struct net_bridge_vlan *v; int i; - u16 vid; spin_lock_bh(&br->hash_lock); + vg = nbp_vlan_group(p); /* Search all chains since old address/hash is unknown */ for (i = 0; i < BR_HASH_SIZE; i++) { struct hlist_node *h; @@ -209,7 +232,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) * configured, we can safely be done at * this point. */ - if (no_vlan) + if (!vg || !vg->num_vlans) goto insert; } } @@ -219,15 +242,15 @@ insert: /* insert new address, may fail if invalid address or dup. */ fdb_insert(br, p, newaddr, 0); - if (no_vlan) + if (!vg || !vg->num_vlans) goto done; /* Now add entries for every VLAN configured on the port. * This function runs under RTNL so the bitmap will not change * from under us. */ - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - fdb_insert(br, p, newaddr, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) + fdb_insert(br, p, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); @@ -235,9 +258,9 @@ done: void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) { + struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; - struct net_port_vlans *pv; - u16 vid = 0; + struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); @@ -247,20 +270,18 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); - + vg = br_vlan_group(br); + if (!vg || !vg->num_vlans) + goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not * change from under us. */ - pv = br_get_vlan_info(br); - if (!pv) - goto out; - - for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { - f = __br_fdb_get(br, br->dev->dev_addr, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + f = __br_fdb_get(br, br->dev->dev_addr, v->vid); if (f && f->is_local && !f->dst) fdb_delete_local(br, NULL, f); - fdb_insert(br, NULL, newaddr, vid); + fdb_insert(br, NULL, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); @@ -282,6 +303,8 @@ void br_fdb_cleanup(unsigned long _data) unsigned long this_timer; if (f->is_static) continue; + if (f->added_by_external_learn) + continue; this_timer = f->updated + delay; if (time_before_eq(this_timer, jiffies)) fdb_delete(br, f); @@ -313,9 +336,11 @@ void br_fdb_flush(struct net_bridge *br) /* Flush all entries referring to a specific port. * if do_all is set also flush static entries + * if vid is set delete all entries that match the vlan_id */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, + u16 vid, int do_all) { int i; @@ -330,8 +355,9 @@ void br_fdb_delete_by_port(struct net_bridge *br, if (f->dst != p) continue; - if (f->is_static && !do_all) - continue; + if (!do_all) + if (f->is_static || (vid && f->vlan_id != vid)) + continue; if (f->is_local) fdb_delete_local(br, p, f); @@ -469,7 +495,9 @@ static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, struct net_bridge_port *source, const unsigned char *addr, - __u16 vid) + __u16 vid, + unsigned char is_local, + unsigned char is_static) { struct net_bridge_fdb_entry *fdb; @@ -478,8 +506,8 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, memcpy(fdb->addr.addr, addr, ETH_ALEN); fdb->dst = source; fdb->vlan_id = vid; - fdb->is_local = 0; - fdb->is_static = 0; + fdb->is_local = is_local; + fdb->is_static = is_static; fdb->added_by_user = 0; fdb->added_by_external_learn = 0; fdb->updated = fdb->used = jiffies; @@ -510,11 +538,10 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, fdb_delete(br, fdb); } - fdb = fdb_create(head, source, addr, vid); + fdb = fdb_create(head, source, addr, vid, 1, 1); if (!fdb) return -ENOMEM; - fdb->is_local = fdb->is_static = 1; fdb_add_hw_addr(br, addr); fdb_notify(br, fdb, RTM_NEWNEIGH); return 0; @@ -571,7 +598,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } else { spin_lock(&br->hash_lock); if (likely(!fdb_find(head, addr, vid))) { - fdb = fdb_create(head, source, addr, vid); + fdb = fdb_create(head, source, addr, vid, 0, 0); if (fdb) { if (unlikely(added_by_user)) fdb->added_by_user = 1; @@ -585,13 +612,14 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } -static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb) +static int fdb_to_nud(const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb) { if (fdb->is_local) return NUD_PERMANENT; else if (fdb->is_static) return NUD_NOARP; - else if (has_expired(fdb->dst->br, fdb)) + else if (has_expired(br, fdb)) return NUD_STALE; else return NUD_REACHABLE; @@ -617,7 +645,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0; ndm->ndm_type = 0; ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; - ndm->ndm_state = fdb_to_nud(fdb); + ndm->ndm_state = fdb_to_nud(br, fdb); if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr)) goto nla_put_failure; @@ -736,12 +764,18 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, struct net_bridge_fdb_entry *fdb; bool modified = false; + /* If the port cannot learn allow only local and static entries */ + if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) && + !(source->state == BR_STATE_LEARNING || + source->state == BR_STATE_FORWARDING)) + return -EPERM; + fdb = fdb_find(head, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(head, source, addr, vid); + fdb = fdb_create(head, source, addr, vid, 0, 0); if (!fdb) return -ENOMEM; @@ -756,7 +790,7 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, } } - if (fdb_to_nud(fdb) != state) { + if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { fdb->is_local = 1; if (!fdb->is_static) { @@ -816,9 +850,11 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags) { - struct net_bridge_port *p; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan *v; + struct net_bridge *br = NULL; int err = 0; - struct net_port_vlans *pv; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); @@ -830,34 +866,51 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - p = br_port_get_rtnl(dev); - if (p == NULL) { - pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", - dev->name); - return -EINVAL; + if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + vg = nbp_vlan_group(p); } - pv = nbp_get_vlan_info(p); if (vid) { - if (!pv || !test_bit(vid, pv->vlan_bitmap)) { - pr_info("bridge: RTM_NEWNEIGH with unconfigured " - "vlan %d on port %s\n", vid, dev->name); + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) { + pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, vid); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); } else { - err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); - if (err || !pv) + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, 0); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); + if (err || !vg || !vg->num_vlans) goto out; /* We have vlans configured on this port and user didn't * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, v->vid); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, + v->vid); if (err) goto out; } @@ -867,13 +920,41 @@ out: return err; } -static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan) +static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, + u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr, vid); + if (!fdb) + return -ENOENT; + + fdb_delete(br, fdb); + return 0; +} + +static int __br_fdb_delete_by_addr(struct net_bridge *br, + const unsigned char *addr, u16 vid) +{ + int err; + + spin_lock_bh(&br->hash_lock); + err = fdb_delete_by_addr(br, addr, vid); + spin_unlock_bh(&br->hash_lock); + + return err; +} + +static int fdb_delete_by_addr_and_port(struct net_bridge_port *p, + const u8 *addr, u16 vlan) { + struct net_bridge *br = p->br; struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; fdb = fdb_find(head, addr, vlan); - if (!fdb) + if (!fdb || fdb->dst != p) return -ENOENT; fdb_delete(br, fdb); @@ -886,7 +967,7 @@ static int __br_fdb_delete(struct net_bridge_port *p, int err; spin_lock_bh(&p->br->hash_lock); - err = fdb_delete_by_addr(p->br, addr, vid); + err = fdb_delete_by_addr_and_port(p, addr, vid); spin_unlock_bh(&p->br->hash_lock); return err; @@ -897,38 +978,53 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { - struct net_bridge_port *p; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan *v; + struct net_bridge *br = NULL; int err; - struct net_port_vlans *pv; - p = br_port_get_rtnl(dev); - if (p == NULL) { - pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", - dev->name); - return -EINVAL; + if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + vg = nbp_vlan_group(p); } - pv = nbp_get_vlan_info(p); if (vid) { - if (!pv || !test_bit(vid, pv->vlan_bitmap)) { - pr_info("bridge: RTM_DELNEIGH with unconfigured " - "vlan %d on port %s\n", vid, dev->name); + v = br_vlan_find(vg, vid); + if (!v) { + pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } - err = __br_fdb_delete(p, addr, vid); + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, vid); + else + err = __br_fdb_delete(p, addr, vid); } else { err = -ENOENT; - err &= __br_fdb_delete(p, addr, 0); - if (!pv) + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, 0); + else + err &= __br_fdb_delete(p, addr, 0); + + if (!vg || !vg->num_vlans) goto out; - /* We have vlans configured on this port and user didn't - * specify a VLAN. To be nice, add/update entry for every - * vlan on this port. - */ - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - err &= __br_fdb_delete(p, addr, vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, v->vid); + else + err &= __br_fdb_delete(p, addr, v->vid); } } out: @@ -1004,7 +1100,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, head = &br->hash[br_mac_hash(addr, vid)]; fdb = fdb_find(head, addr, vid); if (!fdb) { - fdb = fdb_create(head, p, addr, vid); + fdb = fdb_create(head, p, addr, vid, 0, 0); if (!fdb) { err = -ENOMEM; goto err_unlock; diff --git a/kernel/net/bridge/br_forward.c b/kernel/net/bridge/br_forward.c index 0ff6e1bbc..fcdb86dd5 100644 --- a/kernel/net/bridge/br_forward.c +++ b/kernel/net/bridge/br_forward.c @@ -30,30 +30,47 @@ static int deliver_clone(const struct net_bridge_port *prev, static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { + struct net_bridge_vlan_group *vg; + + vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) && - p->state == BR_STATE_FORWARDING; + br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; } -int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb) +int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (!is_skb_forwardable(skb->dev, skb)) { - kfree_skb(skb); - } else { - skb_push(skb, ETH_HLEN); - br_drop_fake_rtable(skb); - skb_sender_cpu_clear(skb); - dev_queue_xmit(skb); + if (!is_skb_forwardable(skb->dev, skb)) + goto drop; + + skb_push(skb, ETH_HLEN); + br_drop_fake_rtable(skb); + skb_sender_cpu_clear(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD))) { + int depth; + + if (!__vlan_get_protocol(skb, skb->protocol, &depth)) + goto drop; + + skb_set_network_header(skb, depth); } + dev_queue_xmit(skb); + + return 0; + +drop: + kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); -int br_forward_finish(struct sock *sk, struct sk_buff *skb) +int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, sk, skb, - NULL, skb->dev, + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, + net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); } @@ -61,7 +78,10 @@ EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { - skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb); + struct net_bridge_vlan_group *vg; + + vg = nbp_vlan_group_rcu(to); + skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; @@ -77,13 +97,14 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) return; } - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, - NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, + dev_net(skb->dev), NULL, skb,NULL, skb->dev, br_forward_finish); } static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { + struct net_bridge_vlan_group *vg; struct net_device *indev; if (skb_warn_if_lro(skb)) { @@ -91,7 +112,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) return; } - skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb); + vg = nbp_vlan_group_rcu(to); + skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; @@ -99,8 +121,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) skb->dev = to->dev; skb_forward_csum(skb); - NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, NULL, skb, - indev, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, + dev_net(indev), NULL, skb, indev, skb->dev, br_forward_finish); } @@ -119,7 +141,7 @@ EXPORT_SYMBOL_GPL(br_deliver); /* called with rcu_read_lock */ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0) { - if (should_deliver(to, skb)) { + if (to && should_deliver(to, skb)) { if (skb0) deliver_clone(to, skb, __br_forward); else diff --git a/kernel/net/bridge/br_if.c b/kernel/net/bridge/br_if.c index 1849d96b3..ec02f5869 100644 --- a/kernel/net/bridge/br_if.c +++ b/kernel/net/bridge/br_if.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "br_private.h" @@ -249,7 +250,9 @@ static void del_nbp(struct net_bridge_port *p) list_del_rcu(&p->list); nbp_vlan_flush(p); - br_fdb_delete_by_port(br, p, 1); + br_fdb_delete_by_port(br, p, 0, 1); + switchdev_deferred_process(); + nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); @@ -278,9 +281,10 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } - br_fdb_delete_by_port(br, NULL, 1); + br_fdb_delete_by_port(br, NULL, 0, 1); br_vlan_flush(br); + br_multicast_dev_del(br); del_timer_sync(&br->gc_timer); br_sysfs_delbr(br->dev); diff --git a/kernel/net/bridge/br_input.c b/kernel/net/bridge/br_input.c index f921a5dce..f7fba7410 100644 --- a/kernel/net/bridge/br_input.c +++ b/kernel/net/bridge/br_input.c @@ -26,38 +26,44 @@ br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; EXPORT_SYMBOL(br_should_route_hook); +static int +br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return netif_receive_skb(skb); +} + static int br_pass_frame_up(struct sk_buff *skb) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); + struct net_bridge_vlan_group *vg; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); - struct net_port_vlans *pv; u64_stats_update_begin(&brstats->syncp); brstats->rx_packets++; brstats->rx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); + vg = br_vlan_group_rcu(br); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc modue when someone * may be running packet capture. */ - pv = br_get_vlan_info(br); if (!(brdev->flags & IFF_PROMISC) && - !br_allowed_egress(br, pv, skb)) { + !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; - skb = br_handle_vlan(br, pv, skb); + skb = br_handle_vlan(br, vg, skb); if (!skb) return NET_RX_DROP; - return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb, - indev, NULL, - netif_receive_skb_sk); + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(indev), NULL, skb, indev, NULL, + br_netif_receive_skb); } static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, @@ -120,7 +126,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, } /* note: already called with rcu_read_lock */ -int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb) +int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p = br_port_get_rcu(skb->dev); @@ -134,7 +140,7 @@ int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb) if (!p || p->state == BR_STATE_DISABLED) goto drop; - if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid)) + if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) goto out; /* insert into forwarding database after filtering to avoid spoofing */ @@ -208,7 +214,7 @@ drop: EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ -static int br_handle_local_finish(struct sock *sk, struct sk_buff *skb) +static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; @@ -278,8 +284,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } /* Deliver packet to local host only */ - if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb, - skb->dev, NULL, br_handle_local_finish)) { + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + br_handle_local_finish)) { return RX_HANDLER_CONSUMED; /* consumed by filter */ } else { *pskb = skb; @@ -303,8 +310,8 @@ forward: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_frame_finish); break; default: diff --git a/kernel/net/bridge/br_ioctl.c b/kernel/net/bridge/br_ioctl.c index 8d423bc64..263b4de4d 100644 --- a/kernel/net/bridge/br_ioctl.c +++ b/kernel/net/bridge/br_ioctl.c @@ -200,8 +200,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br->ageing_time = clock_t_to_jiffies(args[1]); - return 0; + return br_set_ageing_time(br, args[1]); case BRCTL_GET_PORT_INFO: { diff --git a/kernel/net/bridge/br_mdb.c b/kernel/net/bridge/br_mdb.c index d1f910c0d..cd8deea2d 100644 --- a/kernel/net/bridge/br_mdb.c +++ b/kernel/net/bridge/br_mdb.c @@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, memset(&e, 0, sizeof(e)); e.ifindex = port->dev->ifindex; e.state = p->state; + e.vid = p->addr.vid; if (p->addr.proto == htons(ETH_P_IP)) e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) @@ -230,7 +231,7 @@ errout: } void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type) + struct br_ip *group, int type, u8 state) { struct br_mdb_entry entry; @@ -241,9 +242,78 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, #if IS_ENABLED(CONFIG_IPV6) entry.addr.u.ip6 = group->u.ip6; #endif + entry.state = state; + entry.vid = group->vid; __br_mdb_notify(dev, &entry, type); } +static int nlmsg_populate_rtr_fill(struct sk_buff *skb, + struct net_device *dev, + int ifindex, u32 pid, + u32 seq, int type, unsigned int flags) +{ + struct br_port_msg *bpm; + struct nlmsghdr *nlh; + struct nlattr *nest; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + bpm = nlmsg_data(nlh); + memset(bpm, 0, sizeof(*bpm)); + bpm->family = AF_BRIDGE; + bpm->ifindex = dev->ifindex; + nest = nla_nest_start(skb, MDBA_ROUTER); + if (!nest) + goto cancel; + + if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex)) + goto end; + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + return 0; + +end: + nla_nest_end(skb, nest); +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static inline size_t rtnl_rtr_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct br_port_msg)) + + nla_total_size(sizeof(__u32)); +} + +void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, + int type) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOBUFS; + int ifindex; + + ifindex = port ? port->dev->ifindex : 0; + skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); + if (!skb) + goto errout; + + err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_MDB, err); +} + static bool is_valid_mdb_entry(struct br_mdb_entry *entry) { if (entry->ifindex == 0) @@ -263,6 +333,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry) return false; if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) return false; + if (entry->vid >= VLAN_VID_MASK) + return false; return true; } @@ -323,6 +395,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_htable *mdb; + unsigned long now = jiffies; int err; mdb = mlock_dereference(br->mdb, br); @@ -347,6 +420,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); + if (state == MDB_TEMPORARY) + mod_timer(&p->timer, now + br->multicast_membership_interval); return 0; } @@ -371,6 +446,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, return -EINVAL; memset(&ip, 0, sizeof(ip)); + ip.vid = entry->vid; ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; @@ -388,8 +464,11 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); + struct net_bridge_vlan_group *vg; + struct net_device *dev, *pdev; struct br_mdb_entry *entry; - struct net_device *dev; + struct net_bridge_port *p; + struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -399,9 +478,32 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) br = netdev_priv(dev); - err = __br_mdb_add(net, br, entry); - if (!err) - __br_mdb_notify(dev, entry, RTM_NEWMDB); + /* If vlan filtering is enabled and VLAN is not specified + * install mdb entry on all vlans configured on the port. + */ + pdev = __dev_get_by_index(net, entry->ifindex); + if (!pdev) + return -ENODEV; + + p = br_port_get_rtnl(pdev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + + vg = nbp_vlan_group(p); + if (br_vlan_enabled(br) && vg && entry->vid == 0) { + list_for_each_entry(v, &vg->vlan_list, vlist) { + entry->vid = v->vid; + err = __br_mdb_add(net, br, entry); + if (err) + break; + __br_mdb_notify(dev, entry, RTM_NEWMDB); + } + } else { + err = __br_mdb_add(net, br, entry); + if (!err) + __br_mdb_notify(dev, entry, RTM_NEWMDB); + } + return err; } @@ -418,20 +520,14 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) return -EINVAL; memset(&ip, 0, sizeof(ip)); + ip.vid = entry->vid; ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) { - if (timer_pending(&br->ip4_other_query.timer)) - return -EBUSY; - + if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - } else { - if (timer_pending(&br->ip6_other_query.timer)) - return -EBUSY; - + else ip.u.ip6 = entry->addr.u.ip6; #endif - } spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -449,6 +545,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (p->port->state == BR_STATE_DISABLED) goto unlock; + entry->state = p->state; rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); @@ -468,8 +565,12 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct net_device *dev; + struct net *net = sock_net(skb->sk); + struct net_bridge_vlan_group *vg; + struct net_device *dev, *pdev; struct br_mdb_entry *entry; + struct net_bridge_port *p; + struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -479,9 +580,31 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) br = netdev_priv(dev); - err = __br_mdb_del(br, entry); - if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + /* If vlan filtering is enabled and VLAN is not specified + * delete mdb entry on all vlans configured on the port. + */ + pdev = __dev_get_by_index(net, entry->ifindex); + if (!pdev) + return -ENODEV; + + p = br_port_get_rtnl(pdev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + + vg = nbp_vlan_group(p); + if (br_vlan_enabled(br) && vg && entry->vid == 0) { + list_for_each_entry(v, &vg->vlan_list, vlist) { + entry->vid = v->vid; + err = __br_mdb_del(br, entry); + if (!err) + __br_mdb_notify(dev, entry, RTM_DELMDB); + } + } else { + err = __br_mdb_del(br, entry); + if (!err) + __br_mdb_notify(dev, entry, RTM_DELMDB); + } + return err; } diff --git a/kernel/net/bridge/br_multicast.c b/kernel/net/bridge/br_multicast.c index 9ba383f5b..03661d974 100644 --- a/kernel/net/bridge/br_multicast.c +++ b/kernel/net/bridge/br_multicast.c @@ -37,6 +37,18 @@ static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); +static void br_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); +static void br_ip4_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + __be32 group, + __u16 vid); +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + const struct in6_addr *group, + __u16 vid); +#endif unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) @@ -271,6 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); + br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, + p->state); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && @@ -692,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br, if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB); + br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -752,6 +766,7 @@ static void br_multicast_router_expired(unsigned long data) goto out; hlist_del_init_rcu(&port->rlist); + br_rtr_notify(br->dev, port, RTM_DELMDB); out: spin_unlock(&br->multicast_lock); @@ -814,8 +829,8 @@ static void __br_multicast_send_query(struct net_bridge *br, if (port) { skb->dev = port->dev; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, - NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, + dev_net(port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); @@ -912,6 +927,15 @@ void br_multicast_add_port(struct net_bridge_port *port) void br_multicast_del_port(struct net_bridge_port *port) { + struct net_bridge *br = port->br; + struct net_bridge_port_group *pg; + struct hlist_node *n; + + /* Take care of the remaining groups, only perm ones should be left */ + spin_lock_bh(&br->multicast_lock); + hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) + br_multicast_del_pg(br, pg); + spin_unlock_bh(&br->multicast_lock); del_timer_sync(&port->multicast_router_timer); } @@ -936,6 +960,8 @@ void br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif + if (port->multicast_router == 2 && hlist_unhashed(&port->rlist)) + br_multicast_add_router(br, port); out: spin_unlock(&br->multicast_lock); @@ -949,10 +975,13 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - br_multicast_del_pg(br, pg); + if (pg->state == MDB_TEMPORARY) + br_multicast_del_pg(br, pg); - if (!hlist_unhashed(&port->rlist)) + if (!hlist_unhashed(&port->rlist)) { hlist_del_init_rcu(&port->rlist); + br_rtr_notify(br->dev, port, RTM_DELMDB); + } del_timer(&port->multicast_router_timer); del_timer(&port->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) @@ -975,9 +1004,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int err = 0; __be32 group; - if (!pskb_may_pull(skb, sizeof(*ih))) - return -EINVAL; - ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = skb_transport_offset(skb) + sizeof(*ih); @@ -1009,9 +1035,15 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, continue; } - err = br_ip4_multicast_add_group(br, port, group, vid); - if (err) - break; + if ((type == IGMPV3_CHANGE_TO_INCLUDE || + type == IGMPV3_MODE_IS_INCLUDE) && + ntohs(grec->grec_nsrcs) == 0) { + br_ip4_multicast_leave_group(br, port, group, vid); + } else { + err = br_ip4_multicast_add_group(br, port, group, vid); + if (err) + break; + } } return err; @@ -1070,10 +1102,17 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, continue; } - err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, - vid); - if (err) - break; + if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || + grec->grec_type == MLD2_MODE_IS_INCLUDE) && + ntohs(*nsrcs) == 0) { + br_ip6_multicast_leave_group(br, port, &grec->grec_mca, + vid); + } else { + err = br_ip6_multicast_add_group(br, port, + &grec->grec_mca, vid); + if (!err) + break; + } } return err; @@ -1180,6 +1219,7 @@ static void br_multicast_add_router(struct net_bridge *br, hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); + br_rtr_notify(br->dev, port, RTM_NEWMDB); } static void br_multicast_mark_router(struct net_bridge *br, @@ -1247,25 +1287,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay = 10 * HZ; group = 0; } - } else { - if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { - err = -EINVAL; - goto out; - } - + } else if (skb->len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; - } - - /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer - * all-systems destination addresses (224.0.0.1) for general queries - */ - if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { - err = -EINVAL; + } else { goto out; } @@ -1328,12 +1357,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ - if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - err = -EINVAL; - goto out; - } - if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1357,14 +1380,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, is_general_query = group && ipv6_addr_any(group); - /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer - * all-nodes destination address (ff02::1) for general queries - */ - if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { - err = -EINVAL; - goto out; - } - if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.u.ip6 = ip6h->saddr; @@ -1417,8 +1432,7 @@ br_multicast_leave_group(struct net_bridge *br, spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED) || - timer_pending(&other_query->timer)) + (port && port->state == BR_STATE_DISABLED)) goto out; mdb = mlock_dereference(br->mdb, br); @@ -1426,6 +1440,32 @@ br_multicast_leave_group(struct net_bridge *br, if (!mp) goto out; + if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { + struct net_bridge_port_group __rcu **pp; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p->port != port) + continue; + + rcu_assign_pointer(*pp, p->next); + hlist_del_init(&p->mglist); + del_timer(&p->timer); + call_rcu_bh(&p->rcu, br_multicast_free_pg); + br_mdb_notify(br->dev, port, group, RTM_DELMDB, + p->state); + + if (!mp->ports && !mp->mglist && + netif_running(br->dev)) + mod_timer(&mp->timer, jiffies); + } + goto out; + } + + if (timer_pending(&other_query->timer)) + goto out; + if (br->multicast_querier) { __br_multicast_send_query(br, port, &mp->addr); @@ -1451,28 +1491,6 @@ br_multicast_leave_group(struct net_bridge *br, } } - if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { - struct net_bridge_port_group __rcu **pp; - - for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; - pp = &p->next) { - if (p->port != port) - continue; - - rcu_assign_pointer(*pp, p->next); - hlist_del_init(&p->mglist); - del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, port, group, RTM_DELMDB); - - if (!mp->ports && !mp->mglist && - netif_running(br->dev)) - mod_timer(&mp->timer, jiffies); - } - goto out; - } - now = jiffies; time = now + br->multicast_last_member_count * br->multicast_last_member_interval; @@ -1556,74 +1574,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2 = skb; - const struct iphdr *iph; + struct sk_buff *skb_trimmed = NULL; struct igmphdr *ih; - unsigned int len; - unsigned int offset; int err; - /* We treat OOM as packet loss for now. */ - if (!pskb_may_pull(skb, sizeof(*iph))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (iph->ihl < 5 || iph->version != 4) - return -EINVAL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb))) - return -EINVAL; - - iph = ip_hdr(skb); + err = ip_mc_check_igmp(skb, &skb_trimmed); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - return -EINVAL; - - if (iph->protocol != IPPROTO_IGMP) { - if (!ipv4_is_local_multicast(iph->daddr)) + if (err == -ENOMSG) { + if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; + } else if (err < 0) { + return err; } - len = ntohs(iph->tot_len); - if (skb->len < len || len < ip_hdrlen(skb)) - return -EINVAL; - - if (skb->len > len) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = pskb_trim_rcsum(skb2, len); - if (err) - goto err_out; - } - - len -= ip_hdrlen(skb2); - offset = skb_network_offset(skb2) + ip_hdrlen(skb2); - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - - err = -EINVAL; - if (!pskb_may_pull(skb2, sizeof(*ih))) - goto out; - - switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb2->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb2->csum = 0; - if (skb_checksum_complete(skb2)) - goto out; - } - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; - ih = igmp_hdr(skb2); + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1632,21 +1598,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2, vid); + err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); break; } -out: - __skb_push(skb2, offset); -err_out: - if (skb2 != skb) - kfree_skb(skb2); + if (skb_trimmed && skb_trimmed != skb) + kfree_skb(skb_trimmed); + return err; } @@ -1656,138 +1620,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2; - const struct ipv6hdr *ip6h; - u8 icmp6_type; - u8 nexthdr; - __be16 frag_off; - unsigned int len; - int offset; + struct sk_buff *skb_trimmed = NULL; + struct mld_msg *mld; int err; - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -EINVAL; - - ip6h = ipv6_hdr(skb); - - /* - * We're interested in MLD messages only. - * - Version is 6 - * - MLD has always Router Alert hop-by-hop option - * - But we do not support jumbrograms. - */ - if (ip6h->version != 6) - return 0; - - /* Prevent flooding this packet if there is no listener present */ - if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) - BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - - if (ip6h->nexthdr != IPPROTO_HOPOPTS || - ip6h->payload_len == 0) - return 0; - - len = ntohs(ip6h->payload_len) + sizeof(*ip6h); - if (skb->len < len) - return -EINVAL; - - nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); + err = ipv6_mc_check_mld(skb, &skb_trimmed); - if (offset < 0 || nexthdr != IPPROTO_ICMPV6) + if (err == -ENOMSG) { + if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; - - /* Okay, we found ICMPv6 header */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = -EINVAL; - if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) - goto out; - - len -= offset - skb_network_offset(skb2); - - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - skb_postpull_rcsum(skb2, skb_network_header(skb2), - skb_network_header_len(skb2)); - - icmp6_type = icmp6_hdr(skb2)->icmp6_type; - - switch (icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - break; - default: - err = 0; - goto out; - } - - /* Okay, we found MLD message. Check further. */ - if (skb2->len > len) { - err = pskb_trim_rcsum(skb2, len); - if (err) - goto out; - err = -EINVAL; - } - - ip6h = ipv6_hdr(skb2); - - switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len, - IPPROTO_ICMPV6, skb2->csum)) - break; - /*FALLTHROUGH*/ - case CHECKSUM_NONE: - skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb2->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb2)) - goto out; + } else if (err < 0) { + return err; } - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; + mld = (struct mld_msg *)skb_transport_header(skb); - switch (icmp6_type) { + switch (mld->mld_type) { case ICMPV6_MGM_REPORT: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; - } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2, vid); + err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2, vid); + err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); - } + break; } -out: - kfree_skb(skb2); + if (skb_trimmed && skb_trimmed != skb) + kfree_skb(skb_trimmed); + return err; } #endif @@ -1903,12 +1771,6 @@ void br_multicast_open(struct net_bridge *br) void br_multicast_stop(struct net_bridge *br) { - struct net_bridge_mdb_htable *mdb; - struct net_bridge_mdb_entry *mp; - struct hlist_node *n; - u32 ver; - int i; - del_timer_sync(&br->multicast_router_timer); del_timer_sync(&br->ip4_other_query.timer); del_timer_sync(&br->ip4_own_query.timer); @@ -1916,6 +1778,15 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif +} + +void br_multicast_dev_del(struct net_bridge *br) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct hlist_node *n; + u32 ver; + int i; spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -1949,11 +1820,9 @@ out: int br_multicast_set_router(struct net_bridge *br, unsigned long val) { - int err = -ENOENT; + int err = -EINVAL; spin_lock_bh(&br->multicast_lock); - if (!netif_running(br->dev)) - goto unlock; switch (val) { case 0: @@ -1964,13 +1833,8 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) br->multicast_router = val; err = 0; break; - - default: - err = -EINVAL; - break; } -unlock: spin_unlock_bh(&br->multicast_lock); return err; @@ -1979,11 +1843,9 @@ unlock: int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { struct net_bridge *br = p->br; - int err = -ENOENT; + int err = -EINVAL; spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED) - goto unlock; switch (val) { case 0: @@ -1992,8 +1854,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) p->multicast_router = val; err = 0; - if (val < 2 && !hlist_unhashed(&p->rlist)) + if (val < 2 && !hlist_unhashed(&p->rlist)) { hlist_del_init_rcu(&p->rlist); + br_rtr_notify(br->dev, p, RTM_DELMDB); + } if (val == 1) break; @@ -2005,13 +1869,8 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) br_multicast_add_router(br, p); break; - - default: - err = -EINVAL; - break; } -unlock: spin_unlock(&br->multicast_lock); return err; @@ -2116,15 +1975,11 @@ unlock: int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) { - int err = -ENOENT; + int err = -EINVAL; u32 old; struct net_bridge_mdb_htable *mdb; spin_lock_bh(&br->multicast_lock); - if (!netif_running(br->dev)) - goto unlock; - - err = -EINVAL; if (!is_power_of_2(val)) goto unlock; diff --git a/kernel/net/bridge/br_netfilter.c b/kernel/net/bridge/br_netfilter.c deleted file mode 100644 index 60ddfbeb4..000000000 --- a/kernel/net/bridge/br_netfilter.c +++ /dev/null @@ -1,1140 +0,0 @@ -/* - * Handle firewalling - * Linux ethernet bridge - * - * Authors: - * Lennert Buytenhek - * Bart De Schuymer - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Lennert dedicates this file to Kerstin Wurdinger. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include "br_private.h" -#ifdef CONFIG_SYSCTL -#include -#endif - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly = 0; -static int brnf_filter_pppoe_tagged __read_mostly = 0; -static int brnf_pass_vlan_indev __read_mostly = 0; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 -#endif - -#define IS_IP(skb) \ - (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) - -#define IS_IPV6(skb) \ - (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) - -#define IS_ARP(skb) \ - (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) - -static inline __be16 vlan_proto(const struct sk_buff *skb) -{ - if (skb_vlan_tag_present(skb)) - return skb->protocol; - else if (skb->protocol == htons(ETH_P_8021Q)) - return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; - else - return 0; -} - -#define IS_VLAN_IP(skb) \ - (vlan_proto(skb) == htons(ETH_P_IP) && \ - brnf_filter_vlan_tagged) - -#define IS_VLAN_IPV6(skb) \ - (vlan_proto(skb) == htons(ETH_P_IPV6) && \ - brnf_filter_vlan_tagged) - -#define IS_VLAN_ARP(skb) \ - (vlan_proto(skb) == htons(ETH_P_ARP) && \ - brnf_filter_vlan_tagged) - -static inline __be16 pppoe_proto(const struct sk_buff *skb) -{ - return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + - sizeof(struct pppoe_hdr))); -} - -#define IS_PPPOE_IP(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IP) && \ - brnf_filter_pppoe_tagged) - -#define IS_PPPOE_IPV6(skb) \ - (skb->protocol == htons(ETH_P_PPP_SES) && \ - pppoe_proto(skb) == htons(PPP_IPV6) && \ - brnf_filter_pppoe_tagged) - -/* largest possible L2 header, see br_nf_dev_queue_xmit() */ -#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) - -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) -struct brnf_frag_data { - char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; - u8 encap_size; - u8 size; -}; - -static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); -#endif - -static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb) -{ - return skb->nf_bridge; -} - -static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) -{ - struct net_bridge_port *port; - - port = br_port_get_rcu(dev); - return port ? &port->br->fake_rtable : NULL; -} - -static inline struct net_device *bridge_parent(const struct net_device *dev) -{ - struct net_bridge_port *port; - - port = br_port_get_rcu(dev); - return port ? port->br->dev : NULL; -} - -static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) -{ - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); - if (likely(skb->nf_bridge)) - atomic_set(&(skb->nf_bridge->use), 1); - - return skb->nf_bridge; -} - -static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - - if (atomic_read(&nf_bridge->use) > 1) { - struct nf_bridge_info *tmp = nf_bridge_alloc(skb); - - if (tmp) { - memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); - atomic_set(&tmp->use, 1); - } - nf_bridge_put(nf_bridge); - nf_bridge = tmp; - } - return nf_bridge; -} - -static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) -{ - switch (skb->protocol) { - case __cpu_to_be16(ETH_P_8021Q): - return VLAN_HLEN; - case __cpu_to_be16(ETH_P_PPP_SES): - return PPPOE_SES_HLEN; - default: - return 0; - } -} - -static inline void nf_bridge_push_encap_header(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_push(skb, len); - skb->network_header -= len; -} - -static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_pull(skb, len); - skb->network_header += len; -} - -static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) -{ - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_pull_rcsum(skb, len); - skb->network_header += len; -} - -/* When handing a packet over to the IP layer - * check whether we have a skb that is in the - * expected format - */ - -static int br_parse_ip_options(struct sk_buff *skb) -{ - const struct iphdr *iph; - struct net_device *dev = skb->dev; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto inhdr_error; - - iph = ip_hdr(skb); - - /* Basic sanity checks */ - if (iph->ihl < 5 || iph->version != 4) - goto inhdr_error; - - if (!pskb_may_pull(skb, iph->ihl*4)) - goto inhdr_error; - - iph = ip_hdr(skb); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - goto inhdr_error; - - len = ntohs(iph->tot_len); - if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); - goto drop; - } else if (len < (iph->ihl*4)) - goto inhdr_error; - - if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); - goto drop; - } - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - /* We should really parse IP options here but until - * somebody who actually uses IP options complains to - * us we'll just silently ignore the options because - * we're lazy! - */ - return 0; - -inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); -drop: - return -1; -} - -static void nf_bridge_update_protocol(struct sk_buff *skb) -{ - switch (skb->nf_bridge->orig_proto) { - case BRNF_PROTO_8021Q: - skb->protocol = htons(ETH_P_8021Q); - break; - case BRNF_PROTO_PPPOE: - skb->protocol = htons(ETH_P_PPP_SES); - break; - case BRNF_PROTO_UNCHANGED: - break; - } -} - -/* PF_BRIDGE/PRE_ROUTING *********************************************/ -/* Undo the changes made for ip6tables PREROUTING and continue the - * bridge PRE_ROUTING hook. */ -static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct rtable *rt; - - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; -} - -/* Obtain the correct destination MAC address, while preserving the original - * source MAC address. If we already know this address, we just copy it. If we - * don't, we use the neighbour framework to find out. In both cases, we make - * sure that br_handle_frame_finish() is called afterwards. - */ -static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) -{ - struct neighbour *neigh; - struct dst_entry *dst; - - skb->dev = bridge_parent(skb->dev); - if (!skb->dev) - goto free_skb; - dst = skb_dst(skb); - neigh = dst_neigh_lookup_skb(dst, skb); - if (neigh) { - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - int ret; - - if (neigh->hh.hh_len) { - neigh_hh_bridge(&neigh->hh, skb); - skb->dev = nf_bridge->physindev; - ret = br_handle_frame_finish(sk, skb); - } else { - /* the neighbour function below overwrites the complete - * MAC header, so we save the Ethernet source address and - * protocol number. - */ - skb_copy_from_linear_data_offset(skb, - -(ETH_HLEN-ETH_ALEN), - nf_bridge->neigh_header, - ETH_HLEN-ETH_ALEN); - /* tell br_dev_xmit to continue with forwarding */ - nf_bridge->mask |= BRNF_BRIDGED_DNAT; - /* FIXME Need to refragment */ - ret = neigh->output(neigh, skb); - } - neigh_release(neigh); - return ret; - } -free_skb: - kfree_skb(skb); - return 0; -} - -static bool daddr_was_changed(const struct sk_buff *skb, - const struct nf_bridge_info *nf_bridge) -{ - return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; -} - -/* This requires some explaining. If DNAT has taken place, - * we will need to fix up the destination Ethernet address. - * This is also true when SNAT takes place (for the reply direction). - * - * There are two cases to consider: - * 1. The packet was DNAT'ed to a device in the same bridge - * port group as it was received on. We can still bridge - * the packet. - * 2. The packet was DNAT'ed to a different device, either - * a non-bridged device or another bridge port group. - * The packet will need to be routed. - * - * The correct way of distinguishing between these two cases is to - * call ip_route_input() and to look at skb->dst->dev, which is - * changed to the destination device if ip_route_input() succeeds. - * - * Let's first consider the case that ip_route_input() succeeds: - * - * If the output device equals the logical bridge device the packet - * came in on, we can consider this bridging. The corresponding MAC - * address will be obtained in br_nf_pre_routing_finish_bridge. - * Otherwise, the packet is considered to be routed and we just - * change the destination MAC address so that the packet will - * later be passed up to the IP stack to be routed. For a redirected - * packet, ip_route_input() will give back the localhost as output device, - * which differs from the bridge device. - * - * Let's now consider the case that ip_route_input() fails: - * - * This can be because the destination address is martian, in which case - * the packet will be dropped. - * If IP forwarding is disabled, ip_route_input() will fail, while - * ip_route_output_key() can return success. The source - * address for ip_route_output_key() is set to zero, so ip_route_output_key() - * thinks we're handling a locally generated packet and won't care - * if IP forwarding is enabled. If the output device equals the logical bridge - * device, we proceed as if ip_route_input() succeeded. If it differs from the - * logical bridge port or if ip_route_output_key() fails we drop the packet. - */ -static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct iphdr *iph = ip_hdr(skb); - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct rtable *rt; - int err; - int frag_max_size; - - frag_max_size = IPCB(skb)->frag_max_size; - BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; - - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - if (daddr_was_changed(skb, nf_bridge)) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { - struct in_device *in_dev = __in_dev_get_rcu(dev); - - /* If err equals -EHOSTUNREACH the error is due to a - * martian destination or due to the fact that - * forwarding is disabled. For most martian packets, - * ip_route_output_key() will fail. It won't fail for 2 types of - * martian destinations: loopback destinations and destination - * 0.0.0.0. In both cases the packet will be dropped because the - * destination is the loopback device and not the bridge. */ - if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) - goto free_skb; - - rt = ip_route_output(dev_net(dev), iph->daddr, 0, - RT_TOS(iph->tos), 0); - if (!IS_ERR(rt)) { - /* - Bridged-and-DNAT'ed traffic doesn't - * require ip_forwarding. */ - if (rt->dst.dev == dev) { - skb_dst_set(skb, &rt->dst); - goto bridged_dnat; - } - ip_rt_put(rt); - } -free_skb: - kfree_skb(skb); - return 0; - } else { - if (skb_dst(skb)->dev == dev) { -bridged_dnat: - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, - NF_BR_PRE_ROUTING, - sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); - return 0; - } - ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); - skb->pkt_type = PACKET_HOST; - } - } else { - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - } - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; -} - -static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) -{ - struct net_device *vlan, *br; - - br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) - return br; - - vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, - skb_vlan_tag_get(skb) & VLAN_VID_MASK); - - return vlan ? vlan : br; -} - -/* Some common code for IPv4/IPv6 */ -static struct net_device *setup_pre_routing(struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - - if (skb->pkt_type == PACKET_OTHERHOST) { - skb->pkt_type = PACKET_HOST; - nf_bridge->pkt_otherhost = true; - } - - nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; - nf_bridge->physindev = skb->dev; - skb->dev = brnf_get_logical_dev(skb, skb->dev); - - if (skb->protocol == htons(ETH_P_8021Q)) - nf_bridge->orig_proto = BRNF_PROTO_8021Q; - else if (skb->protocol == htons(ETH_P_PPP_SES)) - nf_bridge->orig_proto = BRNF_PROTO_PPPOE; - - /* Must drop socket now because of tproxy. */ - skb_orphan(skb); - return skb->dev; -} - -/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */ -static int check_hbh_len(struct sk_buff *skb) -{ - unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); - u32 pkt_len; - const unsigned char *nh = skb_network_header(skb); - int off = raw - nh; - int len = (raw[1] + 1) << 3; - - if ((raw + len) - skb->data > skb_headlen(skb)) - goto bad; - - off += 2; - len -= 2; - - while (len > 0) { - int optlen = nh[off + 1] + 2; - - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; - break; - - case IPV6_TLV_PADN: - break; - - case IPV6_TLV_JUMBO: - if (nh[off + 1] != 4 || (off & 3) != 2) - goto bad; - pkt_len = ntohl(*(__be32 *) (nh + off + 2)); - if (pkt_len <= IPV6_MAXPLEN || - ipv6_hdr(skb)->payload_len) - goto bad; - if (pkt_len > skb->len - sizeof(struct ipv6hdr)) - goto bad; - if (pskb_trim_rcsum(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - nh = skb_network_header(skb); - break; - default: - if (optlen > len) - goto bad; - break; - } - off += optlen; - len -= optlen; - } - if (len == 0) - return 0; -bad: - return -1; - -} - -/* Replicate the checks that IPv6 does on packet reception and pass the packet - * to ip6tables, which doesn't support NAT, so things are fairly simple. */ -static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - const struct ipv6hdr *hdr; - u32 pkt_len; - - if (skb->len < sizeof(struct ipv6hdr)) - return NF_DROP; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return NF_DROP; - - hdr = ipv6_hdr(skb); - - if (hdr->version != 6) - return NF_DROP; - - pkt_len = ntohs(hdr->payload_len); - - if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return NF_DROP; - if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) - return NF_DROP; - } - if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) - return NF_DROP; - - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) - return NF_DROP; - if (!setup_pre_routing(skb)) - return NF_DROP; - - skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, - skb->dev, NULL, - br_nf_pre_routing_finish_ipv6); - - return NF_STOLEN; -} - -/* Direct IPv6 traffic to br_nf_pre_routing_ipv6. - * Replicate the checks that IPv4 does on packet reception. - * Set skb->dev to the bridge device (i.e. parent of the - * receiving device) to make netfilter happy, the REDIRECT - * target in particular. Save the original destination IP - * address to be able to detect DNAT afterwards. */ -static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge; - struct net_bridge_port *p; - struct net_bridge *br; - __u32 len = nf_bridge_encap_header_len(skb); - - if (unlikely(!pskb_may_pull(skb, len))) - return NF_DROP; - - p = br_port_get_rcu(state->in); - if (p == NULL) - return NF_DROP; - br = p->br; - - if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && !br->nf_call_ip6tables) - return NF_ACCEPT; - - nf_bridge_pull_encap_header_rcsum(skb); - return br_nf_pre_routing_ipv6(ops, skb, state); - } - - if (!brnf_call_iptables && !br->nf_call_iptables) - return NF_ACCEPT; - - if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) - return NF_ACCEPT; - - nf_bridge_pull_encap_header_rcsum(skb); - - if (br_parse_ip_options(skb)) - return NF_DROP; - - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) - return NF_DROP; - if (!setup_pre_routing(skb)) - return NF_DROP; - - nf_bridge = nf_bridge_info_get(skb); - nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; - - skb->protocol = htons(ETH_P_IP); - - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, - skb->dev, NULL, - br_nf_pre_routing_finish); - - return NF_STOLEN; -} - - -/* PF_BRIDGE/LOCAL_IN ************************************************/ -/* The packet is locally destined, which requires a real - * dst_entry, so detach the fake one. On the way up, the - * packet would pass through PRE_ROUTING again (which already - * took place when the packet entered the bridge), but we - * register an IPv4 PRE_ROUTING 'sabotage' hook that will - * prevent this from happening. */ -static unsigned int br_nf_local_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - br_drop_fake_rtable(skb); - return NF_ACCEPT; -} - -/* PF_BRIDGE/FORWARD *************************************************/ -static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct net_device *in; - - if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { - int frag_max_size; - - if (skb->protocol == htons(ETH_P_IP)) { - frag_max_size = IPCB(skb)->frag_max_size; - BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; - } - - in = nf_bridge->physindev; - if (nf_bridge->pkt_otherhost) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->pkt_otherhost = false; - } - nf_bridge_update_protocol(skb); - } else { - in = *((struct net_device **)(skb->cb)); - } - nf_bridge_push_encap_header(skb); - - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb, - in, skb->dev, br_forward_finish, 1); - return 0; -} - - -/* This is the 'purely bridged' case. For IP, we pass the packet to - * netfilter with indev and outdev set to the bridge device, - * but we are still able to filter on the 'real' indev/outdev - * because of the physdev module. For ARP, indev and outdev are the - * bridge ports. */ -static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge; - struct net_device *parent; - u_int8_t pf; - - if (!skb->nf_bridge) - return NF_ACCEPT; - - /* Need exclusive nf_bridge_info since we might have multiple - * different physoutdevs. */ - if (!nf_bridge_unshare(skb)) - return NF_DROP; - - nf_bridge = nf_bridge_info_get(skb); - if (!nf_bridge) - return NF_DROP; - - parent = bridge_parent(state->out); - if (!parent) - return NF_DROP; - - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) - pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) - pf = NFPROTO_IPV6; - else - return NF_ACCEPT; - - nf_bridge_pull_encap_header(skb); - - if (skb->pkt_type == PACKET_OTHERHOST) { - skb->pkt_type = PACKET_HOST; - nf_bridge->pkt_otherhost = true; - } - - if (pf == NFPROTO_IPV4) { - int frag_max = BR_INPUT_SKB_CB(skb)->frag_max_size; - - if (br_parse_ip_options(skb)) - return NF_DROP; - - IPCB(skb)->frag_max_size = frag_max; - } - - nf_bridge->physoutdev = skb->dev; - if (pf == NFPROTO_IPV4) - skb->protocol = htons(ETH_P_IP); - else - skb->protocol = htons(ETH_P_IPV6); - - NF_HOOK(pf, NF_INET_FORWARD, NULL, skb, - brnf_get_logical_dev(skb, state->in), - parent, br_nf_forward_finish); - - return NF_STOLEN; -} - -static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct net_bridge_port *p; - struct net_bridge *br; - struct net_device **d = (struct net_device **)(skb->cb); - - p = br_port_get_rcu(state->out); - if (p == NULL) - return NF_ACCEPT; - br = p->br; - - if (!brnf_call_arptables && !br->nf_call_arptables) - return NF_ACCEPT; - - if (!IS_ARP(skb)) { - if (!IS_VLAN_ARP(skb)) - return NF_ACCEPT; - nf_bridge_pull_encap_header(skb); - } - - if (arp_hdr(skb)->ar_pln != 4) { - if (IS_VLAN_ARP(skb)) - nf_bridge_push_encap_header(skb); - return NF_ACCEPT; - } - *d = state->in; - NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb, - state->in, state->out, br_nf_forward_finish); - - return NF_STOLEN; -} - -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) -static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) -{ - struct brnf_frag_data *data; - int err; - - data = this_cpu_ptr(&brnf_frag_data_storage); - err = skb_cow_head(skb, data->size); - - if (err) { - kfree_skb(skb); - return 0; - } - - skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); - __skb_push(skb, data->encap_size); - - return br_dev_queue_push_xmit(sk, skb); -} - -static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) -{ - int ret; - int frag_max_size; - unsigned int mtu_reserved; - - if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) - return br_dev_queue_push_xmit(sk, skb); - - mtu_reserved = nf_bridge_mtu_reduction(skb); - /* This is wrong! We should preserve the original fragment - * boundaries by preserving frag_list rather than refragmenting. - */ - if (skb->len + mtu_reserved > skb->dev->mtu) { - struct brnf_frag_data *data; - - frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - if (br_parse_ip_options(skb)) - /* Drop invalid packet */ - return NF_DROP; - IPCB(skb)->frag_max_size = frag_max_size; - - nf_bridge_update_protocol(skb); - - data = this_cpu_ptr(&brnf_frag_data_storage); - data->encap_size = nf_bridge_encap_header_len(skb); - data->size = ETH_HLEN + data->encap_size; - - skb_copy_from_linear_data_offset(skb, -data->size, data->mac, - data->size); - - ret = ip_fragment(sk, skb, br_nf_push_frag_xmit); - } else { - ret = br_dev_queue_push_xmit(sk, skb); - } - - return ret; -} -#else -static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) -{ - return br_dev_queue_push_xmit(sk, skb); -} -#endif - -/* PF_BRIDGE/POST_ROUTING ********************************************/ -static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - struct net_device *realoutdev = bridge_parent(skb->dev); - u_int8_t pf; - - /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in - * on a bridge, but was delivered locally and is now being routed: - * - * POST_ROUTING was already invoked from the ip stack. - */ - if (!nf_bridge || !nf_bridge->physoutdev) - return NF_ACCEPT; - - if (!realoutdev) - return NF_DROP; - - if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) - pf = NFPROTO_IPV4; - else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) - pf = NFPROTO_IPV6; - else - return NF_ACCEPT; - - /* We assume any code from br_dev_queue_push_xmit onwards doesn't care - * about the value of skb->pkt_type. */ - if (skb->pkt_type == PACKET_OTHERHOST) { - skb->pkt_type = PACKET_HOST; - nf_bridge->pkt_otherhost = true; - } - - nf_bridge_pull_encap_header(skb); - if (pf == NFPROTO_IPV4) - skb->protocol = htons(ETH_P_IP); - else - skb->protocol = htons(ETH_P_IPV6); - - NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb, - NULL, realoutdev, - br_nf_dev_queue_xmit); - - return NF_STOLEN; -} - -/* IP/SABOTAGE *****************************************************/ -/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING - * for the second time. */ -static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (skb->nf_bridge && - !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { - return NF_STOP; - } - - return NF_ACCEPT; -} - -/* This is called when br_netfilter has called into iptables/netfilter, - * and DNAT has taken place on a bridge-forwarded packet. - * - * neigh->output has created a new MAC header, with local br0 MAC - * as saddr. - * - * This restores the original MAC saddr of the bridged packet - * before invoking bridge forward logic to transmit the packet. - */ -static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) -{ - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - - skb_pull(skb, ETH_HLEN); - nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; - - BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); - - skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), - nf_bridge->neigh_header, - ETH_HLEN - ETH_ALEN); - skb->dev = nf_bridge->physindev; - br_handle_frame_finish(NULL, skb); -} - -static int br_nf_dev_xmit(struct sk_buff *skb) -{ - if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { - br_nf_pre_routing_finish_bridge_slow(skb); - return 1; - } - return 0; -} - -static const struct nf_br_ops br_ops = { - .br_dev_xmit_hook = br_nf_dev_xmit, -}; - -void br_netfilter_enable(void) -{ -} -EXPORT_SYMBOL_GPL(br_netfilter_enable); - -/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because - * br_dev_queue_push_xmit is called afterwards */ -static struct nf_hook_ops br_nf_ops[] __read_mostly = { - { - .hook = br_nf_pre_routing, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_PRE_ROUTING, - .priority = NF_BR_PRI_BRNF, - }, - { - .hook = br_nf_local_in, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_LOCAL_IN, - .priority = NF_BR_PRI_BRNF, - }, - { - .hook = br_nf_forward_ip, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_FORWARD, - .priority = NF_BR_PRI_BRNF - 1, - }, - { - .hook = br_nf_forward_arp, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_FORWARD, - .priority = NF_BR_PRI_BRNF, - }, - { - .hook = br_nf_post_routing, - .owner = THIS_MODULE, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_POST_ROUTING, - .priority = NF_BR_PRI_LAST, - }, - { - .hook = ip_sabotage_in, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_FIRST, - }, - { - .hook = ip_sabotage_in, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_FIRST, - }, -}; - -#ifdef CONFIG_SYSCTL -static -int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); - - if (write && *(int *)(ctl->data)) - *(int *)(ctl->data) = 1; - return ret; -} - -static struct ctl_table brnf_table[] = { - { - .procname = "bridge-nf-call-arptables", - .data = &brnf_call_arptables, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-call-iptables", - .data = &brnf_call_iptables, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-call-ip6tables", - .data = &brnf_call_ip6tables, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-filter-vlan-tagged", - .data = &brnf_filter_vlan_tagged, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-filter-pppoe-tagged", - .data = &brnf_filter_pppoe_tagged, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { - .procname = "bridge-nf-pass-vlan-input-dev", - .data = &brnf_pass_vlan_indev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = brnf_sysctl_call_tables, - }, - { } -}; -#endif - -static int __init br_netfilter_init(void) -{ - int ret; - - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) - return ret; - -#ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); - if (brnf_sysctl_header == NULL) { - printk(KERN_WARNING - "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - return -ENOMEM; - } -#endif - RCU_INIT_POINTER(nf_br_ops, &br_ops); - printk(KERN_NOTICE "Bridge firewalling registered\n"); - return 0; -} - -static void __exit br_netfilter_fini(void) -{ - RCU_INIT_POINTER(nf_br_ops, NULL); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(brnf_sysctl_header); -#endif -} - -module_init(br_netfilter_init); -module_exit(br_netfilter_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Lennert Buytenhek "); -MODULE_AUTHOR("Bart De Schuymer "); -MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/kernel/net/bridge/br_netfilter_hooks.c b/kernel/net/bridge/br_netfilter_hooks.c new file mode 100644 index 000000000..7ddbe7ec8 --- /dev/null +++ b/kernel/net/bridge/br_netfilter_hooks.c @@ -0,0 +1,1039 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *brnf_sysctl_header; +static int brnf_call_iptables __read_mostly = 1; +static int brnf_call_ip6tables __read_mostly = 1; +static int brnf_call_arptables __read_mostly = 1; +static int brnf_filter_vlan_tagged __read_mostly; +static int brnf_filter_pppoe_tagged __read_mostly; +static int brnf_pass_vlan_indev __read_mostly; +#else +#define brnf_call_iptables 1 +#define brnf_call_ip6tables 1 +#define brnf_call_arptables 1 +#define brnf_filter_vlan_tagged 0 +#define brnf_filter_pppoe_tagged 0 +#define brnf_pass_vlan_indev 0 +#endif + +#define IS_IP(skb) \ + (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) + +#define IS_IPV6(skb) \ + (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) + +#define IS_ARP(skb) \ + (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) + +static inline __be16 vlan_proto(const struct sk_buff *skb) +{ + if (skb_vlan_tag_present(skb)) + return skb->protocol; + else if (skb->protocol == htons(ETH_P_8021Q)) + return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + else + return 0; +} + +#define IS_VLAN_IP(skb) \ + (vlan_proto(skb) == htons(ETH_P_IP) && \ + brnf_filter_vlan_tagged) + +#define IS_VLAN_IPV6(skb) \ + (vlan_proto(skb) == htons(ETH_P_IPV6) && \ + brnf_filter_vlan_tagged) + +#define IS_VLAN_ARP(skb) \ + (vlan_proto(skb) == htons(ETH_P_ARP) && \ + brnf_filter_vlan_tagged) + +static inline __be16 pppoe_proto(const struct sk_buff *skb) +{ + return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); +} + +#define IS_PPPOE_IP(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IP) && \ + brnf_filter_pppoe_tagged) + +#define IS_PPPOE_IPV6(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IPV6) && \ + brnf_filter_pppoe_tagged) + +/* largest possible L2 header, see br_nf_dev_queue_xmit() */ +#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) + +struct brnf_frag_data { + char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; + u8 encap_size; + u8 size; + u16 vlan_tci; + __be16 vlan_proto; +}; + +static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); + +static void nf_bridge_info_free(struct sk_buff *skb) +{ + if (skb->nf_bridge) { + nf_bridge_put(skb->nf_bridge); + skb->nf_bridge = NULL; + } +} + +static inline struct net_device *bridge_parent(const struct net_device *dev) +{ + struct net_bridge_port *port; + + port = br_port_get_rcu(dev); + return port ? port->br->dev : NULL; +} + +static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (atomic_read(&nf_bridge->use) > 1) { + struct nf_bridge_info *tmp = nf_bridge_alloc(skb); + + if (tmp) { + memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); + atomic_set(&tmp->use, 1); + } + nf_bridge_put(nf_bridge); + nf_bridge = tmp; + } + return nf_bridge; +} + +unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case __cpu_to_be16(ETH_P_8021Q): + return VLAN_HLEN; + case __cpu_to_be16(ETH_P_PPP_SES): + return PPPOE_SES_HLEN; + default: + return 0; + } +} + +static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull(skb, len); + skb->network_header += len; +} + +static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull_rcsum(skb, len); + skb->network_header += len; +} + +/* When handing a packet over to the IP layer + * check whether we have a skb that is in the + * expected format + */ + +static int br_validate_ipv4(struct net *net, struct sk_buff *skb) +{ + const struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = ip_hdr(skb); + + /* Basic sanity checks */ + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = ip_hdr(skb); + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len) { + IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) + goto inhdr_error; + + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + goto drop; + } + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + /* We should really parse IP options here but until + * somebody who actually uses IP options complains to + * us we'll just silently ignore the options because + * we're lazy! + */ + return 0; + +inhdr_error: + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +void nf_bridge_update_protocol(struct sk_buff *skb) +{ + switch (skb->nf_bridge->orig_proto) { + case BRNF_PROTO_8021Q: + skb->protocol = htons(ETH_P_8021Q); + break; + case BRNF_PROTO_PPPOE: + skb->protocol = htons(ETH_P_PPP_SES); + break; + case BRNF_PROTO_UNCHANGED: + break; + } +} + +/* Obtain the correct destination MAC address, while preserving the original + * source MAC address. If we already know this address, we just copy it. If we + * don't, we use the neighbour framework to find out. In both cases, we make + * sure that br_handle_frame_finish() is called afterwards. + */ +int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct neighbour *neigh; + struct dst_entry *dst; + + skb->dev = bridge_parent(skb->dev); + if (!skb->dev) + goto free_skb; + dst = skb_dst(skb); + neigh = dst_neigh_lookup_skb(dst, skb); + if (neigh) { + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + int ret; + + if (neigh->hh.hh_len) { + neigh_hh_bridge(&neigh->hh, skb); + skb->dev = nf_bridge->physindev; + ret = br_handle_frame_finish(net, sk, skb); + } else { + /* the neighbour function below overwrites the complete + * MAC header, so we save the Ethernet source address and + * protocol number. + */ + skb_copy_from_linear_data_offset(skb, + -(ETH_HLEN-ETH_ALEN), + nf_bridge->neigh_header, + ETH_HLEN-ETH_ALEN); + /* tell br_dev_xmit to continue with forwarding */ + nf_bridge->bridged_dnat = 1; + /* FIXME Need to refragment */ + ret = neigh->output(neigh, skb); + } + neigh_release(neigh); + return ret; + } +free_skb: + kfree_skb(skb); + return 0; +} + +static inline bool +br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) +{ + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; +} + +/* This requires some explaining. If DNAT has taken place, + * we will need to fix up the destination Ethernet address. + * This is also true when SNAT takes place (for the reply direction). + * + * There are two cases to consider: + * 1. The packet was DNAT'ed to a device in the same bridge + * port group as it was received on. We can still bridge + * the packet. + * 2. The packet was DNAT'ed to a different device, either + * a non-bridged device or another bridge port group. + * The packet will need to be routed. + * + * The correct way of distinguishing between these two cases is to + * call ip_route_input() and to look at skb->dst->dev, which is + * changed to the destination device if ip_route_input() succeeds. + * + * Let's first consider the case that ip_route_input() succeeds: + * + * If the output device equals the logical bridge device the packet + * came in on, we can consider this bridging. The corresponding MAC + * address will be obtained in br_nf_pre_routing_finish_bridge. + * Otherwise, the packet is considered to be routed and we just + * change the destination MAC address so that the packet will + * later be passed up to the IP stack to be routed. For a redirected + * packet, ip_route_input() will give back the localhost as output device, + * which differs from the bridge device. + * + * Let's now consider the case that ip_route_input() fails: + * + * This can be because the destination address is martian, in which case + * the packet will be dropped. + * If IP forwarding is disabled, ip_route_input() will fail, while + * ip_route_output_key() can return success. The source + * address for ip_route_output_key() is set to zero, so ip_route_output_key() + * thinks we're handling a locally generated packet and won't care + * if IP forwarding is enabled. If the output device equals the logical bridge + * device, we proceed as if ip_route_input() succeeded. If it differs from the + * logical bridge port or if ip_route_output_key() fails we drop the packet. + */ +static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct iphdr *iph = ip_hdr(skb); + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + int err; + + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->in_prerouting = 0; + if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { + if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + /* If err equals -EHOSTUNREACH the error is due to a + * martian destination or due to the fact that + * forwarding is disabled. For most martian packets, + * ip_route_output_key() will fail. It won't fail for 2 types of + * martian destinations: loopback destinations and destination + * 0.0.0.0. In both cases the packet will be dropped because the + * destination is the loopback device and not the bridge. */ + if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) + goto free_skb; + + rt = ip_route_output(net, iph->daddr, 0, + RT_TOS(iph->tos), 0); + if (!IS_ERR(rt)) { + /* - Bridged-and-DNAT'ed traffic doesn't + * require ip_forwarding. */ + if (rt->dst.dev == dev) { + skb_dst_set(skb, &rt->dst); + goto bridged_dnat; + } + ip_rt_put(rt); + } +free_skb: + kfree_skb(skb); + return 0; + } else { + if (skb_dst(skb)->dev == dev) { +bridged_dnat: + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, + NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) +{ + struct net_device *vlan, *br; + + br = bridge_parent(dev); + if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + return br; + + vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, + skb_vlan_tag_get(skb) & VLAN_VID_MASK); + + return vlan ? vlan : br; +} + +/* Some common code for IPv4/IPv6 */ +struct net_device *setup_pre_routing(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->pkt_otherhost = true; + } + + nf_bridge->in_prerouting = 1; + nf_bridge->physindev = skb->dev; + skb->dev = brnf_get_logical_dev(skb, skb->dev); + + if (skb->protocol == htons(ETH_P_8021Q)) + nf_bridge->orig_proto = BRNF_PROTO_8021Q; + else if (skb->protocol == htons(ETH_P_PPP_SES)) + nf_bridge->orig_proto = BRNF_PROTO_PPPOE; + + /* Must drop socket now because of tproxy. */ + skb_orphan(skb); + return skb->dev; +} + +/* Direct IPv6 traffic to br_nf_pre_routing_ipv6. + * Replicate the checks that IPv4 does on packet reception. + * Set skb->dev to the bridge device (i.e. parent of the + * receiving device) to make netfilter happy, the REDIRECT + * target in particular. Save the original destination IP + * address to be able to detect DNAT afterwards. */ +static unsigned int br_nf_pre_routing(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + struct net_bridge_port *p; + struct net_bridge *br; + __u32 len = nf_bridge_encap_header_len(skb); + + if (unlikely(!pskb_may_pull(skb, len))) + return NF_DROP; + + p = br_port_get_rcu(state->in); + if (p == NULL) + return NF_DROP; + br = p->br; + + if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { + if (!brnf_call_ip6tables && !br->nf_call_ip6tables) + return NF_ACCEPT; + + nf_bridge_pull_encap_header_rcsum(skb); + return br_nf_pre_routing_ipv6(priv, skb, state); + } + + if (!brnf_call_iptables && !br->nf_call_iptables) + return NF_ACCEPT; + + if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) + return NF_ACCEPT; + + nf_bridge_pull_encap_header_rcsum(skb); + + if (br_validate_ipv4(state->net, skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; + + skb->protocol = htons(ETH_P_IP); + + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, + skb->dev, NULL, + br_nf_pre_routing_finish); + + return NF_STOLEN; +} + + +/* PF_BRIDGE/LOCAL_IN ************************************************/ +/* The packet is locally destined, which requires a real + * dst_entry, so detach the fake one. On the way up, the + * packet would pass through PRE_ROUTING again (which already + * took place when the packet entered the bridge), but we + * register an IPv4 PRE_ROUTING 'sabotage' hook that will + * prevent this from happening. */ +static unsigned int br_nf_local_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + br_drop_fake_rtable(skb); + return NF_ACCEPT; +} + +/* PF_BRIDGE/FORWARD *************************************************/ +static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *in; + + if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { + + if (skb->protocol == htons(ETH_P_IP)) + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; + + if (skb->protocol == htons(ETH_P_IPV6)) + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + + in = nf_bridge->physindev; + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge_update_protocol(skb); + } else { + in = *((struct net_device **)(skb->cb)); + } + nf_bridge_push_encap_header(skb); + + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb, + in, skb->dev, br_forward_finish, 1); + return 0; +} + + +/* This is the 'purely bridged' case. For IP, we pass the packet to + * netfilter with indev and outdev set to the bridge device, + * but we are still able to filter on the 'real' indev/outdev + * because of the physdev module. For ARP, indev and outdev are the + * bridge ports. */ +static unsigned int br_nf_forward_ip(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + struct net_device *parent; + u_int8_t pf; + + if (!skb->nf_bridge) + return NF_ACCEPT; + + /* Need exclusive nf_bridge_info since we might have multiple + * different physoutdevs. */ + if (!nf_bridge_unshare(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + if (!nf_bridge) + return NF_DROP; + + parent = bridge_parent(state->out); + if (!parent) + return NF_DROP; + + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + pf = NFPROTO_IPV4; + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + pf = NFPROTO_IPV6; + else + return NF_ACCEPT; + + nf_bridge_pull_encap_header(skb); + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->pkt_otherhost = true; + } + + if (pf == NFPROTO_IPV4) { + if (br_validate_ipv4(state->net, skb)) + return NF_DROP; + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + } + + if (pf == NFPROTO_IPV6) { + if (br_validate_ipv6(state->net, skb)) + return NF_DROP; + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + } + + nf_bridge->physoutdev = skb->dev; + if (pf == NFPROTO_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, + brnf_get_logical_dev(skb, state->in), + parent, br_nf_forward_finish); + + return NF_STOLEN; +} + +static unsigned int br_nf_forward_arp(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct net_bridge_port *p; + struct net_bridge *br; + struct net_device **d = (struct net_device **)(skb->cb); + + p = br_port_get_rcu(state->out); + if (p == NULL) + return NF_ACCEPT; + br = p->br; + + if (!brnf_call_arptables && !br->nf_call_arptables) + return NF_ACCEPT; + + if (!IS_ARP(skb)) { + if (!IS_VLAN_ARP(skb)) + return NF_ACCEPT; + nf_bridge_pull_encap_header(skb); + } + + if (arp_hdr(skb)->ar_pln != 4) { + if (IS_VLAN_ARP(skb)) + nf_bridge_push_encap_header(skb); + return NF_ACCEPT; + } + *d = state->in; + NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb, + state->in, state->out, br_nf_forward_finish); + + return NF_STOLEN; +} + +static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct brnf_frag_data *data; + int err; + + data = this_cpu_ptr(&brnf_frag_data_storage); + err = skb_cow_head(skb, data->size); + + if (err) { + kfree_skb(skb); + return 0; + } + + if (data->vlan_tci) { + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + } + + skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); + __skb_push(skb, data->encap_size); + + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(net, sk, skb); +} + +static int +br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) +{ + unsigned int mtu = ip_skb_dst_mtu(skb); + struct iphdr *iph = ip_hdr(skb); + + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(net, sk, skb, output); +} + +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) +{ + if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + return PPPOE_SES_HLEN; + return 0; +} + +static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge; + unsigned int mtu_reserved; + + mtu_reserved = nf_bridge_mtu_reduction(skb); + + if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(net, sk, skb); + } + + nf_bridge = nf_bridge_info_get(skb); + + /* This is wrong! We should preserve the original fragment + * boundaries by preserving frag_list rather than refragmenting. + */ + if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) && + skb->protocol == htons(ETH_P_IP)) { + struct brnf_frag_data *data; + + if (br_validate_ipv4(net, skb)) + goto drop; + + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + } + if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && + skb->protocol == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + struct brnf_frag_data *data; + + if (br_validate_ipv6(net, skb)) + goto drop; + + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + if (v6ops) + return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + + kfree_skb(skb); + return -EMSGSIZE; + } + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(net, sk, skb); + drop: + kfree_skb(skb); + return 0; +} + +/* PF_BRIDGE/POST_ROUTING ********************************************/ +static unsigned int br_nf_post_routing(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *realoutdev = bridge_parent(skb->dev); + u_int8_t pf; + + /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in + * on a bridge, but was delivered locally and is now being routed: + * + * POST_ROUTING was already invoked from the ip stack. + */ + if (!nf_bridge || !nf_bridge->physoutdev) + return NF_ACCEPT; + + if (!realoutdev) + return NF_DROP; + + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) + pf = NFPROTO_IPV4; + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) + pf = NFPROTO_IPV6; + else + return NF_ACCEPT; + + /* We assume any code from br_dev_queue_push_xmit onwards doesn't care + * about the value of skb->pkt_type. */ + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->pkt_otherhost = true; + } + + nf_bridge_pull_encap_header(skb); + if (pf == NFPROTO_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb, + NULL, realoutdev, + br_nf_dev_queue_xmit); + + return NF_STOLEN; +} + +/* IP/SABOTAGE *****************************************************/ +/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING + * for the second time. */ +static unsigned int ip_sabotage_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) + return NF_STOP; + + return NF_ACCEPT; +} + +/* This is called when br_netfilter has called into iptables/netfilter, + * and DNAT has taken place on a bridge-forwarded packet. + * + * neigh->output has created a new MAC header, with local br0 MAC + * as saddr. + * + * This restores the original MAC saddr of the bridged packet + * before invoking bridge forward logic to transmit the packet. + */ +static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + skb_pull(skb, ETH_HLEN); + nf_bridge->bridged_dnat = 0; + + BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); + + skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), + nf_bridge->neigh_header, + ETH_HLEN - ETH_ALEN); + skb->dev = nf_bridge->physindev; + + nf_bridge->physoutdev = NULL; + br_handle_frame_finish(dev_net(skb->dev), NULL, skb); +} + +static int br_nf_dev_xmit(struct sk_buff *skb) +{ + if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) { + br_nf_pre_routing_finish_bridge_slow(skb); + return 1; + } + return 0; +} + +static const struct nf_br_ops br_ops = { + .br_dev_xmit_hook = br_nf_dev_xmit, +}; + +void br_netfilter_enable(void) +{ +} +EXPORT_SYMBOL_GPL(br_netfilter_enable); + +/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because + * br_dev_queue_push_xmit is called afterwards */ +static struct nf_hook_ops br_nf_ops[] __read_mostly = { + { + .hook = br_nf_pre_routing, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_local_in, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_forward_ip, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF - 1, + }, + { + .hook = br_nf_forward_arp, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_post_routing, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_BR_PRI_LAST, + }, + { + .hook = ip_sabotage_in, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_FIRST, + }, + { + .hook = ip_sabotage_in, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_FIRST, + }, +}; + +#ifdef CONFIG_SYSCTL +static +int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write && *(int *)(ctl->data)) + *(int *)(ctl->data) = 1; + return ret; +} + +static struct ctl_table brnf_table[] = { + { + .procname = "bridge-nf-call-arptables", + .data = &brnf_call_arptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-call-iptables", + .data = &brnf_call_iptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-call-ip6tables", + .data = &brnf_call_ip6tables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-filter-vlan-tagged", + .data = &brnf_filter_vlan_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-filter-pppoe-tagged", + .data = &brnf_filter_pppoe_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-pass-vlan-input-dev", + .data = &brnf_pass_vlan_indev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { } +}; +#endif + +static int __init br_netfilter_init(void) +{ + int ret; + + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret < 0) + return ret; + +#ifdef CONFIG_SYSCTL + brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); + if (brnf_sysctl_header == NULL) { + printk(KERN_WARNING + "br_netfilter: can't register to sysctl.\n"); + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return -ENOMEM; + } +#endif + RCU_INIT_POINTER(nf_br_ops, &br_ops); + printk(KERN_NOTICE "Bridge firewalling registered\n"); + return 0; +} + +static void __exit br_netfilter_fini(void) +{ + RCU_INIT_POINTER(nf_br_ops, NULL); + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(brnf_sysctl_header); +#endif +} + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/kernel/net/bridge/br_netfilter_ipv6.c b/kernel/net/bridge/br_netfilter_ipv6.c new file mode 100644 index 000000000..d61f56efc --- /dev/null +++ b/kernel/net/bridge/br_netfilter_ipv6.c @@ -0,0 +1,244 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff + * anyway + */ +static int br_nf_check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); + u32 pkt_len; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; + int len = (raw[1] + 1) << 3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (nh[off + 1] != 4 || (off & 3) != 2) + goto bad; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + goto bad; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pskb_trim_rcsum(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + nh = skb_network_header(skb); + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; +} + +int br_validate_ipv6(struct net *net, struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + struct inet6_dev *idev = __in6_dev_get(skb->dev); + u32 pkt_len; + u8 ip6h_len = sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, ip6h_len)) + goto inhdr_error; + + if (skb->len < ip6h_len) + goto drop; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto inhdr_error; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + ip6h_len > skb->len) { + IP6_INC_STATS_BH(net, idev, + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { + IP6_INC_STATS_BH(net, idev, + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) + goto drop; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* No IP options in IPv6 header; however it should be + * checked if some next headers need special treatment + */ + return 0; + +inhdr_error: + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +static inline bool +br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) +{ + return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, + sizeof(ipv6_hdr(skb)->daddr)) != 0; +} + +/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables + * PREROUTING and continue the bridge PRE_ROUTING hook. See comment + * for br_nf_pre_routing_finish(), same logic is used here but + * equivalent IPv6 function ip6_route_input() called indirectly. + */ +static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + struct net_device *dev = skb->dev; + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->in_prerouting = 0; + if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { + skb_dst_drop(skb); + v6ops->route_input(skb); + + if (skb_dst(skb)->error) { + kfree_skb(skb); + return 0; + } + + if (skb_dst(skb)->dev == dev) { + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* Replicate the checks that IPv6 does on packet reception and pass the packet + * to ip6tables. + */ +unsigned int br_nf_pre_routing_ipv6(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + + if (br_validate_ipv6(state->net, skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; + + skb->protocol = htons(ETH_P_IPV6); + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb, + skb->dev, NULL, + br_nf_pre_routing_finish_ipv6); + + return NF_STOLEN; +} diff --git a/kernel/net/bridge/br_netlink.c b/kernel/net/bridge/br_netlink.c index a7559ef31..40197ff89 100644 --- a/kernel/net/bridge/br_netlink.c +++ b/kernel/net/bridge/br_netlink.c @@ -16,42 +16,40 @@ #include #include #include -#include #include #include "br_private.h" #include "br_private_stp.h" -static int br_get_num_vlan_infos(const struct net_port_vlans *pv, - u32 filter_mask) +static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, + u32 filter_mask) { - u16 vid_range_start = 0, vid_range_end = 0; - u16 vid_range_flags = 0; - u16 pvid, vid, flags; + struct net_bridge_vlan *v; + u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; + u16 flags, pvid; int num_vlans = 0; - if (filter_mask & RTEXT_FILTER_BRVLAN) - return pv->num_vlans; - if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; - /* Count number of vlan info's - */ - pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + pvid = br_get_pvid(vg); + /* Count number of vlan infos */ + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; - if (vid == pvid) + /* only a context, bridge vlan not activated */ + if (!br_vlan_should_use(v)) + continue; + if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; - } else if ((vid - vid_range_end) == 1 && + } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { - vid_range_end = vid; + vid_range_end = v->vid; continue; } else { if ((vid_range_end - vid_range_start) > 0) @@ -60,8 +58,8 @@ static int br_get_num_vlan_infos(const struct net_port_vlans *pv, num_vlans += 1; } initvars: - vid_range_start = vid; - vid_range_end = vid; + vid_range_start = v->vid; + vid_range_end = v->vid; vid_range_flags = flags; } @@ -75,28 +73,43 @@ initvars: return num_vlans; } +static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, + u32 filter_mask) +{ + int num_vlans; + + if (!vg) + return 0; + + if (filter_mask & RTEXT_FILTER_BRVLAN) + return vg->num_vlans; + + rcu_read_lock(); + num_vlans = __get_num_vlan_infos(vg, filter_mask); + rcu_read_unlock(); + + return num_vlans; +} + static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { - struct net_port_vlans *pv; + struct net_bridge_vlan_group *vg = NULL; + struct net_bridge_port *p; + struct net_bridge *br; int num_vlan_infos; rcu_read_lock(); - if (br_port_exists(dev)) - pv = nbp_get_vlan_info(br_port_get_rcu(dev)); - else if (dev->priv_flags & IFF_EBRIDGE) - pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); - else - pv = NULL; - if (pv) - num_vlan_infos = br_get_num_vlan_infos(pv, filter_mask); - else - num_vlan_infos = 0; + if (br_port_exists(dev)) { + p = br_port_get_rcu(dev); + vg = nbp_vlan_group_rcu(p); + } else if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group_rcu(br); + } + num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); - if (!num_vlan_infos) - return 0; - /* Each VLAN is returned in bridge_vlan_info along with flags */ return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); } @@ -114,6 +127,20 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ +#endif + 0; } @@ -135,6 +162,7 @@ static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); + u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || @@ -147,9 +175,36 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, - !!(p->flags & BR_PROXYARP_WIFI))) + !!(p->flags & BR_PROXYARP_WIFI)) || + nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), + &p->designated_root) || + nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), + &p->designated_bridge) || + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || + nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || + nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || + nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + p->topology_change_ack) || + nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) + return -EMSGSIZE; + + timerval = br_timer_value(&p->message_age_timer); + if (nla_put_u64(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval)) + return -EMSGSIZE; + timerval = br_timer_value(&p->forward_delay_timer); + if (nla_put_u64(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval)) + return -EMSGSIZE; + timerval = br_timer_value(&p->hold_timer); + if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval)) return -EMSGSIZE; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, + p->multicast_router)) + return -EMSGSIZE; +#endif + return 0; } @@ -166,8 +221,6 @@ static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start, sizeof(vinfo), &vinfo)) goto nla_put_failure; - vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN; - vinfo.vid = vid_end; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, @@ -188,31 +241,33 @@ nla_put_failure: } static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, - const struct net_port_vlans *pv) + struct net_bridge_vlan_group *vg) { - u16 vid_range_start = 0, vid_range_end = 0; - u16 vid_range_flags = 0; - u16 pvid, vid, flags; + struct net_bridge_vlan *v; + u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; + u16 flags, pvid; int err = 0; /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan * and mark vlan info with begin and end flags * if vlaninfo represents a range */ - pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + pvid = br_get_pvid(vg); + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; - if (vid == pvid) + if (!br_vlan_should_use(v)) + continue; + if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; - } else if ((vid - vid_range_end) == 1 && + } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { - vid_range_end = vid; + vid_range_end = v->vid; continue; } else { err = br_fill_ifvlaninfo_range(skb, vid_range_start, @@ -223,8 +278,8 @@ static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, } initvars: - vid_range_start = vid; - vid_range_end = vid; + vid_range_start = v->vid; + vid_range_end = v->vid; vid_range_flags = flags; } @@ -241,19 +296,23 @@ initvars: } static int br_fill_ifvlaninfo(struct sk_buff *skb, - const struct net_port_vlans *pv) + struct net_bridge_vlan_group *vg) { struct bridge_vlan_info vinfo; - u16 pvid, vid; + struct net_bridge_vlan *v; + u16 pvid; + + pvid = br_get_pvid(vg); + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; - pvid = br_get_pvid(pv); - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - vinfo.vid = vid; + vinfo.vid = v->vid; vinfo.flags = 0; - if (vid == pvid) + if (v->vid == pvid) vinfo.flags |= BRIDGE_VLAN_INFO_PVID; - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, @@ -272,11 +331,11 @@ nla_put_failure: * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, - const struct net_bridge_port *port, + struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev) { - const struct net_bridge *br; + struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; @@ -323,26 +382,31 @@ static int br_fill_ifinfo(struct sk_buff *skb, /* Check if the VID information is requested */ if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { - const struct net_port_vlans *pv; + struct net_bridge_vlan_group *vg; struct nlattr *af; int err; + /* RCU needed because of the VLAN locking rules (rcu || rtnl) */ + rcu_read_lock(); if (port) - pv = nbp_get_vlan_info(port); + vg = nbp_vlan_group_rcu(port); else - pv = br_get_vlan_info(br); + vg = br_vlan_group_rcu(br); - if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID)) + if (!vg || !vg->num_vlans) { + rcu_read_unlock(); goto done; - + } af = nla_nest_start(skb, IFLA_AF_SPEC); - if (!af) + if (!af) { + rcu_read_unlock(); goto nla_put_failure; - + } if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) - err = br_fill_ifvlaninfo_compressed(skb, pv); + err = br_fill_ifvlaninfo_compressed(skb, vg); else - err = br_fill_ifvlaninfo(skb, pv); + err = br_fill_ifvlaninfo(skb, vg); + rcu_read_unlock(); if (err) goto nla_put_failure; nla_nest_end(skb, af); @@ -416,14 +480,14 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, switch (cmd) { case RTM_SETLINK: if (p) { + /* if the MASTER flag is set this will act on the global + * per-VLAN entry as well + */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags); if (err) break; - - if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) - err = br_vlan_add(p->br, vinfo->vid, - vinfo->flags); } else { + vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags); } break; @@ -459,10 +523,15 @@ static int br_afspec(struct net_bridge *br, if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo = nla_data(attr); + if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) + return -EINVAL; if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (vinfo_start) return -EINVAL; vinfo_start = vinfo; + /* don't allow range of pvids */ + if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID) + return -EINVAL; continue; } @@ -508,6 +577,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, + [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -579,6 +649,18 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } + if (tb[IFLA_BRPORT_FLUSH]) + br_fdb_delete_by_port(p->br, p, 0, 0); + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { + u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); + + err = br_multicast_set_port_router(p, mcast_router); + if (err) + return err; + } +#endif br_port_flags_change(p, old_flags ^ p->flags); return 0; } @@ -590,7 +672,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) struct nlattr *afspec; struct net_bridge_port *p; struct nlattr *tb[IFLA_BRPORT_MAX + 1]; - int err = 0, ret_offload = 0; + int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); @@ -632,16 +714,6 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) afspec, RTM_SETLINK); } - if (p && !(flags & BRIDGE_FLAGS_SELF)) { - /* set bridge attributes in hardware if supported - */ - ret_offload = netdev_switch_port_bridge_setlink(dev, nlh, - flags); - if (ret_offload && ret_offload != -EOPNOTSUPP) - br_warn(p->br, "error setting attrs on port %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); - } - if (err == 0) br_ifinfo_notify(RTM_NEWLINK, p); out: @@ -653,7 +725,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct nlattr *afspec; struct net_bridge_port *p; - int err = 0, ret_offload = 0; + int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) @@ -672,16 +744,6 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) */ br_ifinfo_notify(RTM_NEWLINK, p); - if (p && !(flags & BRIDGE_FLAGS_SELF)) { - /* del bridge attributes in hardware - */ - ret_offload = netdev_switch_port_bridge_dellink(dev, nlh, - flags); - if (ret_offload && ret_offload != -EOPNOTSUPP) - br_warn(p->br, "error deleting attrs on port %u (%s)\n", - (unsigned int)p->port_no, p->dev->name); - } - return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -693,6 +755,21 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EADDRNOTAVAIL; } + if (!data) + return 0; + +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (data[IFLA_BR_VLAN_PROTOCOL]) { + switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + break; + default: + return -EPROTONOSUPPORT; + } + } +#endif + return 0; } @@ -748,6 +825,29 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, + [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, + [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, + [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, + [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, + [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -778,9 +878,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } if (data[IFLA_BR_AGEING_TIME]) { - u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); - - br->ageing_time = clock_t_to_jiffies(ageing_time); + err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); + if (err) + return err; } if (data[IFLA_BR_STP_STATE]) { @@ -795,6 +895,176 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br_stp_set_bridge_priority(br, priority); } + if (data[IFLA_BR_VLAN_FILTERING]) { + u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); + + err = __br_vlan_filter_toggle(br, vlan_filter); + if (err) + return err; + } + +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (data[IFLA_BR_VLAN_PROTOCOL]) { + __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); + + err = __br_vlan_set_proto(br, vlan_proto); + if (err) + return err; + } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + err = __br_vlan_set_default_pvid(br, defpvid); + if (err) + return err; + } +#endif + + if (data[IFLA_BR_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_RESTRICTED) + return -EINVAL; + br->group_fwd_mask = fwd_mask; + } + + if (data[IFLA_BR_GROUP_ADDR]) { + u8 new_addr[ETH_ALEN]; + + if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) + return -EINVAL; + memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); + if (!is_link_local_ether_addr(new_addr)) + return -EINVAL; + if (new_addr[5] == 1 || /* 802.3x Pause address */ + new_addr[5] == 2 || /* 802.3ad Slow protocols */ + new_addr[5] == 3) /* 802.1X PAE address */ + return -EINVAL; + spin_lock_bh(&br->lock); + memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); + spin_unlock_bh(&br->lock); + br->group_addr_set = true; + br_recalculate_fwd_mask(br); + } + + if (data[IFLA_BR_FDB_FLUSH]) + br_fdb_flush(br); + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (data[IFLA_BR_MCAST_ROUTER]) { + u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); + + err = br_multicast_set_router(br, multicast_router); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_SNOOPING]) { + u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); + + err = br_multicast_toggle(br, mcast_snooping); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { + u8 val; + + val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); + br->multicast_query_use_ifaddr = !!val; + } + + if (data[IFLA_BR_MCAST_QUERIER]) { + u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); + + err = br_multicast_set_querier(br, mcast_querier); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]); + + br->hash_elasticity = val; + } + + if (data[IFLA_BR_MCAST_HASH_MAX]) { + u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); + + err = br_multicast_set_hash_max(br, hash_max); + if (err) + return err; + } + + if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); + + br->multicast_last_member_count = val; + } + + if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); + + br->multicast_startup_query_count = val; + } + + if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); + + br->multicast_last_member_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); + + br->multicast_membership_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); + + br->multicast_querier_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERY_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); + + br->multicast_query_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); + + br->multicast_query_response_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); + + br->multicast_startup_query_interval = clock_t_to_jiffies(val); + } +#endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (data[IFLA_BR_NF_CALL_IPTABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); + + br->nf_call_iptables = val ? true : false; + } + + if (data[IFLA_BR_NF_CALL_IP6TABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); + + br->nf_call_ip6tables = val ? true : false; + } + + if (data[IFLA_BR_NF_CALL_ARPTABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); + + br->nf_call_arptables = val ? true : false; + } +#endif + return 0; } @@ -806,6 +1076,44 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ +#endif + nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ + nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ + nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ +#endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ +#endif 0; } @@ -818,46 +1126,105 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; + u8 vlan_enabled = br_vlan_enabled(br); + u64 clockval; + + clockval = br_timer_value(&br->hello_timer); + if (nla_put_u64(skb, IFLA_BR_HELLO_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->tcn_timer); + if (nla_put_u64(skb, IFLA_BR_TCN_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->topology_change_timer); + if (nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->gc_timer); + if (nla_put_u64(skb, IFLA_BR_GC_TIMER, clockval)) + return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || - nla_put_u16(skb, IFLA_BR_PRIORITY, priority)) + nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || + nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || + nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || + nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), + &br->bridge_id) || + nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), + &br->designated_root) || + nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || + nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || + nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || + nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + br->topology_change_detected) || + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) return -EMSGSIZE; - return 0; -} - -static size_t br_get_link_af_size(const struct net_device *dev) -{ - struct net_port_vlans *pv; - - if (br_port_exists(dev)) - pv = nbp_get_vlan_info(br_port_get_rtnl(dev)); - else if (dev->priv_flags & IFF_EBRIDGE) - pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); - else - return 0; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || + nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid)) + return -EMSGSIZE; +#endif +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || + nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, + br->multicast_query_use_ifaddr) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, + br->hash_elasticity) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || + nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, + br->multicast_last_member_count) || + nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, + br->multicast_startup_query_count)) + return -EMSGSIZE; - if (!pv) - return 0; + clockval = jiffies_to_clock_t(br->multicast_last_member_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_membership_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_querier_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_query_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_query_response_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval)) + return -EMSGSIZE; +#endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, + br->nf_call_iptables ? 1 : 0) || + nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, + br->nf_call_ip6tables ? 1 : 0) || + nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, + br->nf_call_arptables ? 1 : 0)) + return -EMSGSIZE; +#endif - /* Each VLAN is returned in bridge_vlan_info along with flags */ - return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info)); + return 0; } + static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, - .get_link_af_size = br_get_link_af_size, + .get_link_af_size = br_get_link_af_size_filtered, }; struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, - .maxtype = IFLA_BRPORT_MAX, + .maxtype = IFLA_BR_MAX, .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, diff --git a/kernel/net/bridge/br_private.h b/kernel/net/bridge/br_private.h index 3362c2940..216018c76 100644 --- a/kernel/net/bridge/br_private.h +++ b/kernel/net/bridge/br_private.h @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #define BR_HASH_BITS 8 #define BR_HASH_SIZE (1 << BR_HASH_BITS) @@ -27,14 +29,13 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<bridge_id, &br->designated_root, 8); } +/* check if a VLAN entry is global */ +static inline bool br_vlan_is_master(const struct net_bridge_vlan *v) +{ + return v->flags & BRIDGE_VLAN_INFO_MASTER; +} + +/* check if a VLAN entry is used by the bridge */ +static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v) +{ + return v->flags & BRIDGE_VLAN_INFO_BRENTRY; +} + +/* check if we should use the vlan entry, returns false if it's only context */ +static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) +{ + if (br_vlan_is_master(v)) { + if (br_vlan_is_brentry(v)) + return true; + else + return false; + } + + return true; +} + /* br_device.c */ void br_dev_setup(struct net_device *dev); void br_dev_delete(struct net_device *dev, struct list_head *list); @@ -384,7 +456,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(unsigned long arg); void br_fdb_delete_by_port(struct net_bridge *br, - const struct net_bridge_port *p, int do_all); + const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, const unsigned char *addr, __u16 vid); int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); @@ -410,10 +482,10 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, /* br_forward.c */ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); -int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb); +int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0); -int br_forward_finish(struct sock *sk, struct sk_buff *skb); +int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast); void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, struct sk_buff *skb2, bool unicast); @@ -431,7 +503,7 @@ void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); /* br_input.c */ -int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); +int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); rx_handler_result_t br_handle_frame(struct sk_buff **pskb); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) @@ -463,6 +535,7 @@ void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); +void br_multicast_dev_del(struct net_bridge *br); void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb); void br_multicast_forward(struct net_bridge_mdb_entry *mdst, @@ -485,7 +558,9 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, void br_mdb_init(void); void br_mdb_uninit(void); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type); + struct br_ip *group, int type, u8 state); +void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, + int type); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -562,6 +637,10 @@ static inline void br_multicast_stop(struct net_bridge *br) { } +static inline void br_multicast_dev_del(struct net_bridge *br) +{ +} + static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb) { @@ -591,39 +670,55 @@ static inline void br_mdb_uninit(void) /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING -bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, - struct sk_buff *skb, u16 *vid); -bool br_allowed_egress(struct net_bridge *br, const struct net_port_vlans *v, +bool br_allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, + u16 *vid); +bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, - const struct net_port_vlans *v, + struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); -bool br_vlan_find(struct net_bridge *br, u16 vid); +struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); +int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); -bool nbp_vlan_find(struct net_bridge_port *port, u16 vid); int nbp_vlan_init(struct net_bridge_port *port); +int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); -static inline struct net_port_vlans *br_get_vlan_info( - const struct net_bridge *br) +static inline struct net_bridge_vlan_group *br_vlan_group( + const struct net_bridge *br) { - return rcu_dereference_rtnl(br->vlan_info); + return rtnl_dereference(br->vlgrp); } -static inline struct net_port_vlans *nbp_get_vlan_info( - const struct net_bridge_port *p) +static inline struct net_bridge_vlan_group *nbp_vlan_group( + const struct net_bridge_port *p) { - return rcu_dereference_rtnl(p->vlan_info); + return rtnl_dereference(p->vlgrp); +} + +static inline struct net_bridge_vlan_group *br_vlan_group_rcu( + const struct net_bridge *br) +{ + return rcu_dereference(br->vlgrp); +} + +static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( + const struct net_bridge_port *p) +{ + return rcu_dereference(p->vlgrp); } /* Since bridge now depends on 8021Q module, but the time bridge sees the @@ -633,9 +728,9 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) { int err = 0; - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK; - else { + } else { *vid = 0; err = -EINVAL; } @@ -643,13 +738,13 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) return err; } -static inline u16 br_get_pvid(const struct net_port_vlans *v) +static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { - if (!v) + if (!vg) return 0; smp_rmb(); - return v->pvid; + return vg->pvid; } static inline int br_vlan_enabled(struct net_bridge *br) @@ -657,16 +752,15 @@ static inline int br_vlan_enabled(struct net_bridge *br) return br->vlan_enabled; } #else -static inline bool br_allowed_ingress(struct net_bridge *br, - struct net_port_vlans *v, +static inline bool br_allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid) { return true; } -static inline bool br_allowed_egress(struct net_bridge *br, - const struct net_port_vlans *v, +static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { return true; @@ -679,7 +773,7 @@ static inline bool br_should_learn(struct net_bridge_port *p, } static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, - const struct net_port_vlans *v, + struct net_bridge_vlan_group *vg, struct sk_buff *skb) { return skb; @@ -699,11 +793,6 @@ static inline void br_vlan_flush(struct net_bridge *br) { } -static inline bool br_vlan_find(struct net_bridge *br, u16 vid) -{ - return false; -} - static inline void br_recalculate_fwd_mask(struct net_bridge *br) { } @@ -727,40 +816,68 @@ static inline void nbp_vlan_flush(struct net_bridge_port *port) { } -static inline struct net_port_vlans *br_get_vlan_info( - const struct net_bridge *br) +static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, + u16 vid) { return NULL; } -static inline struct net_port_vlans *nbp_get_vlan_info( - const struct net_bridge_port *p) + +static inline int nbp_vlan_init(struct net_bridge_port *port) { - return NULL; + return 0; } -static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) +static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) { - return false; + return 0; } -static inline int nbp_vlan_init(struct net_bridge_port *port) +static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { return 0; } -static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) +static inline int br_vlan_enabled(struct net_bridge *br) { return 0; } -static inline u16 br_get_pvid(const struct net_port_vlans *v) + +static inline int __br_vlan_filter_toggle(struct net_bridge *br, + unsigned long val) { - return 0; + return -EOPNOTSUPP; } -static inline int br_vlan_enabled(struct net_bridge *br) +static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p, + u32 filter_mask) { return 0; } + +static inline struct net_bridge_vlan_group *br_vlan_group( + const struct net_bridge *br) +{ + return NULL; +} + +static inline struct net_bridge_vlan_group *nbp_vlan_group( + const struct net_bridge_port *p) +{ + return NULL; +} + +static inline struct net_bridge_vlan_group *br_vlan_group_rcu( + const struct net_bridge *br) +{ + return NULL; +} + +static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( + const struct net_bridge_port *p) +{ + return NULL; +} + #endif struct nf_br_ops { @@ -790,6 +907,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); +int br_set_ageing_time(struct net_bridge *br, u32 ageing_time); /* br_stp_if.c */ diff --git a/kernel/net/bridge/br_stp.c b/kernel/net/bridge/br_stp.c index fb3ebe615..5f3f64553 100644 --- a/kernel/net/bridge/br_stp.c +++ b/kernel/net/bridge/br_stp.c @@ -39,10 +39,15 @@ void br_log_state(const struct net_bridge_port *p) void br_set_state(struct net_bridge_port *p, unsigned int state) { + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, + .flags = SWITCHDEV_F_DEFER, + .u.stp_state = state, + }; int err; p->state = state; - err = netdev_switch_port_stp_update(p->dev, state); + err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); @@ -205,8 +210,9 @@ void br_transmit_config(struct net_bridge_port *p) br_send_config_bpdu(p, &bpdu); p->topology_change_ack = 0; p->config_pending = 0; - mod_timer(&p->hold_timer, - round_jiffies(jiffies + BR_HOLD_TIME)); + if (p->br->stp_enabled == BR_KERNEL_STP) + mod_timer(&p->hold_timer, + round_jiffies(jiffies + BR_HOLD_TIME)); } } @@ -424,7 +430,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_multicast_enable_port(p); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -458,6 +463,12 @@ void br_port_state_selection(struct net_bridge *br) } } + if (p->state != BR_STATE_BLOCKING) + br_multicast_enable_port(p); + /* Multicast is not disabled for the port when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ if (p->state == BR_STATE_FORWARDING) ++liveports; } @@ -556,6 +567,29 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) } +int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.ageing_time = ageing_time, + }; + unsigned long t = clock_t_to_jiffies(ageing_time); + int err; + + if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME) + return -ERANGE; + + err = switchdev_port_attr_set(br->dev, &attr); + if (err) + return err; + + br->ageing_time = t; + mod_timer(&br->gc_timer, jiffies); + + return 0; +} + void __br_set_forward_delay(struct net_bridge *br, unsigned long t) { br->bridge_forward_delay = t; diff --git a/kernel/net/bridge/br_stp_bpdu.c b/kernel/net/bridge/br_stp_bpdu.c index 534fc4cd2..5881fbc11 100644 --- a/kernel/net/bridge/br_stp_bpdu.c +++ b/kernel/net/bridge/br_stp_bpdu.c @@ -30,6 +30,12 @@ #define LLC_RESERVE sizeof(struct llc_pdu_un) +static int br_send_bpdu_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + static void br_send_bpdu(struct net_bridge_port *p, const unsigned char *data, int length) { @@ -54,9 +60,9 @@ static void br_send_bpdu(struct net_bridge_port *p, skb_reset_mac_header(skb); - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, - NULL, skb->dev, - dev_queue_xmit_sk); + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, + dev_net(p->dev), NULL, skb, NULL, skb->dev, + br_send_bpdu_finish); } static inline void br_set_ticks(unsigned char *dest, int j) diff --git a/kernel/net/bridge/br_stp_if.c b/kernel/net/bridge/br_stp_if.c index 7832d07f4..8a7ada8bb 100644 --- a/kernel/net/bridge/br_stp_if.c +++ b/kernel/net/bridge/br_stp_if.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "br_private.h" #include "br_private_stp.h" @@ -35,11 +36,22 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) /* called under bridge lock */ void br_init_port(struct net_bridge_port *p) { + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER, + .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time), + }; + int err; + p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + netdev_err(p->dev, "failed to set HW ageing time\n"); } /* called under bridge lock */ @@ -48,7 +60,8 @@ void br_stp_enable_bridge(struct net_bridge *br) struct net_bridge_port *p; spin_lock_bh(&br->lock); - mod_timer(&br->hello_timer, jiffies + br->hello_time); + if (br->stp_enabled == BR_KERNEL_STP) + mod_timer(&br->hello_timer, jiffies + br->hello_time); mod_timer(&br->gc_timer, jiffies + HZ/10); br_config_bpdu_generation(br); @@ -111,7 +124,7 @@ void br_stp_disable_port(struct net_bridge_port *p) del_timer(&p->forward_delay_timer); del_timer(&p->hold_timer); - br_fdb_delete_by_port(br, p, 0); + br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); @@ -127,8 +140,12 @@ static void br_stp_start(struct net_bridge *br) int r; char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; char *envp[] = { NULL }; + struct net_bridge_port *p; - r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + if (net_eq(dev_net(br->dev), &init_net)) + r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + else + r = -ENOENT; spin_lock_bh(&br->lock); @@ -140,6 +157,10 @@ static void br_stp_start(struct net_bridge *br) if (r == 0) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); + /* Stop hello and hold timers */ + del_timer(&br->hello_timer); + list_for_each_entry(p, &br->port_list, list) + del_timer(&p->hold_timer); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); @@ -156,12 +177,17 @@ static void br_stp_stop(struct net_bridge *br) int r; char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL }; char *envp[] = { NULL }; + struct net_bridge_port *p; if (br->stp_enabled == BR_USER_STP) { r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); br_info(br, "userspace STP stopped, return code %d\n", r); /* To start timers on any ports left in blocking */ + mod_timer(&br->hello_timer, jiffies + br->hello_time); + list_for_each_entry(p, &br->port_list, list) + mod_timer(&p->hold_timer, + round_jiffies(jiffies + BR_HOLD_TIME)); spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); diff --git a/kernel/net/bridge/br_stp_timer.c b/kernel/net/bridge/br_stp_timer.c index 7caf7fae2..5f0f5af0e 100644 --- a/kernel/net/bridge/br_stp_timer.c +++ b/kernel/net/bridge/br_stp_timer.c @@ -40,7 +40,9 @@ static void br_hello_timer_expired(unsigned long arg) if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); - mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); + if (br->stp_enabled != BR_USER_STP) + mod_timer(&br->hello_timer, + round_jiffies(jiffies + br->hello_time)); } spin_unlock(&br->lock); } diff --git a/kernel/net/bridge/br_sysfs_br.c b/kernel/net/bridge/br_sysfs_br.c index 4c97fc50f..8365bd53c 100644 --- a/kernel/net/bridge/br_sysfs_br.c +++ b/kernel/net/bridge/br_sysfs_br.c @@ -102,8 +102,15 @@ static ssize_t ageing_time_show(struct device *d, static int set_ageing_time(struct net_bridge *br, unsigned long val) { - br->ageing_time = clock_t_to_jiffies(val); - return 0; + int ret; + + if (!rtnl_trylock()) + return restart_syscall(); + + ret = br_set_ageing_time(br, val); + rtnl_unlock(); + + return ret; } static ssize_t ageing_time_store(struct device *d, diff --git a/kernel/net/bridge/br_sysfs_if.c b/kernel/net/bridge/br_sysfs_if.c index 4905845a9..efe415ad8 100644 --- a/kernel/net/bridge/br_sysfs_if.c +++ b/kernel/net/bridge/br_sysfs_if.c @@ -160,7 +160,7 @@ static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { - br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry + br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); diff --git a/kernel/net/bridge/br_vlan.c b/kernel/net/bridge/br_vlan.c index 13013fe8d..1394da636 100644 --- a/kernel/net/bridge/br_vlan.c +++ b/kernel/net/bridge/br_vlan.c @@ -2,59 +2,209 @@ #include #include #include +#include #include "br_private.h" -static void __vlan_add_pvid(struct net_port_vlans *v, u16 vid) +static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) { - if (v->pvid == vid) + const struct net_bridge_vlan *vle = ptr; + u16 vid = *(u16 *)arg->key; + + return vle->vid != vid; +} + +static const struct rhashtable_params br_vlan_rht_params = { + .head_offset = offsetof(struct net_bridge_vlan, vnode), + .key_offset = offsetof(struct net_bridge_vlan, vid), + .key_len = sizeof(u16), + .nelem_hint = 3, + .locks_mul = 1, + .max_size = VLAN_N_VID, + .obj_cmpfn = br_vlan_cmp, + .automatic_shrinking = true, +}; + +static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) +{ + return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); +} + +static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) +{ + if (vg->pvid == vid) return; smp_wmb(); - v->pvid = vid; + vg->pvid = vid; } -static void __vlan_delete_pvid(struct net_port_vlans *v, u16 vid) +static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { - if (v->pvid != vid) + if (vg->pvid != vid) return; smp_wmb(); - v->pvid = 0; + vg->pvid = 0; } -static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) +static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) { + struct net_bridge_vlan_group *vg; + + if (br_vlan_is_master(v)) + vg = br_vlan_group(v->br); + else + vg = nbp_vlan_group(v->port); + if (flags & BRIDGE_VLAN_INFO_PVID) - __vlan_add_pvid(v, vid); + __vlan_add_pvid(vg, v->vid); else - __vlan_delete_pvid(v, vid); + __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) - set_bit(vid, v->untagged_bitmap); + v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else - clear_bit(vid, v->untagged_bitmap); + v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; } -static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) +static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, + u16 vid, u16 flags) { - struct net_bridge_port *p = NULL; - struct net_bridge *br; - struct net_device *dev; + struct switchdev_obj_port_vlan v = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; + int err; + + /* Try switchdev op first. In case it is not supported, fallback to + * 8021q add. + */ + err = switchdev_port_obj_add(dev, &v.obj); + if (err == -EOPNOTSUPP) + return vlan_vid_add(dev, br->vlan_proto, vid); + return err; +} + +static void __vlan_add_list(struct net_bridge_vlan *v) +{ + struct net_bridge_vlan_group *vg; + struct list_head *headp, *hpos; + struct net_bridge_vlan *vent; + + if (br_vlan_is_master(v)) + vg = br_vlan_group(v->br); + else + vg = nbp_vlan_group(v->port); + + headp = &vg->vlan_list; + list_for_each_prev(hpos, headp) { + vent = list_entry(hpos, struct net_bridge_vlan, vlist); + if (v->vid < vent->vid) + continue; + else + break; + } + list_add_rcu(&v->vlist, hpos); +} + +static void __vlan_del_list(struct net_bridge_vlan *v) +{ + list_del_rcu(&v->vlist); +} + +static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, + u16 vid) +{ + struct switchdev_obj_port_vlan v = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .vid_begin = vid, + .vid_end = vid, + }; int err; - if (test_bit(vid, v->vlan_bitmap)) { - __vlan_add_flags(v, vid, flags); + /* Try switchdev op first. In case it is not supported, fallback to + * 8021q del. + */ + err = switchdev_port_obj_del(dev, &v.obj); + if (err == -EOPNOTSUPP) { + vlan_vid_del(dev, br->vlan_proto, vid); return 0; } + return err; +} + +/* Returns a master vlan, if it didn't exist it gets created. In all cases a + * a reference is taken to the master vlan before returning. + */ +static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *masterv; + + vg = br_vlan_group(br); + masterv = br_vlan_find(vg, vid); + if (!masterv) { + /* missing global ctx, create it now */ + if (br_vlan_add(br, vid, 0)) + return NULL; + masterv = br_vlan_find(vg, vid); + if (WARN_ON(!masterv)) + return NULL; + } + atomic_inc(&masterv->refcnt); + + return masterv; +} + +static void br_vlan_put_master(struct net_bridge_vlan *masterv) +{ + struct net_bridge_vlan_group *vg; + + if (!br_vlan_is_master(masterv)) + return; + + vg = br_vlan_group(masterv->br); + if (atomic_dec_and_test(&masterv->refcnt)) { + rhashtable_remove_fast(&vg->vlan_hash, + &masterv->vnode, br_vlan_rht_params); + __vlan_del_list(masterv); + kfree_rcu(masterv, rcu); + } +} - if (v->port_idx) { - p = v->parent.port; +/* This is the shared VLAN add function which works for both ports and bridge + * devices. There are four possible calls to this function in terms of the + * vlan entry type: + * 1. vlan is being added on a port (no master flags, global entry exists) + * 2. vlan is being added on a bridge (both master and brentry flags) + * 3. vlan is being added on a port, but a global entry didn't exist which + * is being created right now (master flag set, brentry flag unset), the + * global entry is used for global per-vlan features, but not for filtering + * 4. same as 3 but with both master and brentry flags set so the entry + * will be used for filtering in both the port and the bridge + */ +static int __vlan_add(struct net_bridge_vlan *v, u16 flags) +{ + struct net_bridge_vlan *masterv = NULL; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan_group *vg; + struct net_device *dev; + struct net_bridge *br; + int err; + + if (br_vlan_is_master(v)) { + br = v->br; + dev = br->dev; + vg = br_vlan_group(br); + } else { + p = v->port; br = p->br; dev = p->dev; - } else { - br = v->parent.br; - dev = br->dev; + vg = nbp_vlan_group(p); } if (p) { @@ -62,83 +212,140 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = vlan_vid_add(dev, br->vlan_proto, vid); + err = __vlan_vid_add(dev, br, v->vid, flags); if (err) - return err; + goto out; + + /* need to work on the master vlan too */ + if (flags & BRIDGE_VLAN_INFO_MASTER) { + err = br_vlan_add(br, v->vid, flags | + BRIDGE_VLAN_INFO_BRENTRY); + if (err) + goto out_filt; + } + + masterv = br_vlan_get_master(br, v->vid); + if (!masterv) + goto out_filt; + v->brvlan = masterv; } - err = br_fdb_insert(br, p, dev->dev_addr, vid); - if (err) { - br_err(br, "failed insert local address into bridge " - "forwarding table\n"); - goto out_filt; + /* Add the dev mac and count the vlan only if it's usable */ + if (br_vlan_should_use(v)) { + err = br_fdb_insert(br, p, dev->dev_addr, v->vid); + if (err) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + goto out_filt; + } + vg->num_vlans++; } - set_bit(vid, v->vlan_bitmap); - v->num_vlans++; - __vlan_add_flags(v, vid, flags); + err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, + br_vlan_rht_params); + if (err) + goto out_fdb_insert; - return 0; + __vlan_add_list(v); + __vlan_add_flags(v, flags); +out: + return err; + +out_fdb_insert: + if (br_vlan_should_use(v)) { + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); + vg->num_vlans--; + } out_filt: - if (p) - vlan_vid_del(dev, br->vlan_proto, vid); - return err; + if (p) { + __vlan_vid_del(dev, br, v->vid); + if (masterv) { + br_vlan_put_master(masterv); + v->brvlan = NULL; + } + } + + goto out; } -static int __vlan_del(struct net_port_vlans *v, u16 vid) +static int __vlan_del(struct net_bridge_vlan *v) { - if (!test_bit(vid, v->vlan_bitmap)) - return -EINVAL; + struct net_bridge_vlan *masterv = v; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + int err = 0; + + if (br_vlan_is_master(v)) { + vg = br_vlan_group(v->br); + } else { + p = v->port; + vg = nbp_vlan_group(v->port); + masterv = v->brvlan; + } - __vlan_delete_pvid(v, vid); - clear_bit(vid, v->untagged_bitmap); + __vlan_delete_pvid(vg, v->vid); + if (p) { + err = __vlan_vid_del(p->dev, p->br, v->vid); + if (err) + goto out; + } - if (v->port_idx) { - struct net_bridge_port *p = v->parent.port; - vlan_vid_del(p->dev, p->br->vlan_proto, vid); + if (br_vlan_should_use(v)) { + v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; + vg->num_vlans--; } - clear_bit(vid, v->vlan_bitmap); - v->num_vlans--; - if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) { - if (v->port_idx) - RCU_INIT_POINTER(v->parent.port->vlan_info, NULL); - else - RCU_INIT_POINTER(v->parent.br->vlan_info, NULL); + if (masterv != v) { + rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, + br_vlan_rht_params); + __vlan_del_list(v); kfree_rcu(v, rcu); } - return 0; + + br_vlan_put_master(masterv); +out: + return err; } -static void __vlan_flush(struct net_port_vlans *v) +static void __vlan_group_free(struct net_bridge_vlan_group *vg) { - smp_wmb(); - v->pvid = 0; - bitmap_zero(v->vlan_bitmap, VLAN_N_VID); - if (v->port_idx) - RCU_INIT_POINTER(v->parent.port->vlan_info, NULL); - else - RCU_INIT_POINTER(v->parent.br->vlan_info, NULL); - kfree_rcu(v, rcu); + WARN_ON(!list_empty(&vg->vlan_list)); + rhashtable_destroy(&vg->vlan_hash); + kfree(vg); +} + +static void __vlan_flush(struct net_bridge_vlan_group *vg) +{ + struct net_bridge_vlan *vlan, *tmp; + + __vlan_delete_pvid(vg, vg->pvid); + list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) + __vlan_del(vlan); } struct sk_buff *br_handle_vlan(struct net_bridge *br, - const struct net_port_vlans *pv, + struct net_bridge_vlan_group *vg, struct sk_buff *skb) { + struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; - /* Vlan filter table must be configured at this point. The + /* At this point, we know that the frame was filtered and contains + * a valid vlan id. If the vlan id has untagged flag set, + * send untagged; otherwise, send tagged. + */ + br_vlan_get_tag(skb, &vid); + v = br_vlan_find(vg, vid); + /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ - if (!pv) { + if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { @@ -146,13 +353,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, return NULL; } } - - /* At this point, we know that the frame was filtered and contains - * a valid vlan id. If the vlan id is set in the untagged bitmap, - * send untagged; otherwise, send tagged. - */ - br_vlan_get_tag(skb, &vid); - if (test_bit(vid, pv->untagged_bitmap)) + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) skb->vlan_tci = 0; out: @@ -160,29 +361,13 @@ out: } /* Called under RCU */ -bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, - struct sk_buff *skb, u16 *vid) +static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, + struct sk_buff *skb, u16 *vid) { + const struct net_bridge_vlan *v; bool tagged; - __be16 proto; - - /* If VLAN filtering is disabled on the bridge, all packets are - * permitted. - */ - if (!br->vlan_enabled) { - BR_INPUT_SKB_CB(skb)->vlan_filtered = false; - return true; - } - - /* If there are no vlan in the permitted list, all packets are - * rejected. - */ - if (!v) - goto drop; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; - proto = br->vlan_proto; - /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. @@ -217,7 +402,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, } if (!*vid) { - u16 pvid = br_get_pvid(v); + u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which @@ -245,29 +430,43 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, } /* Frame had a valid vlan tag. See if vlan is allowed */ - if (test_bit(*vid, v->vlan_bitmap)) + v = br_vlan_find(vg, *vid); + if (v && br_vlan_should_use(v)) return true; drop: kfree_skb(skb); return false; } +bool br_allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, + u16 *vid) +{ + /* If VLAN filtering is disabled on the bridge, all packets are + * permitted. + */ + if (!br->vlan_enabled) { + BR_INPUT_SKB_CB(skb)->vlan_filtered = false; + return true; + } + + return __allowed_ingress(vg, br->vlan_proto, skb, vid); +} + /* Called under RCU. */ -bool br_allowed_egress(struct net_bridge *br, - const struct net_port_vlans *v, +bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { + const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; - if (!v) - return false; - br_vlan_get_tag(skb, &vid); - if (test_bit(vid, v->vlan_bitmap)) + v = br_vlan_find(vg, vid); + if (v && br_vlan_should_use(v)) return true; return false; @@ -276,29 +475,29 @@ bool br_allowed_egress(struct net_bridge *br, /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { + struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; - struct net_port_vlans *v; /* If filtering was disabled at input, let it pass. */ if (!br->vlan_enabled) return true; - v = rcu_dereference(p->vlan_info); - if (!v) + vg = nbp_vlan_group_rcu(p); + if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { - *vid = br_get_pvid(v); + *vid = br_get_pvid(vg); if (!*vid) return false; return true; } - if (test_bit(*vid, v->vlan_bitmap)) + if (br_vlan_find(vg, *vid)) return true; return false; @@ -309,31 +508,49 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) { - struct net_port_vlans *pv = NULL; - int err; + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + int ret; ASSERT_RTNL(); - pv = rtnl_dereference(br->vlan_info); - if (pv) - return __vlan_add(pv, vid, flags); + vg = br_vlan_group(br); + vlan = br_vlan_find(vg, vid); + if (vlan) { + if (!br_vlan_is_brentry(vlan)) { + /* Trying to change flags of non-existent bridge vlan */ + if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) + return -EINVAL; + /* It was only kept for port vlans, now make it real */ + ret = br_fdb_insert(br, NULL, br->dev->dev_addr, + vlan->vid); + if (ret) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + return ret; + } + atomic_inc(&vlan->refcnt); + vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; + vg->num_vlans++; + } + __vlan_add_flags(vlan, flags); + return 0; + } - /* Create port vlan infomration - */ - pv = kzalloc(sizeof(*pv), GFP_KERNEL); - if (!pv) + vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + if (!vlan) return -ENOMEM; - pv->parent.br = br; - err = __vlan_add(pv, vid, flags); - if (err) - goto out; - - rcu_assign_pointer(br->vlan_info, pv); - return 0; -out: - kfree(pv); - return err; + vlan->vid = vid; + vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; + vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; + vlan->br = br; + if (flags & BRIDGE_VLAN_INFO_BRENTRY) + atomic_set(&vlan->refcnt, 1); + ret = __vlan_add(vlan, flags); + if (ret) + kfree(vlan); + + return ret; } /* Must be protected by RTNL. @@ -341,49 +558,41 @@ out: */ int br_vlan_delete(struct net_bridge *br, u16 vid) { - struct net_port_vlans *pv; + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; ASSERT_RTNL(); - pv = rtnl_dereference(br->vlan_info); - if (!pv) - return -EINVAL; + vg = br_vlan_group(br); + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_is_brentry(v)) + return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); + br_fdb_delete_by_port(br, NULL, vid, 0); - __vlan_del(pv, vid); - return 0; + return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { - struct net_port_vlans *pv; + struct net_bridge_vlan_group *vg; ASSERT_RTNL(); - pv = rtnl_dereference(br->vlan_info); - if (!pv) - return; - __vlan_flush(pv); + vg = br_vlan_group(br); + __vlan_flush(vg); + RCU_INIT_POINTER(br->vlgrp, NULL); + synchronize_rcu(); + __vlan_group_free(vg); } -bool br_vlan_find(struct net_bridge *br, u16 vid) +struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { - struct net_port_vlans *pv; - bool found = false; - - rcu_read_lock(); - pv = rcu_dereference(br->vlan_info); + if (!vg) + return NULL; - if (!pv) - goto out; - - if (test_bit(vid, pv->vlan_bitmap)) - found = true; - -out: - rcu_read_unlock(); - return found; + return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ @@ -413,50 +622,46 @@ void br_recalculate_fwd_mask(struct net_bridge *br) ~(1u << br->group_addr[5]); } -int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { - if (!rtnl_trylock()) - return restart_syscall(); - if (br->vlan_enabled == val) - goto unlock; + return 0; br->vlan_enabled = val; br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); -unlock: + return 0; +} + +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +{ + if (!rtnl_trylock()) + return restart_syscall(); + + __br_vlan_filter_toggle(br, val); rtnl_unlock(); + return 0; } -int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { int err = 0; struct net_bridge_port *p; - struct net_port_vlans *pv; - __be16 proto, oldproto; - u16 vid, errvid; + struct net_bridge_vlan *vlan; + struct net_bridge_vlan_group *vg; + __be16 oldproto; - if (val != ETH_P_8021Q && val != ETH_P_8021AD) - return -EPROTONOSUPPORT; - - if (!rtnl_trylock()) - return restart_syscall(); - - proto = htons(val); if (br->vlan_proto == proto) - goto unlock; + return 0; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { - pv = rtnl_dereference(p->vlan_info); - if (!pv) - continue; - - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { - err = vlan_vid_add(p->dev, proto, vid); + vg = nbp_vlan_group(p); + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } @@ -470,38 +675,55 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val) /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { - pv = rtnl_dereference(p->vlan_info); - if (!pv) - continue; - - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(p->dev, oldproto, vid); + vg = nbp_vlan_group(p); + list_for_each_entry(vlan, &vg->vlan_list, vlist) + vlan_vid_del(p->dev, oldproto, vlan->vid); } -unlock: - rtnl_unlock(); - return err; + return 0; err_filt: - errvid = vid; - for_each_set_bit(vid, pv->vlan_bitmap, errvid) - vlan_vid_del(p->dev, proto, vid); + list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) + vlan_vid_del(p->dev, proto, vlan->vid); list_for_each_entry_continue_reverse(p, &br->port_list, list) { - pv = rtnl_dereference(p->vlan_info); - if (!pv) - continue; - - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(p->dev, proto, vid); + vg = nbp_vlan_group(p); + list_for_each_entry(vlan, &vg->vlan_list, vlist) + vlan_vid_del(p->dev, proto, vlan->vid); } - goto unlock; + return err; +} + +int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +{ + int err; + + if (val != ETH_P_8021Q && val != ETH_P_8021AD) + return -EPROTONOSUPPORT; + + if (!rtnl_trylock()) + return restart_syscall(); + + err = __br_vlan_set_proto(br, htons(val)); + rtnl_unlock(); + + return err; } -static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid) +static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { - return pv && vid == pv->pvid && test_bit(vid, pv->untagged_bitmap); + struct net_bridge_vlan *v; + + if (vid != vg->pvid) + return false; + + v = br_vlan_lookup(&vg->vlan_hash, vid); + if (v && br_vlan_should_use(v) && + (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) + return true; + + return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) @@ -512,24 +734,31 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) /* Disable default_pvid on all ports where it is still * configured. */ - if (vlan_default_pvid(br_get_vlan_info(br), pvid)) + if (vlan_default_pvid(br_vlan_group(br), pvid)) br_vlan_delete(br, pvid); list_for_each_entry(p, &br->port_list, list) { - if (vlan_default_pvid(nbp_get_vlan_info(p), pvid)) + if (vlan_default_pvid(nbp_vlan_group(p), pvid)) nbp_vlan_delete(p, pvid); } br->default_pvid = 0; } -static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) { + const struct net_bridge_vlan *pvent; + struct net_bridge_vlan_group *vg; struct net_bridge_port *p; u16 old_pvid; int err = 0; unsigned long *changed; + if (!pvid) { + br_vlan_disable_default_pvid(br); + return 0; + } + changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), GFP_KERNEL); if (!changed) @@ -540,11 +769,14 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) /* Update default_pvid config only if we do not conflict with * user configuration. */ - if ((!old_pvid || vlan_default_pvid(br_get_vlan_info(br), old_pvid)) && - !br_vlan_find(br, pvid)) { + vg = br_vlan_group(br); + pvent = br_vlan_find(vg, pvid); + if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && + (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_BRENTRY); if (err) goto out; br_vlan_delete(br, old_pvid); @@ -555,9 +787,10 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) /* Update default_pvid config only if we do not conflict with * user configuration. */ + vg = nbp_vlan_group(p); if ((old_pvid && - !vlan_default_pvid(nbp_get_vlan_info(p), old_pvid)) || - nbp_vlan_find(p, pvid)) + !vlan_default_pvid(vg, old_pvid)) || + br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, @@ -591,7 +824,8 @@ err_port: if (old_pvid) br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_BRENTRY); br_vlan_delete(br, pvid); } goto out; @@ -617,12 +851,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto unlock; } - - if (!pvid) - br_vlan_disable_default_pvid(br); - else - err = __br_vlan_set_default_pvid(br, pvid); - + err = __br_vlan_set_default_pvid(br, pvid); unlock: rtnl_unlock(); return err; @@ -630,10 +859,68 @@ unlock: int br_vlan_init(struct net_bridge *br) { + struct net_bridge_vlan_group *vg; + int ret = -ENOMEM; + + vg = kzalloc(sizeof(*vg), GFP_KERNEL); + if (!vg) + goto out; + ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); + if (ret) + goto err_rhtbl; + INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; - return br_vlan_add(br, 1, - BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED); + rcu_assign_pointer(br->vlgrp, vg); + ret = br_vlan_add(br, 1, + BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_BRENTRY); + if (ret) + goto err_vlan_add; + +out: + return ret; + +err_vlan_add: + rhashtable_destroy(&vg->vlan_hash); +err_rhtbl: + kfree(vg); + + goto out; +} + +int nbp_vlan_init(struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + int ret = -ENOMEM; + + vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); + if (!vg) + goto out; + + ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); + if (ret) + goto err_rhtbl; + INIT_LIST_HEAD(&vg->vlan_list); + rcu_assign_pointer(p->vlgrp, vg); + if (p->br->default_pvid) { + ret = nbp_vlan_add(p, p->br->default_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + if (ret) + goto err_vlan_add; + } +out: + return ret; + +err_vlan_add: + RCU_INIT_POINTER(p->vlgrp, NULL); + synchronize_rcu(); + rhashtable_destroy(&vg->vlan_hash); +err_rhtbl: + kfree(vg); + + goto out; } /* Must be protected by RTNL. @@ -641,35 +928,28 @@ int br_vlan_init(struct net_bridge *br) */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { - struct net_port_vlans *pv = NULL; - int err; + struct net_bridge_vlan *vlan; + int ret; ASSERT_RTNL(); - pv = rtnl_dereference(port->vlan_info); - if (pv) - return __vlan_add(pv, vid, flags); - - /* Create port vlan infomration - */ - pv = kzalloc(sizeof(*pv), GFP_KERNEL); - if (!pv) { - err = -ENOMEM; - goto clean_up; + vlan = br_vlan_find(nbp_vlan_group(port), vid); + if (vlan) { + __vlan_add_flags(vlan, flags); + return 0; } - pv->port_idx = port->port_no; - pv->parent.port = port; - err = __vlan_add(pv, vid, flags); - if (err) - goto clean_up; + vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + if (!vlan) + return -ENOMEM; - rcu_assign_pointer(port->vlan_info, pv); - return 0; + vlan->vid = vid; + vlan->port = port; + ret = __vlan_add(vlan, flags); + if (ret) + kfree(vlan); -clean_up: - kfree(pv); - return err; + return ret; } /* Must be protected by RTNL. @@ -677,60 +957,28 @@ clean_up: */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { - struct net_port_vlans *pv; + struct net_bridge_vlan *v; ASSERT_RTNL(); - pv = rtnl_dereference(port->vlan_info); - if (!pv) - return -EINVAL; - + v = br_vlan_find(nbp_vlan_group(port), vid); + if (!v) + return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); + br_fdb_delete_by_port(port->br, port, vid, 0); - return __vlan_del(pv, vid); + return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { - struct net_port_vlans *pv; - u16 vid; + struct net_bridge_vlan_group *vg; ASSERT_RTNL(); - pv = rtnl_dereference(port->vlan_info); - if (!pv) - return; - - for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) - vlan_vid_del(port->dev, port->br->vlan_proto, vid); - - __vlan_flush(pv); -} - -bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) -{ - struct net_port_vlans *pv; - bool found = false; - - rcu_read_lock(); - pv = rcu_dereference(port->vlan_info); - - if (!pv) - goto out; - - if (test_bit(vid, pv->vlan_bitmap)) - found = true; - -out: - rcu_read_unlock(); - return found; -} - -int nbp_vlan_init(struct net_bridge_port *p) -{ - return p->br->default_pvid ? - nbp_vlan_add(p, p->br->default_pvid, - BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED) : - 0; + vg = nbp_vlan_group(port); + __vlan_flush(vg); + RCU_INIT_POINTER(port->vlgrp, NULL); + synchronize_rcu(); + __vlan_group_free(vg); } diff --git a/kernel/net/bridge/netfilter/ebt_log.c b/kernel/net/bridge/netfilter/ebt_log.c index 17f2e4bc2..0ad639a96 100644 --- a/kernel/net/bridge/netfilter/ebt_log.c +++ b/kernel/net/bridge/netfilter/ebt_log.c @@ -180,7 +180,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_log_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_LOG; li.u.log.level = info->loglevel; diff --git a/kernel/net/bridge/netfilter/ebt_nflog.c b/kernel/net/bridge/netfilter/ebt_nflog.c index 59ac79520..548161506 100644 --- a/kernel/net/bridge/netfilter/ebt_nflog.c +++ b/kernel/net/bridge/netfilter/ebt_nflog.c @@ -24,7 +24,7 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nflog_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; diff --git a/kernel/net/bridge/netfilter/ebt_stp.c b/kernel/net/bridge/netfilter/ebt_stp.c index 071d87214..0c4057006 100644 --- a/kernel/net/bridge/netfilter/ebt_stp.c +++ b/kernel/net/bridge/netfilter/ebt_stp.c @@ -164,8 +164,10 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) !(info->bitmask & EBT_STP_MASK)) return -EINVAL; /* Make sure the match only receives stp frames */ - if (!ether_addr_equal(e->destmac, bridge_ula) || - !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC)) + if (!par->nft_compat && + (!ether_addr_equal(e->destmac, bridge_ula) || + !ether_addr_equal(e->destmsk, msk) || + !(e->bitmask & EBT_DESTMAC))) return -EINVAL; return 0; diff --git a/kernel/net/bridge/netfilter/ebtable_broute.c b/kernel/net/bridge/netfilter/ebtable_broute.c index d2cdf5d6e..ec94c6f1a 100644 --- a/kernel/net/bridge/netfilter/ebtable_broute.c +++ b/kernel/net/bridge/netfilter/ebtable_broute.c @@ -50,10 +50,14 @@ static const struct ebt_table broute_table = { static int ebt_broute(struct sk_buff *skb) { + struct nf_hook_state state; int ret; - ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL, - dev_net(skb->dev)->xt.broute_table); + nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN, + NFPROTO_BRIDGE, skb->dev, NULL, NULL, + dev_net(skb->dev), NULL); + + ret = ebt_do_table(skb, &state, state.net->xt.broute_table); if (ret == NF_DROP) return 1; /* route it */ return 0; /* bridge it */ diff --git a/kernel/net/bridge/netfilter/ebtable_filter.c b/kernel/net/bridge/netfilter/ebtable_filter.c index 8a3f63b2e..32eccd101 100644 --- a/kernel/net/bridge/netfilter/ebtable_filter.c +++ b/kernel/net/bridge/netfilter/ebtable_filter.c @@ -57,39 +57,34 @@ static const struct ebt_table frame_filter = { }; static unsigned int -ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->in)->xt.frame_filter); + return ebt_do_table(skb, state, state->net->xt.frame_filter); } static unsigned int -ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->out)->xt.frame_filter); + return ebt_do_table(skb, state, state->net->xt.frame_filter); } static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { { .hook = ebt_in_hook, - .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { .hook = ebt_in_hook, - .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { .hook = ebt_out_hook, - .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FILTER_OTHER, diff --git a/kernel/net/bridge/netfilter/ebtable_nat.c b/kernel/net/bridge/netfilter/ebtable_nat.c index c5ef5b1ab..ec55358f0 100644 --- a/kernel/net/bridge/netfilter/ebtable_nat.c +++ b/kernel/net/bridge/netfilter/ebtable_nat.c @@ -57,39 +57,34 @@ static struct ebt_table frame_nat = { }; static unsigned int -ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_nat_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->in)->xt.frame_nat); + return ebt_do_table(skb, state, state->net->xt.frame_nat); } static unsigned int -ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +ebt_nat_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, state->in, state->out, - dev_net(state->out)->xt.frame_nat); + return ebt_do_table(skb, state, state->net->xt.frame_nat); } static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { { .hook = ebt_nat_out, - .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_NAT_DST_OTHER, }, { .hook = ebt_nat_out, - .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_NAT_SRC, }, { .hook = ebt_nat_in, - .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_NAT_DST_BRIDGED, diff --git a/kernel/net/bridge/netfilter/ebtables.c b/kernel/net/bridge/netfilter/ebtables.c index 91180a7fc..f46ca417b 100644 --- a/kernel/net/bridge/netfilter/ebtables.c +++ b/kernel/net/bridge/netfilter/ebtables.c @@ -6,7 +6,7 @@ * * ebtables.c,v 2.0, July, 2002 * - * This code is stongly inspired on the iptables code which is + * This code is strongly inspired by the iptables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * * This program is free software; you can redistribute it and/or @@ -139,7 +139,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { - if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO)) + if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO)) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && FWINV2(e->ethproto != ethproto, EBT_IPROTO)) @@ -176,17 +176,18 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, return 0; } -static inline __pure +static inline struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) { return (void *)entry + entry->next_offset; } /* Do some firewalling */ -unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - struct ebt_table *table) +unsigned int ebt_do_table(struct sk_buff *skb, + const struct nf_hook_state *state, + struct ebt_table *table) { + unsigned int hook = state->hook; int i, nentries; struct ebt_entry *point; struct ebt_counter *counter_base, *cb_base; @@ -199,8 +200,9 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, struct xt_action_param acpar; acpar.family = NFPROTO_BRIDGE; - acpar.in = in; - acpar.out = out; + acpar.net = state->net; + acpar.in = state->in; + acpar.out = state->out; acpar.hotdrop = false; acpar.hooknum = hook; @@ -220,7 +222,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, base = private->entries; i = 0; while (i < nentries) { - if (ebt_basic_match(point, skb, in, out)) + if (ebt_basic_match(point, skb, state->in, state->out)) goto letscontinue; if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0) diff --git a/kernel/net/bridge/netfilter/nf_tables_bridge.c b/kernel/net/bridge/netfilter/nf_tables_bridge.c index a343e6244..62f6b1b19 100644 --- a/kernel/net/bridge/netfilter/nf_tables_bridge.c +++ b/kernel/net/bridge/netfilter/nf_tables_bridge.c @@ -65,31 +65,29 @@ int nft_bridge_ip6hdr_validate(struct sk_buff *skb) EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate); static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { if (nft_bridge_iphdr_validate(skb)) - nft_set_pktinfo_ipv4(pkt, ops, skb, state); + nft_set_pktinfo_ipv4(pkt, skb, state); else - nft_set_pktinfo(pkt, ops, skb, state); + nft_set_pktinfo(pkt, skb, state); } static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) if (nft_bridge_ip6hdr_validate(skb) && - nft_set_pktinfo_ipv6(pkt, ops, skb, state) == 0) + nft_set_pktinfo_ipv6(pkt, skb, state) == 0) return; #endif - nft_set_pktinfo(pkt, ops, skb, state); + nft_set_pktinfo(pkt, skb, state); } static unsigned int -nft_do_chain_bridge(const struct nf_hook_ops *ops, +nft_do_chain_bridge(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -97,17 +95,17 @@ nft_do_chain_bridge(const struct nf_hook_ops *ops, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_bridge_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_bridge_set_pktinfo_ipv4(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_bridge_set_pktinfo_ipv6(&pkt, ops, skb, state); + nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); break; } - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_bridge __read_mostly = { diff --git a/kernel/net/bridge/netfilter/nft_reject_bridge.c b/kernel/net/bridge/netfilter/nft_reject_bridge.c index 858d84856..fdba3d9fb 100644 --- a/kernel/net/bridge/netfilter/nft_reject_bridge.c +++ b/kernel/net/bridge/netfilter/nft_reject_bridge.c @@ -261,7 +261,6 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); const unsigned char *dest = eth_hdr(pkt->skb)->h_dest; if (is_broadcast_ether_addr(dest) || @@ -273,16 +272,16 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->ops->hooknum, + pkt->hook, priv->icmp_code); break; case NFT_REJECT_TCP_RST: nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in, - pkt->ops->hooknum); + pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->ops->hooknum, + pkt->hook, nft_reject_icmp_code(priv->icmp_code)); break; } @@ -290,17 +289,17 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, case htons(ETH_P_IPV6): switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in, - pkt->ops->hooknum, + nft_reject_br_send_v6_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, priv->icmp_code); break; case NFT_REJECT_TCP_RST: - nft_reject_br_send_v6_tcp_reset(net, pkt->skb, pkt->in, - pkt->ops->hooknum); + nft_reject_br_send_v6_tcp_reset(pkt->net, pkt->skb, + pkt->in, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in, - pkt->ops->hooknum, + nft_reject_br_send_v6_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, nft_reject_icmpv6_code(priv->icmp_code)); break; } diff --git a/kernel/net/caif/caif_dev.c b/kernel/net/caif/caif_dev.c index edbca468f..d730a0f68 100644 --- a/kernel/net/caif/caif_dev.c +++ b/kernel/net/caif/caif_dev.c @@ -177,7 +177,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ - if (likely(caifd->netdev->tx_queue_len == 0)) + if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) diff --git a/kernel/net/caif/caif_socket.c b/kernel/net/caif/caif_socket.c index 112ad7848..aa209b106 100644 --- a/kernel/net/caif/caif_socket.c +++ b/kernel/net/caif/caif_socket.c @@ -121,12 +121,13 @@ static void caif_flow_ctrl(struct sock *sk, int mode) * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are * not dropped, but CAIF is sending flow off instead. */ -static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + bool queued = false; if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { @@ -139,7 +140,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) err = sk_filter(sk, skb); if (err) - return err; + goto out; + if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); @@ -147,21 +149,16 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ spin_lock_irqsave(&list->lock, flags); - if (!sock_flag(sk, SOCK_DEAD)) + queued = !sock_flag(sk, SOCK_DEAD); + if (queued) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); - - if (!sock_flag(sk, SOCK_DEAD)) +out: + if (queued) sk->sk_data_ready(sk); else kfree_skb(skb); - return 0; } /* Packet Receive Callback function called from CAIF Stack */ @@ -326,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); @@ -334,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); @@ -1055,7 +1052,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ - sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot); + sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/can/af_can.c b/kernel/net/can/af_can.c index 62c635f2b..166d43619 100644 --- a/kernel/net/can/af_can.c +++ b/kernel/net/can/af_can.c @@ -181,7 +181,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, sock->ops = cp->ops; - sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot); + sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; diff --git a/kernel/net/can/bcm.c b/kernel/net/can/bcm.c index a1ba6875c..6863310d6 100644 --- a/kernel/net/can/bcm.c +++ b/kernel/net/can/bcm.c @@ -96,7 +96,7 @@ struct bcm_op { canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; - struct timeval ival1, ival2; + struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; @@ -131,6 +131,11 @@ static inline struct bcm_sock *bcm_sk(const struct sock *sk) return (struct bcm_sock *)sk; } +static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) +{ + return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); +} + #define CFSIZ sizeof(struct can_frame) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) @@ -953,8 +958,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; - op->kt_ival1 = timeval_to_ktime(msg_head->ival1); - op->kt_ival2 = timeval_to_ktime(msg_head->ival2); + op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); + op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64) @@ -1134,8 +1139,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; - op->kt_ival1 = timeval_to_ktime(msg_head->ival1); - op->kt_ival2 = timeval_to_ktime(msg_head->ival2); + op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); + op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1.tv64) diff --git a/kernel/net/can/gw.c b/kernel/net/can/gw.c index a6f448e18..455168718 100644 --- a/kernel/net/can/gw.c +++ b/kernel/net/can/gw.c @@ -110,6 +110,7 @@ struct cf_mod { void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; + u32 uid; }; @@ -548,6 +549,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + if (gwj->mod.uid) { + if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) + goto cancel; + } + if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) @@ -619,6 +625,7 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, + [CGW_MOD_UID] = { .type = NLA_U32 }, }; /* check for common and gwtype specific attributes */ @@ -761,6 +768,10 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, else mod->csumfunc.xor = cgw_csum_xor_neg; } + + if (tb[CGW_MOD_UID]) { + nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); + } } if (gwtype == CGW_TYPE_CAN_CAN) { @@ -802,6 +813,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtcanmsg *r; struct cgw_job *gwj; + struct cf_mod mod; + struct can_can_gw ccgw; u8 limhops = 0; int err = 0; @@ -819,6 +832,36 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; + err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); + if (err < 0) + return err; + + if (mod.uid) { + + ASSERT_RTNL(); + + /* check for updating an existing job with identical uid */ + hlist_for_each_entry(gwj, &cgw_list, list) { + + if (gwj->mod.uid != mod.uid) + continue; + + /* interfaces & filters must be identical */ + if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) + return -EINVAL; + + /* update modifications with disabled softirq & quit */ + local_bh_disable(); + memcpy(&gwj->mod, &mod, sizeof(mod)); + local_bh_enable(); + return 0; + } + } + + /* ifindex == 0 is not allowed for job creation */ + if (!ccgw.src_idx || !ccgw.dst_idx) + return -ENODEV; + gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; @@ -828,18 +871,14 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; + gwj->limit_hops = limhops; - err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw, - &limhops); - if (err < 0) - goto out; + /* insert already parsed information */ + memcpy(&gwj->mod, &mod, sizeof(mod)); + memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; - /* ifindex == 0 is not allowed for job creation */ - if (!gwj->ccgw.src_idx || !gwj->ccgw.dst_idx) - goto out; - gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx); if (!gwj->src.dev) @@ -856,8 +895,6 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->dst.dev->type != ARPHRD_CAN) goto out; - gwj->limit_hops = limhops; - ASSERT_RTNL(); err = cgw_register_filter(gwj); @@ -931,8 +968,15 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->limit_hops != limhops) continue; - if (memcmp(&gwj->mod, &mod, sizeof(mod))) - continue; + /* we have a match when uid is enabled and identical */ + if (gwj->mod.uid || mod.uid) { + if (gwj->mod.uid != mod.uid) + continue; + } else { + /* no uid => check for identical modifications */ + if (memcmp(&gwj->mod, &mod, sizeof(mod))) + continue; + } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) diff --git a/kernel/net/ceph/auth_x.c b/kernel/net/ceph/auth_x.c index ba6eb1722..10d87753e 100644 --- a/kernel/net/ceph/auth_x.c +++ b/kernel/net/ceph/auth_x.c @@ -8,6 +8,7 @@ #include #include +#include #include #include "crypto.h" @@ -279,6 +280,15 @@ bad: return -EINVAL; } +static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au) +{ + ceph_crypto_key_destroy(&au->session_key); + if (au->buf) { + ceph_buffer_put(au->buf); + au->buf = NULL; + } +} + static int ceph_x_build_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th, struct ceph_x_authorizer *au) @@ -297,7 +307,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, ceph_crypto_key_destroy(&au->session_key); ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); if (ret) - return ret; + goto out_au; maxlen = sizeof(*msg_a) + sizeof(msg_b) + ceph_x_encrypt_buflen(ticket_blob_len); @@ -309,8 +319,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, if (!au->buf) { au->buf = ceph_buffer_new(maxlen, GFP_NOFS); if (!au->buf) { - ceph_crypto_key_destroy(&au->session_key); - return -ENOMEM; + ret = -ENOMEM; + goto out_au; } } au->service = th->service; @@ -340,7 +350,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), p, end - p); if (ret < 0) - goto out_buf; + goto out_au; p += ret; au->buf->vec.iov_len = p - au->buf->vec.iov_base; dout(" built authorizer nonce %llx len %d\n", au->nonce, @@ -348,9 +358,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, BUG_ON(au->buf->vec.iov_len > maxlen); return 0; -out_buf: - ceph_buffer_put(au->buf); - au->buf = NULL; +out_au: + ceph_x_authorizer_cleanup(au); return ret; } @@ -624,8 +633,7 @@ static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, { struct ceph_x_authorizer *au = (void *)a; - ceph_crypto_key_destroy(&au->session_key); - ceph_buffer_put(au->buf); + ceph_x_authorizer_cleanup(au); kfree(au); } @@ -653,8 +661,7 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } - if (xi->auth_authorizer.buf) - ceph_buffer_put(xi->auth_authorizer.buf); + ceph_x_authorizer_cleanup(&xi->auth_authorizer); kfree(ac->private); ac->private = NULL; @@ -691,8 +698,10 @@ static int ceph_x_sign_message(struct ceph_auth_handshake *auth, struct ceph_msg *msg) { int ret; - if (!auth->authorizer) + + if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN)) return 0; + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, msg, &msg->footer.sig); if (ret < 0) @@ -707,8 +716,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, __le64 sig_check; int ret; - if (!auth->authorizer) + if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN)) return 0; + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, msg, &sig_check); if (ret < 0) diff --git a/kernel/net/ceph/ceph_common.c b/kernel/net/ceph/ceph_common.c index 3f76eb84b..bcbec33c6 100644 --- a/kernel/net/ceph/ceph_common.c +++ b/kernel/net/ceph/ceph_common.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -16,8 +17,6 @@ #include #include #include -#include -#include #include @@ -131,6 +130,13 @@ int ceph_compare_options(struct ceph_options *new_opt, int i; int ret; + /* + * Don't bother comparing options if network namespaces don't + * match. + */ + if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net))) + return -1; + ret = memcmp(opt1, opt2, ofs); if (ret) return ret; @@ -239,6 +245,8 @@ enum { Opt_nocrc, Opt_cephx_require_signatures, Opt_nocephx_require_signatures, + Opt_cephx_sign_messages, + Opt_nocephx_sign_messages, Opt_tcp_nodelay, Opt_notcp_nodelay, }; @@ -261,6 +269,8 @@ static match_table_t opt_tokens = { {Opt_nocrc, "nocrc"}, {Opt_cephx_require_signatures, "cephx_require_signatures"}, {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, + {Opt_cephx_sign_messages, "cephx_sign_messages"}, + {Opt_nocephx_sign_messages, "nocephx_sign_messages"}, {Opt_tcp_nodelay, "tcp_nodelay"}, {Opt_notcp_nodelay, "notcp_nodelay"}, {-1, NULL} @@ -312,7 +322,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) { goto out; } - ckey = ukey->payload.data; + ckey = ukey->payload.data[0]; err = ceph_crypto_key_clone(dst, ckey); if (err) goto out_key; @@ -335,9 +345,6 @@ ceph_parse_options(char *options, const char *dev_name, int err = -ENOMEM; substring_t argstr[MAX_OPT_ARGS]; - if (current->nsproxy->net_ns != &init_net) - return ERR_PTR(-EINVAL); - opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) return ERR_PTR(-ENOMEM); @@ -352,8 +359,9 @@ ceph_parse_options(char *options, const char *dev_name, /* start with defaults */ opt->flags = CEPH_OPT_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ - opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; + opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ @@ -439,13 +447,32 @@ ceph_parse_options(char *options, const char *dev_name, pr_warn("ignoring deprecated osdtimeout option\n"); break; case Opt_osdkeepalivetimeout: - opt->osd_keepalive_timeout = intval; + /* 0 isn't well defined right now, reject it */ + if (intval < 1 || intval > INT_MAX / 1000) { + pr_err("osdkeepalive out of range\n"); + err = -EINVAL; + goto out; + } + opt->osd_keepalive_timeout = + msecs_to_jiffies(intval * 1000); break; case Opt_osd_idle_ttl: - opt->osd_idle_ttl = intval; + /* 0 isn't well defined right now, reject it */ + if (intval < 1 || intval > INT_MAX / 1000) { + pr_err("osd_idle_ttl out of range\n"); + err = -EINVAL; + goto out; + } + opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000); break; case Opt_mount_timeout: - opt->mount_timeout = intval; + /* 0 is "wait forever" (i.e. infinite timeout) */ + if (intval < 0 || intval > INT_MAX / 1000) { + pr_err("mount_timeout out of range\n"); + err = -EINVAL; + goto out; + } + opt->mount_timeout = msecs_to_jiffies(intval * 1000); break; case Opt_share: @@ -468,6 +495,12 @@ ceph_parse_options(char *options, const char *dev_name, case Opt_nocephx_require_signatures: opt->flags |= CEPH_OPT_NOMSGAUTH; break; + case Opt_cephx_sign_messages: + opt->flags &= ~CEPH_OPT_NOMSGSIGN; + break; + case Opt_nocephx_sign_messages: + opt->flags |= CEPH_OPT_NOMSGSIGN; + break; case Opt_tcp_nodelay: opt->flags |= CEPH_OPT_TCP_NODELAY; @@ -511,16 +544,20 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) seq_puts(m, "nocrc,"); if (opt->flags & CEPH_OPT_NOMSGAUTH) seq_puts(m, "nocephx_require_signatures,"); + if (opt->flags & CEPH_OPT_NOMSGSIGN) + seq_puts(m, "nocephx_sign_messages,"); if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) seq_puts(m, "notcp_nodelay,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); + seq_printf(m, "mount_timeout=%d,", + jiffies_to_msecs(opt->mount_timeout) / 1000); if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); + seq_printf(m, "osd_idle_ttl=%d,", + jiffies_to_msecs(opt->osd_idle_ttl) / 1000); if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) seq_printf(m, "osdkeepalivetimeout=%d,", - opt->osd_keepalive_timeout); + jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000); /* drop redundant comma */ if (m->count != pos) @@ -571,11 +608,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; - ceph_messenger_init(&client->msgr, myaddr, - client->supported_features, - client->required_features, - ceph_test_opt(client, NOCRC), - ceph_test_opt(client, TCP_NODELAY)); + ceph_messenger_init(&client->msgr, myaddr); /* subsystems */ err = ceph_monc_init(&client->monc, client); @@ -590,6 +623,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, fail_monc: ceph_monc_stop(&client->monc); fail: + ceph_messenger_fini(&client->msgr); kfree(client); return ERR_PTR(err); } @@ -603,8 +637,8 @@ void ceph_destroy_client(struct ceph_client *client) /* unmount */ ceph_osdc_stop(&client->osdc); - ceph_monc_stop(&client->monc); + ceph_messenger_fini(&client->msgr); ceph_debugfs_client_cleanup(client); @@ -629,8 +663,8 @@ static int have_mon_and_osd_map(struct ceph_client *client) */ int __ceph_open_session(struct ceph_client *client, unsigned long started) { - int err; - unsigned long timeout = client->options->mount_timeout * HZ; + unsigned long timeout = client->options->mount_timeout; + long err; /* open session, and wait for mon and osd maps */ err = ceph_monc_open_session(&client->monc); @@ -638,16 +672,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) return err; while (!have_mon_and_osd_map(client)) { - err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) - return err; + return -ETIMEDOUT; /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, have_mon_and_osd_map(client) || (client->auth_err < 0), - timeout); - if (err == -EINTR || err == -ERESTARTSYS) + ceph_timeout_jiffies(timeout)); + if (err < 0) return err; if (client->auth_err < 0) return client->auth_err; @@ -724,5 +757,5 @@ module_exit(exit_ceph_lib); MODULE_AUTHOR("Sage Weil "); MODULE_AUTHOR("Yehuda Sadeh "); MODULE_AUTHOR("Patience Warnick "); -MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_DESCRIPTION("Ceph core library"); MODULE_LICENSE("GPL"); diff --git a/kernel/net/ceph/crush/crush.c b/kernel/net/ceph/crush/crush.c index 9d84ce4ea..80d7c3a97 100644 --- a/kernel/net/ceph/crush/crush.c +++ b/kernel/net/ceph/crush/crush.c @@ -1,15 +1,11 @@ - #ifdef __KERNEL__ # include +# include #else -# include -# include -# define kfree(x) do { if (x) free(x); } while (0) -# define BUG_ON(x) assert(!(x)) +# include "crush_compat.h" +# include "crush.h" #endif -#include - const char *crush_bucket_alg_name(int alg) { switch (alg) { @@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map) kfree(map->rules); } +#ifndef __KERNEL__ + kfree(map->choose_tries); +#endif kfree(map); } diff --git a/kernel/net/ceph/crush/crush_ln_table.h b/kernel/net/ceph/crush/crush_ln_table.h index 6192c7fc9..aae534c90 100644 --- a/kernel/net/ceph/crush/crush_ln_table.h +++ b/kernel/net/ceph/crush/crush_ln_table.h @@ -10,20 +10,20 @@ * */ -#if defined(__linux__) -#include -#elif defined(__FreeBSD__) -#include -#endif - #ifndef CEPH_CRUSH_LN_H #define CEPH_CRUSH_LN_H +#ifdef __KERNEL__ +# include +#else +# include "crush_compat.h" +#endif -// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) -// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) - -static int64_t __RH_LH_tbl[128*2+2] = { +/* + * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) + * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + */ +static __s64 __RH_LH_tbl[128*2+2] = { 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, @@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = { 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, 0x0000800000000000ll, 0x0000ffff00000000ll, - }; - +}; - // LL_tbl[k] = 2^48*log2(1.0+k/2^15); -static int64_t __LL_tbl[256] = { +/* + * LL_tbl[k] = 2^48*log2(1.0+k/2^15) + */ +static __s64 __LL_tbl[256] = { 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, @@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = { 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, }; - - - #endif diff --git a/kernel/net/ceph/crush/hash.c b/kernel/net/ceph/crush/hash.c index 5bb63e37a..ed123af49 100644 --- a/kernel/net/ceph/crush/hash.c +++ b/kernel/net/ceph/crush/hash.c @@ -1,6 +1,8 @@ - -#include -#include +#ifdef __KERNEL__ +# include +#else +# include "hash.h" +#endif /* * Robert Jenkins' function for mixing 32-bit values diff --git a/kernel/net/ceph/crush/mapper.c b/kernel/net/ceph/crush/mapper.c index 5b47736d2..393bfb22d 100644 --- a/kernel/net/ceph/crush/mapper.c +++ b/kernel/net/ceph/crush/mapper.c @@ -1,27 +1,31 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ #ifdef __KERNEL__ # include # include # include # include -# ifndef dprintk -# define dprintk(args...) -# endif +# include +# include #else -# include -# include -# include -# include -# define BUG_ON(x) assert(!(x)) -# define dprintk(args...) /* printf(args) */ -# define kmalloc(x, f) malloc(x) -# define kfree(x) free(x) +# include "crush_compat.h" +# include "crush.h" +# include "hash.h" #endif - -#include -#include #include "crush_ln_table.h" +#define dprintk(args...) /* printf(args) */ + /* * Implement the core CRUSH mapping algorithm. */ @@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, int i; for (i = bucket->h.size-1; i >= 0; i--) { - __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], + __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i], r, bucket->h.id); w &= 0xffff; dprintk("list_choose i=%d x=%d r=%d item %d weight %x " @@ -238,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, return bucket->h.items[high]; } -// compute 2^44*log2(input+1) -uint64_t crush_ln(unsigned xin) +/* compute 2^44*log2(input+1) */ +static __u64 crush_ln(unsigned int xin) { - unsigned x=xin, x1; - int iexpon, index1, index2; - uint64_t RH, LH, LL, xl64, result; + unsigned int x = xin, x1; + int iexpon, index1, index2; + __u64 RH, LH, LL, xl64, result; - x++; + x++; - // normalize input - iexpon = 15; - while(!(x&0x18000)) { x<<=1; iexpon--; } + /* normalize input */ + iexpon = 15; + while (!(x & 0x18000)) { + x <<= 1; + iexpon--; + } - index1 = (x>>8)<<1; - // RH ~ 2^56/index1 - RH = __RH_LH_tbl[index1 - 256]; - // LH ~ 2^48 * log2(index1/256) - LH = __RH_LH_tbl[index1 + 1 - 256]; + index1 = (x >> 8) << 1; + /* RH ~ 2^56/index1 */ + RH = __RH_LH_tbl[index1 - 256]; + /* LH ~ 2^48 * log2(index1/256) */ + LH = __RH_LH_tbl[index1 + 1 - 256]; - // RH*x ~ 2^48 * (2^15 + xf), xf<2^8 - xl64 = (int64_t)x * RH; - xl64 >>= 48; - x1 = xl64; + /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */ + xl64 = (__s64)x * RH; + xl64 >>= 48; + x1 = xl64; - result = iexpon; - result <<= (12 + 32); + result = iexpon; + result <<= (12 + 32); - index2 = x1 & 0xff; - // LL ~ 2^48*log2(1.0+index2/2^15) - LL = __LL_tbl[index2]; + index2 = x1 & 0xff; + /* LL ~ 2^48*log2(1.0+index2/2^15) */ + LL = __LL_tbl[index2]; - LH = LH + LL; + LH = LH + LL; - LH >>= (48-12 - 32); - result += LH; + LH >>= (48 - 12 - 32); + result += LH; - return result; + return result; } @@ -290,9 +297,9 @@ uint64_t crush_ln(unsigned xin) static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, int x, int r) { - unsigned i, high = 0; - unsigned u; - unsigned w; + unsigned int i, high = 0; + unsigned int u; + unsigned int w; __s64 ln, draw, high_draw = 0; for (i = 0; i < bucket->h.size; i++) { @@ -567,6 +574,10 @@ reject: out[outpos] = item; outpos++; count--; +#ifndef __KERNEL__ + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; +#endif } dprintk("CHOOSE returns %d\n", outpos); @@ -610,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map, } for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { +#ifdef DEBUG_INDEP + if (out2 && ftotal) { + dprintk("%u %d a: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out[rep]); + } + dprintk("\n"); + dprintk("%u %d b: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out2[rep]); + } + dprintk("\n"); + } +#endif for (rep = outpos; rep < endpos; rep++) { if (out[rep] != CRUSH_ITEM_UNDEF) continue; @@ -726,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map, out2[rep] = CRUSH_ITEM_NONE; } } +#ifndef __KERNEL__ + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; +#endif +#ifdef DEBUG_INDEP + if (out2) { + dprintk("%u %d a: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out[rep]); + } + dprintk("\n"); + dprintk("%u %d b: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out2[rep]); + } + dprintk("\n"); + } +#endif } /** @@ -790,8 +833,15 @@ int crush_do_rule(const struct crush_map *map, switch (curstep->op) { case CRUSH_RULE_TAKE: - w[0] = curstep->arg1; - wsize = 1; + if ((curstep->arg1 >= 0 && + curstep->arg1 < map->max_devices) || + (-1-curstep->arg1 < map->max_buckets && + map->buckets[-1-curstep->arg1])) { + w[0] = curstep->arg1; + wsize = 1; + } else { + dprintk(" bad take value %d\n", curstep->arg1); + } break; case CRUSH_RULE_SET_CHOOSE_TRIES: @@ -877,7 +927,7 @@ int crush_do_rule(const struct crush_map *map, 0); } else { out_size = ((numrep < (result_max-osize)) ? - numrep : (result_max-osize)); + numrep : (result_max-osize)); crush_choose_indep( map, map->buckets[-1-w[i]], @@ -923,5 +973,3 @@ int crush_do_rule(const struct crush_map *map, } return result_len; } - - diff --git a/kernel/net/ceph/crypto.c b/kernel/net/ceph/crypto.c index 790fe89d9..42e8649c6 100644 --- a/kernel/net/ceph/crypto.c +++ b/kernel/net/ceph/crypto.c @@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) return 0; } - - -#define AES_KEY_SIZE 16 - static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) { return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); @@ -541,7 +537,7 @@ static int ceph_key_preparse(struct key_preparsed_payload *prep) if (ret < 0) goto err_ckey; - prep->payload[0] = ckey; + prep->payload.data[0] = ckey; prep->quotalen = datalen; return 0; @@ -553,14 +549,14 @@ err: static void ceph_key_free_preparse(struct key_preparsed_payload *prep) { - struct ceph_crypto_key *ckey = prep->payload[0]; + struct ceph_crypto_key *ckey = prep->payload.data[0]; ceph_crypto_key_destroy(ckey); kfree(ckey); } static void ceph_key_destroy(struct key *key) { - struct ceph_crypto_key *ckey = key->payload.data; + struct ceph_crypto_key *ckey = key->payload.data[0]; ceph_crypto_key_destroy(ckey); kfree(ckey); diff --git a/kernel/net/ceph/crypto.h b/kernel/net/ceph/crypto.h index d1498224c..2e9cab09f 100644 --- a/kernel/net/ceph/crypto.h +++ b/kernel/net/ceph/crypto.h @@ -16,8 +16,10 @@ struct ceph_crypto_key { static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { - if (key) + if (key) { kfree(key->key); + key->key = NULL; + } } int ceph_crypto_key_clone(struct ceph_crypto_key *dst, diff --git a/kernel/net/ceph/messenger.c b/kernel/net/ceph/messenger.c index 967080a9f..63ae5dd24 100644 --- a/kernel/net/ceph/messenger.c +++ b/kernel/net/ceph/messenger.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -162,6 +163,7 @@ static struct kmem_cache *ceph_msg_data_cache; static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; @@ -175,7 +177,7 @@ static struct lock_class_key socket_class; static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); -static void con_work(struct work_struct *); +static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* @@ -275,23 +277,22 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - ceph_msgr_slab_exit(); - BUG_ON(zero_page == NULL); - kunmap(zero_page); page_cache_release(zero_page); zero_page = NULL; + + ceph_msgr_slab_exit(); } int ceph_msgr_init(void) { + if (ceph_msgr_slab_init()) + return -ENOMEM; + BUG_ON(zero_page != NULL); zero_page = ZERO_PAGE(0); page_cache_get(zero_page); - if (ceph_msgr_slab_init()) - return -ENOMEM; - /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. @@ -480,8 +481,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) int ret; BUG_ON(con->sock); - ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; @@ -508,7 +509,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (con->msgr->tcp_nodelay) { + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { int optval = 1; ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, @@ -636,9 +637,6 @@ static int con_close_socket(struct ceph_connection *con) static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); - BUG_ON(msg->con == NULL); - msg->con->ops->put(msg->con); - msg->con = NULL; ceph_msg_put(msg); } @@ -661,20 +659,21 @@ static void reset_connection(struct ceph_connection *con) if (con->in_msg) { BUG_ON(con->in_msg->con != con); - con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->ops->put(con); } con->connect_seq = 0; con->out_seq = 0; if (con->out_msg) { + BUG_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } con->in_seq = 0; con->in_seq_acked = 0; + + con->out_skip = 0; } /* @@ -749,7 +748,7 @@ void ceph_con_init(struct ceph_connection *con, void *private, mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); - INIT_DELAYED_WORK(&con->work, con_work); + INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CON_STATE_CLOSED; } @@ -774,6 +773,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) static void con_out_kvec_reset(struct ceph_connection *con) { + BUG_ON(con->out_skip); + con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; @@ -782,9 +783,9 @@ static void con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { - int index; + int index = con->out_kvec_left; - index = con->out_kvec_left; + BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; @@ -793,6 +794,27 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int off = con->out_kvec_cur - con->out_kvec; + int skip = 0; + + if (con->out_kvec_bytes > 0) { + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; + BUG_ON(con->out_kvec_bytes < skip); + BUG_ON(!con->out_kvec_left); + con->out_kvec_bytes -= skip; + con->out_kvec_left--; + } + + return skip; +} + #ifdef CONFIG_BLOCK /* @@ -1178,6 +1200,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, return new_piece; } +static size_t sizeof_footer(struct ceph_connection *con) +{ + return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? + sizeof(struct ceph_msg_footer) : + sizeof(struct ceph_msg_footer_old); +} + static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { BUG_ON(!msg); @@ -1200,11 +1229,10 @@ static void prepare_write_message_footer(struct ceph_connection *con) m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) - con->ops->sign_message(con, m); + con->ops->sign_message(m); else m->footer.sig = 0; con->out_kvec[v].iov_len = sizeof(m->footer); @@ -1228,7 +1256,6 @@ static void prepare_write_message(struct ceph_connection *con) u32 crc; con_out_kvec_reset(con); - con->out_kvec_is_msg = true; con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same @@ -1268,18 +1295,19 @@ static void prepare_write_message(struct ceph_connection *con) /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); - /* fill in crc (except data pages), footer */ + /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = 0; + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); + /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { @@ -1291,6 +1319,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; @@ -1351,7 +1380,16 @@ static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); - con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { + struct timespec now = CURRENT_TIME; + + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); + ceph_encode_timespec(&con->out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), + &con->out_temp_keepalive2); + } else { + con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); + } con_flag_set(con, CON_FLAG_WRITE_PENDING); } @@ -1422,7 +1460,8 @@ static int prepare_write_connect(struct ceph_connection *con) dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(con->msgr->supported_features); + con->out_connect.features = + cpu_to_le64(from_msgr(con->msgr)->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1485,7 +1524,6 @@ static int write_partial_kvec(struct ceph_connection *con) } } con->out_kvec_left = 0; - con->out_kvec_is_msg = false; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, @@ -1517,7 +1555,7 @@ static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !con->msgr->nocrc; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; dout("%s %p msg %p\n", __func__, con, msg); @@ -1542,10 +1580,10 @@ static int write_partial_message_data(struct ceph_connection *con) bool need_crc; int ret; - page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, - &last_piece); + page = ceph_msg_data_next(cursor, &page_offset, &length, + &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, - length, last_piece); + length, !last_piece); if (ret <= 0) { if (do_datacrc) msg->footer.data_crc = cpu_to_le32(crc); @@ -1554,7 +1592,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); + need_crc = ceph_msg_data_advance(cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -1577,6 +1615,7 @@ static int write_partial_skip(struct ceph_connection *con) { int ret; + dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); @@ -1625,6 +1664,12 @@ static void prepare_read_tag(struct ceph_connection *con) con->in_tag = CEPH_MSGR_TAG_READY; } +static void prepare_read_keepalive_ack(struct ceph_connection *con) +{ + dout("prepare_read_keepalive_ack %p\n", con); + con->in_base_pos = 0; +} + /* * Prepare to read a message. */ @@ -1732,17 +1777,17 @@ static int verify_hello(struct ceph_connection *con) static bool addr_is_blank(struct sockaddr_storage *ss) { + struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr; + struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr; + switch (ss->ss_family) { case AF_INET: - return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; + return addr->s_addr == htonl(INADDR_ANY); case AF_INET6: - return - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; + return ipv6_addr_any(addr6); + default: + return true; } - return false; } static int addr_port(struct sockaddr_storage *ss) @@ -1989,8 +2034,8 @@ static int process_banner(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = con->msgr->supported_features; - u64 req_feat = con->msgr->required_features; + u64 sup_feat = from_msgr(con->msgr)->supported_features; + u64 req_feat = from_msgr(con->msgr)->required_features; u64 server_feat = ceph_sanitize_features( le64_to_cpu(con->in_reply.features)); int ret; @@ -2216,7 +2261,7 @@ static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; - const bool do_datacrc = !con->msgr->nocrc; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; size_t length; @@ -2230,8 +2275,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = con->in_data_crc; while (cursor->resid) { - page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, - NULL); + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { if (do_datacrc) @@ -2242,7 +2286,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); + (void) ceph_msg_data_advance(cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; @@ -2262,7 +2306,7 @@ static int read_partial_message(struct ceph_connection *con) int end; int ret; unsigned int front_len, middle_len, data_len; - bool do_datacrc = !con->msgr->nocrc; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); u64 seq; u32 crc; @@ -2301,9 +2345,9 @@ static int read_partial_message(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); + sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; - return 0; + return 1; } else if ((s64)seq - (s64)con->in_seq > 1) { pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); @@ -2322,21 +2366,14 @@ static int read_partial_message(struct ceph_connection *con) return ret; BUG_ON(!con->in_msg ^ skip); - if (con->in_msg && data_len > con->in_msg->data_length) { - pr_warn("%s skipping long message (%u > %zd)\n", - __func__, data_len, con->in_msg->data_length); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - skip = 1; - } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); + sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; - return 0; + return 1; } BUG_ON(!con->in_msg); @@ -2414,7 +2451,7 @@ static int read_partial_message(struct ceph_connection *con) } if (need_sign && con->ops->check_message_signature && - con->ops->check_message_signature(con, m)) { + con->ops->check_message_signature(m)) { pr_err("read_partial_message %p signature check failed\n", m); return -EBADMSG; } @@ -2429,13 +2466,10 @@ static int read_partial_message(struct ceph_connection *con) */ static void process_message(struct ceph_connection *con) { - struct ceph_msg *msg; + struct ceph_msg *msg = con->in_msg; BUG_ON(con->in_msg->con != con); - con->in_msg->con = NULL; - msg = con->in_msg; con->in_msg = NULL; - con->ops->put(con); /* if first message, set peer_name */ if (con->peer_name.type == 0) @@ -2457,6 +2491,17 @@ static void process_message(struct ceph_connection *con) mutex_lock(&con->mutex); } +static int read_keepalive_ack(struct ceph_connection *con) +{ + struct ceph_timespec ceph_ts; + size_t size = sizeof(ceph_ts); + int ret = read_partial(con, size, size, &ceph_ts); + if (ret <= 0) + return ret; + ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts); + prepare_read_tag(con); + return 1; +} /* * Write something to the socket. Called in a worker thread when the @@ -2493,13 +2538,13 @@ more: more_kvec: /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); + if (con->out_kvec_left) { + ret = write_partial_kvec(con); if (ret <= 0) goto out; } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); + if (con->out_skip) { + ret = write_partial_skip(con); if (ret <= 0) goto out; } @@ -2526,6 +2571,10 @@ more_kvec: do_next: if (con->state == CON_STATE_OPEN) { + if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -2535,10 +2584,6 @@ do_next: prepare_write_ack(con); goto more; } - if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { - prepare_write_keepalive(con); - goto more; - } } /* Nothing to do! */ @@ -2641,6 +2686,9 @@ more: case CEPH_MSGR_TAG_ACK: prepare_read_ack(con); break; + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + prepare_read_keepalive_ack(con); + break; case CEPH_MSGR_TAG_CLOSE: con_close_socket(con); con->state = CON_STATE_CLOSED; @@ -2654,7 +2702,7 @@ more: if (ret <= 0) { switch (ret) { case -EBADMSG: - con->error_msg = "bad crc"; + con->error_msg = "bad crc/signature"; /* fall through */ case -EBADE: ret = -EIO; @@ -2684,6 +2732,12 @@ more: process_ack(con); goto more; } + if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ret = read_keepalive_ack(con); + if (ret <= 0) + goto out; + goto more; + } out: dout("try_read done on %p ret %d\n", con, ret); @@ -2799,7 +2853,7 @@ static void con_fault_finish(struct ceph_connection *con) /* * Do some work on a connection. Drop a connection ref when we're done. */ -static void con_work(struct work_struct *work) +static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); @@ -2889,10 +2943,8 @@ static void con_fault(struct ceph_connection *con) if (con->in_msg) { BUG_ON(con->in_msg->con != con); - con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->ops->put(con); } /* Requeue anything that hasn't been acked */ @@ -2923,15 +2975,8 @@ static void con_fault(struct ceph_connection *con) * initialize a new messenger instance */ void ceph_messenger_init(struct ceph_messenger *msgr, - struct ceph_entity_addr *myaddr, - u64 supported_features, - u64 required_features, - bool nocrc, - bool tcp_nodelay) + struct ceph_entity_addr *myaddr) { - msgr->supported_features = supported_features; - msgr->required_features = required_features; - spin_lock_init(&msgr->global_seq_lock); if (myaddr) @@ -2941,15 +2986,29 @@ void ceph_messenger_init(struct ceph_messenger *msgr, msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); - msgr->nocrc = nocrc; - msgr->tcp_nodelay = tcp_nodelay; atomic_set(&msgr->stopping, 0); + write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } EXPORT_SYMBOL(ceph_messenger_init); +void ceph_messenger_fini(struct ceph_messenger *msgr) +{ + put_net(read_pnet(&msgr->net)); +} +EXPORT_SYMBOL(ceph_messenger_fini); + +static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) +{ + if (msg->con) + msg->con->ops->put(msg->con); + + msg->con = con ? con->ops->get(con) : NULL; + BUG_ON(msg->con != con); +} + static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ @@ -2981,9 +3040,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) return; } - BUG_ON(msg->con != NULL); - msg->con = con->ops->get(con); - BUG_ON(msg->con == NULL); + msg_con_set(msg, con); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); @@ -3011,31 +3068,45 @@ void ceph_msg_revoke(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; - if (!con) + if (!con) { + dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ + } mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); - BUG_ON(msg->con == NULL); - msg->con->ops->put(msg->con); - msg->con = NULL; msg->hdr.seq = 0; ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("%s %p msg %p - was sending\n", __func__, con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; + BUG_ON(con->out_skip); + /* footer */ + if (con->out_msg_done) { + con->out_skip += con_out_kvec_skip(con); + } else { + BUG_ON(!msg->data_length); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) + con->out_skip += sizeof(msg->footer); + else + con->out_skip += sizeof(msg->old_footer); } + /* data, middle, front */ + if (msg->data_length) + con->out_skip += msg->cursor.total_resid; + if (msg->middle) + con->out_skip += con_out_kvec_skip(con); + con->out_skip += con_out_kvec_skip(con); + + dout("%s %p msg %p - was sending, will write %d skip %d\n", + __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; - + con->out_msg = NULL; ceph_msg_put(msg); } + mutex_unlock(&con->mutex); } @@ -3044,16 +3115,13 @@ void ceph_msg_revoke(struct ceph_msg *msg) */ void ceph_msg_revoke_incoming(struct ceph_msg *msg) { - struct ceph_connection *con; + struct ceph_connection *con = msg->con; - BUG_ON(msg == NULL); - if (!msg->con) { + if (!con) { dout("%s msg %p null con\n", __func__, msg); - return; /* Message not in our possession */ } - con = msg->con; mutex_lock(&con->mutex); if (con->in_msg == msg) { unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); @@ -3094,6 +3162,20 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +bool ceph_con_keepalive_expired(struct ceph_connection *con, + unsigned long interval) +{ + if (interval > 0 && + (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { + struct timespec now = CURRENT_TIME; + struct timespec ts; + jiffies_to_timespec(interval, &ts); + ts = timespec_add(con->last_keepalive_ack, ts); + return timespec_compare(&now, &ts) >= 0; + } + return false; +} + static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) { struct ceph_msg_data *data; @@ -3285,9 +3367,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) } if (msg) { BUG_ON(*skip); + msg_con_set(msg, con); con->in_msg = msg; - con->in_msg->con = con->ops->get(con); - BUG_ON(con->in_msg->con == NULL); } else { /* * Null message pointer means either we should skip @@ -3334,6 +3415,8 @@ static void ceph_msg_release(struct kref *kref) dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); + msg_con_set(m, NULL); + /* drop middle, data, if any */ if (m->middle) { ceph_buffer_put(m->middle); diff --git a/kernel/net/ceph/mon_client.c b/kernel/net/ceph/mon_client.c index 2b3cf05e8..edda01626 100644 --- a/kernel/net/ceph/mon_client.c +++ b/kernel/net/ceph/mon_client.c @@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc) CEPH_ENTITY_TYPE_MON, monc->cur_mon, &monc->monmap->mon_inst[monc->cur_mon].addr); + /* send an initial keepalive to ensure our timestamp is + * valid by the time we are in an OPENED state */ + ceph_con_keepalive(&monc->con); + /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, @@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc) */ static void __schedule_delayed(struct ceph_mon_client *monc) { - unsigned int delay; + struct ceph_options *opt = monc->client->options; + unsigned long delay; - if (monc->cur_mon < 0 || __sub_expired(monc)) + if (monc->cur_mon < 0 || __sub_expired(monc)) { delay = 10 * HZ; - else + } else { delay = 20 * HZ; - dout("__schedule_delayed after %u\n", delay); - schedule_delayed_work(&monc->delayed_work, delay); + if (opt->monc_ping_timeout > 0) + delay = min(delay, opt->monc_ping_timeout / 3); + } + dout("__schedule_delayed after %lu\n", delay); + schedule_delayed_work(&monc->delayed_work, + round_jiffies_relative(delay)); } /* @@ -298,21 +307,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_request_next_osdmap); +/* + * Wait for an osdmap with a given epoch. + * + * @epoch: epoch to wait for + * @timeout: in jiffies, 0 means "wait forever" + */ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, unsigned long timeout) { unsigned long started = jiffies; - int ret; + long ret; mutex_lock(&monc->mutex); while (monc->have_osdmap < epoch) { mutex_unlock(&monc->mutex); - if (timeout != 0 && time_after_eq(jiffies, started + timeout)) + if (timeout && time_after_eq(jiffies, started + timeout)) return -ETIMEDOUT; ret = wait_event_interruptible_timeout(monc->client->auth_wq, - monc->have_osdmap >= epoch, timeout); + monc->have_osdmap >= epoch, + ceph_timeout_jiffies(timeout)); if (ret < 0) return ret; @@ -736,11 +752,23 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - ceph_con_keepalive(&monc->con); + struct ceph_options *opt = monc->client->options; + int is_auth = ceph_auth_is_authenticated(monc->auth); + if (ceph_con_keepalive_expired(&monc->con, + opt->monc_ping_timeout)) { + dout("monc keepalive timeout\n"); + is_auth = 0; + __close_session(monc); + monc->hunting = true; + __open_session(monc); + } - __validate_auth(monc); + if (!monc->hunting) { + ceph_con_keepalive(&monc->con); + __validate_auth(monc); + } - if (ceph_auth_is_authenticated(monc->auth)) + if (is_auth) __send_subscribe(monc); } __schedule_delayed(monc); diff --git a/kernel/net/ceph/osd_client.c b/kernel/net/ceph/osd_client.c index c4ec92392..a28e47ff1 100644 --- a/kernel/net/ceph/osd_client.c +++ b/kernel/net/ceph/osd_client.c @@ -120,11 +120,13 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, } #endif /* CONFIG_BLOCK */ -#define osd_req_op_data(oreq, whch, typ, fld) \ - ({ \ - BUG_ON(whch >= (oreq)->r_num_ops); \ - &(oreq)->r_ops[whch].typ.fld; \ - }) +#define osd_req_op_data(oreq, whch, typ, fld) \ +({ \ + struct ceph_osd_request *__oreq = (oreq); \ + unsigned int __whch = (whch); \ + BUG_ON(__whch >= __oreq->r_num_ops); \ + &__oreq->r_ops[__whch].typ.fld; \ +}) static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) @@ -285,6 +287,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, switch (op->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: @@ -296,6 +299,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_CMPXATTR: ceph_osd_data_release(&op->xattr.osd_data); break; + case CEPH_OSD_OP_STAT: + ceph_osd_data_release(&op->raw_data_in); + break; default: break; } @@ -450,7 +456,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) */ static struct ceph_osd_req_op * _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode) + u16 opcode, u32 flags) { struct ceph_osd_req_op *op; @@ -460,14 +466,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); op->op = opcode; + op->flags = flags; return op; } void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode) + unsigned int which, u16 opcode, u32 flags) { - (void)_osd_req_op_init(osd_req, which, opcode); + (void)_osd_req_op_init(osd_req, which, opcode, flags); } EXPORT_SYMBOL(osd_req_op_init); @@ -476,17 +483,19 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && - opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE); + opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && + opcode != CEPH_OSD_OP_TRUNCATE); op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; op->extent.truncate_seq = truncate_seq; - if (opcode == CEPH_OSD_OP_WRITE) + if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; op->payload_len = payload_len; @@ -515,7 +524,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; @@ -552,7 +562,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; @@ -585,7 +596,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); @@ -602,7 +614,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_write_size) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - CEPH_OSD_OP_SETALLOCHINT); + CEPH_OSD_OP_SETALLOCHINT, + 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; @@ -661,9 +674,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_TRUNCATE: - if (src->op == CEPH_OSD_OP_WRITE) + if (src->op == CEPH_OSD_OP_WRITE || + src->op == CEPH_OSD_OP_WRITEFULL) request_data_len = src->extent.length; dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); @@ -672,7 +687,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); osd_data = &src->extent.osd_data; - if (src->op == CEPH_OSD_OP_WRITE) + if (src->op == CEPH_OSD_OP_WRITE || + src->op == CEPH_OSD_OP_WRITEFULL) ceph_osdc_msg_data_add(req->r_request, osd_data); else ceph_osdc_msg_data_add(req->r_reply, osd_data); @@ -786,7 +802,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { - osd_req_op_init(req, which, opcode); + osd_req_op_init(req, which, opcode, 0); } else { u32 object_size = le32_to_cpu(layout->fl_object_size); u32 object_base = off - objoff; @@ -1088,7 +1104,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc, BUG_ON(!list_empty(&osd->o_osd_lru)); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; } static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, @@ -1199,7 +1215,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, - osdc->client->options->osd_keepalive_timeout * HZ); + osdc->client->options->osd_keepalive_timeout); } static void __cancel_osd_timeout(struct ceph_osd_client *osdc) @@ -1567,10 +1583,9 @@ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); + struct ceph_options *opts = osdc->client->options; struct ceph_osd_request *req; struct ceph_osd *osd; - unsigned long keepalive = - osdc->client->options->osd_keepalive_timeout * HZ; struct list_head slow_osds; dout("timeout\n"); down_read(&osdc->map_sem); @@ -1586,7 +1601,8 @@ static void handle_timeout(struct work_struct *work) */ INIT_LIST_HEAD(&slow_osds); list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, req->r_stamp + keepalive)) + if (time_before(jiffies, + req->r_stamp + opts->osd_keepalive_timeout)) break; osd = req->r_osd; @@ -1613,8 +1629,7 @@ static void handle_osds_timeout(struct work_struct *work) struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, osds_timeout_work.work); - unsigned long delay = - osdc->client->options->osd_idle_ttl * HZ >> 2; + unsigned long delay = osdc->client->options->osd_idle_ttl / 4; dout("osds timeout\n"); down_read(&osdc->map_sem); @@ -1737,8 +1752,7 @@ static void complete_request(struct ceph_osd_request *req) * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, - struct ceph_connection *con) +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p, *end; struct ceph_osd_request *req; @@ -2619,7 +2633,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->event_count = 0; schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); + round_jiffies_relative(osdc->client->options->osd_idle_ttl)); err = -ENOMEM; osdc->req_mempool = mempool_create_kmalloc_pool(10, @@ -2794,7 +2808,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg, con); + handle_reply(osdc, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); @@ -2809,8 +2823,9 @@ out: } /* - * lookup and return message for incoming reply. set up reply message - * pages. + * Lookup and return message for incoming reply. Don't try to do + * anything about a larger than preallocated data portion of the + * message at the moment - for now, just skip the message. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -2828,23 +2843,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { - *skip = 1; + dout("%s osd%d tid %llu unknown, skipping\n", __func__, + osd->o_osd, tid); m = NULL; - dout("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); + *skip = 1; goto out; } - if (req->r_reply->con) - dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); if (front_len > req->r_reply->front_alloc_len) { - pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n", - front_len, req->r_reply->front_alloc_len, - (unsigned int)con->peer_name.type, - le64_to_cpu(con->peer_name.num)); + pr_warn("%s osd%d tid %llu front %d > preallocated %d\n", + __func__, osd->o_osd, req->r_tid, front_len, + req->r_reply->front_alloc_len); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) @@ -2852,37 +2863,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ceph_msg_put(req->r_reply); req->r_reply = m; } - m = ceph_msg_get(req->r_reply); - - if (data_len > 0) { - struct ceph_osd_data *osd_data; - /* - * XXX This is assuming there is only one op containing - * XXX page data. Probably OK for reads, but this - * XXX ought to be done more generally. - */ - osd_data = osd_req_op_extent_osd_data(req, 0); - if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - if (osd_data->pages && - unlikely(osd_data->length < data_len)) { - - pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n", - tid, data_len, osd_data->length); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - } + if (data_len > req->r_reply->data_length) { + pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", + __func__, osd->o_osd, req->r_tid, data_len, + req->r_reply->data_length); + m = NULL; + *skip = 1; + goto out; } - *skip = 0; + + m = ceph_msg_get(req->r_reply); dout("get_reply tid %lld %p\n", tid, m); out: mutex_unlock(&osdc->request_mutex); return m; - } static struct ceph_msg *alloc_msg(struct ceph_connection *con, @@ -2980,17 +2976,19 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } -static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +static int osd_sign_message(struct ceph_msg *msg) { - struct ceph_osd *o = con->private; + struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; + return ceph_auth_sign_message(auth, msg); } -static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +static int osd_check_message_signature(struct ceph_msg *msg) { - struct ceph_osd *o = con->private; + struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; + return ceph_auth_check_message_signature(auth, msg); } @@ -3002,7 +3000,7 @@ static const struct ceph_connection_operations osd_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, - .sign_message = sign_message, - .check_message_signature = check_message_signature, + .sign_message = osd_sign_message, + .check_message_signature = osd_check_message_signature, .fault = osd_reset, }; diff --git a/kernel/net/ceph/osdmap.c b/kernel/net/ceph/osdmap.c index 4a3125836..7d8f581d9 100644 --- a/kernel/net/ceph/osdmap.c +++ b/kernel/net/ceph/osdmap.c @@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; map->osd_addr[osd] = addr; } diff --git a/kernel/net/ceph/pagevec.c b/kernel/net/ceph/pagevec.c index 096d91447..d4f5f220a 100644 --- a/kernel/net/ceph/pagevec.c +++ b/kernel/net/ceph/pagevec.c @@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) set_page_dirty_lock(pages[i]); put_page(pages[i]); } - if (is_vmalloc_addr(pages)) - vfree(pages); - else - kfree(pages); + kvfree(pages); } EXPORT_SYMBOL(ceph_put_page_vector); diff --git a/kernel/net/core/Makefile b/kernel/net/core/Makefile index fec0856dd..086b01fbe 100644 --- a/kernel/net/core/Makefile +++ b/kernel/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/kernel/net/core/datagram.c b/kernel/net/core/datagram.c index 617088aee..d62af69ad 100644 --- a/kernel/net/core/datagram.c +++ b/kernel/net/core/datagram.c @@ -785,7 +785,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/kernel/net/core/dev.c b/kernel/net/core/dev.c index f8c23dee5..0e17592ad 100644 --- a/kernel/net/core/dev.c +++ b/kernel/net/core/dev.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -469,10 +471,14 @@ EXPORT_SYMBOL(dev_remove_pack); */ void dev_add_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct packet_offload *elem; spin_lock(&offload_lock); - list_add_rcu(&po->list, head); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); @@ -677,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_iflink); +/** + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. + * + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. + */ +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info; + + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; + + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; + + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); +} +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -1632,7 +1664,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS static struct static_key ingress_needed __read_mostly; void net_inc_ingress_queue(void) @@ -2347,21 +2379,52 @@ void netif_device_attach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_attach); +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, + name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); @@ -2487,6 +2550,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -2501,6 +2566,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; @@ -2823,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * This permits __QDISC___STATE_RUNNING owner to get the lock more * often and dequeue packets faster. */ +#ifdef CONFIG_PREEMPT_RT_FULL + contended = true; +#else contended = qdisc_is_running(q); +#endif if (unlikely(contended)) spin_lock(&q->busylock); @@ -2883,16 +2955,53 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif +#ifdef CONFIG_PREEMPT_RT_FULL + +static inline int xmit_rec_read(void) +{ + return current->xmit_recursion; +} + +static inline void xmit_rec_inc(void) +{ + current->xmit_recursion++; +} + +static inline void xmit_rec_dec(void) +{ + current->xmit_recursion--; +} + +#else + DEFINE_PER_CPU(int, xmit_recursion); EXPORT_SYMBOL(xmit_recursion); +static inline int xmit_rec_read(void) +{ + return __this_cpu_read(xmit_recursion); +} + +static inline void xmit_rec_inc(void) +{ + __this_cpu_inc(xmit_recursion); +} + +static inline void xmit_rec_dec(void) +{ + __this_cpu_dec(xmit_recursion); +} +#endif + #define RECURSION_LIMIT 10 /** * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); @@ -2905,6 +3014,85 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[skb->sender_cpu - 1]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, skb); + if (new_index < 0) + new_index = skb_tx_hash(dev, skb); + + if (queue_index != new_index && sk && + sk_fullsock(sk) && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb, + void *accel_priv) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + if (skb->sender_cpu == 0) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); + else + queue_index = __netdev_pick_tx(dev, skb); + + if (!accel_priv) + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2958,6 +3146,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -2987,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (txq->xmit_lock_owner != cpu) { - if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + if (xmit_rec_read() > RECURSION_LIMIT) goto recursion_alert; skb = validate_xmit_skb(skb, dev); @@ -2997,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { - __this_cpu_inc(xmit_recursion); + xmit_rec_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); - __this_cpu_dec(xmit_recursion); + xmit_rec_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -3030,11 +3228,11 @@ out: return rc; } -int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb) +int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } -EXPORT_SYMBOL(dev_queue_xmit_sk); +EXPORT_SYMBOL(dev_queue_xmit); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) { @@ -3549,66 +3747,55 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesn't stop any functionality; if you dont have - * the ingress scheduler, you just can't add policies on ingress. - * - */ -static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) -{ - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); - int result = TC_ACT_OK; - struct Qdisc *q; - - if (unlikely(MAX_RED_LOOP < ttl++)) { - net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", - skb->skb_iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - q = rcu_dereference(rxq->qdisc); - if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); - } - - return result; -} - static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); + struct tcf_result cl_res; - if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) + /* If there's at least one ingress present somewhere (so + * we get here via enabled static key), remaining devices + * that are not configured with an ingress qdisc will bail + * out here. + */ + if (!cl) return skb; - if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - switch (ing_filter(skb, rxq)) { + qdisc_skb_cb(skb)->pkt_len = skb->len; + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + qdisc_bstats_cpu_update(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; case TC_ACT_SHOT: + qdisc_qstats_cpu_drop(cl->q); case TC_ACT_STOLEN: + case TC_ACT_QUEUED: kfree_skb(skb); return NULL; + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by cls/act_bpf, so + * we can safely push the L2 header back before + * redirecting to another netdev + */ + __skb_push(skb, skb->mac_len); + skb_do_redirect(skb); + return NULL; + default: + break; } - +#endif /* CONFIG_NET_CLS_ACT */ return skb; } -#endif /** * netdev_rx_handler_register - register receive handler @@ -3681,6 +3868,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) } } +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ +#ifdef CONFIG_NETFILTER_INGRESS + if (nf_hook_ingress_active(skb)) { + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + return nf_hook_ingress(skb); + } +#endif /* CONFIG_NETFILTER_INGRESS */ + return 0; +} + static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; @@ -3738,13 +3941,17 @@ another_round: } skip_taps: -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; - } + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto out; + } +#endif +#ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; ncls: #endif @@ -3897,13 +4104,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb) * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ -int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb) +int netif_receive_skb(struct sk_buff *skb) { trace_netif_receive_skb_entry(skb); return netif_receive_skb_internal(skb); } -EXPORT_SYMBOL(netif_receive_skb_sk); +EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending * Called with irqs disabled. @@ -4017,6 +4224,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_metadata_dst_cmp(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); @@ -4214,10 +4422,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { + skb_dst_drop(skb); kmem_cache_free(skbuff_head_cache, skb); - else + } else { __kfree_skb(skb); + } break; case GRO_HELD: @@ -4634,6 +4844,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); @@ -4755,7 +4967,7 @@ static void net_rx_action(struct softirq_action *h) list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); } @@ -4776,8 +4988,7 @@ struct netdev_adjacent { struct rcu_head rcu; }; -static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, - struct net_device *adj_dev, +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; @@ -4803,7 +5014,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); + return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4916,7 +5127,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower - * list will remain unchainged. + * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) @@ -4971,7 +5182,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower - * list will remain unchainged. + * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { @@ -5065,7 +5276,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, struct netdev_adjacent *adj; int ret; - adj = __netdev_find_adj(dev, adj_dev, dev_list); + adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr++; @@ -5121,7 +5332,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, { struct netdev_adjacent *adj; - adj = __netdev_find_adj(dev, adj_dev, dev_list); + adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("tried to remove device %s from %s\n", @@ -5232,6 +5443,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *private) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -5241,15 +5453,25 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) + if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) + if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = master; + changeupper_info.linking = true; + + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, master); if (ret) @@ -5288,7 +5510,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); return 0; rollback_lower_mesh: @@ -5383,9 +5606,17 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j; ASSERT_RTNL(); + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + changeupper_info.linking = false; + + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower @@ -5405,7 +5636,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5511,7 +5743,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev, if (!lower_dev) return NULL; - lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; @@ -6005,6 +6237,26 @@ int dev_get_phys_port_name(struct net_device *dev, } EXPORT_SYMBOL(dev_get_phys_port_name); +/** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace @@ -6129,6 +6381,48 @@ static void rollback_registered(struct net_device *dev) list_del(&single); } +static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(&upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(upper->wanted_features & feature) + && (features & feature)) { + netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", + &feature, upper->name); + features &= ~feature; + } + } + + return features; +} + +static void netdev_sync_lower_features(struct net_device *upper, + struct net_device *lower, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(&upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(features & feature) && (lower->features & feature)) { + netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", + &feature, lower->name); + lower->wanted_features &= ~feature; + netdev_update_features(lower); + + if (unlikely(lower->features & feature)) + netdev_WARN(upper, "failed to disable %pNF on %s!\n", + &feature, lower->name); + } + } +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -6198,8 +6492,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, int __netdev_update_features(struct net_device *dev) { + struct net_device *upper, *lower; netdev_features_t features; - int err = 0; + struct list_head *iter; + int err = -1; ASSERT_RTNL(); @@ -6211,26 +6507,42 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); + /* some features can't be enabled if they're off an an upper device */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) + features = netdev_sync_upper_features(dev, upper, features); + if (dev->features == features) - return 0; + goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } +sync_lower: + /* some features must be disabled on lower devices when disabled + * on an upper device (think: bonding master or bridge) + */ + netdev_for_each_lower_dev(dev, lower, iter) + netdev_sync_lower_features(dev, lower, features); + if (!err) dev->features = features; - return 1; + return err < 0 ? 0 : 1; } /** @@ -6357,6 +6669,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return 0; } +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + /** * register_netdevice - register a network device * @dev: device to register @@ -6887,6 +7210,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); + if (!dev->tx_queue_len) { + dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = 1; + } + dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) @@ -6904,6 +7232,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_ingress_init(dev); + return dev; free_all: @@ -6969,7 +7300,7 @@ EXPORT_SYMBOL(free_netdev); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) synchronize_rcu_expedited(); else synchronize_rcu(); @@ -7561,7 +7892,7 @@ static int __init net_dev_init(void) open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + dst_subsys_init(); rc = 0; out: return rc; diff --git a/kernel/net/core/dst.c b/kernel/net/core/dst.c index e956ce6d1..a1656e3b8 100644 --- a/kernel/net/core/dst.c +++ b/kernel/net/core/dst.c @@ -20,8 +20,10 @@ #include #include #include +#include #include +#include /* * Theory of operations: @@ -142,12 +144,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard_sk(struct sock *sk, struct sk_buff *skb) +int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard_sk); +EXPORT_SYMBOL(dst_discard_out); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -158,19 +160,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = { [RTAX_MAX] = 0xdeadbeef, }; - -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags) { - struct dst_entry *dst; - - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) - return NULL; - } - dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); - if (!dst) - return NULL; dst->child = NULL; dst->dev = dev; if (dev) @@ -184,7 +177,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -192,6 +185,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif + dst->lwtstate = NULL; atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; @@ -200,6 +194,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); +} +EXPORT_SYMBOL(dst_init); + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, unsigned short flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + return dst; } EXPORT_SYMBOL(dst_alloc); @@ -211,7 +224,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -248,7 +261,13 @@ again: dst->ops->destroy(dst); if (dst->dev) dev_put(dst->dev); - kmem_cache_free(dst->ops->kmem_cachep, dst); + + lwtstate_put(dst->lwtstate); + + if (dst->flags & DST_METADATA) + kfree(dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) { @@ -282,10 +301,13 @@ void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; + unsigned short nocache = dst->flags & DST_NOCACHE; newrefcnt = atomic_dec_return(&dst->__refcnt); - WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + if (unlikely(newrefcnt < 0)) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt && unlikely(nocache)) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } @@ -327,6 +349,69 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); +static struct dst_ops md_dst_ops = { + .family = AF_UNSPEC, +}; + +static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call output on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static int dst_md_discard(struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call input on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) +{ + struct dst_entry *dst; + + dst = &md_dst->dst; + dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + + dst->input = dst_md_discard; + dst->output = dst_md_discard_out; + + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return NULL; + + __metadata_dst_init(md_dst, optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + +struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +{ + int cpu; + struct metadata_dst __percpu *md_dst; + + md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + __alignof__(struct metadata_dst), flags); + if (!md_dst) + return NULL; + + for_each_possible_cpu(cpu) + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice @@ -346,7 +431,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); @@ -391,7 +476,7 @@ static struct notifier_block dst_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; -void __init dst_init(void) +void __init dst_subsys_init(void) { register_netdevice_notifier(&dst_dev_notifier); } diff --git a/kernel/net/core/ethtool.c b/kernel/net/core/ethtool.c index 1d00b8922..29edf7484 100644 --- a/kernel/net/core/ethtool.c +++ b/kernel/net/core/ethtool.c @@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", - [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload", }; static const char @@ -107,6 +106,13 @@ rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { [ETH_RSS_HASH_XOR_BIT] = "xor", }; +static const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -195,6 +201,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); + if (sset == ETH_SS_TUNABLES) + return ARRAY_SIZE(tunable_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -212,6 +221,8 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); + else if (stringset == ETH_SS_TUNABLES) + memcpy(data, tunable_strings, sizeof(tunable_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -1273,7 +1284,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) gstrings.len = ret; - data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); if (!data) return -ENOMEM; diff --git a/kernel/net/core/fib_rules.c b/kernel/net/core/fib_rules.c index 0ad144fb0..365de6643 100644 --- a/kernel/net/core/fib_rules.c +++ b/kernel/net/core/fib_rules.c @@ -16,6 +16,7 @@ #include #include #include +#include int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) @@ -43,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); -u32 fib_default_rule_pref(struct fib_rules_ops *ops) +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -59,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops) return 0; } -EXPORT_SYMBOL(fib_default_rule_pref); static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, @@ -186,6 +186,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -295,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } rule->fr_net = net; - if (tb[FRA_PRIORITY]) - rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) + : fib_default_rule_pref(ops); if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -330,6 +333,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_FWMASK]) rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + if (tb[FRA_TUN_ID]) + rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -343,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) else rule->suppress_ifgroup = -1; - if (!tb[FRA_PRIORITY] && ops->default_pref) - rule->pref = ops->default_pref(ops); - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) @@ -407,6 +410,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (unresolved) ops->unresolved_rules++; + if (rule->tun_id) + ip_tunnel_need_metadata(); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -473,6 +479,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) continue; + if (tb[FRA_TUN_ID] && + (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -487,6 +497,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) goto errout; } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { @@ -535,7 +548,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ - + nla_total_size(4); /* FRA_FWMASK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -591,7 +605,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && - nla_put_u32(skb, FRA_GOTO, rule->target))) + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/kernel/net/core/filter.c b/kernel/net/core/filter.c index bf831a85c..37157c4c1 100644 --- a/kernel/net/core/filter.c +++ b/kernel/net/core/filter.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -45,16 +46,20 @@ #include #include #include +#include +#include +#include +#include /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * - * Run the filter code and then cut skb->data to correct size returned by - * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller + * Run the eBPF program and then cut skb->data to correct size returned by + * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to SK_RUN_FILTER. It returns 0 if the packet should + * wrapper to BPF_PROG_RUN. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -78,7 +83,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } @@ -144,12 +149,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } -/* note that this only generates 32-bit random numbers */ -static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) -{ - return prandom_u32(); -} - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -308,7 +307,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, *insn = BPF_EMIT_CALL(__get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: - *insn = BPF_EMIT_CALL(__get_random_u32); + *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); + bpf_user_rnd_init_once(); break; } break; @@ -355,8 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * for socket filters: ctx == 'struct sk_buff *', for seccomp: * ctx == 'struct seccomp_data *'. */ -int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) +static int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; struct bpf_insn *new_insn; @@ -371,7 +371,8 @@ int bpf_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { - addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); + addrs = kcalloc(len, sizeof(*addrs), + GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } @@ -473,9 +474,9 @@ do_pass: bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; - insn->src_reg = BPF_REG_X; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ @@ -751,7 +752,8 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) +static int bpf_check_classic(const struct sock_filter *filter, + unsigned int flen) { bool anc_found; int pc; @@ -775,6 +777,11 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) if (ftest->k == 0) return -EINVAL; break; + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_K: + if (ftest->k >= 32) + return -EINVAL; + break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: @@ -825,7 +832,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) return -EINVAL; } -EXPORT_SYMBOL(bpf_check_classic); static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) @@ -839,7 +845,9 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp, fkprog = fp->orig_prog; fkprog->len = fprog->len; - fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); + + fkprog->filter = kmemdup(fp->insns, fsize, + GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; @@ -941,7 +949,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; @@ -988,12 +996,13 @@ out_err: return ERR_PTR(err); } -static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; - fp->jited = false; + fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { @@ -1001,6 +1010,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) return ERR_PTR(err); } + /* There might be additional checks and transformations + * needed on classic filters, f.e. in case of seccomp. + */ + if (trans) { + err = trans(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + } + /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ @@ -1050,7 +1070,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = bpf_prepare_filter(fp); + fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -1059,6 +1079,60 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) } EXPORT_SYMBOL_GPL(bpf_prog_create); +/** + * bpf_prog_create_from_user - create an unattached filter from user buffer + * @pfp: the unattached filter that is created + * @fprog: the filter program + * @trans: post-classic verifier transformation handler + * @save_orig: save classic BPF program + * + * This function effectively does the same as bpf_prog_create(), only + * that it builds up its insns buffer from user space provided buffer. + * It also allows for passing a bpf_aux_classic_check_t handler. + */ +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans, bool save_orig) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + __bpf_prog_free(fp); + return -EFAULT; + } + + fp->len = fprog->len; + fp->orig_prog = NULL; + + if (save_orig) { + err = bpf_prog_store_orig_filter(fp, fprog); + if (err) { + __bpf_prog_free(fp); + return -ENOMEM; + } + } + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, trans); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} +EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); + void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); @@ -1135,7 +1209,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog); + prog = bpf_prepare_filter(prog, NULL); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -1175,21 +1249,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -/** - * bpf_skb_clone_not_writable - is the header of a clone not writable - * @skb: buffer to check - * @len: length up to which to write, can be negative - * - * Returns true if modifying the header part of the cloned buffer - * does require the data to be copied. I.e. this version works with - * negative lengths needed for eBPF case! - */ -static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len) -{ - return skb_header_cloned(skb) || - (int) skb_headroom(skb) + len > skb->hdr_len; -} - #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) @@ -1212,9 +1271,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + len))) + !skb_clone_writable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, buf); @@ -1258,9 +1316,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely((u32) offset > 0xffff)) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) + !skb_clone_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1299,16 +1356,15 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); int offset = (int) r2; __sum16 sum, *ptr; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) + !skb_clone_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1344,6 +1400,233 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) + +static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; + struct net_device *dev; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); + if (unlikely(!dev)) + return -EINVAL; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb2)) + return -ENOMEM; + + if (BPF_IS_REDIRECT_INGRESS(flags)) + return dev_forward_skb(dev, skb2); + + skb2->dev = dev; + skb_sender_cpu_clear(skb2); + return dev_queue_xmit(skb2); +} + +const struct bpf_func_proto bpf_clone_redirect_proto = { + .func = bpf_clone_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +struct redirect_info { + u32 ifindex; + u32 flags; +}; + +static DEFINE_PER_CPU(struct redirect_info, redirect_info); +static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + ri->ifindex = ifindex; + ri->flags = flags; + return TC_ACT_REDIRECT; +} + +int skb_do_redirect(struct sk_buff *skb) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct net_device *dev; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); + ri->ifindex = 0; + if (unlikely(!dev)) { + kfree_skb(skb); + return -EINVAL; + } + + if (BPF_IS_REDIRECT_INGRESS(ri->flags)) + return dev_forward_skb(dev, skb); + + skb->dev = dev; + skb_sender_cpu_clear(skb); + return dev_queue_xmit(skb); +} + +const struct bpf_func_proto bpf_redirect_proto = { + .func = bpf_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return task_get_classid((struct sk_buff *) (unsigned long) r1); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { + .func = bpf_get_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + const struct dst_entry *dst; + + dst = skb_dst((struct sk_buff *) (unsigned long) r1); + if (dst) + return dst->tclassid; +#endif + return 0; +} + +static const struct bpf_func_proto bpf_get_route_realm_proto = { + .func = bpf_get_route_realm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 vlan_proto = (__force __be16) r2; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + return skb_vlan_push(skb, vlan_proto, vlan_tci); +} + +const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); + +static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + + return skb_vlan_pop(skb); +} + +const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); + +bool bpf_helper_changes_skb_data(void *func) +{ + if (func == bpf_skb_vlan_push) + return true; + if (func == bpf_skb_vlan_pop) + return true; + return false; +} + +static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) + return -EINVAL; + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + to->tunnel_id = be64_to_cpu(info->key.tun_id); + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + + return 0; +} + +const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { + .func = bpf_skb_get_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static struct metadata_dst __percpu *md_dst; + +static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; + struct metadata_dst *md = this_cpu_ptr(md_dst); + struct ip_tunnel_info *info; + + if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + return -EINVAL; + + skb_dst_drop(skb); + dst_hold((struct dst_entry *) md); + skb_dst_set(skb, (struct dst_entry *) md); + + info = &md->u.tun_info; + info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_flags = TUNNEL_KEY; + info->key.tun_id = cpu_to_be64(from->tunnel_id); + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + + return 0; +} + +const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { + .func = bpf_skb_set_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +{ + if (!md_dst) { + /* race is not possible, since it's called from + * verifier that is holding verifier mutex + */ + md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + if (!md_dst) + return NULL; + } + return &bpf_skb_set_tunnel_key_proto; +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1358,6 +1641,13 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); default: return NULL; } @@ -1373,18 +1663,29 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_key_proto(); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; default: return sk_filter_func_proto(func_id); } } -static bool sk_filter_is_valid_access(int off, int size, - enum bpf_access_type type) +static bool __is_valid_access(int off, int size, enum bpf_access_type type) { - /* only read is allowed */ - if (type != BPF_READ) - return false; - /* check bounds */ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -1400,8 +1701,50 @@ static bool sk_filter_is_valid_access(int off, int size, return true; } -static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, - struct bpf_insn *insn_buf) +static bool sk_filter_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (off == offsetof(struct __sk_buff, tc_classid)) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + return __is_valid_access(off, size, type); +} + +static bool tc_cls_act_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (off == offsetof(struct __sk_buff, tc_classid)) + return type == BPF_WRITE ? true : false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, tc_index): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + return __is_valid_access(off, size, type); +} + +static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1430,12 +1773,49 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, case offsetof(struct __sk_buff, priority): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + break; + + case offsetof(struct __sk_buff, ingress_ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, priority)); + offsetof(struct sk_buff, skb_iif)); + break; + + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + + case offsetof(struct __sk_buff, hash): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, hash)); break; case offsetof(struct __sk_buff, mark): - return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; case offsetof(struct __sk_buff, pkt_type): return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); @@ -1450,6 +1830,47 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, case offsetof(struct __sk_buff, vlan_tci): return convert_skb_access(SKF_AD_VLAN_TAG, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + + prog->cb_access = 1; + ctx_off -= offsetof(struct __sk_buff, cb[0]); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct qdisc_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + break; + + case offsetof(struct __sk_buff, tc_classid): + ctx_off -= offsetof(struct __sk_buff, tc_classid); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + WARN_ON(type != BPF_WRITE); + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + break; + + case offsetof(struct __sk_buff, tc_index): +#ifdef CONFIG_NET_SCHED + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, tc_index)); + else + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, tc_index)); + break; +#else + if (type == BPF_WRITE) + *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + else + *insn++ = BPF_MOV64_IMM(dst_reg, 0); + break; +#endif } return insn - insn_buf; @@ -1458,13 +1879,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, - .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { @@ -1526,9 +1947,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; /* We're copying the filter that has been originally attached, - * so no conversion/decode needed anymore. + * so no conversion/decode needed anymore. eBPF programs that + * have no original program cannot be dumped through this. */ + ret = -EACCES; fprog = filter->prog->orig_prog; + if (!fprog) + goto out; ret = fprog->len; if (!len) diff --git a/kernel/net/core/flow_dissector.c b/kernel/net/core/flow_dissector.c index 2c35c02a9..12e700332 100644 --- a/kernel/net/core/flow_dissector.c +++ b/kernel/net/core/flow_dissector.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -12,19 +13,60 @@ #include #include #include -#include +#include +#include +#include +#include #include -/* copy saddr & daddr, possibly using 64bit load/store - * Equivalent to : flow->src = iph->saddr; - * flow->dst = iph->daddr; - */ -static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +static bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + return flow_dissector->used_keys & (1 << key_id); +} + +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + flow_dissector->used_keys |= (1 << key_id); +} + +static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id, + void *target_container) +{ + return ((char *) target_container) + flow_dissector->offset[key_id]; +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count) { - BUILD_BUG_ON(offsetof(typeof(*flow), dst) != - offsetof(typeof(*flow), src) + sizeof(flow->src)); - memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); + unsigned int i; + + memset(flow_dissector, 0, sizeof(*flow_dissector)); + + for (i = 0; i < key_count; i++, key++) { + /* User should make sure that every key target offset is withing + * boundaries of unsigned short. + */ + BUG_ON(key->offset > USHRT_MAX); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); + + dissector_set_key(flow_dissector, key->key_id); + flow_dissector->offset[key->key_id] = key->offset; + } + + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. + */ + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); } +EXPORT_SYMBOL(skb_flow_dissector_init); /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -63,18 +105,33 @@ EXPORT_SYMBOL(__skb_flow_get_ports); /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * - * The function will try to retrieve the struct flow_keys from either the skbuff - * or a raw buffer specified by the rest parameters + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. */ -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, - void *data, __be16 proto, int nhoff, int hlen) +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags) { - u8 ip_proto; + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_keyid *key_keyid; + u8 ip_proto = 0; + bool ret = false; if (!data) { data = skb->data; @@ -83,7 +140,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, hlen = skb_headlen(skb); } - memset(flow, 0, sizeof(*flow)); + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); + struct flow_dissector_key_eth_addrs *key_eth_addrs; + + key_eth_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target_container); + memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); + } again: switch (proto) { @@ -93,57 +173,82 @@ again: ip: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) - return false; + goto out_bad; nhoff += iph->ihl * 4; ip_proto = iph->protocol; - if (ip_is_fragment(iph)) - ip_proto = 0; - /* skip the address processing if skb is NULL. The assumption - * here is that if there is no skb we are not looking for flow - * info but lengths and protocols. - */ - if (!skb) + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; - iph_to_flow_copy_addrs(flow, iph); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + + if (ip_is_fragment(iph)) { + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + if (iph->frag_off & htons(IP_OFFSET)) { + goto out_good; + } else { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) + goto out_good; + } + } + + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; - __be32 flow_label; ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) - return false; + goto out_bad; ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - /* see comment above in IPv4 section */ - if (!skb) - break; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; - flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); - flow_label = ip6_flowlabel(iph); - if (flow_label) { - /* Awesome, IPv6 packet has a flow label so we can - * use that to represent the ports without any - * further dissection. - */ - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->ports = flow_label; - flow->thoff = (u16)nhoff; + memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } - return true; + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_label); + } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) + goto out_good; } + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_8021AD): @@ -153,7 +258,16 @@ ipv6: vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) - return false; + goto out_bad; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID, + target_container); + + key_tags->vlan_id = skb_vlan_tag_get_id(skb); + } proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); @@ -166,7 +280,7 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; proto = hdr->proto; nhoff += PPPOE_SES_HLEN; switch (proto) { @@ -175,7 +289,7 @@ ipv6: case htons(PPP_IPV6): goto ipv6; default: - return false; + goto out_bad; } } case htons(ETH_P_TIPC): { @@ -185,20 +299,53 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; - flow->src = hdr->srcnode; - flow->dst = 0; - flow->n_proto = proto; - flow->thoff = (u16)nhoff; - return true; + goto out_bad; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS, + target_container); + key_addrs->tipcaddrs.srcnode = hdr->srcnode; + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; + } + goto out_good; } + + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): { + struct mpls_label *hdr, _hdr[2]; +mpls: + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + goto out_bad; + + if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> + MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = hdr[1].entry & + htonl(MPLS_LS_LABEL_MASK); + } + + goto out_good; + } + + goto out_good; + } + case htons(ETH_P_FCOE): - flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: - return false; + goto out_bad; } +ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { struct gre_hdr { @@ -208,56 +355,149 @@ ipv6: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; /* * Only look inside GRE if version zero and no * routing */ - if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { - proto = hdr->proto; + if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + break; + + proto = hdr->proto; + nhoff += 4; + if (hdr->flags & GRE_CSUM) nhoff += 4; - if (hdr->flags & GRE_CSUM) - nhoff += 4; - if (hdr->flags & GRE_KEY) - nhoff += 4; - if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) - return false; - proto = eth->h_proto; - nhoff += sizeof(*eth); + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + data, hlen, &_keyid); + + if (!keyid) + goto out_bad; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + key_keyid->keyid = *keyid; } - goto again; + nhoff += 4; } - break; + if (hdr->flags & GRE_SEQ) + nhoff += 4; + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + goto out_bad; + proto = eth->h_proto; + nhoff += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = nhoff; + } + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + + goto again; + } + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 _opthdr[2], *opthdr; + + if (proto != htons(ETH_P_IPV6)) + break; + + opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), + data, hlen, &_opthdr); + if (!opthdr) + goto out_bad; + + ip_proto = opthdr[0]; + nhoff += (opthdr[1] + 1) << 3; + + goto ip_proto_again; + } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) + goto out_bad; + + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + nhoff += sizeof(_fh); + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + ip_proto = fh->nexthdr; + goto ip_proto_again; + } + } + goto out_good; } case IPPROTO_IPIP: proto = htons(ETH_P_IP); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ip; case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ipv6; + case IPPROTO_MPLS: + proto = htons(ETH_P_MPLS_UC); + goto mpls; default: break; } - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->thoff = (u16) nhoff; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + } + +out_good: + ret = true; - /* unless skb is set we don't need to record port info */ - if (skb) - flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); +out_bad: + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; - return true; + return ret; } EXPORT_SYMBOL(__skb_flow_dissect); @@ -267,27 +507,112 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, + u32 keyval) { - __flow_hash_secret_init(); - return jhash_3words(a, b, c, hashrnd); + return jhash2(words, length, keyval); } -static inline u32 __flow_hash_from_keys(struct flow_keys *keys) +static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) { - u32 hash; + const void *p = flow; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys->dst < (__force u32)keys->src) || - (((__force u32)keys->dst == (__force u32)keys->src) && - ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { - swap(keys->dst, keys->src); - swap(keys->port16[0], keys->port16[1]); + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); + return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); +} + +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) +{ + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != + sizeof(*flow) - sizeof(flow->addrs)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; + case FLOW_DISSECTOR_KEY_TIPC_ADDRS: + diff -= sizeof(flow->addrs.tipcaddrs); + break; + } + return (sizeof(*flow) - diff) / sizeof(u32); +} + +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); + case FLOW_DISSECTOR_KEY_TIPC_ADDRS: + return flow->addrs.tipcaddrs.srcnode; + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); + +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; } +} +EXPORT_SYMBOL(flow_get_u32_dst); - hash = __flow_hash_3words((__force u32)keys->dst, - (__force u32)keys->src, - (__force u32)keys->ports); +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + addr_diff = (__force u32)keys->addrs.v4addrs.dst - + (__force u32)keys->addrs.v4addrs.src; + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + swap(keys->ports.src, keys->ports.dst); + } + break; + } +} + +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) +{ + u32 hash; + + __flow_hash_consistentify(keys); + + hash = __flow_hash_words(flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -296,12 +621,52 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys) { - return __flow_hash_from_keys(keys); + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); -/* - * __skb_get_hash: calculate a flow hash based on src/dst addresses +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, u32 keyval) +{ + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + return __flow_hash_from_keys(keys, keyval); +} + +struct _flow_keys_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 padding; + __be32 ports; + __be32 src; + __be32 dst; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow) +{ + struct _flow_keys_digest_data *data = + (struct _flow_keys_digest_data *)digest; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + memset(digest, 0, sizeof(*digest)); + + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; +} +EXPORT_SYMBOL(make_flow_keys_digest); + +/** + * __skb_get_hash: calculate a flow hash + * @skb: sk_buff to calculate flow hash from + * + * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. @@ -310,52 +675,72 @@ void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect(skb, &keys)) - return; + __flow_hash_secret_init(); - if (keys.ports) - skb->l4_hash = 1; + __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), + flow_keys_have_l4(&keys)); +} +EXPORT_SYMBOL(__skb_get_hash); - skb->sw_hash = 1; +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) +{ + struct flow_keys keys; - skb->hash = __flow_hash_from_keys(&keys); + return ___skb_get_hash(skb, &keys, perturb); } -EXPORT_SYMBOL(__skb_get_hash); +EXPORT_SYMBOL(skb_get_hash_perturb); -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues) +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { - u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; - - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } + struct flow_keys keys; - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } + memset(&keys, 0, sizeof(keys)); + + memcpy(&keys.addrs.v6addrs.src, &fl6->saddr, + sizeof(keys.addrs.v6addrs.src)); + memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys.addrs.v6addrs.dst)); + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys.ports.src = fl6->fl6_sport; + keys.ports.dst = fl6->fl6_dport; + keys.keyid.keyid = fl6->fl6_gre_key; + keys.tags.flow_label = (__force u32)fl6->flowlabel; + keys.basic.ip_proto = fl6->flowi6_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); - return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; + return skb->hash; } -EXPORT_SYMBOL(__skb_tx_hash); +EXPORT_SYMBOL(__skb_get_hash_flowi6); + +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + keys.addrs.v4addrs.src = fl4->saddr; + keys.addrs.v4addrs.dst = fl4->daddr; + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys.ports.src = fl4->fl4_sport; + keys.ports.dst = fl4->fl4_dport; + keys.keyid.keyid = fl4->fl4_gre_key; + keys.basic.ip_proto = fl4->flowi4_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi4); u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { - u32 poff = keys->thoff; + u32 poff = keys->control.thoff; - switch (keys->ip_proto) { + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; @@ -396,8 +781,12 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, return poff; } -/* skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically +/** + * skb_get_poff - get the offset to the payload + * @skb: sk_buff to get the payload offset from + * + * The function will get the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ @@ -405,86 +794,111 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; - - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[skb->sender_cpu - 1]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); - - return queue_index; -#else - return -1; -#endif + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); } +EXPORT_SYMBOL(__get_hash_from_flowi6); -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) { - struct sock *sk = skb->sk; - int queue_index = sk_tx_queue_get(sk); - - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); - if (new_index < 0) - new_index = skb_tx_hash(dev, skb); + memset(keys, 0, sizeof(*keys)); - if (queue_index != new_index && sk && - rcu_access_pointer(sk->sk_dst_cache)) - sk_tx_queue_set(sk, new_index); + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; - queue_index = new_index; - } - - return queue_index; + return flow_hash_from_keys(keys); } - -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb, - void *accel_priv) +EXPORT_SYMBOL(__get_hash_from_flowi4); + +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, + .offset = offsetof(struct flow_keys, addrs.tipcaddrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLANID, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +}; + +static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; + +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); + +struct flow_dissector flow_keys_buf_dissector __read_mostly; + +static int __init init_default_flow_dissectors(void) { - int queue_index = 0; - -#ifdef CONFIG_XPS - if (skb->sender_cpu == 0) - skb->sender_cpu = raw_smp_processor_id() + 1; -#endif - - if (dev->real_num_tx_queues != 1) { - const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, - __netdev_pick_tx); - else - queue_index = __netdev_pick_tx(dev, skb); - - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); - } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_buf_dissector, + flow_keys_buf_dissector_keys, + ARRAY_SIZE(flow_keys_buf_dissector_keys)); + return 0; } + +late_initcall_sync(init_default_flow_dissectors); diff --git a/kernel/net/core/gen_estimator.c b/kernel/net/core/gen_estimator.c index 9dfb88a93..92d886f4a 100644 --- a/kernel/net/core/gen_estimator.c +++ b/kernel/net/core/gen_estimator.c @@ -66,7 +66,7 @@ NOTES. - * avbps is scaled by 2^5, avpps is scaled by 2^10. + * avbps and avpps are scaled by 2^5. * both values are reported as 32 bit unsigned values. bps can overflow for fast links : max speed being 34360Mbit/sec * Minimal interval is HZ/4=250msec (it is the greatest common divisor @@ -85,10 +85,10 @@ struct gen_estimator struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; int ewma_log; + u32 last_packets; + unsigned long avpps; u64 last_bytes; u64 avbps; - u32 last_packets; - u32 avpps; struct rcu_head e_rcu; struct rb_node node; struct gnet_stats_basic_cpu __percpu *cpu_bstats; @@ -118,8 +118,8 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { struct gnet_stats_basic_packed b = {0}; + unsigned long rate; u64 brate; - u32 rate; spin_lock(e->stats_lock); read_lock(&est_lock); @@ -133,10 +133,11 @@ static void est_timer(unsigned long arg) e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; - rate = (b.packets - e->last_packets)<<(12 - idx); + rate = b.packets - e->last_packets; + rate <<= (7 - idx); e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - e->rate_est->pps = (e->avpps+0x1FF)>>10; + e->rate_est->pps = (e->avpps + 0xF) >> 5; skip: read_unlock(&est_lock); spin_unlock(e->stats_lock); diff --git a/kernel/net/core/lwtunnel.c b/kernel/net/core/lwtunnel.c new file mode 100644 index 000000000..299cfc24d --- /dev/null +++ b/kernel/net/core/lwtunnel.c @@ -0,0 +1,249 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +static const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, unsigned int family, + const void *cfg, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, family, cfg, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_cmp_encap); + +int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(net, sk, skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_output); + +int lwtunnel_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->input)) + ret = ops->input(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_input); diff --git a/kernel/net/core/neighbour.c b/kernel/net/core/neighbour.c index 2237c1b3c..f18ae91b6 100644 --- a/kernel/net/core/neighbour.c +++ b/kernel/net/core/neighbour.c @@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) + entries >= tbl->gc_thresh3) { + net_info_ratelimited("%s: neighbor table overflow!\n", + tbl->id); + NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; + } } n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); @@ -853,7 +857,7 @@ static void neigh_probe(struct neighbour *neigh) struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) - skb = skb_copy(skb, GFP_ATOMIC); + skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); @@ -913,6 +917,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); + notify = 1; next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } } else { @@ -1155,6 +1160,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (new != old) { neigh_del_timer(neigh); + if (new & NUD_PROBE) + atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? @@ -1846,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; ndst.ndts_forced_gc_runs += st->forced_gc_runs; + ndst.ndts_table_fulls += st->table_fulls; } if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) @@ -2207,7 +2215,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, ndm->ndm_pad2 = 0; ndm->ndm_flags = pn->flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; - ndm->ndm_ifindex = pn->dev->ifindex; + ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) @@ -2227,14 +2235,53 @@ static void neigh_update_notify(struct neighbour *neigh) __neigh_notify(neigh, RTM_NEWNEIGH, 0); } +static bool neigh_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) +{ + if (filter_idx && dev->ifindex != filter_idx) + return true; + + return false; +} + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + const struct nlmsghdr *nlh = cb->nlh; + struct nlattr *tb[NDA_MAX + 1]; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; + int filter_master_idx = 0, filter_idx = 0; + unsigned int flags = NLM_F_MULTI; + int err; + + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL); + if (!err) { + if (tb[NDA_IFINDEX]) + filter_idx = nla_get_u32(tb[NDA_IFINDEX]); + + if (tb[NDA_MASTER]) + filter_master_idx = nla_get_u32(tb[NDA_MASTER]); + + if (filter_idx || filter_master_idx) + flags |= NLM_F_DUMP_FILTERED; + } rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); @@ -2247,12 +2294,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; + if (neigh_ifindex_filtered(n->dev, filter_idx)) + continue; + if (neigh_master_filtered(n->dev, filter_master_idx)) + continue; if (idx < s_idx) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, - NLM_F_MULTI) < 0) { + flags) < 0) { rc = -1; goto out; } @@ -2282,7 +2333,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) + if (pneigh_net(n) != net) continue; if (idx < s_idx) goto next; @@ -2714,12 +2765,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx\n", + "%08lx %08lx %08lx %08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, @@ -2736,7 +2787,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) st->periodic_gc_runs, st->forced_gc_runs, - st->unres_discards + st->unres_discards, + st->table_fulls ); return 0; diff --git a/kernel/net/core/net-sysfs.c b/kernel/net/core/net-sysfs.c index 4238d6da5..f88a62ab0 100644 --- a/kernel/net/core/net-sysfs.c +++ b/kernel/net/core/net-sysfs.c @@ -31,7 +31,6 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; -static const char fmt_udec[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -202,7 +201,7 @@ static ssize_t speed_show(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); return ret; @@ -404,6 +403,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool) proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -458,11 +470,15 @@ static ssize_t phys_switch_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; - ret = netdev_switch_parent_id_get(netdev, &ppid); + ret = switchdev_port_attr_get(netdev, &attr); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len, + attr.u.ppid.id); } rtnl_unlock(); @@ -497,6 +513,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); @@ -671,7 +688,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rps_map *old_map, *map; cpumask_var_t mask; int err, cpu, i; - static DEFINE_SPINLOCK(rps_map_lock); + static DEFINE_MUTEX(rps_map_mutex); if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -704,18 +721,21 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, map = NULL; } - spin_lock(&rps_map_lock); + mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, - lockdep_is_held(&rps_map_lock)); + mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); - spin_unlock(&rps_map_lock); if (map) static_key_slow_inc(&rps_needed); - if (old_map) { - kfree_rcu(old_map, rcu); + if (old_map) static_key_slow_dec(&rps_needed); - } + + mutex_unlock(&rps_map_mutex); + + if (old_map) + kfree_rcu(old_map, rcu); + free_cpumask_var(mask); return len; } @@ -983,15 +1003,12 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) - break; + unsigned int i; + i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; @@ -1460,6 +1477,15 @@ static int of_dev_node_match(struct device *dev, const void *data) return ret == 0 ? dev->of_node == data : ret; } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; diff --git a/kernel/net/core/net-traces.c b/kernel/net/core/net-traces.c index ba3c01207..adef015b2 100644 --- a/kernel/net/core/net-traces.c +++ b/kernel/net/core/net-traces.c @@ -31,6 +31,7 @@ #include #include #include +#include EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/kernel/net/core/net_namespace.c b/kernel/net/core/net_namespace.c index 572af0011..2c2eb1b62 100644 --- a/kernel/net/core/net_namespace.c +++ b/kernel/net/core/net_namespace.c @@ -147,24 +147,17 @@ static void ops_free_list(const struct pernet_operations *ops, } } -static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, - int id); +/* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { - int min = 0, max = 0, id; - - ASSERT_RTNL(); + int min = 0, max = 0; if (reqid >= 0) { min = reqid; max = reqid + 1; } - id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); - if (id >= 0) - rtnl_net_notifyid(net, peer, RTM_NEWNSID, id); - - return id; + return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -180,11 +173,16 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -static int __peernet2id(struct net *net, struct net *peer, bool alloc) +/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc + * is set to true, thus the caller knows that the new id must be notified via + * rtnl. + */ +static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); + bool alloc_it = *alloc; - ASSERT_RTNL(); + *alloc = false; /* Magic value for id 0. */ if (id == NET_ID_ZERO) @@ -192,36 +190,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) if (id > 0) return id; - if (alloc) - return alloc_netid(net, peer, -1); + if (alloc_it) { + id = alloc_netid(net, peer, -1); + *alloc = true; + return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + } + + return NETNSA_NSID_NOT_ASSIGNED; +} + +/* should be called with nsid_lock held */ +static int __peernet2id(struct net *net, struct net *peer) +{ + bool no = false; - return -ENOENT; + return __peernet2id_alloc(net, peer, &no); } +static void rtnl_net_notifyid(struct net *net, int cmd, int id); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ +int peernet2id_alloc(struct net *net, struct net *peer) +{ + unsigned long flags; + bool alloc; + int id; + + spin_lock_irqsave(&net->nsid_lock, flags); + alloc = atomic_read(&peer->count) == 0 ? false : true; + id = __peernet2id_alloc(net, peer, &alloc); + spin_unlock_irqrestore(&net->nsid_lock, flags); + if (alloc && id >= 0) + rtnl_net_notifyid(net, RTM_NEWNSID, id); + return id; +} +EXPORT_SYMBOL(peernet2id_alloc); + +/* This function returns, if assigned, the id of a peer netns. */ int peernet2id(struct net *net, struct net *peer) { - bool alloc = atomic_read(&peer->count) == 0 ? false : true; + unsigned long flags; int id; - id = __peernet2id(net, peer, alloc); - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + spin_lock_irqsave(&net->nsid_lock, flags); + id = __peernet2id(net, peer); + spin_unlock_irqrestore(&net->nsid_lock, flags); + return id; +} + +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; } -EXPORT_SYMBOL(peernet2id); struct net *get_net_ns_by_id(struct net *net, int id) { + unsigned long flags; struct net *peer; if (id < 0) return NULL; rcu_read_lock(); + spin_lock_irqsave(&net->nsid_lock, flags); peer = idr_find(&net->netns_ids, id); if (peer) get_net(peer); + spin_unlock_irqrestore(&net->nsid_lock, flags); rcu_read_unlock(); return peer; @@ -242,6 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -362,14 +402,19 @@ static void cleanup_net(struct work_struct *work) list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); + int id; - if (id >= 0) { - rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); + spin_lock_irq(&tmp->nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) idr_remove(&tmp->netns_ids, id); - } + spin_unlock_irq(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); } + spin_lock_irq(&net->nsid_lock); idr_destroy(&net->netns_ids); + spin_unlock_irq(&net->nsid_lock); } rtnl_unlock(); @@ -497,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + unsigned long flags; struct net *peer; int nsid, err; @@ -517,14 +563,19 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); - if (__peernet2id(net, peer, false) >= 0) { + spin_lock_irqsave(&net->nsid_lock, flags); + if (__peernet2id(net, peer) >= 0) { + spin_unlock_irqrestore(&net->nsid_lock, flags); err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); - if (err > 0) + spin_unlock_irqrestore(&net->nsid_lock, flags); + if (err >= 0) { + rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; + } out: put_net(peer); return err; @@ -538,14 +589,10 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, struct net *peer, - int nsid) + int cmd, struct net *net, int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - int id; - - ASSERT_RTNL(); nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); if (!nlh) @@ -554,14 +601,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nsid >= 0) { - id = nsid; - } else { - id = __peernet2id(net, peer, false); - if (id < 0) - id = NETNSA_NSID_NOT_ASSIGNED; - } - if (nla_put_s32(skb, NETNSA_NSID, id)) + if (nla_put_s32(skb, NETNSA_NSID, nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -578,7 +618,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[NETNSA_MAX + 1]; struct sk_buff *msg; struct net *peer; - int err; + int err, id; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy); @@ -600,8 +640,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } + id = peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, peer, -1); + RTM_NEWNSID, net, id); if (err < 0) goto err_out; @@ -633,7 +674,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, peer, id); + RTM_NEWNSID, net_cb->net, id); if (ret < 0) return ret; @@ -652,17 +693,17 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .idx = 0, .s_idx = cb->args[0], }; + unsigned long flags; - ASSERT_RTNL(); - + spin_lock_irqsave(&net->nsid_lock, flags); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); + spin_unlock_irqrestore(&net->nsid_lock, flags); cb->args[0] = net_cb.idx; return skb->len; } -static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, - int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id) { struct sk_buff *msg; int err = -ENOMEM; @@ -671,7 +712,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id); + err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); if (err < 0) goto err_out; diff --git a/kernel/net/core/netclassid_cgroup.c b/kernel/net/core/netclassid_cgroup.c index 1f2a126f4..d9ee8d08a 100644 --- a/kernel/net/core/netclassid_cgroup.c +++ b/kernel/net/core/netclassid_cgroup.c @@ -23,7 +23,8 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return css_cls_state(task_css(p, net_cls_cgrp_id)); + return css_cls_state(task_css_check(p, net_cls_cgrp_id, + rcu_read_lock_bh_held())); } EXPORT_SYMBOL_GPL(task_cls_state); @@ -55,7 +56,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css_cls_state(css)); } -static int update_classid(const void *v, struct file *file, unsigned n) +static int update_classid_sock(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); @@ -66,18 +67,27 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void update_classid(struct cgroup_subsys_state *css, void *v) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; + struct css_task_iter it; struct task_struct *p; - cgroup_taskset_for_each(p, tset) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { task_lock(p); - iterate_fd(p->files, 0, update_classid, v); + iterate_fd(p->files, 0, update_classid_sock, v); task_unlock(p); } + css_task_iter_end(&it); +} + +static void cgrp_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + + cgroup_taskset_first(tset, &css); + update_classid(css, + (void *)(unsigned long)css_cls_state(css)->classid); } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) @@ -88,8 +98,11 @@ static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { - css_cls_state(css)->classid = (u32) value; + struct cgroup_cls_state *cs = css_cls_state(css); + + cs->classid = (u32)value; + update_classid(css, (void *)(unsigned long)cs->classid); return 0; } diff --git a/kernel/net/core/netevent.c b/kernel/net/core/netevent.c index f17ccd291..8b3bc4fac 100644 --- a/kernel/net/core/netevent.c +++ b/kernel/net/core/netevent.c @@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); */ int register_netevent_notifier(struct notifier_block *nb) { - int err; - - err = atomic_notifier_chain_register(&netevent_notif_chain, nb); - return err; + return atomic_notifier_chain_register(&netevent_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_netevent_notifier); diff --git a/kernel/net/core/netpoll.c b/kernel/net/core/netpoll.c index c126a878c..94acfc89a 100644 --- a/kernel/net/core/netpoll.c +++ b/kernel/net/core/netpoll.c @@ -140,36 +140,42 @@ static void queue_process(struct work_struct *work) * case. Further, we test the poll_owner to avoid recursion on UP * systems where the lock doesn't exist. */ -static int poll_one_napi(struct napi_struct *napi, int budget) +static void poll_one_napi(struct napi_struct *napi) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while * holding the napi->poll_lock. */ if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return budget; + return; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + return; - work = napi->poll(napi, budget); - WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); + /* We explicilty pass the polling call a budget of 0 to + * indicate that we are clearing the Tx path only. + */ + work = napi->poll(napi, 0); + WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); - - return budget - work; } -static void poll_napi(struct net_device *dev, int budget) +static void poll_napi(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(napi, budget); + poll_one_napi(napi); spin_unlock(&napi->poll_lock); } } @@ -179,7 +185,6 @@ static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); - int budget = 0; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity @@ -202,7 +207,7 @@ static void netpoll_poll_dev(struct net_device *dev) /* Process pending work on NIC */ ops->ndo_poll_controller(dev); - poll_napi(dev, budget); + poll_napi(dev); up(&ni->dev_lock); @@ -380,6 +385,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; + WARN_ON_ONCE(!irqs_disabled()); + udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); diff --git a/kernel/net/core/netprio_cgroup.c b/kernel/net/core/netprio_cgroup.c index cbd0a199b..40fd09fe0 100644 --- a/kernel/net/core/netprio_cgroup.c +++ b/kernel/net/core/netprio_cgroup.c @@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); diff --git a/kernel/net/core/pktgen.c b/kernel/net/core/pktgen.c index 043ea1867..4da4d51a2 100644 --- a/kernel/net/core/pktgen.c +++ b/kernel/net/core/pktgen.c @@ -177,7 +177,7 @@ #include #include /* do_div */ -#define VERSION "2.74" +#define VERSION "2.75" #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) @@ -210,6 +210,10 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ +/* Xmit modes */ +#define M_START_XMIT 0 /* Default normal TX */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ + /* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); #define if_unlock(t) spin_unlock(&(t->if_lock)); @@ -251,13 +255,14 @@ struct pktgen_dev { * we will do a random selection from within the range. */ __u32 flags; - int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ - + int xmit_mode; int min_pkt_size; int max_pkt_size; int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + struct page *page; u64 delay; /* nano-seconds */ @@ -268,7 +273,6 @@ struct pktgen_dev { /* runtime counters relating to clone_skb */ - __u64 allocated_skbs; __u32 clone_count; int last_ok; /* Was last skb sent? * Or a failed transmit of some sort? @@ -507,7 +511,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, pktgen_reset_all_threads(pn); else - pr_warn("Unknown command: %s\n", data); + return -EINVAL; return count; } @@ -567,7 +571,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) " dst_min: %s dst_max: %s\n", pkt_dev->dst_min, pkt_dev->dst_max); seq_printf(seq, - " src_min: %s src_max: %s\n", + " src_min: %s src_max: %s\n", pkt_dev->src_min, pkt_dev->src_max); } @@ -620,6 +624,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) + seq_puts(seq, " xmit_mode: netif_receive\n"); + seq_puts(seq, " Flags: "); if (pkt_dev->flags & F_IPV6) @@ -1081,7 +1088,8 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) return len; if ((value > 0) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || + !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; i += len; pkt_dev->clone_skb = value; @@ -1134,7 +1142,7 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; - if ((value > 1) && + if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; @@ -1160,6 +1168,45 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "ERROR: node not possible"); return count; } + if (!strcmp(name, "xmit_mode")) { + char f[32]; + + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + + if (strcmp(f, "start_xmit") == 0) { + pkt_dev->xmit_mode = M_START_XMIT; + } else if (strcmp(f, "netif_receive") == 0) { + /* clone_skb set earlier, not supported in this mode */ + if (pkt_dev->clone_skb > 0) + return -ENOTSUPP; + + pkt_dev->xmit_mode = M_NETIF_RECEIVE; + + /* make sure new packet is allocated every time + * pktgen_xmit() is called + */ + pkt_dev->last_ok = 1; + + /* override clone_skb if user passed default value + * at module loading time + */ + pkt_dev->clone_skb = 0; + } else { + sprintf(pg_result, + "xmit_mode -:%s:- unknown\nAvailable modes: %s", + f, "start_xmit, netif_receive\n"); + return count; + } + sprintf(pg_result, "OK: xmit_mode=%s", f); + return count; + } if (!strcmp(name, "flag")) { char f[32]; memset(f, 0, 32); @@ -1267,6 +1314,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "NO_TIMESTAMP") == 0) pkt_dev->flags |= F_NO_TIMESTAMP; + else if (strcmp(f, "!NO_TIMESTAMP") == 0) + pkt_dev->flags &= ~F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -2212,8 +2262,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); @@ -2230,7 +2278,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { - pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2594,9 +2642,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; if (x) { - int ret; - __u8 *eth; + struct ethhdr *eth; struct iphdr *iph; + int ret; nhead = x->props.header_len - skb_headroom(skb); if (nhead > 0) { @@ -2616,9 +2664,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, goto err; } /* restore ll */ - eth = (__u8 *) skb_push(skb, ETH_HLEN); - memcpy(eth, pkt_dev->hh, 12); - *(u16 *) ð[12] = protocol; + eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); + eth->h_proto = protocol; /* Update IPv4 header len as well as checksum value */ iph = ip_hdr(skb); @@ -2740,6 +2788,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + if (likely(skb)) + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + return skb; } @@ -3317,6 +3368,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) unsigned int burst = ACCESS_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; + struct sk_buff *skb; int ret; /* If device is offline, then don't send */ @@ -3347,13 +3399,43 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) return; } pkt_dev->last_pkt_size = pkt_dev->skb->len; - pkt_dev->allocated_skbs++; pkt_dev->clone_count = 0; /* reset counter */ } if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { + skb = pkt_dev->skb; + skb->protocol = eth_type_trans(skb, skb->dev); + atomic_add(burst, &skb->users); + local_bh_disable(); + do { + ret = netif_receive_skb(skb); + if (ret == NET_RX_DROP) + pkt_dev->errors++; + pkt_dev->sofar++; + pkt_dev->seq_num++; + if (atomic_read(&skb->users) != burst) { + /* skb was queued by rps/rfs or taps, + * so cannot reuse this skb + */ + atomic_sub(burst - 1, &skb->users); + /* get out of the loop and wait + * until skb is consumed + */ + break; + } + /* skb was 'freed' by stack, so clean few + * bits and reuse it + */ +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; /* reset reclass/redir ttl */ +#endif + } while (--burst > 0); + goto out; /* Skips xmit_mode M_START_XMIT */ + } + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); @@ -3401,6 +3483,7 @@ xmit_more: unlock: HARD_TX_UNLOCK(odev, txq); +out: local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ @@ -3432,8 +3515,6 @@ static int pktgen_thread_worker(void *arg) set_freezable(); - __set_current_state(TASK_RUNNING); - while (!kthread_should_stop()) { pkt_dev = next_to_run(t); @@ -3478,7 +3559,6 @@ static int pktgen_thread_worker(void *arg) try_to_freeze(); } - set_current_state(TASK_INTERRUPTIBLE); pr_debug("%s stopping all device\n", t->tsk->comm); pktgen_stop(t); @@ -3489,15 +3569,6 @@ static int pktgen_thread_worker(void *arg) pr_debug("%s removing thread\n", t->tsk->comm); pktgen_rem_thread(t); - /* Wait for kthread_stop */ - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - schedule(); - } - __set_current_state(TASK_RUNNING); - return 0; } @@ -3689,6 +3760,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) } t->net = pn; + get_task_struct(p); wake_up_process(p); wait_for_completion(&t->start_done); @@ -3811,6 +3883,7 @@ static void __net_exit pg_net_exit(struct net *net) t = list_entry(q, struct pktgen_thread, th_list); list_del(&t->th_list); kthread_stop(t->tsk); + put_task_struct(t->tsk); kfree(t); } diff --git a/kernel/net/core/ptp_classifier.c b/kernel/net/core/ptp_classifier.c index 4eab4a94a..703cf76aa 100644 --- a/kernel/net/core/ptp_classifier.c +++ b/kernel/net/core/ptp_classifier.c @@ -58,7 +58,7 @@ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [18] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2 * ret a ; return PTP class * * ; PTP over UDP over IPv4 over 802.1Q over Ethernet @@ -73,7 +73,7 @@ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? * ldh [x + 26] ; load payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 * ret a ; return PTP class * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE * @@ -86,7 +86,7 @@ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? * ldh [66] ; load payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 + * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 * ret a ; return PTP class * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE * @@ -98,7 +98,7 @@ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [14] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x30 ; PTP_CLASS_L2 + * or #0x40 ; PTP_CLASS_L2 * ret a ; return PTP class * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE */ @@ -150,7 +150,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 35, 0x00000000 }, { 0x28, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000070 }, + { 0x44, 0, 0, 0x000000c0 }, { 0x16, 0, 0, 0x00000000 }, { 0x15, 0, 12, 0x00000800 }, { 0x30, 0, 0, 0x0000001b }, @@ -162,7 +162,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 4, 0x0000013f }, { 0x48, 0, 0, 0x0000001a }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000050 }, + { 0x44, 0, 0, 0x00000090 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 8, 0x000086dd }, @@ -172,7 +172,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 4, 0x0000013f }, { 0x28, 0, 0, 0x00000042 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000060 }, + { 0x44, 0, 0, 0x000000a0 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 7, 0x000088f7 }, @@ -181,7 +181,7 @@ void __init ptp_classifier_init(void) { 0x15, 0, 4, 0x00000000 }, { 0x28, 0, 0, 0x0000000e }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000030 }, + { 0x44, 0, 0, 0x00000040 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, }; diff --git a/kernel/net/core/request_sock.c b/kernel/net/core/request_sock.c index b42f0e26f..5d26056b6 100644 --- a/kernel/net/core/request_sock.c +++ b/kernel/net/core/request_sock.c @@ -37,90 +37,16 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt = NULL; + spin_lock_init(&queue->rskq_lock); - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); + spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; - if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - lopt = kzalloc(lopt_size, GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); - if (!lopt) - lopt = vzalloc(lopt_size); - if (!lopt) - return -ENOMEM; - - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - spin_lock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; - lopt->max_qlen_log = ilog2(nr_table_entries); - - spin_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - spin_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* This is an error recovery path only, no locking needed */ - kvfree(queue->listen_opt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - spin_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - spin_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* make all the listen_opt local to us */ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - - if (listen_sock_qlen(lopt) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_inc(&lopt->qlen_dec); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(listen_sock_qlen(lopt) != 0)) - pr_err("qlen %u\n", listen_sock_qlen(lopt)); - kvfree(lopt); } /* @@ -174,7 +100,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; - fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); diff --git a/kernel/net/core/rtnetlink.c b/kernel/net/core/rtnetlink.c index fe95cb704..34ba7a088 100644 --- a/kernel/net/core/rtnetlink.c +++ b/kernel/net/core/rtnetlink.c @@ -96,7 +96,7 @@ int rtnl_is_locked(void) EXPORT_SYMBOL(rtnl_is_locked); #ifdef CONFIG_PROVE_LOCKING -int lockdep_rtnl_is_held(void) +bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } @@ -497,7 +497,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) } EXPORT_SYMBOL_GPL(rtnl_af_unregister); -static size_t rtnl_link_get_af_size(const struct net_device *dev) +static size_t rtnl_link_get_af_size(const struct net_device *dev, + u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; @@ -509,7 +510,7 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev) if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + - af_ops->get_link_af_size(dev); + af_ops->get_link_af_size(dev, ext_filter_mask); } } @@ -678,6 +679,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; @@ -819,7 +826,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + - nla_total_size(sizeof(struct ifla_vf_rss_query_en))); + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size(sizeof(__u64)) + + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else return 0; @@ -882,9 +902,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(1); /* IFLA_PROTO_DOWN */ + } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1004,16 +1026,163 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; - struct netdev_phys_item_id psid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; - err = netdev_switch_parent_id_get(dev, &psid); + err = switchdev_port_attr_get(dev, &attr); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } - if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len, + attr.u.ppid.id)) + return -EMSGSIZE; + + return 0; +} + +static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, + struct net_device *dev) +{ + const struct rtnl_link_stats64 *stats; + struct rtnl_link_stats64 temp; + struct nlattr *attr; + + stats = dev_get_stats(dev, &temp); + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats(nla_data(attr), stats); + + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats64(nla_data(attr), stats); + + return 0; +} + +static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, + struct net_device *dev, + int vfs_num, + struct nlattr *vfinfo) +{ + struct ifla_vf_rss_query_en vf_rss_query_en; + struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; + struct nlattr *vf, *vfstats; + struct ifla_vf_mac vf_mac; + struct ifla_vf_info ivi; + + /* Not all SR-IOV capable drivers support the + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. + */ + ivi.spoofchk = -1; + ivi.rss_query_en = -1; + ivi.trusted = -1; + memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; + if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) + return 0; + + vf_mac.vf = + vf_vlan.vf = + vf_rate.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = + vf_linkstate.vf = + vf_rss_query_en.vf = + vf_trust.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || + nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate) || + nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) + return -EMSGSIZE; + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start(skb, IFLA_VF_STATS); + if (!vfstats) { + nla_nest_cancel(skb, vf); + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast) || + nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast)) + return -EMSGSIZE; + nla_nest_end(skb, vfstats); + nla_nest_end(skb, vf); + return 0; +} + +static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +{ + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) return -EMSGSIZE; return 0; @@ -1025,9 +1194,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats; - struct nlattr *attr, *af_spec; + struct nlattr *af_spec; struct rtnl_af_ops *af_ops; struct net_device *upper_dev = netdev_master_upper_dev_get(dev); @@ -1066,21 +1233,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, (dev->ifalias && nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes))) + atomic_read(&dev->carrier_changes)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; - if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) - goto nla_put_failure; - } + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || @@ -1097,97 +1255,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) + if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - stats = dev_get_stats(dev, &temp); - copy_rtnl_link_stats(nla_data(attr), stats); - - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); - if (attr == NULL) - goto nla_put_failure; - copy_rtnl_link_stats64(nla_data(attr), stats); - if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent - && (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && + ext_filter_mask & RTEXT_FILTER_VF) { int i; - - struct nlattr *vfinfo, *vf; + struct nlattr *vfinfo; int num_vfs = dev_num_vf(dev->dev.parent); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); if (!vfinfo) goto nla_put_failure; for (i = 0; i < num_vfs; i++) { - struct ifla_vf_info ivi; - struct ifla_vf_mac vf_mac; - struct ifla_vf_vlan vf_vlan; - struct ifla_vf_rate vf_rate; - struct ifla_vf_tx_rate vf_tx_rate; - struct ifla_vf_spoofchk vf_spoofchk; - struct ifla_vf_link_state vf_linkstate; - struct ifla_vf_rss_query_en vf_rss_query_en; - - /* - * Not all SR-IOV capable drivers support the - * spoofcheck and "RSS query enable" query. Preset to - * -1 so the user space tool can detect that the driver - * didn't report anything. - */ - ivi.spoofchk = -1; - ivi.rss_query_en = -1; - memset(ivi.mac, 0, sizeof(ivi.mac)); - /* The default value for VF link state is "auto" - * IFLA_VF_LINK_STATE_AUTO which equals zero - */ - ivi.linkstate = 0; - if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) - break; - vf_mac.vf = - vf_vlan.vf = - vf_rate.vf = - vf_tx_rate.vf = - vf_spoofchk.vf = - vf_linkstate.vf = - vf_rss_query_en.vf = ivi.vf; - - memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); - vf_vlan.vlan = ivi.vlan; - vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.max_tx_rate; - vf_rate.min_tx_rate = ivi.min_tx_rate; - vf_rate.max_tx_rate = ivi.max_tx_rate; - vf_spoofchk.setting = ivi.spoofchk; - vf_linkstate.link_state = ivi.linkstate; - vf_rss_query_en.setting = ivi.rss_query_en; - vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || - nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || - nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), - &vf_rate) || - nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), - &vf_tx_rate) || - nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk) || - nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate) || - nla_put(skb, IFLA_VF_RSS_QUERY_EN, - sizeof(vf_rss_query_en), - &vf_rss_query_en)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) goto nla_put_failure; - nla_nest_end(skb, vf); } + nla_nest_end(skb, vfinfo); } @@ -1204,7 +1292,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id(dev_net(dev), link_net); + int id = peernet2id_alloc(dev_net(dev), link_net); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) goto nla_put_failure; @@ -1222,7 +1310,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (!(af = nla_nest_start(skb, af_ops->family))) goto nla_put_failure; - err = af_ops->fill_link_af(skb, dev); + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there @@ -1278,6 +1366,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1295,6 +1384,17 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, + [IFLA_VF_STATS] = { .type = NLA_NESTED }, + [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, +}; + +static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { + [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 }, + [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 }, + [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 }, + [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 }, + [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 }, + [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1525,6 +1625,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } + if (tb[IFLA_VF_TRUST]) { + struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_trust) + err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); + if (err < 0) + return err; + } + return err; } @@ -1753,10 +1863,13 @@ static int do_setlink(const struct sk_buff *skb, goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { - if (nla_type(attr) != IFLA_VF_PORT) - continue; - err = nla_parse_nested(port, IFLA_PORT_MAX, - attr, ifla_port_policy); + if (nla_type(attr) != IFLA_VF_PORT || + nla_len(attr) < NLA_HDRLEN) { + err = -EINVAL; + goto errout; + } + err = nla_parse_nested(port, IFLA_PORT_MAX, attr, + ifla_port_policy); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { @@ -1807,6 +1920,14 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; + if (tb[IFLA_PROTO_DOWN]) { + err = dev_change_proto_down(dev, + nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) @@ -1897,16 +2018,30 @@ static int rtnl_group_dellink(const struct net *net, int group) return 0; } +int rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; - LIST_HEAD(list_kill); err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1928,13 +2063,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev) return -ENODEV; - ops = dev->rtnl_link_ops; - if (!ops || !ops->dellink) - return -EOPNOTSUPP; - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return 0; + return rtnl_delete_link(dev); } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) @@ -2862,7 +2991,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, - u32 flags, u32 mask, int nlflags) + u32 flags, u32 mask, int nlflags, + u32 filter_mask, + int (*vlan_fill)(struct sk_buff *skb, + struct net_device *dev, + u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; @@ -2870,6 +3003,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); + int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) @@ -2910,6 +3044,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, goto nla_put_failure; } } + if (vlan_fill) { + err = vlan_fill(skb, dev, filter_mask); + if (err) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); @@ -2943,9 +3084,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, return 0; nla_put_failure: nlmsg_cancel(skb, nlh); - return -EMSGSIZE; + return err ? err : -EMSGSIZE; } -EXPORT_SYMBOL(ndo_dflt_bridge_getlink); +EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2955,6 +3096,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; u32 filter_mask = 0; + int err; if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { struct nlattr *extfilt; @@ -2975,20 +3117,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } } @@ -3345,4 +3492,3 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); } - diff --git a/kernel/net/core/scm.c b/kernel/net/core/scm.c index 3b6899b7d..dce0acb92 100644 --- a/kernel/net/core/scm.c +++ b/kernel/net/core/scm.c @@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fplp = fpl; fpl->count = 0; fpl->max = SCM_MAX_FD; + fpl->user = NULL; } fpp = &fpl->fp[fpl->count]; @@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fpp++ = file; fpl->count++; } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + return num; } @@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm) scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); + free_uid(fpl->user); kfree(fpl); } } @@ -305,6 +311,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) err = put_user(cmlen, &cm->cmsg_len); if (!err) { cmlen = CMSG_SPACE(i*sizeof(int)); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } @@ -334,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); } return new_fpl; } diff --git a/kernel/net/core/secure_seq.c b/kernel/net/core/secure_seq.c index 51dd3193a..fd3ce461f 100644 --- a/kernel/net/core/secure_seq.c +++ b/kernel/net/core/secure_seq.c @@ -154,7 +154,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + daddr[i]; + secret[i] = net_secret[i] + (__force u32)daddr[i]; secret[4] = net_secret[4] + (((__force u16)sport << 16) + (__force u16)dport); for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) diff --git a/kernel/net/core/skbuff.c b/kernel/net/core/skbuff.c index fc09e8f3d..12780dc62 100644 --- a/kernel/net/core/skbuff.c +++ b/kernel/net/core/skbuff.c @@ -80,6 +80,8 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); /** * skb_panic - private function for out-of-line support @@ -348,95 +350,20 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); -struct netdev_alloc_cache { - struct page_frag frag; - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. - */ - unsigned int pagecnt_bias; -}; -static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock); - -static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, - gfp_t gfp_mask) -{ - const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; - struct page *page = NULL; - gfp_t gfp = gfp_mask; - - if (order) { - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); - nc->frag.size = PAGE_SIZE << (page ? order : 0); - } - - if (unlikely(!page)) - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - - nc->frag.page = page; - - return page; -} - -static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, - unsigned int fragsz, gfp_t gfp_mask) -{ - struct netdev_alloc_cache *nc = this_cpu_ptr(cache); - struct page *page = nc->frag.page; - unsigned int size; - int offset; - - if (unlikely(!page)) { -refill: - page = __page_frag_refill(nc, gfp_mask); - if (!page) - return NULL; - - /* if size can vary use frag.size else just use PAGE_SIZE */ - size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - - /* Even if we own the page, we do not use atomic_set(). - * This would break get_page_unless_zero() users. - */ - atomic_add(size - 1, &page->_count); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; - nc->frag.offset = size; - } - - offset = nc->frag.offset - fragsz; - if (unlikely(offset < 0)) { - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) - goto refill; - - /* if size can vary use frag.size else just use PAGE_SIZE */ - size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - - /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; - offset = size - fragsz; - } - - nc->pagecnt_bias--; - nc->frag.offset = offset; - - return page_address(page) + offset; -} +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { + struct page_frag_cache *nc; unsigned long flags; void *data; local_lock_irqsave(netdev_alloc_lock, flags); - data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, fragsz, gfp_mask); local_unlock_irqrestore(netdev_alloc_lock, flags); return data; } @@ -456,7 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); + struct page_frag_cache *nc; + void *data; + + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); + data = __alloc_page_frag(nc, fragsz, gfp_mask); + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); + return data; } void *napi_alloc_frag(unsigned int fragsz) @@ -466,76 +399,70 @@ void *napi_alloc_frag(unsigned int fragsz) EXPORT_SYMBOL(napi_alloc_frag); /** - * __alloc_rx_skb - allocate an skbuff for rx - * @length: length to allocate + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb - * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for - * allocations in case we have to fallback to __alloc_skb() - * If SKB_ALLOC_NAPI is set, page fragment will be allocated - * from napi_cache instead of netdev_cache. * * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate + * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. */ -static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, - int flags) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, + gfp_t gfp_mask) { - struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct page_frag_cache *nc; + unsigned long flags; + struct sk_buff *skb; + bool pfmemalloc; + void *data; - if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data; + len += NET_SKB_PAD; - if (sk_memalloc_socks()) - gfp_mask |= __GFP_MEMALLOC; + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } - data = (flags & SKB_ALLOC_NAPI) ? - __napi_alloc_frag(fragsz, gfp_mask) : - __netdev_alloc_frag(fragsz, gfp_mask); + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); - if (likely(data)) { - skb = build_skb(data, fragsz); - if (unlikely(!skb)) - put_page(virt_to_head_page(data)); - } - } else { - skb = __alloc_skb(length, gfp_mask, - SKB_ALLOC_RX, NUMA_NO_NODE); - } - return skb; -} + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; -/** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has NET_SKB_PAD headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned if there is no free memory. - */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) -{ - struct sk_buff *skb; + local_lock_irqsave(netdev_alloc_lock, flags); + + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; - length += NET_SKB_PAD; - skb = __alloc_rx_skb(length, gfp_mask, 0); + local_unlock_irqrestore(netdev_alloc_lock, flags); - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = dev; + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; } + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + +skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); @@ -543,7 +470,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); /** * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will @@ -553,19 +480,54 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, - unsigned int length, gfp_t gfp_mask) +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, + gfp_t gfp_mask) { + struct page_frag_cache *nc; struct sk_buff *skb; + void *data; + bool pfmemalloc; - length += NET_SKB_PAD + NET_IP_ALIGN; - skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); + len += NET_SKB_PAD + NET_IP_ALIGN; - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - skb->dev = napi->dev; + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; } + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); + data = __alloc_page_frag(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); + + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; + } + + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + +skb_fail: return skb; } EXPORT_SYMBOL(__napi_alloc_skb); @@ -613,10 +575,12 @@ static void skb_clone_fraglist(struct sk_buff *skb) static void skb_free_head(struct sk_buff *skb) { + unsigned char *head = skb->head; + if (skb->head_frag) - put_page(virt_to_head_page(skb->head)); + skb_free_frag(head); else - kfree(skb->head); + kfree(head); } static void skb_release_data(struct sk_buff *skb) @@ -1920,15 +1884,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + /* Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. It does NOT handle frag lists within * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags) + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -1941,7 +1929,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; - struct sock *sk = skb->sk; int ret = 0; /* @@ -1964,23 +1951,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, } done: - if (spd.nr_pages) { - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); - } + if (spd.nr_pages) + ret = splice_cb(sk, pipe, &spd); return ret; } +EXPORT_SYMBOL_GPL(skb_splice_bits); /** * skb_store_bits - store bits from kernel buffer to skb @@ -2965,6 +2941,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL(skb_append_datato_frags); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size) +{ + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); + /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update @@ -2978,11 +2972,12 @@ EXPORT_SYMBOL(skb_append_datato_frags); */ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); @@ -3662,7 +3657,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; - if (sk->sk_protocol == IPPROTO_TCP) + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) serr->ee.ee_data -= sk->sk_tskey; } @@ -4032,6 +4028,92 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) } EXPORT_SYMBOL(skb_checksum_setup); +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) + return NULL; + else if (skb->len == len) + return skb; + + skb_chk = skb_clone(skb, GFP_ATOMIC); + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + __sum16 ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + goto err; + + if (!pskb_may_pull(skb_chk, offset)) + goto err; + + __skb_pull(skb_chk, offset); + ret = skb_chkf(skb_chk); + __skb_push(skb_chk, offset); + + if (ret) + goto err; + + return skb_chk; + +err: + if (skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return NULL; + +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -4201,7 +4283,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN, + 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; } @@ -4385,7 +4468,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; gfp_head = gfp_mask; - if (gfp_head & __GFP_WAIT) + if (gfp_head & __GFP_DIRECT_RECLAIM) gfp_head |= __GFP_REPEAT; *errcode = -ENOBUFS; @@ -4400,7 +4483,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, while (order) { if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_WAIT) | + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, diff --git a/kernel/net/core/sock.c b/kernel/net/core/sock.c index 6317c7149..9c3234299 100644 --- a/kernel/net/core/sock.c +++ b/kernel/net/core/sock.c @@ -131,6 +131,7 @@ #include #include #include +#include #include @@ -421,13 +422,23 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +static bool sock_needs_netstamp(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_UNSPEC: + case AF_UNIX: + return false; + default: + return true; + } +} static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; - if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } @@ -861,7 +872,8 @@ set_rcvbuf: if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { if (sk->sk_state != TCP_ESTABLISHED) { ret = -EINVAL; break; @@ -987,6 +999,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -1393,9 +1409,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx); * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; @@ -1408,7 +1425,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) + get_net(net); + sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); @@ -1419,7 +1439,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -static void __sk_free(struct sock *sk) +void sk_destruct(struct sock *sk) { struct sk_filter *filter; @@ -1442,10 +1462,19 @@ static void __sk_free(struct sock *sk) if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + if (likely(sk->sk_net_refcnt)) + put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } +static void __sk_free(struct sock *sk) +{ + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) + sock_diag_broadcast_destroy(sk); + else + sk_destruct(sk); +} + void sk_free(struct sock *sk) { /* @@ -1458,25 +1487,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -/* - * Last sock_put should drop reference to sk->sk_net. It has already - * been dropped in sk_change_net. Taking reference to stopping namespace - * is not an option. - * Take reference to a socket to remove it from hash _alive_ and after that - * destroy it in the context of init_net. - */ -void sk_release_kernel(struct sock *sk) -{ - if (sk == NULL || sk->sk_socket == NULL) - return; - - sock_hold(sk); - sock_release(sk->sk_socket); - sock_net_set(sk, get_net(&init_net)); - sock_put(sk); -} -EXPORT_SYMBOL(sk_release_kernel); - static void sk_update_clone(const struct sock *sk, struct sock *newsk) { if (mem_cgroup_sockets_enabled && sk->sk_cgrp) @@ -1502,7 +1512,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk->sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); @@ -1518,7 +1529,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1541,7 +1551,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ is_charged = sk_filter_charge(newsk, filter); - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; @@ -1582,7 +1592,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + if (sock_needs_netstamp(sk) && + newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: @@ -1592,7 +1603,9 @@ EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { - __sk_dst_set(sk, dst); + u32 max_segs = 1; + + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -1603,9 +1616,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1640,6 +1654,28 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) { + skb->destructor = sock_edemux; + sock_hold(sk); + return; + } +#endif + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ + atomic_add(skb->truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(skb_set_owner_w); + void skb_orphan_partial(struct sk_buff *skb) { /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, @@ -1777,7 +1813,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1823,7 +1859,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) break; - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -1853,6 +1889,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) @@ -1880,8 +1942,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER) { - pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP | - __GFP_NOWARN | __GFP_NORETRY, + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; @@ -1969,21 +2033,22 @@ static void __release_sock(struct sock *sk) * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long + * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ -int sk_wait_data(struct sock *sk, long *timeo) +int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { int rc; DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } @@ -2078,14 +2143,15 @@ suppress_allocation: EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reclaim - reclaim memory_allocated * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ -void __sk_mem_reclaim(struct sock *sk) +void __sk_mem_reclaim(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + amount >>= SK_MEM_QUANTUM_SHIFT; + sk_memory_allocated_sub(sk, amount); + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) @@ -2270,7 +2336,6 @@ static void sock_def_write_space(struct sock *sk) static void sock_def_destruct(struct sock *sk) { - kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) @@ -2321,7 +2386,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, @@ -2353,6 +2417,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2478,7 +2543,8 @@ void sock_enable_timestamp(struct sock *sk, int flag) * time stamping, but time stamping might have been on * already because of the other one */ - if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } @@ -2739,10 +2805,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2759,7 +2823,7 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - 0, NULL); + prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -2827,10 +2891,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); diff --git a/kernel/net/core/sock_diag.c b/kernel/net/core/sock_diag.c index 556ecf96a..0c1d58d43 100644 --- a/kernel/net/core/sock_diag.c +++ b/kernel/net/core/sock_diag.c @@ -1,3 +1,5 @@ +/* License: GPL */ + #include #include #include @@ -5,6 +7,9 @@ #include #include #include +#include +#include +#include #include #include @@ -12,6 +17,7 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); +static struct workqueue_struct *broadcast_wq; static u64 sock_gen_cookie(struct sock *sk) { @@ -104,6 +110,62 @@ out: } EXPORT_SYMBOL(sock_diag_put_filterinfo); +struct broadcast_sk { + struct sock *sk; + struct work_struct work; +}; + +static size_t sock_diag_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ + + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ +} + +static void sock_diag_broadcast_destroy_work(struct work_struct *work) +{ + struct broadcast_sk *bsk = + container_of(work, struct broadcast_sk, work); + struct sock *sk = bsk->sk; + const struct sock_diag_handler *hndl; + struct sk_buff *skb; + const enum sknetlink_groups group = sock_diag_destroy_group(sk); + int err = -1; + + WARN_ON(group == SKNLGRP_NONE); + + skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out; + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[sk->sk_family]; + if (hndl && hndl->get_info) + err = hndl->get_info(skb, sk); + mutex_unlock(&sock_diag_table_mutex); + + if (!err) + nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, + GFP_KERNEL); + else + kfree_skb(skb); +out: + sk_destruct(sk); + kfree(bsk); +} + +void sock_diag_broadcast_destroy(struct sock *sk) +{ + /* Note, this function is often called from an interrupt context. */ + struct broadcast_sk *bsk = + kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); + if (!bsk) + return sk_destruct(sk); + bsk->sk = sk; + INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work); + queue_work(broadcast_wq, &bsk->work); +} + void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); @@ -214,10 +276,32 @@ static void sock_diag_rcv(struct sk_buff *skb) mutex_unlock(&sock_diag_mutex); } +static int sock_diag_bind(struct net *net, int group) +{ + switch (group) { + case SKNLGRP_INET_TCP_DESTROY: + case SKNLGRP_INET_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + break; + case SKNLGRP_INET6_TCP_DESTROY: + case SKNLGRP_INET6_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET6]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + break; + } + return 0; +} + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { + .groups = SKNLGRP_MAX, .input = sock_diag_rcv, + .bind = sock_diag_bind, + .flags = NL_CFG_F_NONROOT_RECV, }; net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); @@ -237,15 +321,8 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { + broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } - -static void __exit sock_diag_exit(void) -{ - unregister_pernet_subsys(&diag_net_ops); -} - -module_init(sock_diag_init); -module_exit(sock_diag_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); +device_initcall(sock_diag_init); diff --git a/kernel/net/core/stream.c b/kernel/net/core/stream.c index 301c05f26..b96f7a79e 100644 --- a/kernel/net/core/stream.c +++ b/kernel/net/core/stream.c @@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } @@ -119,23 +119,27 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) int err = 0; long vm_wait = 0; long current_timeo = *timeo_p; + bool noblock = (*timeo_p ? false : true); DEFINE_WAIT(wait); if (sk_stream_memory_free(sk)) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - if (!*timeo_p) + if (!*timeo_p) { + if (noblock) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); goto do_nonblock; + } if (signal_pending(current)) goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; diff --git a/kernel/net/core/sysctl_net_core.c b/kernel/net/core/sysctl_net_core.c index 95b6139d7..a6beb7b6a 100644 --- a/kernel/net/core/sysctl_net_core.c +++ b/kernel/net/core/sysctl_net_core.c @@ -26,6 +26,7 @@ static int zero = 0; static int one = 1; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "max_skb_frags", + .data = &sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &max_skb_frags, + }, { } }; diff --git a/kernel/net/core/timestamping.c b/kernel/net/core/timestamping.c index 43d3dd62f..42689d5c4 100644 --- a/kernel/net/core/timestamping.c +++ b/kernel/net/core/timestamping.c @@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; + __skb_push(skb, ETH_HLEN); - type = classify(skb); + type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); diff --git a/kernel/net/core/tso.c b/kernel/net/core/tso.c index 630b30b4f..5dca7ce8e 100644 --- a/kernel/net/core/tso.c +++ b/kernel/net/core/tso.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -14,18 +15,24 @@ EXPORT_SYMBOL(tso_count_descs); void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) { - struct iphdr *iph; struct tcphdr *tcph; int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); int mac_hdr_len = skb_network_offset(skb); memcpy(hdr, skb->data, hdr_len); - iph = (struct iphdr *)(hdr + mac_hdr_len); - iph->id = htons(tso->ip_id); - iph->tot_len = htons(size + hdr_len - mac_hdr_len); + if (!tso->ipv6) { + struct iphdr *iph = (void *)(hdr + mac_hdr_len); + + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tso->ip_id++; + } else { + struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len); + + iph->payload_len = htons(size + tcp_hdrlen(skb)); + } tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); put_unaligned_be32(tso->tcp_seq, &tcph->seq); - tso->ip_id++; if (!is_last) { /* Clear all special flags for not last packet */ @@ -61,6 +68,7 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso) tso->ip_id = ntohs(ip_hdr(skb)->id); tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); tso->next_frag_idx = 0; + tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6); /* Build first data */ tso->size = skb_headlen(skb) - hdr_len; diff --git a/kernel/net/core/utils.c b/kernel/net/core/utils.c index 7b803884c..3d17ca8b4 100644 --- a/kernel/net/core/utils.c +++ b/kernel/net/core/utils.c @@ -301,22 +301,24 @@ out: EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) + __be32 from, __be32 to, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), - to)); + csum_replace4(sum, from, to); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to); + skb->csum = ~csum_add(csum_sub(~(skb->csum), + (__force __wsum)from), + (__force __wsum)to); } else if (pseudohdr) - *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from), - to)); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), + (__force __wsum)from), + (__force __wsum)to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, - int pseudohdr) + bool pseudohdr) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], @@ -334,51 +336,15 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace16); -struct __net_random_once_work { - struct work_struct work; - struct static_key *key; -}; - -static void __net_random_once_deferred(struct work_struct *w) -{ - struct __net_random_once_work *work = - container_of(w, struct __net_random_once_work, work); - BUG_ON(!static_key_enabled(work->key)); - static_key_slow_dec(work->key); - kfree(work); -} - -static void __net_random_once_disable_jump(struct static_key *key) +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr) { - struct __net_random_once_work *w; - - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return; - - INIT_WORK(&w->work, __net_random_once_deferred); - w->key = key; - schedule_work(&w->work); -} - -bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *once_key) -{ - static DEFINE_SPINLOCK(lock); - unsigned long flags; - - spin_lock_irqsave(&lock, flags); - if (*done) { - spin_unlock_irqrestore(&lock, flags); - return false; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_add(diff, ~skb->csum); + } else if (pseudohdr) { + *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); } - - get_random_bytes(buf, nbytes); - *done = true; - spin_unlock_irqrestore(&lock, flags); - - __net_random_once_disable_jump(once_key); - - return true; } -EXPORT_SYMBOL(__net_get_random_once); +EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); diff --git a/kernel/net/dcb/dcbnl.c b/kernel/net/dcb/dcbnl.c index 5b21f6f88..4f6c1862d 100644 --- a/kernel/net/dcb/dcbnl.c +++ b/kernel/net/dcb/dcbnl.c @@ -13,6 +13,7 @@ * You should have received a copy of the GNU General Public License along with * this program; if not, see . * + * Description: Data Center Bridging netlink interface * Author: Lucy Liu */ @@ -24,7 +25,7 @@ #include #include #include -#include +#include #include /* Data Center Bridging (DCB) is a collection of Ethernet enhancements @@ -48,10 +49,6 @@ * features for capable devices. */ -MODULE_AUTHOR("Lucy Liu, "); -MODULE_DESCRIPTION("Data Center Bridging netlink interface"); -MODULE_LICENSE("GPL"); - /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ @@ -1935,19 +1932,6 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); -static void dcb_flushapp(void) -{ - struct dcb_app_type *app; - struct dcb_app_type *tmp; - - spin_lock_bh(&dcb_lock); - list_for_each_entry_safe(app, tmp, &dcb_app_list, list) { - list_del(&app->list); - kfree(app); - } - spin_unlock_bh(&dcb_lock); -} - static int __init dcbnl_init(void) { INIT_LIST_HEAD(&dcb_app_list); @@ -1957,12 +1941,4 @@ static int __init dcbnl_init(void) return 0; } -module_init(dcbnl_init); - -static void __exit dcbnl_exit(void) -{ - rtnl_unregister(PF_UNSPEC, RTM_GETDCB); - rtnl_unregister(PF_UNSPEC, RTM_SETDCB); - dcb_flushapp(); -} -module_exit(dcbnl_exit); +device_initcall(dcbnl_init); diff --git a/kernel/net/dccp/ackvec.c b/kernel/net/dccp/ackvec.c index bd9e718c2..3de0d0362 100644 --- a/kernel/net/dccp/ackvec.c +++ b/kernel/net/dccp/ackvec.c @@ -398,12 +398,8 @@ out_err: void dccp_ackvec_exit(void) { - if (dccp_ackvec_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - } - if (dccp_ackvec_record_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; - } + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; + kmem_cache_destroy(dccp_ackvec_record_slab); + dccp_ackvec_record_slab = NULL; } diff --git a/kernel/net/dccp/ccid.c b/kernel/net/dccp/ccid.c index 834989751..90f77d08c 100644 --- a/kernel/net/dccp/ccid.c +++ b/kernel/net/dccp/ccid.c @@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f static void ccid_kmem_cache_destroy(struct kmem_cache *slab) { - if (slab != NULL) - kmem_cache_destroy(slab); + kmem_cache_destroy(slab); } static int __init ccid_activate(struct ccid_operations *ccid_ops) diff --git a/kernel/net/dccp/dccp.h b/kernel/net/dccp/dccp.h index bebc735f5..b0e28d24e 100644 --- a/kernel/net/dccp/dccp.h +++ b/kernel/net/dccp/dccp.h @@ -229,7 +229,7 @@ void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); int dccp_retransmit_skb(struct sock *sk); void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk); void dccp_send_sync(struct sock *sk, const u64 seq, @@ -270,15 +270,17 @@ int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); -struct sock *dccp_create_openreq_child(struct sock *sk, +struct sock *dccp_create_openreq_child(const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb); int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); -struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst); + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -293,7 +295,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, +struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req); int dccp_connect(struct sock *sk); @@ -325,13 +327,13 @@ void dccp_send_close(struct sock *sk, const int active); int dccp_invalid_packet(struct sk_buff *skb); u32 dccp_sample_rtt(struct sock *sk, long delta); -static inline int dccp_bad_service_code(const struct sock *sk, +static inline bool dccp_bad_service_code(const struct sock *sk, const __be32 service) { const struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_service == service) - return 0; + return false; return !dccp_list_has_service(dp->dccps_service_list, service); } diff --git a/kernel/net/dccp/diag.c b/kernel/net/dccp/diag.c index 5a45f8de5..2d84303ea 100644 --- a/kernel/net/dccp/diag.c +++ b/kernel/net/dccp/diag.c @@ -66,6 +66,7 @@ static const struct inet_diag_handler dccp_diag_handler = { .dump_one = dccp_diag_dump_one, .idiag_get_info = dccp_diag_get_info, .idiag_type = IPPROTO_DCCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init dccp_diag_init(void) diff --git a/kernel/net/dccp/ipv4.c b/kernel/net/dccp/ipv4.c index ccf4c5629..902d60632 100644 --- a/kernel/net/dccp/ipv4.c +++ b/kernel/net/dccp/ipv4.c @@ -208,7 +208,6 @@ void dccp_req_err(struct sock *sk, u64 seq) if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in RESPOND, just remove it silently. @@ -218,6 +217,7 @@ void dccp_req_err(struct sock *sk, u64 seq) */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); } + reqsk_put(req); } EXPORT_SYMBOL(dccp_req_err); @@ -390,9 +390,12 @@ static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) * * This is the equivalent of TCP's tcp_v4_syn_recv_sock */ -struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *dccp_v4_request_recv_sock(const struct sock *sk, + struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; @@ -425,7 +428,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - __inet_hash_nolisten(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); return newsk; @@ -443,36 +446,6 @@ put_and_exit: } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); -static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct sock *nsk; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, - iph->saddr, iph->daddr); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, - inet_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -498,7 +471,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, return &rt->dst; } -static int dccp_v4_send_response(struct sock *sk, struct request_sock *req) +static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) { int err = -1; struct sk_buff *skb; @@ -527,7 +500,7 @@ out: return err; } -static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { int err; const struct iphdr *rxiph; @@ -624,7 +597,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); if (req == NULL) goto drop; @@ -704,18 +677,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v4_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; @@ -723,7 +684,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: dccp_v4_ctl_send_reset(sk, skb); -discard: kfree_skb(skb); return 0; } @@ -841,15 +801,10 @@ static int dccp_v4_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq); } - /* Step 2: - * Look up flow ID in table and get corresponding socket */ +lookup: sk = __inet_lookup_skb(&dccp_hashinfo, skb, dh->dccph_sport, dh->dccph_dport); - /* - * Step 2: - * If no socket ... - */ - if (sk == NULL) { + if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; @@ -867,6 +822,31 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk; + + sk = req->rsk_listener; + if (unlikely(sk->sk_state != DCCP_LISTEN)) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_and_relse; + } + if (nsk == sk) { + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v4_ctl_send_reset(sk, skb); + goto discard_and_relse; + } else { + sock_put(sk); + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/kernel/net/dccp/ipv6.c b/kernel/net/dccp/ipv6.c index 5165571f3..b8608b71a 100644 --- a/kernel/net/dccp/ipv6.c +++ b/kernel/net/dccp/ipv6.c @@ -181,7 +181,7 @@ out: } -static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) +static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -202,7 +202,9 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -219,7 +221,10 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; - err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -234,7 +239,7 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) kfree_skb(inet_rsk(req)->pktopts); } -static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; @@ -290,37 +295,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { .syn_ack_timeout = dccp_syn_ack_timeout, }; -static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, - &iph->daddr, inet6_iif(skb)); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, - &iph->saddr, dh->dccph_sport, - &iph->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; @@ -350,7 +324,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); if (req == NULL) goto drop; @@ -398,7 +372,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: @@ -408,13 +382,17 @@ drop: return -1; } -static struct sock *dccp_v6_request_recv_sock(struct sock *sk, +static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_pinfo *newnp; + const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; @@ -423,7 +401,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, /* * v6 mapped */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, + req_unhash, own_req); if (newsk == NULL) return NULL; @@ -462,22 +441,11 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, if (sk_acceptq_is_full(sk)) goto out_overflow; - if (dst == NULL) { - struct in6_addr *final_p, final; + if (!dst) { struct flowi6 fl6; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); - if (IS_ERR(dst)) + dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); + if (!dst) goto out; } @@ -491,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, * comment in that function for the gory details. -acme */ - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; @@ -515,15 +483,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; - /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; - if (ireq->pktopts != NULL) { - newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); - } newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; @@ -534,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - if (np->opt != NULL) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt != NULL) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; dccp_sync_mss(newsk, dst_mtu(dst)); @@ -552,7 +514,15 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, dccp_done(newsk); goto out; } - __inet_hash(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + /* Clone pktoptions received with SYN, if we own the req */ + if (*own_req && ireq->pktopts) { + newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); + consume_skb(ireq->pktopts); + ireq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } return newsk; @@ -651,24 +621,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v6_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb != NULL) - __kfree_skb(opt_skb); - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; @@ -715,16 +667,11 @@ static int dccp_v6_rcv(struct sk_buff *skb) else DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - /* Step 2: - * Look up flow ID in table and get corresponding socket */ +lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); - /* - * Step 2: - * If no socket ... - */ - if (sk == NULL) { + if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; @@ -742,6 +689,31 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk; + + sk = req->rsk_listener; + if (unlikely(sk->sk_state != DCCP_LISTEN)) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_and_relse; + } + if (nsk == sk) { + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v6_ctl_send_reset(sk, skb); + goto discard_and_relse; + } else { + sock_put(sk); + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted @@ -793,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -892,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -909,12 +883,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; - if (np->opt != NULL) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; inet->inet_dport = usin->sin6_port; diff --git a/kernel/net/dccp/minisocks.c b/kernel/net/dccp/minisocks.c index 30addee2d..1994f8af6 100644 --- a/kernel/net/dccp/minisocks.c +++ b/kernel/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = sk->sk_ipv6only; } #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) timeo = DCCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -72,7 +72,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) dccp_done(sk); } -struct sock *dccp_create_openreq_child(struct sock *sk, +struct sock *dccp_create_openreq_child(const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb) { @@ -143,6 +143,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, { struct sock *child = NULL; struct dccp_request_sock *dreq = dccp_rsk(req); + bool own_req; /* Check for retransmitted REQUEST */ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { @@ -182,14 +183,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, if (dccp_parse_options(sk, dreq, skb)) goto drop; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); + if (!child) goto listen_overflow; - inet_csk_reqsk_queue_drop(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); -out: - return child; + return inet_csk_complete_hashdance(sk, child, req, own_req); + listen_overflow: dccp_pr_debug("listen_overflow!\n"); DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; @@ -198,7 +198,7 @@ drop: req->rsk_ops->send_reset(sk, skb); inet_csk_reqsk_queue_drop(sk, req); - goto out; + return NULL; } EXPORT_SYMBOL_GPL(dccp_check_req); @@ -236,7 +236,7 @@ int dccp_child_process(struct sock *parent, struct sock *child, EXPORT_SYMBOL_GPL(dccp_child_process); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk) { DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); diff --git a/kernel/net/dccp/output.c b/kernel/net/dccp/output.c index 0248e8a34..4ce912e69 100644 --- a/kernel/net/dccp/output.c +++ b/kernel/net/dccp/output.c @@ -390,7 +390,7 @@ int dccp_retransmit_skb(struct sock *sk) return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); } -struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, +struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req) { struct dccp_hdr *dh; @@ -398,13 +398,18 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, const u32 dccp_header_size = sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext) + sizeof(struct dccp_hdr_response); - struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, - GFP_ATOMIC); - if (skb == NULL) + struct sk_buff *skb; + + /* sk is marked const to clearly express we dont hold socket lock. + * sock_wmalloc() will atomically change sk->sk_wmem_alloc, + * it is safe to promote sk to non const. + */ + skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, + GFP_ATOMIC); + if (!skb) return NULL; - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); + skb_reserve(skb, MAX_DCCP_HEADER); skb_dst_set(skb, dst_clone(dst)); diff --git a/kernel/net/dccp/probe.c b/kernel/net/dccp/probe.c index d8346d0ea..3d3fda05b 100644 --- a/kernel/net/dccp/probe.c +++ b/kernel/net/dccp/probe.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -47,20 +48,20 @@ static struct { struct kfifo fifo; spinlock_t lock; wait_queue_head_t wait; - struct timespec tstart; + struct timespec64 tstart; } dccpw; static void printl(const char *fmt, ...) { va_list args; int len; - struct timespec now; + struct timespec64 now; char tbuf[256]; va_start(args, fmt); - getnstimeofday(&now); + getnstimeofday64(&now); - now = timespec_sub(now, dccpw.tstart); + now = timespec64_sub(now, dccpw.tstart); len = sprintf(tbuf, "%lu.%06lu ", (unsigned long) now.tv_sec, @@ -110,7 +111,7 @@ static struct jprobe dccp_send_probe = { static int dccpprobe_open(struct inode *inode, struct file *file) { kfifo_reset(&dccpw.fifo); - getnstimeofday(&dccpw.tstart); + getnstimeofday64(&dccpw.tstart); return 0; } diff --git a/kernel/net/dccp/proto.c b/kernel/net/dccp/proto.c index 52a940165..41e65804d 100644 --- a/kernel/net/dccp/proto.c +++ b/kernel/net/dccp/proto.c @@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after @@ -886,7 +885,7 @@ verify_sock_status: break; } - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) diff --git a/kernel/net/decnet/af_decnet.c b/kernel/net/decnet/af_decnet.c index 754484b3c..13d6b1a6e 100644 --- a/kernel/net/decnet/af_decnet.c +++ b/kernel/net/decnet/af_decnet.c @@ -468,10 +468,10 @@ static struct proto dn_proto = { .obj_size = sizeof(struct dn_sock), }; -static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp) +static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern) { struct dn_scp *scp; - struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto); + struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern); if (!sk) goto out; @@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; @@ -693,7 +696,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, } - if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL) + if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL) return -ENOBUFS; sk->sk_protocol = protocol; @@ -1096,7 +1099,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) cb = DN_SKB_CB(skb); sk->sk_ack_backlog--; - newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation); + newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); if (newsk == NULL) { release_sock(sk); kfree_skb(skb); @@ -1747,9 +1750,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); } @@ -2004,10 +2007,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, !dn_queue_too_long(scp, queue, flags)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); continue; } diff --git a/kernel/net/decnet/dn_neigh.c b/kernel/net/decnet/dn_neigh.c index 4507b188f..482730cd8 100644 --- a/kernel/net/decnet/dn_neigh.c +++ b/kernel/net/decnet/dn_neigh.c @@ -194,7 +194,7 @@ static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb) return err; } -static int dn_neigh_output_packet(struct sock *sk, struct sk_buff *skb) +static int dn_neigh_output_packet(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -246,8 +246,9 @@ static int dn_long_output(struct neighbour *neigh, struct sock *sk, skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, - NULL, neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, + &init_net, sk, skb, NULL, neigh->dev, + dn_neigh_output_packet); } /* @@ -286,8 +287,9 @@ static int dn_short_output(struct neighbour *neigh, struct sock *sk, skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, - NULL, neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, + &init_net, sk, skb, NULL, neigh->dev, + dn_neigh_output_packet); } /* @@ -327,11 +329,12 @@ static int dn_phase3_output(struct neighbour *neigh, struct sock *sk, skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, - NULL, neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, + &init_net, sk, skb, NULL, neigh->dev, + dn_neigh_output_packet); } -int dn_to_neigh_output(struct sock *sk, struct sk_buff *skb) +int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *) dst; @@ -375,7 +378,7 @@ void dn_neigh_pointopoint_hello(struct sk_buff *skb) /* * Ethernet router hello message received */ -int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb) +int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data; @@ -437,7 +440,7 @@ int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb) /* * Endnode hello message received */ -int dn_neigh_endnode_hello(struct sock *sk, struct sk_buff *skb) +int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb) { struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data; struct neighbour *neigh; diff --git a/kernel/net/decnet/dn_nsp_in.c b/kernel/net/decnet/dn_nsp_in.c index a321eac9f..7ac086d5c 100644 --- a/kernel/net/decnet/dn_nsp_in.c +++ b/kernel/net/decnet/dn_nsp_in.c @@ -714,7 +714,8 @@ out: return ret; } -static int dn_nsp_rx_packet(struct sock *sk2, struct sk_buff *skb) +static int dn_nsp_rx_packet(struct net *net, struct sock *sk2, + struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); struct sock *sk = NULL; @@ -814,8 +815,8 @@ free_out: int dn_nsp_rx(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, + &init_net, NULL, skb, skb->dev, NULL, dn_nsp_rx_packet); } diff --git a/kernel/net/decnet/dn_nsp_out.c b/kernel/net/decnet/dn_nsp_out.c index 1aaa51ebb..849805e7a 100644 --- a/kernel/net/decnet/dn_nsp_out.c +++ b/kernel/net/decnet/dn_nsp_out.c @@ -85,7 +85,7 @@ static void dn_nsp_send(struct sk_buff *skb) if (dst) { try_again: skb_dst_set(skb, dst); - dst_output(skb); + dst_output(&init_net, skb->sk, skb); return; } @@ -582,7 +582,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, * associations. */ skb_dst_set(skb, dst_clone(dst)); - dst_output(skb); + dst_output(&init_net, skb->sk, skb); } diff --git a/kernel/net/decnet/dn_route.c b/kernel/net/decnet/dn_route.c index 03227ffd1..607a14f20 100644 --- a/kernel/net/decnet/dn_route.c +++ b/kernel/net/decnet/dn_route.c @@ -512,7 +512,7 @@ static int dn_return_long(struct sk_buff *skb) * * Returns: result of input function if route is found, error code otherwise */ -static int dn_route_rx_packet(struct sock *sk, struct sk_buff *skb) +static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dn_skb_cb *cb; int err; @@ -573,8 +573,8 @@ static int dn_route_rx_long(struct sk_buff *skb) ptr++; cb->hops = *ptr++; /* Visit Count */ - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, + &init_net, NULL, skb, skb->dev, NULL, dn_route_rx_packet); drop_it: @@ -601,8 +601,8 @@ static int dn_route_rx_short(struct sk_buff *skb) ptr += 2; cb->hops = *ptr & 0x3f; - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, + &init_net, NULL, skb, skb->dev, NULL, dn_route_rx_packet); drop_it: @@ -610,7 +610,7 @@ drop_it: return NET_RX_DROP; } -static int dn_route_discard(struct sock *sk, struct sk_buff *skb) +static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb) { /* * I know we drop the packet here, but thats considered success in @@ -620,7 +620,7 @@ static int dn_route_discard(struct sock *sk, struct sk_buff *skb) return NET_RX_SUCCESS; } -static int dn_route_ptp_hello(struct sock *sk, struct sk_buff *skb) +static int dn_route_ptp_hello(struct net *net, struct sock *sk, struct sk_buff *skb) { dn_dev_hello(skb); dn_neigh_pointopoint_hello(skb); @@ -706,22 +706,22 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type switch (flags & DN_RT_CNTL_MSK) { case DN_RT_PKT_HELO: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_route_ptp_hello); case DN_RT_PKT_L1RT: case DN_RT_PKT_L2RT: return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_route_discard); case DN_RT_PKT_ERTH: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_neigh_router_hello); case DN_RT_PKT_EEDH: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - NULL, skb, skb->dev, NULL, + &init_net, NULL, skb, skb->dev, NULL, dn_neigh_endnode_hello); } } else { @@ -744,7 +744,7 @@ out: return NET_RX_DROP; } -static int dn_output(struct sock *sk, struct sk_buff *skb) +static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -770,8 +770,8 @@ static int dn_output(struct sock *sk, struct sk_buff *skb) cb->rt_flags |= DN_RT_F_IE; cb->hops = 0; - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, sk, skb, - NULL, dev, + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, + &init_net, sk, skb, NULL, dev, dn_to_neigh_output); error: @@ -789,9 +789,7 @@ static int dn_forward(struct sk_buff *skb) struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr); struct dn_route *rt; int header_len; -#ifdef CONFIG_NETFILTER struct net_device *dev = skb->dev; -#endif if (skb->pkt_type != PACKET_HOST) goto drop; @@ -819,8 +817,8 @@ static int dn_forward(struct sk_buff *skb) if (rt->rt_flags & RTCF_DOREDIRECT) cb->rt_flags |= DN_RT_F_IE; - return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, NULL, skb, - dev, skb->dev, + return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, + &init_net, NULL, skb, dev, skb->dev, dn_to_neigh_output); drop: @@ -832,7 +830,7 @@ drop: * Used to catch bugs. This should never normally get * called. */ -static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb) +static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1469,7 +1467,7 @@ make_route: rt->n = neigh; rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug_sk; + rt->dst.output = dn_rt_bug_out; switch (res.type) { case RTN_UNICAST: rt->dst.input = dn_forward; diff --git a/kernel/net/decnet/dn_rules.c b/kernel/net/decnet/dn_rules.c index 9d66a0f72..295bbd6a5 100644 --- a/kernel/net/decnet/dn_rules.c +++ b/kernel/net/decnet/dn_rules.c @@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { .configure = dn_fib_rule_configure, .compare = dn_fib_rule_compare, .fill = dn_fib_rule_fill, - .default_pref = fib_default_rule_pref, .flush_cache = dn_fib_rule_flush_cache, .nlgroup = RTNLGRP_DECnet_RULE, .policy = dn_fib_rule_policy, diff --git a/kernel/net/decnet/netfilter/dn_rtmsg.c b/kernel/net/decnet/netfilter/dn_rtmsg.c index af34fc9bd..85f2fdc36 100644 --- a/kernel/net/decnet/netfilter/dn_rtmsg.c +++ b/kernel/net/decnet/netfilter/dn_rtmsg.c @@ -87,7 +87,7 @@ static void dnrmg_send_peer(struct sk_buff *skb) } -static unsigned int dnrmg_hook(const struct nf_hook_ops *ops, +static unsigned int dnrmg_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { diff --git a/kernel/net/dns_resolver/dns_key.c b/kernel/net/dns_resolver/dns_key.c index 31cd4fd75..c79b85eb4 100644 --- a/kernel/net/dns_resolver/dns_key.c +++ b/kernel/net/dns_resolver/dns_key.c @@ -122,7 +122,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) goto bad_option_value; kdebug("dns error no. = %lu", derrno); - prep->type_data[0] = ERR_PTR(-derrno); + prep->payload.data[dns_key_error] = ERR_PTR(-derrno); continue; } @@ -137,8 +137,8 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) /* don't cache the result if we're caching an error saying there's no * result */ - if (prep->type_data[0]) { - kleave(" = 0 [h_error %ld]", PTR_ERR(prep->type_data[0])); + if (prep->payload.data[dns_key_error]) { + kleave(" = 0 [h_error %ld]", PTR_ERR(prep->payload.data[dns_key_error])); return 0; } @@ -155,7 +155,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) memcpy(upayload->data, data, result_len); upayload->data[result_len] = '\0'; - prep->payload[0] = upayload; + prep->payload.data[dns_key_data] = upayload; kleave(" = 0"); return 0; } @@ -167,7 +167,7 @@ static void dns_resolver_free_preparse(struct key_preparsed_payload *prep) { pr_devel("==>%s()\n", __func__); - kfree(prep->payload[0]); + kfree(prep->payload.data[dns_key_data]); } /* @@ -223,10 +223,10 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data) */ static void dns_resolver_describe(const struct key *key, struct seq_file *m) { - int err = key->type_data.x[0]; - seq_puts(m, key->description); if (key_is_instantiated(key)) { + int err = PTR_ERR(key->payload.data[dns_key_error]); + if (err) seq_printf(m, ": %d", err); else @@ -241,8 +241,10 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m) static long dns_resolver_read(const struct key *key, char __user *buffer, size_t buflen) { - if (key->type_data.x[0]) - return key->type_data.x[0]; + int err = PTR_ERR(key->payload.data[dns_key_error]); + + if (err) + return err; return user_read(key, buffer, buflen); } diff --git a/kernel/net/dns_resolver/dns_query.c b/kernel/net/dns_resolver/dns_query.c index 39d2c39bd..ecc28cff0 100644 --- a/kernel/net/dns_resolver/dns_query.c +++ b/kernel/net/dns_resolver/dns_query.c @@ -67,10 +67,10 @@ * Returns the size of the result on success, -ve error code otherwise. */ int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry) + const char *options, char **_result, time64_t *_expiry) { struct key *rkey; - struct user_key_payload *upayload; + const struct user_key_payload *upayload; const struct cred *saved_cred; size_t typelen, desclen; char *desc, *cp; @@ -137,12 +137,11 @@ int dns_query(const char *type, const char *name, size_t namelen, goto put; /* If the DNS server gave an error, return that to the caller */ - ret = rkey->type_data.x[0]; + ret = PTR_ERR(rkey->payload.data[dns_key_error]); if (ret) goto put; - upayload = rcu_dereference_protected(rkey->payload.data, - lockdep_is_held(&rkey->sem)); + upayload = user_key_payload(rkey); len = upayload->datalen; ret = -ENOMEM; diff --git a/kernel/net/dns_resolver/internal.h b/kernel/net/dns_resolver/internal.h index 7af1ed39c..0c570d40e 100644 --- a/kernel/net/dns_resolver/internal.h +++ b/kernel/net/dns_resolver/internal.h @@ -22,6 +22,14 @@ #include #include +/* + * Layout of key payload words. + */ +enum { + dns_key_data, + dns_key_error, +}; + /* * dns_key.c */ diff --git a/kernel/net/dsa/dsa.c b/kernel/net/dsa/dsa.c index 392e29a02..1eba07feb 100644 --- a/kernel/net/dsa/dsa.c +++ b/kernel/net/dsa/dsa.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "dsa_priv.h" char dsa_driver_version[] = "0.1"; @@ -176,6 +177,41 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); #endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ +static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) +{ + struct dsa_chip_data *cd = ds->pd; + struct device_node *port_dn; + struct phy_device *phydev; + int ret, port, mode; + + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + + port_dn = cd->port_dn[port]; + if (of_phy_is_fixed_link(port_dn)) { + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + netdev_err(master, + "failed to register fixed PHY\n"); + return ret; + } + phydev = of_phy_find_device(port_dn); + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; + + genphy_config_init(phydev); + genphy_read_status(phydev); + if (ds->drv->adjust_link) + ds->drv->adjust_link(ds, port, phydev); + } + } + return 0; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -270,7 +306,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (ret < 0) goto out; - ds->slave_mii_bus = mdiobus_alloc(); + ds->slave_mii_bus = devm_mdiobus_alloc(parent); if (ds->slave_mii_bus == NULL) { ret = -ENOMEM; goto out; @@ -279,7 +315,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = mdiobus_register(ds->slave_mii_bus); if (ret < 0) - goto out_free; + goto out; /* @@ -291,12 +327,20 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = dsa_slave_create(ds, parent, i, pd->port_names[i]); if (ret < 0) { - netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s)\n", - index, i, pd->port_names[i]); + netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", + index, i, pd->port_names[i], ret); ret = 0; } } + /* Perform configuration of the CPU and DSA ports */ + ret = dsa_cpu_dsa_setup(ds, dst->master_netdev); + if (ret < 0) { + netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", + index); + ret = 0; + } + #ifdef CONFIG_NET_DSA_HWMON /* If the switch provides a temperature sensor, * register with hardware monitoring subsystem. @@ -324,10 +368,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) return ret; -out_free: - mdiobus_free(ds->slave_mii_bus); out: - kfree(ds); return ret; } @@ -357,7 +398,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Allocate and initialise switch state. */ - ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); + ds = devm_kzalloc(parent, sizeof(*ds) + drv->priv_size, GFP_KERNEL); if (ds == NULL) return ERR_PTR(-ENOMEM); @@ -377,10 +418,47 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, static void dsa_switch_destroy(struct dsa_switch *ds) { + struct device_node *port_dn; + struct phy_device *phydev; + struct dsa_chip_data *cd = ds->pd; + int port; + #ifdef CONFIG_NET_DSA_HWMON if (ds->hwmon_dev) hwmon_device_unregister(ds->hwmon_dev); #endif + + /* Disable configuration of the CPU and DSA ports */ + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + + port_dn = cd->port_dn[port]; + if (of_phy_is_fixed_link(port_dn)) { + phydev = of_phy_find_device(port_dn); + if (phydev) { + int addr = phydev->addr; + + phy_device_free(phydev); + of_node_put(port_dn); + fixed_phy_del(addr); + } + } + } + + /* Destroy network devices for physical switch ports. */ + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(ds->phys_port_mask & (1 << port))) + continue; + + if (!ds->ports[port]) + continue; + + unregister_netdev(ds->ports[port]); + free_netdev(ds->ports[port]); + } + + mdiobus_unregister(ds->slave_mii_bus); } #ifdef CONFIG_PM_SLEEP @@ -554,6 +632,31 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, return 0; } +static int dsa_of_probe_links(struct dsa_platform_data *pd, + struct dsa_chip_data *cd, + int chip_index, int port_index, + struct device_node *port, + const char *port_name) +{ + struct device_node *link; + int link_index; + int ret; + + for (link_index = 0;; link_index++) { + link = of_parse_phandle(port, "link", link_index); + if (!link) + break; + + if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) { + ret = dsa_of_setup_routing_table(pd, cd, chip_index, + port_index, link); + if (ret) + return ret; + } + } + return 0; +} + static void dsa_of_free_platform_data(struct dsa_platform_data *pd) { int i; @@ -566,6 +669,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) port_index++; } kfree(pd->chip[i].rtable); + + /* Drop our reference to the MDIO bus device */ + if (pd->chip[i].host_dev) + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } @@ -573,8 +680,8 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) static int dsa_of_probe(struct device *dev) { struct device_node *np = dev->of_node; - struct device_node *child, *mdio, *ethernet, *port, *link; - struct mii_bus *mdio_bus; + struct device_node *child, *mdio, *ethernet, *port; + struct mii_bus *mdio_bus, *mdio_bus_switch; struct net_device *ethernet_dev; struct dsa_platform_data *pd; struct dsa_chip_data *cd; @@ -593,16 +700,22 @@ static int dsa_of_probe(struct device *dev) return -EPROBE_DEFER; ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) - return -EINVAL; + if (!ethernet) { + ret = -EINVAL; + goto out_put_mdio; + } ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) - return -EPROBE_DEFER; + if (!ethernet_dev) { + ret = -EPROBE_DEFER; + goto out_put_mdio; + } pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return -ENOMEM; + if (!pd) { + ret = -ENOMEM; + goto out_put_ethernet; + } dev->platform_data = pd; pd->of_netdev = ethernet_dev; @@ -623,25 +736,45 @@ static int dsa_of_probe(struct device *dev) cd = &pd->chip[chip_index]; cd->of_node = child; - cd->host_dev = &mdio_bus->dev; + + /* When assigning the host device, increment its refcount */ + cd->host_dev = get_device(&mdio_bus->dev); sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) continue; cd->sw_addr = be32_to_cpup(sw_addr); - if (cd->sw_addr > PHY_MAX_ADDR) + if (cd->sw_addr >= PHY_MAX_ADDR) continue; if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) cd->eeprom_len = eeprom_len; + mdio = of_parse_phandle(child, "mii-bus", 0); + if (mdio) { + mdio_bus_switch = of_mdio_find_bus(mdio); + if (!mdio_bus_switch) { + ret = -EPROBE_DEFER; + goto out_free_chip; + } + + /* Drop the mdio_bus device ref, replacing the host + * device with the mdio_bus_switch device, keeping + * the refcount from of_mdio_find_bus() above. + */ + put_device(cd->host_dev); + cd->host_dev = &mdio_bus_switch->dev; + } + for_each_available_child_of_node(child, port) { port_reg = of_get_property(port, "reg", NULL); if (!port_reg) continue; port_index = be32_to_cpup(port_reg); + if (port_index >= DSA_MAX_PORTS) + break; port_name = of_get_property(port, "label", NULL); if (!port_name) @@ -656,21 +789,18 @@ static int dsa_of_probe(struct device *dev) goto out_free_chip; } - link = of_parse_phandle(port, "link", 0); - - if (!strcmp(port_name, "dsa") && link && - pd->nr_chips > 1) { - ret = dsa_of_setup_routing_table(pd, cd, - chip_index, port_index, link); - if (ret) - goto out_free_chip; - } + ret = dsa_of_probe_links(pd, cd, chip_index, + port_index, port, port_name); + if (ret) + goto out_free_chip; - if (port_index == DSA_MAX_PORTS) - break; } } + /* The individual chips hold their own refcount on the mdio bus, + * so drop ours */ + put_device(&mdio_bus->dev); + return 0; out_free_chip: @@ -678,6 +808,10 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_ethernet: + put_device(ðernet_dev->dev); +out_put_mdio: + put_device(&mdio_bus->dev); return ret; } @@ -689,6 +823,7 @@ static void dsa_of_remove(struct device *dev) return; dsa_of_free_platform_data(pd); + put_device(&pd->of_netdev->dev); kfree(pd); } #else @@ -702,10 +837,11 @@ static inline void dsa_of_remove(struct device *dev) } #endif -static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, - struct device *parent, struct dsa_platform_data *pd) +static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, + struct device *parent, struct dsa_platform_data *pd) { int i; + unsigned configured = 0; dst->pd = pd; dst->master_netdev = dev; @@ -725,8 +861,16 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->ds[i] = ds; if (ds->drv->poll_link != NULL) dst->link_poll_needed = 1; + + ++configured; } + /* + * If no switch was found, exit cleanly + */ + if (!configured) + return -EPROBE_DEFER; + /* * If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -743,6 +887,8 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); add_timer(&dst->link_poll_timer); } + + return 0; } static int dsa_probe(struct platform_device *pdev) @@ -783,7 +929,7 @@ static int dsa_probe(struct platform_device *pdev) goto out; } - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL); if (dst == NULL) { dev_put(dev); ret = -ENOMEM; @@ -792,7 +938,9 @@ static int dsa_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dst); - dsa_setup_dst(dst, dev, &pdev->dev, pd); + ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); + if (ret) + goto out; return 0; @@ -814,7 +962,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; - if (ds != NULL) + if (ds) dsa_switch_destroy(ds); } } diff --git a/kernel/net/dsa/dsa_priv.h b/kernel/net/dsa/dsa_priv.h index d5f1f9b86..311796c80 100644 --- a/kernel/net/dsa/dsa_priv.h +++ b/kernel/net/dsa/dsa_priv.h @@ -13,9 +13,10 @@ #include #include +#include struct dsa_device_ops { - netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); }; @@ -26,7 +27,7 @@ struct dsa_slave_priv { * switch port. */ struct net_device *dev; - netdev_tx_t (*xmit)(struct sk_buff *skb, + struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); /* @@ -47,6 +48,9 @@ struct dsa_slave_priv { int old_duplex; struct net_device *bridge_dev; +#ifdef CONFIG_NET_POLL_CONTROLLER + struct netpoll *netpoll; +#endif }; /* dsa.c */ diff --git a/kernel/net/dsa/slave.c b/kernel/net/dsa/slave.c index 57978c5b2..7bc787b09 100644 --- a/kernel/net/dsa/slave.c +++ b/kernel/net/dsa/slave.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -112,7 +113,7 @@ static int dsa_slave_open(struct net_device *dev) clear_promisc: if (dev->flags & IFF_PROMISC) - dev_set_promiscuity(master, 0); + dev_set_promiscuity(master, -1); clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(master, -1); @@ -199,103 +200,178 @@ out: return 0; } -static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, u16 nlm_flags) +static int dsa_bridge_check_vlan_range(struct dsa_switch *ds, + const struct net_device *bridge, + u16 vid_begin, u16 vid_end) +{ + struct dsa_slave_priv *p; + struct net_device *dev, *vlan_br; + DECLARE_BITMAP(members, DSA_MAX_PORTS); + DECLARE_BITMAP(untagged, DSA_MAX_PORTS); + u16 vid; + int member, err; + + if (!ds->drv->vlan_getnext || !vid_begin) + return -EOPNOTSUPP; + + vid = vid_begin - 1; + + do { + err = ds->drv->vlan_getnext(ds, &vid, members, untagged); + if (err) + break; + + if (vid > vid_end) + break; + + member = find_first_bit(members, DSA_MAX_PORTS); + if (member == DSA_MAX_PORTS) + continue; + + dev = ds->ports[member]; + p = netdev_priv(dev); + vlan_br = p->bridge_dev; + if (vlan_br == bridge) + continue; + + netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid); + return -EOPNOTSUPP; + } while (vid < vid_end); + + return err == -ENOENT ? 0 : err; +} + +static int dsa_slave_port_vlan_add(struct net_device *dev, + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; + int err; - if (ds->drv->fdb_add) - ret = ds->drv->fdb_add(ds, p->port, addr, vid); + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) + return -EOPNOTSUPP; - return ret; + /* If the requested port doesn't belong to the same bridge as + * the VLAN members, fallback to software VLAN (hopefully). + */ + err = dsa_bridge_check_vlan_range(ds, p->bridge_dev, + vlan->vid_begin, + vlan->vid_end); + if (err) + return err; + + err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); + if (err) + return err; + } else { + err = ds->drv->port_vlan_add(ds, p->port, vlan, trans); + if (err) + return err; + } + + return 0; } -static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) +static int dsa_slave_port_vlan_del(struct net_device *dev, + const struct switchdev_obj_port_vlan *vlan) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; - if (ds->drv->fdb_del) - ret = ds->drv->fdb_del(ds, p->port, addr, vid); + if (!ds->drv->port_vlan_del) + return -EOPNOTSUPP; - return ret; + return ds->drv->port_vlan_del(ds, p->port, vlan); } -static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb, - const unsigned char *addr, u16 vid, - bool is_static, - u32 portid, u32 seq, int type, - unsigned int flags) +static int dsa_slave_port_vlan_dump(struct net_device *dev, + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb) { - struct nlmsghdr *nlh; - struct ndmsg *ndm; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + DECLARE_BITMAP(members, DSA_MAX_PORTS); + DECLARE_BITMAP(untagged, DSA_MAX_PORTS); + u16 pvid, vid = 0; + int err; + + if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get) + return -EOPNOTSUPP; + + err = ds->drv->port_pvid_get(ds, p->port, &pvid); + if (err) + return err; + + for (;;) { + err = ds->drv->vlan_getnext(ds, &vid, members, untagged); + if (err) + break; - nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); - if (!nlh) - return -EMSGSIZE; + if (!test_bit(p->port, members)) + continue; - ndm = nlmsg_data(nlh); - ndm->ndm_family = AF_BRIDGE; - ndm->ndm_pad1 = 0; - ndm->ndm_pad2 = 0; - ndm->ndm_flags = NTF_EXT_LEARNED; - ndm->ndm_type = 0; - ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; + memset(vlan, 0, sizeof(*vlan)); + vlan->vid_begin = vlan->vid_end = vid; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) - goto nla_put_failure; + if (vid == pvid) + vlan->flags |= BRIDGE_VLAN_INFO_PVID; - if (vid && nla_put_u16(skb, NDA_VLAN, vid)) - goto nla_put_failure; + if (test_bit(p->port, untagged)) + vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED; - nlmsg_end(skb, nlh); - return 0; + err = cb(&vlan->obj); + if (err) + break; + } -nla_put_failure: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; + return err == -ENOENT ? 0 : err; } -/* Dump information about entries, in response to GETNEIGH */ -static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, - struct net_device *filter_dev, int idx) +static int dsa_slave_port_fdb_add(struct net_device *dev, + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - unsigned char addr[ETH_ALEN] = { 0 }; int ret; - if (!ds->drv->fdb_getnext) + if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) return -EOPNOTSUPP; - for (; ; idx++) { - bool is_static; + if (switchdev_trans_ph_prepare(trans)) + ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); + else + ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans); - ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static); - if (ret < 0) - break; + return ret; +} - if (idx < cb->args[0]) - continue; +static int dsa_slave_port_fdb_del(struct net_device *dev, + const struct switchdev_obj_port_fdb *fdb) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; - ret = dsa_slave_fill_info(dev, skb, addr, 0, - is_static, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, NLM_F_MULTI); - if (ret < 0) - break; - } + if (ds->drv->port_fdb_del) + ret = ds->drv->port_fdb_del(ds, p->port, fdb); - return idx; + return ret; +} + +static int dsa_slave_port_fdb_dump(struct net_device *dev, + struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->port_fdb_dump) + return ds->drv->port_fdb_dump(ds, p->port, fdb, cb); + + return -EOPNOTSUPP; } static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) @@ -345,6 +421,107 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) return ret; } +static int dsa_slave_port_attr_set(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + switch (attr->id) { + case SWITCHDEV_ATTR_ID_PORT_STP_STATE: + if (switchdev_trans_ph_prepare(trans)) + ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP; + else + ret = ds->drv->port_stp_update(ds, p->port, + attr->u.stp_state); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + return ret; +} + +static int dsa_slave_port_obj_add(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans) +{ + int err; + + /* For the prepare phase, ensure the full set of changes is feasable in + * one go in order to signal a failure properly. If an operation is not + * supported, return -EOPNOTSUPP. + */ + + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = dsa_slave_port_fdb_add(dev, + SWITCHDEV_OBJ_PORT_FDB(obj), + trans); + break; + case SWITCHDEV_OBJ_ID_PORT_VLAN: + err = dsa_slave_port_vlan_add(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj), + trans); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int dsa_slave_port_obj_del(struct net_device *dev, + const struct switchdev_obj *obj) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = dsa_slave_port_fdb_del(dev, + SWITCHDEV_OBJ_PORT_FDB(obj)); + break; + case SWITCHDEV_OBJ_ID_PORT_VLAN: + err = dsa_slave_port_vlan_del(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj)); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int dsa_slave_port_obj_dump(struct net_device *dev, + struct switchdev_obj *obj, + switchdev_obj_dump_cb_t *cb) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = dsa_slave_port_fdb_dump(dev, + SWITCHDEV_OBJ_PORT_FDB(obj), + cb); + break; + case SWITCHDEV_OBJ_ID_PORT_VLAN: + err = dsa_slave_port_vlan_dump(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj), + cb); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + static int dsa_slave_bridge_port_join(struct net_device *dev, struct net_device *br) { @@ -382,36 +559,71 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev) return ret; } -static int dsa_slave_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +static int dsa_slave_port_attr_get(struct net_device *dev, + struct switchdev_attr *attr) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - psid->id_len = sizeof(ds->index); - memcpy(&psid->id, &ds->index, psid->id_len); + switch (attr->id) { + case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: + attr->u.ppid.id_len = sizeof(ds->index); + memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len); + break; + default: + return -EOPNOTSUPP; + } return 0; } -static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) +static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p, + struct sk_buff *skb) { - struct dsa_slave_priv *p = netdev_priv(dev); - - return p->xmit(skb, dev); +#ifdef CONFIG_NET_POLL_CONTROLLER + if (p->netpoll) + netpoll_send_skb(p->netpoll, skb); +#else + BUG(); +#endif + return NETDEV_TX_OK; } -static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, - struct net_device *dev) +static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); + struct sk_buff *nskb; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + /* Transmit function may have to reallocate the original SKB */ + nskb = p->xmit(skb, dev); + if (!nskb) + return NETDEV_TX_OK; + + /* SKB for netpoll still need to be mangled with the protocol-specific + * tag to be successfully transmitted + */ + if (unlikely(netpoll_tx_running(dev))) + return dsa_netpoll_send_skb(p, nskb); - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); + /* Queue the SKB for transmission on the parent interface, but + * do not modify its EtherType + */ + nskb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(nskb); return NETDEV_TX_OK; } +static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Just return the original SKB */ + return skb; +} + /* ethtool operations *******************************************************/ static int @@ -641,6 +853,49 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) return ret; } +#ifdef CONFIG_NET_POLL_CONTROLLER +static int dsa_slave_netpoll_setup(struct net_device *dev, + struct netpoll_info *ni) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + struct net_device *master = ds->dst->master_netdev; + struct netpoll *netpoll; + int err = 0; + + netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + if (!netpoll) + return -ENOMEM; + + err = __netpoll_setup(netpoll, master); + if (err) { + kfree(netpoll); + goto out; + } + + p->netpoll = netpoll; +out: + return err; +} + +static void dsa_slave_netpoll_cleanup(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct netpoll *netpoll = p->netpoll; + + if (!netpoll) + return; + + p->netpoll = NULL; + + __netpoll_free_async(netpoll); +} + +static void dsa_slave_poll_controller(struct net_device *dev) +{ +} +#endif + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -668,16 +923,31 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_fdb_add = dsa_slave_fdb_add, - .ndo_fdb_del = dsa_slave_fdb_del, - .ndo_fdb_dump = dsa_slave_fdb_dump, + .ndo_fdb_add = switchdev_port_fdb_add, + .ndo_fdb_del = switchdev_port_fdb_del, + .ndo_fdb_dump = switchdev_port_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_netpoll_setup = dsa_slave_netpoll_setup, + .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup, + .ndo_poll_controller = dsa_slave_poll_controller, +#endif + .ndo_bridge_getlink = switchdev_port_bridge_getlink, + .ndo_bridge_setlink = switchdev_port_bridge_setlink, + .ndo_bridge_dellink = switchdev_port_bridge_dellink, +}; + +static const struct switchdev_ops dsa_slave_switchdev_ops = { + .switchdev_port_attr_get = dsa_slave_port_attr_get, + .switchdev_port_attr_set = dsa_slave_port_attr_set, + .switchdev_port_obj_add = dsa_slave_port_obj_add, + .switchdev_port_obj_del = dsa_slave_port_obj_del, + .switchdev_port_obj_dump = dsa_slave_port_obj_dump, }; -static const struct swdev_ops dsa_slave_swdev_ops = { - .swdev_parent_id_get = dsa_slave_parent_id_get, - .swdev_port_stp_update = dsa_slave_stp_update, +static struct device_type dsa_type = { + .name = "dsa", }; static void dsa_slave_adjust_link(struct net_device *dev) @@ -728,8 +998,10 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, struct dsa_switch *ds = p->parent; p->phy = ds->slave_mii_bus->phy_map[addr]; - if (!p->phy) + if (!p->phy) { + netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; + } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) @@ -763,7 +1035,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ ret = of_phy_register_fixed_link(port_dn); if (ret) { - netdev_err(slave_dev, "failed to register fixed PHY\n"); + netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret); return ret; } phy_is_fixed = true; @@ -774,17 +1046,20 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_flags = ds->drv->get_phy_flags(ds, p->port); if (phy_dn) { - ret = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + /* If this PHY address is part of phys_mii_mask, which means * that we need to divert reads and writes to/from it, then we * want to bind this device using the slave MII bus created by * DSA to make that happen. */ - if (!phy_is_fixed && ret >= 0 && - (ds->phys_mii_mask & (1 << ret))) { - ret = dsa_slave_phy_connect(p, slave_dev, ret); - if (ret) + if (!phy_is_fixed && phy_id >= 0 && + (ds->phys_mii_mask & (1 << phy_id))) { + ret = dsa_slave_phy_connect(p, slave_dev, phy_id); + if (ret) { + netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); return ret; + } } else { p->phy = of_phy_connect(slave_dev, phy_dn, dsa_slave_adjust_link, @@ -801,8 +1076,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ if (!p->phy) { ret = dsa_slave_phy_connect(p, slave_dev, p->port); - if (ret) + if (ret) { + netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); return ret; + } } else { netdev_info(slave_dev, "attached PHY at address %d [%s]\n", p->phy->addr, p->phy->drv->name); @@ -811,12 +1088,19 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, return 0; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_detach(slave_dev); - if (p->phy) { phy_stop(p->phy); p->old_pause = -1; @@ -858,9 +1142,13 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); - slave_dev->tx_queue_len = 0; + slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; - slave_dev->swdev_ops = &dsa_slave_swdev_ops; + slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; + SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); + + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); SET_NETDEV_DEV(slave_dev, parent); slave_dev->dev.of_node = ds->pd->port_dn[port]; @@ -903,6 +1191,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { + netdev_err(master, "error %d setting up slave phy\n", ret); free_netdev(slave_dev); return ret; } @@ -956,7 +1245,7 @@ int dsa_slave_netdevice_event(struct notifier_block *unused, goto out; err = dsa_slave_master_changed(dev); - if (err) + if (err && err != -EOPNOTSUPP) netdev_warn(dev, "failed to reflect master change\n"); break; diff --git a/kernel/net/dsa/tag_brcm.c b/kernel/net/dsa/tag_brcm.c index 83d3572cd..e2aadb731 100644 --- a/kernel/net/dsa/tag_brcm.c +++ b/kernel/net/dsa/tag_brcm.c @@ -58,14 +58,11 @@ #define BRCM_EG_TC_MASK 0x7 #define BRCM_EG_PID_MASK 0x1f -static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *brcm_tag; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) goto out_free; @@ -87,17 +84,11 @@ static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; - /* Queue the SKB for transmission on the parent interface, but - * do not modify its EtherType - */ - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); - - return NETDEV_TX_OK; + return skb; out_free: kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/kernel/net/dsa/tag_dsa.c b/kernel/net/dsa/tag_dsa.c index 2dab27063..aa780e4ac 100644 --- a/kernel/net/dsa/tag_dsa.c +++ b/kernel/net/dsa/tag_dsa.c @@ -15,14 +15,11 @@ #define DSA_HLEN 4 -static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - /* * Convert the outermost 802.1q tag to a DSA tag for tagged * packets, or insert a DSA tag between the addresses and @@ -63,14 +60,11 @@ static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) dsa_header[3] = 0x00; } - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); - - return NETDEV_TX_OK; + return skb; out_free: kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/kernel/net/dsa/tag_edsa.c b/kernel/net/dsa/tag_edsa.c index 9aeda596f..2288c8098 100644 --- a/kernel/net/dsa/tag_edsa.c +++ b/kernel/net/dsa/tag_edsa.c @@ -16,14 +16,11 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 -static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - /* * Convert the outermost 802.1q tag to a DSA tag and prepend * a DSA ethertype field is the packet is tagged, or insert @@ -76,14 +73,11 @@ static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[7] = 0x00; } - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); - - return NETDEV_TX_OK; + return skb; out_free: kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/kernel/net/dsa/tag_trailer.c b/kernel/net/dsa/tag_trailer.c index e268f9db8..b6ca0890d 100644 --- a/kernel/net/dsa/tag_trailer.c +++ b/kernel/net/dsa/tag_trailer.c @@ -13,16 +13,13 @@ #include #include "dsa_priv.h" -static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; int padlen; u8 *trailer; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - /* * We have to make sure that the trailer ends up as the very * last 4 bytes of the packet. This means that we have to pad @@ -36,7 +33,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC); if (nskb == NULL) { kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } skb_reserve(nskb, NET_IP_ALIGN); @@ -57,10 +54,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) trailer[2] = 0x10; trailer[3] = 0x00; - nskb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(nskb); - - return NETDEV_TX_OK; + return nskb; } static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, @@ -84,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, trailer = skb_tail_pointer(skb) - 4; if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || - (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) + (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) goto out_drop; source_port = trailer[1] & 7; diff --git a/kernel/net/ethernet/eth.c b/kernel/net/ethernet/eth.c index f3bad41d7..9e63f252a 100644 --- a/kernel/net/ethernet/eth.c +++ b/kernel/net/ethernet/eth.c @@ -58,6 +58,7 @@ #include #include #include +#include #include __setup("ether=", netdev_boot_setup); @@ -113,7 +114,7 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, EXPORT_SYMBOL(eth_header); /** - * eth_get_headlen - determine the the length of header for an ethernet frame + * eth_get_headlen - determine the length of header for an ethernet frame * @data: pointer to start of frame * @len: total length of frame * @@ -126,13 +127,13 @@ u32 eth_get_headlen(void *data, unsigned int len) struct flow_keys keys; /* this should never happen, but better safe than sorry */ - if (len < sizeof(*eth)) + if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!__skb_flow_dissect(NULL, &keys, data, - eth->h_proto, sizeof(*eth), len)) - return max_t(u32, keys.thoff, sizeof(*eth)); + if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, + sizeof(*eth), len, 0)) + return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); @@ -156,10 +157,11 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); + + eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - eth = eth_hdr(skb); - if (unlikely(is_multicast_ether_addr(eth->h_dest))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else @@ -178,7 +180,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); - if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* @@ -468,6 +470,7 @@ EXPORT_SYMBOL(eth_gro_complete); static struct packet_offload eth_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_TEB), + .priority = 10, .callbacks = { .gro_receive = eth_gro_receive, .gro_complete = eth_gro_complete, diff --git a/kernel/net/hsr/hsr_device.c b/kernel/net/hsr/hsr_device.c index 44d27469a..c7d1adca3 100644 --- a/kernel/net/hsr/hsr_device.c +++ b/kernel/net/hsr/hsr_device.c @@ -312,7 +312,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) return; out: - WARN_ON_ONCE("HSR: Could not send supervision frame\n"); + WARN_ONCE(1, "HSR: Could not send supervision frame\n"); kfree_skb(skb); } @@ -392,7 +392,7 @@ void hsr_dev_setup(struct net_device *dev) dev->header_ops = &hsr_header_ops; dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->destructor = hsr_dev_destroy; diff --git a/kernel/net/ieee802154/6lowpan/6lowpan_i.h b/kernel/net/ieee802154/6lowpan/6lowpan_i.h index e50f69da7..b4e17a7c0 100644 --- a/kernel/net/ieee802154/6lowpan/6lowpan_i.h +++ b/kernel/net/ieee802154/6lowpan/6lowpan_i.h @@ -5,6 +5,16 @@ #include #include +#include + +typedef unsigned __bitwise__ lowpan_rx_result; +#define RX_CONTINUE ((__force lowpan_rx_result) 0u) +#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u) +#define RX_DROP ((__force lowpan_rx_result) 2u) +#define RX_QUEUED ((__force lowpan_rx_result) 3u) + +#define LOWPAN_DISPATCH_FRAG1 0xc0 +#define LOWPAN_DISPATCH_FRAGN 0xe0 struct lowpan_create_arg { u16 tag; @@ -37,26 +47,18 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) } } -struct lowpan_dev_record { - struct net_device *ldev; - struct list_head list; -}; - /* private device info */ struct lowpan_dev_info { - struct net_device *real_dev; /* real WPAN device ptr */ - struct mutex dev_list_mtx; /* mutex for list ops */ + struct net_device *wdev; /* wpan device ptr */ u16 fragment_tag; }; static inline struct lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) { - return netdev_priv(dev); + return (struct lowpan_dev_info *)lowpan_priv(dev)->priv; } -extern struct list_head lowpan_devices; - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); @@ -69,4 +71,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, const void *_saddr, unsigned int len); netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev); +int lowpan_iphc_decompress(struct sk_buff *skb); +lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb); + #endif /* __IEEE802154_6LOWPAN_I_H__ */ diff --git a/kernel/net/ieee802154/6lowpan/core.c b/kernel/net/ieee802154/6lowpan/core.c index 0ae5822ef..20c49c724 100644 --- a/kernel/net/ieee802154/6lowpan/core.c +++ b/kernel/net/ieee802154/6lowpan/core.c @@ -52,29 +52,7 @@ #include "6lowpan_i.h" -LIST_HEAD(lowpan_devices); -static int lowpan_open_count; - -static __le16 lowpan_get_pan_id(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); -} - -static __le16 lowpan_get_short_addr(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); -} - -static u8 lowpan_get_dsn(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); -} +static int open_count; static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, @@ -83,7 +61,7 @@ static struct header_ops lowpan_header_ops = { static struct lock_class_key lowpan_tx_busylock; static struct lock_class_key lowpan_netdev_xmit_lock_key; -static void lowpan_set_lockdep_class_one(struct net_device *dev, +static void lowpan_set_lockdep_class_one(struct net_device *ldev, struct netdev_queue *txq, void *_unused) { @@ -91,42 +69,47 @@ static void lowpan_set_lockdep_class_one(struct net_device *dev, &lowpan_netdev_xmit_lock_key); } -static int lowpan_dev_init(struct net_device *dev) +static int lowpan_dev_init(struct net_device *ldev) +{ + netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL); + ldev->qdisc_tx_busylock = &lowpan_tx_busylock; + return 0; +} + +static int lowpan_open(struct net_device *dev) { - netdev_for_each_tx_queue(dev, lowpan_set_lockdep_class_one, NULL); - dev->qdisc_tx_busylock = &lowpan_tx_busylock; + if (!open_count) + lowpan_rx_init(); + open_count++; + return 0; +} + +static int lowpan_stop(struct net_device *dev) +{ + open_count--; + if (!open_count) + lowpan_rx_exit(); return 0; } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, + .ndo_open = lowpan_open, + .ndo_stop = lowpan_stop, }; -static struct ieee802154_mlme_ops lowpan_mlme = { - .get_pan_id = lowpan_get_pan_id, - .get_short_addr = lowpan_get_short_addr, - .get_dsn = lowpan_get_dsn, -}; - -static void lowpan_setup(struct net_device *dev) +static void lowpan_setup(struct net_device *ldev) { - dev->addr_len = IEEE802154_ADDR_LEN; - memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN); - dev->type = ARPHRD_6LOWPAN; - /* Frame Control + Sequence Number + Address fields + Security Header */ - dev->hard_header_len = 2 + 1 + 20 + 14; - dev->needed_tailroom = 2; /* FCS */ - dev->mtu = IPV6_MIN_MTU; - dev->tx_queue_len = 0; - dev->flags = IFF_BROADCAST | IFF_MULTICAST; - dev->watchdog_timeo = 0; - - dev->netdev_ops = &lowpan_netdev_ops; - dev->header_ops = &lowpan_header_ops; - dev->ml_priv = &lowpan_mlme; - dev->destructor = free_netdev; - dev->features |= NETIF_F_NETNS_LOCAL; + memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); + /* We need an ipv6hdr as minimum len when calling xmit */ + ldev->hard_header_len = sizeof(struct ipv6hdr); + ldev->flags = IFF_BROADCAST | IFF_MULTICAST; + + ldev->netdev_ops = &lowpan_netdev_ops; + ldev->header_ops = &lowpan_header_ops; + ldev->destructor = free_netdev; + ldev->features |= NETIF_F_NETNS_LOCAL; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -138,11 +121,10 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } -static int lowpan_newlink(struct net *src_net, struct net_device *dev, +static int lowpan_newlink(struct net *src_net, struct net_device *ldev, struct nlattr *tb[], struct nlattr *data[]) { - struct net_device *real_dev; - struct lowpan_dev_record *entry; + struct net_device *wdev; int ret; ASSERT_RTNL(); @@ -150,78 +132,61 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, pr_debug("adding new link\n"); if (!tb[IFLA_LINK] || - !net_eq(dev_net(dev), &init_net)) + !net_eq(dev_net(ldev), &init_net)) return -EINVAL; - /* find and hold real wpan device */ - real_dev = dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK])); - if (!real_dev) + /* find and hold wpan device */ + wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); + if (!wdev) return -ENODEV; - if (real_dev->type != ARPHRD_IEEE802154) { - dev_put(real_dev); + if (wdev->type != ARPHRD_IEEE802154) { + dev_put(wdev); return -EINVAL; } - lowpan_dev_info(dev)->real_dev = real_dev; - mutex_init(&lowpan_dev_info(dev)->dev_list_mtx); - - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - dev_put(real_dev); - lowpan_dev_info(dev)->real_dev = NULL; - return -ENOMEM; + if (wdev->ieee802154_ptr->lowpan_dev) { + dev_put(wdev); + return -EBUSY; } - entry->ldev = dev; - + lowpan_dev_info(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ - memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN); - - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); - INIT_LIST_HEAD(&entry->list); - list_add_tail(&entry->list, &lowpan_devices); - mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - - ret = register_netdevice(dev); - if (ret >= 0) { - if (!lowpan_open_count) - lowpan_rx_init(); - lowpan_open_count++; + memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); + /* We need headroom for possible wpan_dev_hard_header call and tailroom + * for encryption/fcs handling. The lowpan interface will replace + * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN + * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 + * header. + */ + ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + + wdev->needed_headroom; + ldev->needed_tailroom = wdev->needed_tailroom; + + lowpan_netdev_setup(ldev, LOWPAN_LLTYPE_IEEE802154); + + ret = register_netdevice(ldev); + if (ret < 0) { + dev_put(wdev); + return ret; } - return ret; + wdev->ieee802154_ptr->lowpan_dev = ldev; + return 0; } -static void lowpan_dellink(struct net_device *dev, struct list_head *head) +static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { - struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); - struct net_device *real_dev = lowpan_dev->real_dev; - struct lowpan_dev_record *entry, *tmp; + struct net_device *wdev = lowpan_dev_info(ldev)->wdev; ASSERT_RTNL(); - lowpan_open_count--; - if (!lowpan_open_count) - lowpan_rx_exit(); - - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); - list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { - if (entry->ldev == dev) { - list_del(&entry->list); - kfree(entry); - } - } - mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - - mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx); - - unregister_netdevice_queue(dev, head); - - dev_put(real_dev); + wdev->ieee802154_ptr->lowpan_dev = NULL; + unregister_netdevice(ldev); + dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", - .priv_size = sizeof(struct lowpan_dev_info), + .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, @@ -241,20 +206,22 @@ static inline void lowpan_netlink_fini(void) static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - LIST_HEAD(del_list); - struct lowpan_dev_record *entry, *tmp; + struct net_device *wdev = netdev_notifier_info_to_dev(ptr); - if (dev->type != ARPHRD_IEEE802154) + if (wdev->type != ARPHRD_IEEE802154) goto out; - if (event == NETDEV_UNREGISTER) { - list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { - if (lowpan_dev_info(entry->ldev)->real_dev == dev) - lowpan_dellink(entry->ldev, &del_list); - } - - unregister_netdevice_many(&del_list); + switch (event) { + case NETDEV_UNREGISTER: + /* Check if wpan interface is unregistered that we + * also delete possible lowpan interfaces which belongs + * to the wpan interface. + */ + if (wdev->ieee802154_ptr->lowpan_dev) + lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL); + break; + default: + break; } out: diff --git a/kernel/net/ieee802154/6lowpan/reassembly.c b/kernel/net/ieee802154/6lowpan/reassembly.c index f46e4d130..6b437e876 100644 --- a/kernel/net/ieee802154/6lowpan/reassembly.c +++ b/kernel/net/ieee802154/6lowpan/reassembly.c @@ -32,21 +32,10 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags"; -struct lowpan_frag_info { - u16 d_tag; - u16 d_size; - u8 d_offset; -}; - -static struct lowpan_frag_info *lowpan_cb(struct sk_buff *skb) -{ - return (struct lowpan_frag_info *)skb->cb; -} - static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, - struct sk_buff *prev, struct net_device *dev); + struct sk_buff *prev, struct net_device *ldev); static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, const struct ieee802154_addr *saddr, @@ -111,7 +100,7 @@ out: } static inline struct lowpan_frag_queue * -fq_find(struct net *net, const struct lowpan_frag_info *frag_info, +fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { @@ -121,12 +110,12 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); - arg.tag = frag_info->d_tag; - arg.d_size = frag_info->d_size; + arg.tag = cb->d_tag; + arg.d_size = cb->d_size; arg.src = src; arg.dst = dst; - hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst); + hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); q = inet_frag_find(&ieee802154_lowpan->frags, &lowpan_frags, &arg, hash); @@ -138,17 +127,17 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, - struct sk_buff *skb, const u8 frag_type) + struct sk_buff *skb, u8 frag_type) { struct sk_buff *prev, *next; - struct net_device *dev; + struct net_device *ldev; int end, offset; if (fq->q.flags & INET_FRAG_COMPLETE) goto err; - offset = lowpan_cb(skb)->d_offset << 3; - end = lowpan_cb(skb)->d_size; + offset = lowpan_802154_cb(skb)->d_offset << 3; + end = lowpan_802154_cb(skb)->d_size; /* Is this the final fragment? */ if (offset + skb->len == end) { @@ -174,13 +163,16 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || lowpan_cb(prev)->d_offset < lowpan_cb(skb)->d_offset) { + if (!prev || + lowpan_802154_cb(prev)->d_offset < + lowpan_802154_cb(skb)->d_offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (lowpan_cb(next)->d_offset >= lowpan_cb(skb)->d_offset) + if (lowpan_802154_cb(next)->d_offset >= + lowpan_802154_cb(skb)->d_offset) break; /* bingo! */ prev = next; } @@ -195,19 +187,16 @@ found: else fq->q.fragments = skb; - dev = skb->dev; - if (dev) + ldev = skb->dev; + if (ldev) skb->dev = NULL; fq->q.stamp = skb->tstamp; - if (frag_type == LOWPAN_DISPATCH_FRAG1) { - /* Calculate uncomp. 6lowpan header to estimate full size */ - fq->q.meat += lowpan_uncompress_size(skb, NULL); + if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; - } else { - fq->q.meat += skb->len; - } - add_frag_mem_limit(&fq->q, skb->truesize); + + fq->q.meat += skb->len; + add_frag_mem_limit(fq->q.net, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { @@ -215,7 +204,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = lowpan_frag_reasm(fq, prev, dev); + res = lowpan_frag_reasm(fq, prev, ldev); skb->_skb_refdst = orefdst; return res; } @@ -235,7 +224,7 @@ err: * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, - struct net_device *dev) + struct net_device *ldev) { struct sk_buff *fp, *head = fq->q.fragments; int sum_truesize; @@ -287,7 +276,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, clone->data_len = clone->len; head->data_len -= clone->len; head->len -= clone->len; - add_frag_mem_limit(&fq->q, clone->truesize); + add_frag_mem_limit(fq->q.net, clone->truesize); } WARN_ON(head == NULL); @@ -310,10 +299,10 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, } fp = next; } - sub_frag_mem_limit(&fq->q, sum_truesize); + sub_frag_mem_limit(fq->q.net, sum_truesize); head->next = NULL; - head->dev = dev; + head->dev = ldev; head->tstamp = fq->q.stamp; fq->q.fragments = NULL; @@ -325,24 +314,87 @@ out_oom: return -1; } -static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type, - struct lowpan_frag_info *frag_info) +static int lowpan_frag_rx_handlers_result(struct sk_buff *skb, + lowpan_rx_result res) +{ + switch (res) { + case RX_QUEUED: + return NET_RX_SUCCESS; + case RX_CONTINUE: + /* nobody cared about this packet */ + net_warn_ratelimited("%s: received unknown dispatch\n", + __func__); + + /* fall-through */ + default: + /* all others failure */ + return NET_RX_DROP; + } +} + +static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb) +{ + int ret; + + if (!lowpan_is_iphc(*skb_network_header(skb))) + return RX_CONTINUE; + + ret = lowpan_iphc_decompress(skb); + if (ret < 0) + return RX_DROP; + + return RX_QUEUED; +} + +static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb) +{ + lowpan_rx_result res; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(skb); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0) + + /* likely at first */ + CALL_RXH(lowpan_frag_rx_h_iphc); + CALL_RXH(lowpan_rx_h_ipv6); + +rxh_next: + return lowpan_frag_rx_handlers_result(skb, res); +#undef CALL_RXH +} + +#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07 +#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8 + +static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type, + struct lowpan_802154_cb *cb) { bool fail; - u8 pattern = 0, low = 0; + u8 high = 0, low = 0; __be16 d_tag = 0; - fail = lowpan_fetch_skb(skb, &pattern, 1); + fail = lowpan_fetch_skb(skb, &high, 1); fail |= lowpan_fetch_skb(skb, &low, 1); - frag_info->d_size = (pattern & 7) << 8 | low; + /* remove the dispatch value and use first three bits as high value + * for the datagram size + */ + cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) << + LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low; fail |= lowpan_fetch_skb(skb, &d_tag, 2); - frag_info->d_tag = ntohs(d_tag); + cb->d_tag = ntohs(d_tag); if (frag_type == LOWPAN_DISPATCH_FRAGN) { - fail |= lowpan_fetch_skb(skb, &frag_info->d_offset, 1); + fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1); } else { skb_reset_network_header(skb); - frag_info->d_offset = 0; + cb->d_offset = 0; + /* check if datagram_size has ipv6hdr on FRAG1 */ + fail |= cb->d_size < sizeof(struct ipv6hdr); + /* check if we can dereference the dispatch value */ + fail |= !skb->len; } if (unlikely(fail)) @@ -351,27 +403,33 @@ static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type, return 0; } -int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) +int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) { struct lowpan_frag_queue *fq; struct net *net = dev_net(skb->dev); - struct lowpan_frag_info *frag_info = lowpan_cb(skb); - struct ieee802154_addr source, dest; + struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); + struct ieee802154_hdr hdr; int err; - source = mac_cb(skb)->source; - dest = mac_cb(skb)->dest; + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) + goto err; - err = lowpan_get_frag_info(skb, frag_type, frag_info); + err = lowpan_get_cb(skb, frag_type, cb); if (err < 0) goto err; - if (frag_info->d_size > IPV6_MIN_MTU) { + if (frag_type == LOWPAN_DISPATCH_FRAG1) { + err = lowpan_invoke_frag_rx_handlers(skb); + if (err == NET_RX_DROP) + goto err; + } + + if (cb->d_size > IPV6_MIN_MTU) { net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; } - fq = fq_find(net, frag_info, &source, &dest); + fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { int ret; @@ -387,7 +445,6 @@ err: kfree_skb(skb); return -1; } -EXPORT_SYMBOL(lowpan_frag_rcv); #ifdef CONFIG_SYSCTL static int zero; @@ -523,14 +580,19 @@ static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + int res; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&ieee802154_lowpan->frags); - - return lowpan_frags_ns_sysctl_register(net); + res = inet_frags_init_net(&ieee802154_lowpan->frags); + if (res) + return res; + res = lowpan_frags_ns_sysctl_register(net); + if (res) + inet_frags_uninit_net(&ieee802154_lowpan->frags); + return res; } static void __net_exit lowpan_frags_exit_net(struct net *net) diff --git a/kernel/net/ieee802154/6lowpan/rx.c b/kernel/net/ieee802154/6lowpan/rx.c index 4be1d289a..ef185dd41 100644 --- a/kernel/net/ieee802154/6lowpan/rx.c +++ b/kernel/net/ieee802154/6lowpan/rx.c @@ -11,147 +11,307 @@ #include #include +#include #include #include "6lowpan_i.h" -static int lowpan_give_skb_to_devices(struct sk_buff *skb, - struct net_device *dev) -{ - struct lowpan_dev_record *entry; - struct sk_buff *skb_cp; - int stat = NET_RX_SUCCESS; +#define LOWPAN_DISPATCH_FIRST 0xc0 +#define LOWPAN_DISPATCH_FRAG_MASK 0xf8 + +#define LOWPAN_DISPATCH_NALP 0x00 +#define LOWPAN_DISPATCH_ESC 0x40 +#define LOWPAN_DISPATCH_HC1 0x42 +#define LOWPAN_DISPATCH_DFF 0x43 +#define LOWPAN_DISPATCH_BC0 0x50 +#define LOWPAN_DISPATCH_MESH 0x80 +static int lowpan_give_skb_to_device(struct sk_buff *skb) +{ skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; - rcu_read_lock(); - list_for_each_entry_rcu(entry, &lowpan_devices, list) - if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) { - skb_cp = skb_copy(skb, GFP_ATOMIC); - if (!skb_cp) { - kfree_skb(skb); - rcu_read_unlock(); - return NET_RX_DROP; - } + return netif_rx(skb); +} - skb_cp->dev = entry->ldev; - stat = netif_rx(skb_cp); - if (stat == NET_RX_DROP) - break; - } - rcu_read_unlock(); +static int lowpan_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res) +{ + switch (res) { + case RX_CONTINUE: + /* nobody cared about this packet */ + net_warn_ratelimited("%s: received unknown dispatch\n", + __func__); + + /* fall-through */ + case RX_DROP_UNUSABLE: + kfree_skb(skb); - consume_skb(skb); + /* fall-through */ + case RX_DROP: + return NET_RX_DROP; + case RX_QUEUED: + return lowpan_give_skb_to_device(skb); + default: + break; + } - return stat; + return NET_RX_DROP; } -static int -iphc_decompress(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +static inline bool lowpan_is_frag1(u8 dispatch) { - u8 iphc0, iphc1; - struct ieee802154_addr_sa sa, da; - void *sap, *dap; + return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAG1; +} - raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len); - /* at least two bytes will be used for the encoding */ - if (skb->len < 2) - return -EINVAL; +static inline bool lowpan_is_fragn(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAGN; +} - if (lowpan_fetch_skb_u8(skb, &iphc0)) - return -EINVAL; +static lowpan_rx_result lowpan_rx_h_frag(struct sk_buff *skb) +{ + int ret; - if (lowpan_fetch_skb_u8(skb, &iphc1)) - return -EINVAL; + if (!(lowpan_is_frag1(*skb_network_header(skb)) || + lowpan_is_fragn(*skb_network_header(skb)))) + return RX_CONTINUE; - ieee802154_addr_to_sa(&sa, &hdr->source); - ieee802154_addr_to_sa(&da, &hdr->dest); + ret = lowpan_frag_rcv(skb, *skb_network_header(skb) & + LOWPAN_DISPATCH_FRAG_MASK); + if (ret == 1) + return RX_QUEUED; - if (sa.addr_type == IEEE802154_ADDR_SHORT) - sap = &sa.short_addr; - else - sap = &sa.hwaddr; + /* Packet is freed by lowpan_frag_rcv on error or put into the frag + * bucket. + */ + return RX_DROP; +} - if (da.addr_type == IEEE802154_ADDR_SHORT) - dap = &da.short_addr; - else - dap = &da.hwaddr; +int lowpan_iphc_decompress(struct sk_buff *skb) +{ + struct ieee802154_hdr hdr; - return lowpan_header_decompress(skb, skb->dev, sap, sa.addr_type, - IEEE802154_ADDR_LEN, dap, da.addr_type, - IEEE802154_ADDR_LEN, iphc0, iphc1); + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) + return -EINVAL; + + return lowpan_header_decompress(skb, skb->dev, &hdr.dest, &hdr.source); } -static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static lowpan_rx_result lowpan_rx_h_iphc(struct sk_buff *skb) { - struct ieee802154_hdr hdr; int ret; - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - goto drop; + if (!lowpan_is_iphc(*skb_network_header(skb))) + return RX_CONTINUE; + + /* Setting datagram_offset to zero indicates non frag handling + * while doing lowpan_header_decompress. + */ + lowpan_802154_cb(skb)->d_size = 0; - if (!netif_running(dev)) - goto drop_skb; + ret = lowpan_iphc_decompress(skb); + if (ret < 0) + return RX_DROP_UNUSABLE; - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop_skb; + return RX_QUEUED; +} - if (dev->type != ARPHRD_IEEE802154) - goto drop_skb; +lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb) +{ + if (!lowpan_is_ipv6(*skb_network_header(skb))) + return RX_CONTINUE; - if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) - goto drop_skb; - - /* check that it's our buffer */ - if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { - /* Pull off the 1-byte of 6lowpan header. */ - skb_pull(skb, 1); - return lowpan_give_skb_to_devices(skb, NULL); - } else { - switch (skb->data[0] & 0xe0) { - case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ - ret = iphc_decompress(skb, &hdr); - if (ret < 0) - goto drop_skb; - - return lowpan_give_skb_to_devices(skb, NULL); - case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ - ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1); - if (ret == 1) { - ret = iphc_decompress(skb, &hdr); - if (ret < 0) - goto drop_skb; - - return lowpan_give_skb_to_devices(skb, NULL); - } else if (ret == -1) { - return NET_RX_DROP; - } else { - return NET_RX_SUCCESS; - } - case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ - ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAGN); - if (ret == 1) { - ret = iphc_decompress(skb, &hdr); - if (ret < 0) - goto drop_skb; - - return lowpan_give_skb_to_devices(skb, NULL); - } else if (ret == -1) { - return NET_RX_DROP; - } else { - return NET_RX_SUCCESS; - } - default: - break; - } + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(skb, 1); + return RX_QUEUED; +} + +static inline bool lowpan_is_esc(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_ESC; +} + +static lowpan_rx_result lowpan_rx_h_esc(struct sk_buff *skb) +{ + if (!lowpan_is_esc(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN ESC not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_hc1(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_HC1; +} + +static lowpan_rx_result lowpan_rx_h_hc1(struct sk_buff *skb) +{ + if (!lowpan_is_hc1(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN HC1 not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_dff(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_DFF; +} + +static lowpan_rx_result lowpan_rx_h_dff(struct sk_buff *skb) +{ + if (!lowpan_is_dff(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN DFF not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_bc0(u8 dispatch) +{ + return dispatch == LOWPAN_DISPATCH_BC0; +} + +static lowpan_rx_result lowpan_rx_h_bc0(struct sk_buff *skb) +{ + if (!lowpan_is_bc0(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN BC0 not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static inline bool lowpan_is_mesh(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_MESH; +} + +static lowpan_rx_result lowpan_rx_h_mesh(struct sk_buff *skb) +{ + if (!lowpan_is_mesh(*skb_network_header(skb))) + return RX_CONTINUE; + + net_warn_ratelimited("%s: %s\n", skb->dev->name, + "6LoWPAN MESH not supported\n"); + + return RX_DROP_UNUSABLE; +} + +static int lowpan_invoke_rx_handlers(struct sk_buff *skb) +{ + lowpan_rx_result res; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(skb); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0) + + /* likely at first */ + CALL_RXH(lowpan_rx_h_iphc); + CALL_RXH(lowpan_rx_h_frag); + CALL_RXH(lowpan_rx_h_ipv6); + CALL_RXH(lowpan_rx_h_esc); + CALL_RXH(lowpan_rx_h_hc1); + CALL_RXH(lowpan_rx_h_dff); + CALL_RXH(lowpan_rx_h_bc0); + CALL_RXH(lowpan_rx_h_mesh); + +rxh_next: + return lowpan_rx_handlers_result(skb, res); +#undef CALL_RXH +} + +static inline bool lowpan_is_nalp(u8 dispatch) +{ + return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_NALP; +} + +/* Lookup for reserved dispatch values at: + * https://www.iana.org/assignments/_6lowpan-parameters/_6lowpan-parameters.xhtml#_6lowpan-parameters-1 + * + * Last Updated: 2015-01-22 + */ +static inline bool lowpan_is_reserved(u8 dispatch) +{ + return ((dispatch >= 0x44 && dispatch <= 0x4F) || + (dispatch >= 0x51 && dispatch <= 0x5F) || + (dispatch >= 0xc8 && dispatch <= 0xdf) || + (dispatch >= 0xe8 && dispatch <= 0xff)); +} + +/* lowpan_rx_h_check checks on generic 6LoWPAN requirements + * in MAC and 6LoWPAN header. + * + * Don't manipulate the skb here, it could be shared buffer. + */ +static inline bool lowpan_rx_h_check(struct sk_buff *skb) +{ + __le16 fc = ieee802154_get_fc_from_skb(skb); + + /* check on ieee802154 conform 6LoWPAN header */ + if (!ieee802154_is_data(fc) || + !ieee802154_is_intra_pan(fc)) + return false; + + /* check if we can dereference the dispatch */ + if (unlikely(!skb->len)) + return false; + + if (lowpan_is_nalp(*skb_network_header(skb)) || + lowpan_is_reserved(*skb_network_header(skb))) + return false; + + return true; +} + +static int lowpan_rcv(struct sk_buff *skb, struct net_device *wdev, + struct packet_type *pt, struct net_device *orig_wdev) +{ + struct net_device *ldev; + + if (wdev->type != ARPHRD_IEEE802154 || + skb->pkt_type == PACKET_OTHERHOST || + !lowpan_rx_h_check(skb)) + goto drop; + + ldev = wdev->ieee802154_ptr->lowpan_dev; + if (!ldev || !netif_running(ldev)) + goto drop; + + /* Replacing skb->dev and followed rx handlers will manipulate skb. */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + skb->dev = ldev; + + /* When receive frag1 it's likely that we manipulate the buffer. + * When recevie iphc we manipulate the data buffer. So we need + * to unshare the buffer. + */ + if (lowpan_is_frag1(*skb_network_header(skb)) || + lowpan_is_iphc(*skb_network_header(skb))) { + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; } -drop_skb: - kfree_skb(skb); + return lowpan_invoke_rx_handlers(skb); + drop: + kfree_skb(skb); +out: return NET_RX_DROP; } diff --git a/kernel/net/ieee802154/6lowpan/tx.c b/kernel/net/ieee802154/6lowpan/tx.c index 2349070bd..d4353face 100644 --- a/kernel/net/ieee802154/6lowpan/tx.c +++ b/kernel/net/ieee802154/6lowpan/tx.c @@ -10,9 +10,13 @@ #include #include +#include #include "6lowpan_i.h" +#define LOWPAN_FRAG1_HEAD_SIZE 0x4 +#define LOWPAN_FRAGN_HEAD_SIZE 0x5 + /* don't save pan id, it's intra pan */ struct lowpan_addr { u8 mode; @@ -36,7 +40,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) sizeof(struct lowpan_addr_info)); } -int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, +/* This callback will be called from AF_PACKET and IPv6 stack, the AF_PACKET + * sockets gives an 8 byte array for addresses only! + * + * TODO I think AF_PACKET DGRAM (sending/receiving) RAW (sending) makes no + * sense here. We should disable it, the right use-case would be AF_INET6 + * RAW/DGRAM sockets. + */ +int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { @@ -51,7 +62,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, return 0; if (!saddr) - saddr = dev->dev_addr; + saddr = ldev->dev_addr; raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); @@ -71,28 +82,33 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, static struct sk_buff* lowpan_alloc_frag(struct sk_buff *skb, int size, - const struct ieee802154_hdr *master_hdr) + const struct ieee802154_hdr *master_hdr, bool frag1) { - struct net_device *real_dev = lowpan_dev_info(skb->dev)->real_dev; + struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev; struct sk_buff *frag; int rc; - frag = alloc_skb(real_dev->hard_header_len + - real_dev->needed_tailroom + size, + frag = alloc_skb(wdev->needed_headroom + wdev->needed_tailroom + size, GFP_ATOMIC); if (likely(frag)) { - frag->dev = real_dev; + frag->dev = wdev; frag->priority = skb->priority; - skb_reserve(frag, real_dev->hard_header_len); + skb_reserve(frag, wdev->needed_headroom); skb_reset_network_header(frag); *mac_cb(frag) = *mac_cb(skb); - rc = dev_hard_header(frag, real_dev, 0, &master_hdr->dest, - &master_hdr->source, size); - if (rc < 0) { - kfree_skb(frag); - return ERR_PTR(rc); + if (frag1) { + memcpy(skb_put(frag, skb->mac_len), + skb_mac_header(skb), skb->mac_len); + } else { + rc = wpan_dev_hard_header(frag, wdev, + &master_hdr->dest, + &master_hdr->source, size); + if (rc < 0) { + kfree_skb(frag); + return ERR_PTR(rc); + } } } else { frag = ERR_PTR(-ENOMEM); @@ -104,15 +120,15 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, static int lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, u8 *frag_hdr, int frag_hdrlen, - int offset, int len) + int offset, int len, bool frag1) { struct sk_buff *frag; raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen); - frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr); + frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr, frag1); if (IS_ERR(frag)) - return -PTR_ERR(frag); + return PTR_ERR(frag); memcpy(skb_put(frag, frag_hdrlen), frag_hdr, frag_hdrlen); memcpy(skb_put(frag, len), skb_network_header(skb) + offset, len); @@ -123,19 +139,17 @@ lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, } static int -lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, - const struct ieee802154_hdr *wpan_hdr) +lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, + const struct ieee802154_hdr *wpan_hdr, u16 dgram_size, + u16 dgram_offset) { - u16 dgram_size, dgram_offset; __be16 frag_tag; u8 frag_hdr[5]; int frag_cap, frag_len, payload_cap, rc; int skb_unprocessed, skb_offset; - dgram_size = lowpan_uncompress_size(skb, &dgram_offset) - - skb->mac_len; - frag_tag = htons(lowpan_dev_info(dev)->fragment_tag); - lowpan_dev_info(dev)->fragment_tag++; + frag_tag = htons(lowpan_dev_info(ldev)->fragment_tag); + lowpan_dev_info(ldev)->fragment_tag++; frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07); frag_hdr[1] = dgram_size & 0xff; @@ -151,7 +165,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, LOWPAN_FRAG1_HEAD_SIZE, 0, - frag_len + skb_network_header_len(skb)); + frag_len + skb_network_header_len(skb), + true); if (rc) { pr_debug("%s unable to send FRAG1 packet (tag: %d)", __func__, ntohs(frag_tag)); @@ -172,7 +187,7 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, LOWPAN_FRAGN_HEAD_SIZE, skb_offset, - frag_len); + frag_len, false); if (rc) { pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n", __func__, ntohs(frag_tag), skb_offset); @@ -180,6 +195,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev, } } while (skb_unprocessed > frag_cap); + ldev->stats.tx_packets++; + ldev->stats.tx_bytes += dgram_size; consume_skb(skb); return NET_XMIT_SUCCESS; @@ -188,8 +205,10 @@ err: return rc; } -static int lowpan_header(struct sk_buff *skb, struct net_device *dev) +static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, + u16 *dgram_size, u16 *dgram_offset) { + struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr; struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; @@ -201,13 +220,16 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) daddr = &info.daddr.u.extended_addr; saddr = &info.saddr.u.extended_addr; - lowpan_header_compress(skb, dev, ETH_P_IPV6, daddr, saddr, skb->len); + *dgram_size = skb->len; + lowpan_header_compress(skb, ldev, daddr, saddr); + /* dgram_offset = (saved bytes after compression) + lowpan header len */ + *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb); cb->type = IEEE802154_FC_TYPE_DATA; /* prepare wpan address data */ sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + sa.pan_id = wpan_dev->pan_id; sa.extended_addr = ieee802154_devaddr_from_raw(saddr); /* intra-PAN communications */ @@ -216,27 +238,30 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) /* if the destination address is the broadcast address, use the * corresponding short address */ - if (lowpan_is_addr_broadcast((const u8 *)daddr)) { + if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { da.mode = IEEE802154_ADDR_SHORT; da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); cb->ackreq = false; } else { da.mode = IEEE802154_ADDR_LONG; da.extended_addr = ieee802154_devaddr_from_raw(daddr); - cb->ackreq = true; + cb->ackreq = wpan_dev->ackreq; } - return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, - ETH_P_IPV6, (void *)&da, (void *)&sa, 0); + return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa, + 0); } -netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) +netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) { struct ieee802154_hdr wpan_hdr; int max_single, ret; + u16 dgram_size, dgram_offset; pr_debug("package xmit\n"); + WARN_ON_ONCE(skb->len > IPV6_MIN_MTU); + /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ @@ -244,7 +269,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) if (!skb) return NET_XMIT_DROP; - ret = lowpan_header(skb, dev); + ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset); if (ret < 0) { kfree_skb(skb); return NET_XMIT_DROP; @@ -258,13 +283,16 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) max_single = ieee802154_max_payload(&wpan_hdr); if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) { - skb->dev = lowpan_dev_info(dev)->real_dev; + skb->dev = lowpan_dev_info(ldev)->wdev; + ldev->stats.tx_packets++; + ldev->stats.tx_bytes += dgram_size; return dev_queue_xmit(skb); } else { netdev_tx_t rc; pr_debug("frame is too big, fragmentation is needed\n"); - rc = lowpan_xmit_fragmented(skb, dev, &wpan_hdr); + rc = lowpan_xmit_fragmented(skb, ldev, &wpan_hdr, dgram_size, + dgram_offset); return rc < 0 ? NET_XMIT_DROP : rc; } diff --git a/kernel/net/ieee802154/Kconfig b/kernel/net/ieee802154/Kconfig index 1370d5b00..188135bcb 100644 --- a/kernel/net/ieee802154/Kconfig +++ b/kernel/net/ieee802154/Kconfig @@ -12,6 +12,11 @@ menuconfig IEEE802154 if IEEE802154 +config IEEE802154_NL802154_EXPERIMENTAL + bool "IEEE 802.15.4 experimental netlink support" + ---help--- + Adds experimental netlink support for nl802154. + config IEEE802154_SOCKET tristate "IEEE 802.15.4 socket interface" default y diff --git a/kernel/net/ieee802154/core.c b/kernel/net/ieee802154/core.c index 2ee00e8a0..c35fdfa6d 100644 --- a/kernel/net/ieee802154/core.c +++ b/kernel/net/ieee802154/core.c @@ -95,6 +95,18 @@ cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx) return result; } +struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx) +{ + struct cfg802154_registered_device *rdev; + + ASSERT_RTNL(); + + rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx); + if (!rdev) + return NULL; + return &rdev->wpan_phy; +} + struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) { @@ -121,8 +133,6 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wpan_phy_idx--; - mutex_init(&rdev->wpan_phy.pib_lock); - INIT_LIST_HEAD(&rdev->wpan_dev_list); device_initialize(&rdev->wpan_phy.dev); dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx); diff --git a/kernel/net/ieee802154/core.h b/kernel/net/ieee802154/core.h index f3e95580c..231fade95 100644 --- a/kernel/net/ieee802154/core.h +++ b/kernel/net/ieee802154/core.h @@ -42,5 +42,6 @@ extern int cfg802154_rdev_list_generation; void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); +struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */ diff --git a/kernel/net/ieee802154/header_ops.c b/kernel/net/ieee802154/header_ops.c index a051b6993..c7439f0fb 100644 --- a/kernel/net/ieee802154/header_ops.c +++ b/kernel/net/ieee802154/header_ops.c @@ -83,35 +83,35 @@ ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) } int -ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { - u8 buf[MAC802154_FRAME_HARD_HEADER_LEN]; + u8 buf[IEEE802154_MAX_HEADER_LEN]; int pos = 2; int rc; - struct ieee802154_hdr_fc fc = hdr->fc; + struct ieee802154_hdr_fc *fc = &hdr->fc; buf[pos++] = hdr->seq; - fc.dest_addr_mode = hdr->dest.mode; + fc->dest_addr_mode = hdr->dest.mode; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); if (rc < 0) return -EINVAL; pos += rc; - fc.source_addr_mode = hdr->source.mode; + fc->source_addr_mode = hdr->source.mode; if (hdr->source.pan_id == hdr->dest.pan_id && hdr->dest.mode != IEEE802154_ADDR_NONE) - fc.intra_pan = true; + fc->intra_pan = true; - rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc.intra_pan); + rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan); if (rc < 0) return -EINVAL; pos += rc; - if (fc.security_enabled) { - fc.version = 1; + if (fc->security_enabled) { + fc->version = 1; rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); if (rc < 0) @@ -120,7 +120,7 @@ ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) pos += rc; } - memcpy(buf, &fc, 2); + memcpy(buf, fc, 2); memcpy(skb_push(skb, pos), buf, pos); diff --git a/kernel/net/ieee802154/nl-mac.c b/kernel/net/ieee802154/nl-mac.c index 2b4955d7a..3503c3895 100644 --- a/kernel/net/ieee802154/nl-mac.c +++ b/kernel/net/ieee802154/nl-mac.c @@ -97,8 +97,10 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, BUG_ON(!phy); get_device(&phy->dev); - short_addr = ops->get_short_addr(dev); - pan_id = ops->get_pan_id(dev); + rtnl_lock(); + short_addr = dev->ieee802154_ptr->short_addr; + pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || @@ -117,12 +119,12 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, rtnl_unlock(); if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER, - params.transmit_power) || + params.transmit_power / 100) || nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) || nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, params.cca.mode) || nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL, - params.cca_ed_level) || + params.cca_ed_level / 100) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES, params.csma_retries) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE, @@ -166,10 +168,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (!dev) return NULL; - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (dev->type != ARPHRD_IEEE802154 || dev->mtu != IEEE802154_MTU) { + if (dev->type != ARPHRD_IEEE802154) { dev_put(dev); return NULL; } @@ -244,7 +243,9 @@ int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info) addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); - addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + rtnl_lock(); + addr.pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), @@ -281,7 +282,9 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); } - addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + rtnl_lock(); + addr.pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); @@ -449,11 +452,7 @@ int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; for_each_netdev(net, dev) { - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (idx < s_idx || dev->type != ARPHRD_IEEE802154 || - dev->mtu != IEEE802154_MTU) + if (idx < s_idx || dev->type != ARPHRD_IEEE802154) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, @@ -510,7 +509,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) ops->get_mac_params(dev, ¶ms); if (info->attrs[IEEE802154_ATTR_TXPOWER]) - params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]); + params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100; if (info->attrs[IEEE802154_ATTR_LBT_ENABLED]) params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]); @@ -519,7 +518,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]); if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) - params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]); + params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100; if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES]) params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]); @@ -783,11 +782,7 @@ ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, int rc; for_each_netdev(net, dev) { - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (idx < first_dev || dev->type != ARPHRD_IEEE802154 || - dev->mtu != IEEE802154_MTU) + if (idx < first_dev || dev->type != ARPHRD_IEEE802154) goto skip; data.ops = ieee802154_mlme_ops(dev); diff --git a/kernel/net/ieee802154/nl-phy.c b/kernel/net/ieee802154/nl-phy.c index 346c6665d..77d73014b 100644 --- a/kernel/net/ieee802154/nl-phy.c +++ b/kernel/net/ieee802154/nl-phy.c @@ -50,26 +50,26 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, if (!hdr) goto out; - mutex_lock(&phy->pib_lock); + rtnl_lock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) || nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel)) goto nla_put_failure; for (i = 0; i < 32; i++) { - if (phy->channels_supported[i]) - buf[pages++] = phy->channels_supported[i] | (i << 27); + if (phy->supported.channels[i]) + buf[pages++] = phy->supported.channels[i] | (i << 27); } if (pages && nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, pages * sizeof(uint32_t), buf)) goto nla_put_failure; - mutex_unlock(&phy->pib_lock); + rtnl_unlock(); kfree(buf); genlmsg_end(msg, hdr); return 0; nla_put_failure: - mutex_unlock(&phy->pib_lock); + rtnl_unlock(); genlmsg_cancel(msg, hdr); out: kfree(buf); diff --git a/kernel/net/ieee802154/nl802154.c b/kernel/net/ieee802154/nl802154.c index f3c12f6a4..16ef0d9f5 100644 --- a/kernel/net/ieee802154/nl802154.c +++ b/kernel/net/ieee802154/nl802154.c @@ -207,10 +207,11 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_PAGE] = { .type = NLA_U8, }, [NL802154_ATTR_CHANNEL] = { .type = NLA_U8, }, - [NL802154_ATTR_TX_POWER] = { .type = NLA_S8, }, + [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, }, [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, }, + [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, }, [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, }, @@ -225,8 +226,92 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, }, [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, }, + + [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, + + [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, + + [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, + [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, + [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, }, + [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 }, + + [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED }, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static int +nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg802154_registered_device **rdev, + struct wpan_dev **wpan_dev) +{ + int err; + + rtnl_lock(); + + if (!cb->args[0]) { + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, + nl802154_fam.attrbuf, nl802154_fam.maxattr, + nl802154_policy); + if (err) + goto out_unlock; + + *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), + nl802154_fam.attrbuf); + if (IS_ERR(*wpan_dev)) { + err = PTR_ERR(*wpan_dev); + goto out_unlock; + } + *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy); + /* 0 is the first index - add 1 to parse only once */ + cb->args[0] = (*rdev)->wpan_phy_idx + 1; + cb->args[1] = (*wpan_dev)->identifier; + } else { + /* subtract the 1 again here */ + struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1); + struct wpan_dev *tmp; + + if (!wpan_phy) { + err = -ENODEV; + goto out_unlock; + } + *rdev = wpan_phy_to_rdev(wpan_phy); + *wpan_dev = NULL; + + list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) { + if (tmp->identifier == cb->args[1]) { + *wpan_dev = tmp; + break; + } + } + + if (!*wpan_dev) { + err = -ENODEV; + goto out_unlock; + } + } + + return 0; + out_unlock: + rtnl_unlock(); + return err; +} + +static void +nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev) +{ + rtnl_unlock(); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + /* message building helper */ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) @@ -235,6 +320,28 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd); } +static int +nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask) +{ + struct nlattr *nl_flags = nla_nest_start(msg, attr); + int i; + + if (!nl_flags) + return -ENOBUFS; + + i = 0; + while (mask) { + if ((mask & 1) && nla_put_flag(msg, i)) + return -ENOBUFS; + + mask >>= 1; + i++; + } + + nla_nest_end(msg, nl_flags); + return 0; +} + static int nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, struct sk_buff *msg) @@ -248,7 +355,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, for (page = 0; page <= IEEE802154_MAX_PAGE; page++) { if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL, - rdev->wpan_phy.channels_supported[page])) + rdev->wpan_phy.supported.channels[page])) return -ENOBUFS; } nla_nest_end(msg, nl_page); @@ -256,12 +363,100 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, return 0; } +static int +nl802154_put_capabilities(struct sk_buff *msg, + struct cfg802154_registered_device *rdev) +{ + const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported; + struct nlattr *nl_caps, *nl_channels; + int i; + + nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS); + if (!nl_caps) + return -ENOBUFS; + + nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS); + if (!nl_channels) + return -ENOBUFS; + + for (i = 0; i <= IEEE802154_MAX_PAGE; i++) { + if (caps->channels[i]) { + if (nl802154_put_flags(msg, i, caps->channels[i])) + return -ENOBUFS; + } + } + + nla_nest_end(msg, nl_channels); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { + struct nlattr *nl_ed_lvls; + + nl_ed_lvls = nla_nest_start(msg, + NL802154_CAP_ATTR_CCA_ED_LEVELS); + if (!nl_ed_lvls) + return -ENOBUFS; + + for (i = 0; i < caps->cca_ed_levels_size; i++) { + if (nla_put_s32(msg, i, caps->cca_ed_levels[i])) + return -ENOBUFS; + } + + nla_nest_end(msg, nl_ed_lvls); + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { + struct nlattr *nl_tx_pwrs; + + nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS); + if (!nl_tx_pwrs) + return -ENOBUFS; + + for (i = 0; i < caps->tx_powers_size; i++) { + if (nla_put_s32(msg, i, caps->tx_powers[i])) + return -ENOBUFS; + } + + nla_nest_end(msg, nl_tx_pwrs); + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { + if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES, + caps->cca_modes) || + nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS, + caps->cca_opts)) + return -ENOBUFS; + } + + if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS, + caps->min_csma_backoffs) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS, + caps->max_csma_backoffs) || + nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES, + caps->min_frame_retries) || + nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES, + caps->max_frame_retries) || + nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES, + caps->iftypes) || + nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt)) + return -ENOBUFS; + + nla_nest_end(msg, nl_caps); + + return 0; +} + static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, enum nl802154_commands cmd, struct sk_buff *msg, u32 portid, u32 seq, int flags) { + struct nlattr *nl_cmds; void *hdr; + int i; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) @@ -286,25 +481,77 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, rdev->wpan_phy.current_channel)) goto nla_put_failure; - /* supported channels array */ + /* TODO remove this behaviour, we still keep support it for a while + * so users can change the behaviour to the new one. + */ if (nl802154_send_wpan_phy_channels(rdev, msg)) goto nla_put_failure; /* cca mode */ - if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, - rdev->wpan_phy.cca.mode)) - goto nla_put_failure; + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { + if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, + rdev->wpan_phy.cca.mode)) + goto nla_put_failure; + + if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { + if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, + rdev->wpan_phy.cca.opt)) + goto nla_put_failure; + } + } - if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { - if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, - rdev->wpan_phy.cca.opt)) + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { + if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, + rdev->wpan_phy.transmit_power)) goto nla_put_failure; } - if (nla_put_s8(msg, NL802154_ATTR_TX_POWER, - rdev->wpan_phy.transmit_power)) + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { + if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL, + rdev->wpan_phy.cca_ed_level)) + goto nla_put_failure; + } + + if (nl802154_put_capabilities(msg, rdev)) + goto nla_put_failure; + + nl_cmds = nla_nest_start(msg, NL802154_ATTR_SUPPORTED_COMMANDS); + if (!nl_cmds) goto nla_put_failure; + i = 0; +#define CMD(op, n) \ + do { \ + if (rdev->ops->op) { \ + i++; \ + if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \ + goto nla_put_failure; \ + } \ + } while (0) + + CMD(add_virtual_intf, NEW_INTERFACE); + CMD(del_virtual_intf, DEL_INTERFACE); + CMD(set_channel, SET_CHANNEL); + CMD(set_pan_id, SET_PAN_ID); + CMD(set_short_addr, SET_SHORT_ADDR); + CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT); + CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS); + CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES); + CMD(set_lbt_mode, SET_LBT_MODE); + CMD(set_ackreq_default, SET_ACKREQ_DEFAULT); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) + CMD(set_tx_power, SET_TX_POWER); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) + CMD(set_cca_ed_level, SET_CCA_ED_LEVEL); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) + CMD(set_cca_mode, SET_CCA_MODE); + +#undef CMD + nla_nest_end(msg, nl_cmds); + finish: genlmsg_end(msg, hdr); return 0; @@ -443,6 +690,107 @@ static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev) ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32); } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +#include + +static int +ieee802154_llsec_send_key_id(struct sk_buff *msg, + const struct ieee802154_llsec_key_id *desc) +{ + struct nlattr *nl_dev_addr; + + if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode)) + return -ENOBUFS; + + switch (desc->mode) { + case NL802154_KEY_ID_MODE_IMPLICIT: + nl_dev_addr = nla_nest_start(msg, NL802154_KEY_ID_ATTR_IMPLICIT); + if (!nl_dev_addr) + return -ENOBUFS; + + if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID, + desc->device_addr.pan_id) || + nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE, + desc->device_addr.mode)) + return -ENOBUFS; + + switch (desc->device_addr.mode) { + case NL802154_DEV_ADDR_SHORT: + if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT, + desc->device_addr.short_addr)) + return -ENOBUFS; + break; + case NL802154_DEV_ADDR_EXTENDED: + if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, + desc->device_addr.extended_addr)) + return -ENOBUFS; + break; + default: + /* userspace should handle unknown */ + break; + } + + nla_nest_end(msg, nl_dev_addr); + break; + case NL802154_KEY_ID_MODE_INDEX: + break; + case NL802154_KEY_ID_MODE_INDEX_SHORT: + /* TODO renmae short_source? */ + if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT, + desc->short_source)) + return -ENOBUFS; + break; + case NL802154_KEY_ID_MODE_INDEX_EXTENDED: + if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED, + desc->extended_source)) + return -ENOBUFS; + break; + default: + /* userspace should handle unknown */ + break; + } + + /* TODO key_id to key_idx ? Check naming */ + if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { + if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id)) + return -ENOBUFS; + } + + return 0; +} + +static int nl802154_get_llsec_params(struct sk_buff *msg, + struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + struct nlattr *nl_key_id; + struct ieee802154_llsec_params params; + int ret; + + ret = rdev_get_llsec_params(rdev, wpan_dev, ¶ms); + if (ret < 0) + return ret; + + if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) || + nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) || + nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER, + params.frame_counter)) + return -ENOBUFS; + + nl_key_id = nla_nest_start(msg, NL802154_ATTR_SEC_OUT_KEY_ID); + if (!nl_key_id) + return -ENOBUFS; + + ret = ieee802154_llsec_send_key_id(msg, ¶ms.out_key); + if (ret < 0) + return ret; + + nla_nest_end(msg, nl_key_id); + + return 0; +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + static int nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, @@ -490,6 +838,15 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt)) goto nla_put_failure; + /* ackreq default behaviour */ + if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq)) + goto nla_put_failure; + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0) + goto nla_put_failure; +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + genlmsg_end(msg, hdr); return 0; @@ -575,14 +932,13 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL802154_ATTR_IFTYPE]) { type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]); - if (type > NL802154_IFTYPE_MAX) + if (type > NL802154_IFTYPE_MAX || + !(rdev->wpan_phy.supported.iftypes & BIT(type))) return -EINVAL; } - /* TODO add nla_get_le64 to netlink */ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) - extended_addr = (__force __le64)nla_get_u64( - info->attrs[NL802154_ATTR_EXTENDED_ADDR]); + extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); if (!rdev->ops->add_virtual_intf) return -EOPNOTSUPP; @@ -625,7 +981,8 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info) channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]); /* check 802.15.4 constraints */ - if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL) + if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL || + !(rdev->wpan_phy.supported.channels[page] & BIT(channel))) return -EINVAL; return rdev_set_channel(rdev, page, channel); @@ -636,12 +993,17 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_phy_cca cca; + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)) + return -EOPNOTSUPP; + if (!info->attrs[NL802154_ATTR_CCA_MODE]) return -EINVAL; cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]); /* checking 802.15.4 constraints */ - if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX) + if (cca.mode < NL802154_CCA_ENERGY || + cca.mode > NL802154_CCA_ATTR_MAX || + !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode))) return -EINVAL; if (cca.mode == NL802154_CCA_ENERGY_CARRIER) { @@ -649,13 +1011,58 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) return -EINVAL; cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]); - if (cca.opt > NL802154_CCA_OPT_ATTR_MAX) + if (cca.opt > NL802154_CCA_OPT_ATTR_MAX || + !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt))) return -EINVAL; } return rdev_set_cca_mode(rdev, &cca); } +static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + s32 ed_level; + int i; + + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)) + return -EOPNOTSUPP; + + if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL]) + return -EINVAL; + + ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]); + + for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) { + if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i]) + return rdev_set_cca_ed_level(rdev, ed_level); + } + + return -EINVAL; +} + +static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + s32 power; + int i; + + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)) + return -EOPNOTSUPP; + + if (!info->attrs[NL802154_ATTR_TX_POWER]) + return -EINVAL; + + power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]); + + for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) { + if (power == rdev->wpan_phy.supported.tx_powers[i]) + return rdev_set_tx_power(rdev, power); + } + + return -EINVAL; +} + static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; @@ -668,14 +1075,22 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) return -EBUSY; /* don't change address fields on monitor */ - if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) - return -EINVAL; - - if (!info->attrs[NL802154_ATTR_PAN_ID]) + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || + !info->attrs[NL802154_ATTR_PAN_ID]) return -EINVAL; pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); + /* TODO + * I am not sure about to check here on broadcast pan_id. + * Broadcast is a valid setting, comment from 802.15.4: + * If this value is 0xffff, the device is not associated. + * + * This could useful to simple deassociate an device. + */ + if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + return -EINVAL; + return rdev_set_pan_id(rdev, wpan_dev, pan_id); } @@ -691,14 +1106,27 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) return -EBUSY; /* don't change address fields on monitor */ - if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) - return -EINVAL; - - if (!info->attrs[NL802154_ATTR_SHORT_ADDR]) + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || + !info->attrs[NL802154_ATTR_SHORT_ADDR]) return -EINVAL; short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); + /* TODO + * I am not sure about to check here on broadcast short_addr. + * Broadcast is a valid setting, comment from 802.15.4: + * A value of 0xfffe indicates that the device has + * associated but has not been allocated an address. A + * value of 0xffff indicates that the device does not + * have a short address. + * + * I think we should allow to set these settings but + * don't allow to allow socket communication with it. + */ + if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || + short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) + return -EINVAL; + return rdev_set_short_addr(rdev, wpan_dev, short_addr); } @@ -722,7 +1150,11 @@ nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info) max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]); /* check 802.15.4 constraints */ - if (max_be < 3 || max_be > 8 || min_be > max_be) + if (min_be < rdev->wpan_phy.supported.min_minbe || + min_be > rdev->wpan_phy.supported.max_minbe || + max_be < rdev->wpan_phy.supported.min_maxbe || + max_be > rdev->wpan_phy.supported.max_maxbe || + min_be > max_be) return -EINVAL; return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be); @@ -747,7 +1179,8 @@ nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info) info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]); /* check 802.15.4 constraints */ - if (max_csma_backoffs > 5) + if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs || + max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs) return -EINVAL; return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs); @@ -771,7 +1204,8 @@ nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info) info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]); /* check 802.15.4 constraints */ - if (max_frame_retries < -1 || max_frame_retries > 7) + if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries || + max_frame_retries > rdev->wpan_phy.supported.max_frame_retries) return -EINVAL; return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries); @@ -782,7 +1216,7 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; - bool mode; + int mode; if (netif_running(dev)) return -EBUSY; @@ -790,88 +1224,949 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL802154_ATTR_LBT_MODE]) return -EINVAL; - mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + + if (mode != 0 && mode != 1) + return -EINVAL; + + if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) + return -EINVAL; + return rdev_set_lbt_mode(rdev, wpan_dev, mode); } -#define NL802154_FLAG_NEED_WPAN_PHY 0x01 -#define NL802154_FLAG_NEED_NETDEV 0x02 -#define NL802154_FLAG_NEED_RTNL 0x04 -#define NL802154_FLAG_CHECK_NETDEV_UP 0x08 -#define NL802154_FLAG_NEED_NETDEV_UP (NL802154_FLAG_NEED_NETDEV |\ - NL802154_FLAG_CHECK_NETDEV_UP) -#define NL802154_FLAG_NEED_WPAN_DEV 0x10 -#define NL802154_FLAG_NEED_WPAN_DEV_UP (NL802154_FLAG_NEED_WPAN_DEV |\ - NL802154_FLAG_CHECK_NETDEV_UP) - -static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) +static int +nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) { - struct cfg802154_registered_device *rdev; - struct wpan_dev *wpan_dev; - struct net_device *dev; - bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL; + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + int ackreq; - if (rtnl) - rtnl_lock(); + if (netif_running(dev)) + return -EBUSY; - if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) { - rdev = cfg802154_get_dev_from_info(genl_info_net(info), info); - if (IS_ERR(rdev)) { - if (rtnl) - rtnl_unlock(); - return PTR_ERR(rdev); - } - info->user_ptr[0] = rdev; - } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV || - ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { - ASSERT_RTNL(); - wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info), - info->attrs); - if (IS_ERR(wpan_dev)) { - if (rtnl) - rtnl_unlock(); - return PTR_ERR(wpan_dev); - } + if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]) + return -EINVAL; - dev = wpan_dev->netdev; - rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); + ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); - if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) { - if (!dev) { - if (rtnl) - rtnl_unlock(); - return -EINVAL; - } + if (ackreq != 0 && ackreq != 1) + return -EINVAL; - info->user_ptr[1] = dev; - } else { - info->user_ptr[1] = wpan_dev; - } + return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); +} - if (dev) { - if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP && - !netif_running(dev)) { - if (rtnl) - rtnl_unlock(); - return -ENETDOWN; - } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { + [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, + [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 }, + [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 }, + [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 }, +}; - dev_hold(dev); - } +static int +ieee802154_llsec_parse_dev_addr(struct nlattr *nla, + struct ieee802154_addr *addr) +{ + struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; - info->user_ptr[0] = rdev; + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, + nl802154_dev_addr_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] && + !attrs[NL802154_DEV_ADDR_ATTR_MODE] && + !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || + attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + return -EINVAL; + + addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); + addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); + switch (addr->mode) { + case NL802154_DEV_ADDR_SHORT: + addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); + break; + case NL802154_DEV_ADDR_EXTENDED: + addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); + break; + default: + return -EINVAL; } return 0; } -static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) +static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = { + [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 }, + [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 }, + [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED }, + [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 }, + [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 }, +}; + +static int +ieee802154_llsec_parse_key_id(struct nlattr *nla, + struct ieee802154_llsec_key_id *desc) { - if (info->user_ptr[1]) { - if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { - struct wpan_dev *wpan_dev = info->user_ptr[1]; + struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla, + nl802154_key_id_policy)) + return -EINVAL; + + if (!attrs[NL802154_KEY_ID_ATTR_MODE]) + return -EINVAL; + + desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]); + switch (desc->mode) { + case NL802154_KEY_ID_MODE_IMPLICIT: + if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT]) + return -EINVAL; + + if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT], + &desc->device_addr) < 0) + return -EINVAL; + break; + case NL802154_KEY_ID_MODE_INDEX: + break; + case NL802154_KEY_ID_MODE_INDEX_SHORT: + if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]) + return -EINVAL; + + desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]); + break; + case NL802154_KEY_ID_MODE_INDEX_EXTENDED: + if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]) + return -EINVAL; + + desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]); + break; + default: + return -EINVAL; + } + + if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { + if (!attrs[NL802154_KEY_ID_ATTR_INDEX]) + return -EINVAL; + + /* TODO change id to idx */ + desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]); + } + + return 0; +} + +static int nl802154_set_llsec_params(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_params params; + u32 changed = 0; + int ret; + + if (info->attrs[NL802154_ATTR_SEC_ENABLED]) { + u8 enabled; + + enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); + if (enabled != 0 && enabled != 1) + return -EINVAL; + + params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); + changed |= IEEE802154_LLSEC_PARAM_ENABLED; + } + + if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) { + ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID], + ¶ms.out_key); + if (ret < 0) + return ret; + + changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; + } + + if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) { + params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]); + if (params.out_level > NL802154_SECLEVEL_MAX) + return -EINVAL; + + changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; + } + + if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) { + params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]); + changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; + } + + return rdev_set_llsec_params(rdev, wpan_dev, ¶ms, changed); +} + +static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_key_entry *key) +{ + void *hdr; + u32 commands[NL802154_CMD_FRAME_NR_IDS / 32]; + struct nlattr *nl_key, *nl_key_id; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_key = nla_nest_start(msg, NL802154_ATTR_SEC_KEY); + if (!nl_key) + goto nla_put_failure; + + nl_key_id = nla_nest_start(msg, NL802154_KEY_ATTR_ID); + if (!nl_key_id) + goto nla_put_failure; + + if (ieee802154_llsec_send_key_id(msg, &key->id) < 0) + goto nla_put_failure; + + nla_nest_end(msg, nl_key_id); + + if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES, + key->key->frame_types)) + goto nla_put_failure; + + if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) { + /* TODO for each nested */ + memset(commands, 0, sizeof(commands)); + commands[7] = key->key->cmd_frame_ids; + if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS, + sizeof(commands), commands)) + goto nla_put_failure; + } + + if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE, + key->key->key)) + goto nla_put_failure; + + nla_nest_end(msg, nl_key); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_key_entry *key; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(key, &table->keys, list) { + if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, key) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = { + [NL802154_KEY_ATTR_ID] = { NLA_NESTED }, + /* TODO handle it as for_each_nested and NLA_FLAG? */ + [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 }, + /* TODO handle it as for_each_nested, not static array? */ + [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 }, + [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE }, +}; + +static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; + struct ieee802154_llsec_key key = { }; + struct ieee802154_llsec_key_id id = { }; + u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; + + if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_KEY], + nl802154_key_policy)) + return -EINVAL; + + if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || + !attrs[NL802154_KEY_ATTR_BYTES]) + return -EINVAL; + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]); + if (key.frame_types > BIT(NL802154_FRAME_MAX) || + ((key.frame_types & BIT(NL802154_FRAME_CMD)) && + !attrs[NL802154_KEY_ATTR_USAGE_CMDS])) + return -EINVAL; + + if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) { + /* TODO for each nested */ + nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS], + NL802154_CMD_FRAME_NR_IDS / 8); + + /* TODO understand the -EINVAL logic here? last condition */ + if (commands[0] || commands[1] || commands[2] || commands[3] || + commands[4] || commands[5] || commands[6] || + commands[7] > BIT(NL802154_CMD_FRAME_MAX)) + return -EINVAL; + + key.cmd_frame_ids = commands[7]; + } else { + key.cmd_frame_ids = 0; + } + + nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE); + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + return rdev_add_llsec_key(rdev, wpan_dev, &id, &key); +} + +static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; + struct ieee802154_llsec_key_id id; + + if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_KEY], + nl802154_key_policy)) + return -EINVAL; + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + return rdev_del_llsec_key(rdev, wpan_dev, &id); +} + +static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_device *dev_desc) +{ + void *hdr; + struct nlattr *nl_device; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_device = nla_nest_start(msg, NL802154_ATTR_SEC_DEVICE); + if (!nl_device) + goto nla_put_failure; + + if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER, + dev_desc->frame_counter) || + nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) || + nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR, + dev_desc->short_addr) || + nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR, + dev_desc->hwaddr) || + nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT, + dev_desc->seclevel_exempt) || + nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode)) + goto nla_put_failure; + + nla_nest_end(msg, nl_device); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_device *dev; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(dev, &table->devices, list) { + if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, dev) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = { + [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 }, + [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 }, + [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 }, + [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, + [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 }, + [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 }, +}; + +static int +ieee802154_llsec_parse_device(struct nlattr *nla, + struct ieee802154_llsec_device *dev) +{ + struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, nla, + nl802154_dev_policy)) + return -EINVAL; + + memset(dev, 0, sizeof(*dev)); + + if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] || + !attrs[NL802154_DEV_ATTR_PAN_ID] || + !attrs[NL802154_DEV_ATTR_SHORT_ADDR] || + !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] || + !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] || + !attrs[NL802154_DEV_ATTR_KEY_MODE]) + return -EINVAL; + + /* TODO be32 */ + dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]); + dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]); + dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]); + /* TODO rename hwaddr to extended_addr */ + dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); + dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]); + dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]); + + if (dev->key_mode > NL802154_DEVKEY_MAX || + (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1)) + return -EINVAL; + + return 0; +} + +static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_device dev_desc; + + if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE], + &dev_desc) < 0) + return -EINVAL; + + return rdev_add_device(rdev, wpan_dev, &dev_desc); +} + +static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; + __le64 extended_addr; + + if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVICE], + nl802154_dev_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); + return rdev_del_device(rdev, wpan_dev, extended_addr); +} + +static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + void *hdr; + struct nlattr *nl_devkey, *nl_key_id; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_devkey = nla_nest_start(msg, NL802154_ATTR_SEC_DEVKEY); + if (!nl_devkey) + goto nla_put_failure; + + if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR, + extended_addr) || + nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER, + devkey->frame_counter)) + goto nla_put_failure; + + nl_key_id = nla_nest_start(msg, NL802154_DEVKEY_ATTR_ID); + if (!nl_key_id) + goto nla_put_failure; + + if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0) + goto nla_put_failure; + + nla_nest_end(msg, nl_key_id); + nla_nest_end(msg, nl_devkey); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_device_key *kpos; + struct ieee802154_llsec_device *dpos; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + /* TODO look if remove devkey and do some nested attribute */ + list_for_each_entry(dpos, &table->devices, list) { + list_for_each_entry(kpos, &dpos->keys, list) { + if (nl802154_send_devkey(skb, + NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, rdev, + wpan_dev->netdev, + dpos->hwaddr, + kpos) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = { + [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 }, + [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 }, + [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED }, +}; + +static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; + struct ieee802154_llsec_device_key key; + __le64 extended_addr; + + if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || + nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVKEY], + nl802154_devkey_policy) < 0) + return -EINVAL; + + if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || + !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + /* TODO change key.id ? */ + if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], + &key.key_id) < 0) + return -ENOBUFS; + + /* TODO be32 */ + key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]); + /* TODO change naming hwaddr -> extended_addr + * check unique identifier short+pan OR extended_addr + */ + extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); + return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key); +} + +static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; + struct ieee802154_llsec_device_key key; + __le64 extended_addr; + + if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVKEY], + nl802154_devkey_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + /* TODO change key.id ? */ + if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], + &key.key_id) < 0) + return -ENOBUFS; + + /* TODO change naming hwaddr -> extended_addr + * check unique identifier short+pan OR extended_addr + */ + extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); + return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key); +} + +static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl) +{ + void *hdr; + struct nlattr *nl_seclevel; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_seclevel = nla_nest_start(msg, NL802154_ATTR_SEC_LEVEL); + if (!nl_seclevel) + goto nla_put_failure; + + if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) || + nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) || + nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE, + sl->device_override)) + goto nla_put_failure; + + if (sl->frame_type == NL802154_FRAME_CMD) { + if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME, + sl->cmd_frame_id)) + goto nla_put_failure; + } + + nla_nest_end(msg, nl_seclevel); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_seclevel *sl; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(sl, &table->security_levels, list) { + if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, sl) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = { + [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 }, + [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 }, + [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 }, + [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 }, +}; + +static int +llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) +{ + struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, + nl802154_seclevel_policy)) + return -EINVAL; + + memset(sl, 0, sizeof(*sl)); + + if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] || + !attrs[NL802154_SECLEVEL_ATTR_FRAME] || + !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]) + return -EINVAL; + + sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]); + sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]); + sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]); + if (sl->frame_type > NL802154_FRAME_MAX || + (sl->device_override != 0 && sl->device_override != 1)) + return -EINVAL; + + if (sl->frame_type == NL802154_FRAME_CMD) { + if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]) + return -EINVAL; + + sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]); + if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX) + return -EINVAL; + } + + return 0; +} + +static int nl802154_add_llsec_seclevel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_seclevel sl; + + if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], + &sl) < 0) + return -EINVAL; + + return rdev_add_seclevel(rdev, wpan_dev, &sl); +} + +static int nl802154_del_llsec_seclevel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_seclevel sl; + + if (!info->attrs[NL802154_ATTR_SEC_LEVEL] || + llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], + &sl) < 0) + return -EINVAL; + + return rdev_del_seclevel(rdev, wpan_dev, &sl); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + +#define NL802154_FLAG_NEED_WPAN_PHY 0x01 +#define NL802154_FLAG_NEED_NETDEV 0x02 +#define NL802154_FLAG_NEED_RTNL 0x04 +#define NL802154_FLAG_CHECK_NETDEV_UP 0x08 +#define NL802154_FLAG_NEED_NETDEV_UP (NL802154_FLAG_NEED_NETDEV |\ + NL802154_FLAG_CHECK_NETDEV_UP) +#define NL802154_FLAG_NEED_WPAN_DEV 0x10 +#define NL802154_FLAG_NEED_WPAN_DEV_UP (NL802154_FLAG_NEED_WPAN_DEV |\ + NL802154_FLAG_CHECK_NETDEV_UP) + +static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev; + struct wpan_dev *wpan_dev; + struct net_device *dev; + bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL; + + if (rtnl) + rtnl_lock(); + + if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) { + rdev = cfg802154_get_dev_from_info(genl_info_net(info), info); + if (IS_ERR(rdev)) { + if (rtnl) + rtnl_unlock(); + return PTR_ERR(rdev); + } + info->user_ptr[0] = rdev; + } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV || + ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { + ASSERT_RTNL(); + wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info), + info->attrs); + if (IS_ERR(wpan_dev)) { + if (rtnl) + rtnl_unlock(); + return PTR_ERR(wpan_dev); + } + + dev = wpan_dev->netdev; + rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); + + if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) { + if (!dev) { + if (rtnl) + rtnl_unlock(); + return -EINVAL; + } + + info->user_ptr[1] = dev; + } else { + info->user_ptr[1] = wpan_dev; + } + + if (dev) { + if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP && + !netif_running(dev)) { + if (rtnl) + rtnl_unlock(); + return -ENETDOWN; + } + + dev_hold(dev); + } + + info->user_ptr[0] = rdev; + } + + return 0; +} + +static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + if (info->user_ptr[1]) { + if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { + struct wpan_dev *wpan_dev = info->user_ptr[1]; if (wpan_dev->netdev) dev_put(wpan_dev->netdev); @@ -936,6 +2231,22 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_CCA_ED_LEVEL, + .doit = nl802154_set_cca_ed_level, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_SET_TX_POWER, + .doit = nl802154_set_tx_power, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, { .cmd = NL802154_CMD_SET_PAN_ID, .doit = nl802154_set_pan_id, @@ -984,6 +2295,127 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT, + .doit = nl802154_set_ackreq_default, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + { + .cmd = NL802154_CMD_SET_SEC_PARAMS, + .doit = nl802154_set_llsec_params, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_SEC_KEY, + /* TODO .doit by matching key id? */ + .dumpit = nl802154_dump_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_KEY, + .doit = nl802154_add_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_KEY, + .doit = nl802154_del_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + /* TODO unique identifier must short+pan OR extended_addr */ + { + .cmd = NL802154_CMD_GET_SEC_DEV, + /* TODO .doit by matching extended_addr? */ + .dumpit = nl802154_dump_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_DEV, + .doit = nl802154_add_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_DEV, + .doit = nl802154_del_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + /* TODO remove complete devkey, put it as nested? */ + { + .cmd = NL802154_CMD_GET_SEC_DEVKEY, + /* TODO doit by matching ??? */ + .dumpit = nl802154_dump_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_DEVKEY, + .doit = nl802154_add_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_DEVKEY, + .doit = nl802154_del_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_SEC_LEVEL, + /* TODO .doit by matching frame_type? */ + .dumpit = nl802154_dump_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_LEVEL, + .doit = nl802154_add_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_LEVEL, + /* TODO match frame_type only? */ + .doit = nl802154_del_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; /* initialisation/exit functions */ diff --git a/kernel/net/ieee802154/rdev-ops.h b/kernel/net/ieee802154/rdev-ops.h index 7b5a9dd94..4441c63b3 100644 --- a/kernel/net/ieee802154/rdev-ops.h +++ b/kernel/net/ieee802154/rdev-ops.h @@ -23,6 +23,26 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev); } +static inline int +rdev_suspend(struct cfg802154_registered_device *rdev) +{ + int ret; + trace_802154_rdev_suspend(&rdev->wpan_phy); + ret = rdev->ops->suspend(&rdev->wpan_phy); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +static inline int +rdev_resume(struct cfg802154_registered_device *rdev) +{ + int ret; + trace_802154_rdev_resume(&rdev->wpan_phy); + ret = rdev->ops->resume(&rdev->wpan_phy); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, unsigned char name_assign_type, @@ -74,6 +94,29 @@ rdev_set_cca_mode(struct cfg802154_registered_device *rdev, return ret; } +static inline int +rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level) +{ + int ret; + + trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level); + ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +static inline int +rdev_set_tx_power(struct cfg802154_registered_device *rdev, + s32 power) +{ + int ret; + + trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power); + ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) @@ -152,4 +195,126 @@ rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, return ret; } +static inline int +rdev_set_ackreq_default(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, bool ackreq) +{ + int ret; + + trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev, + ackreq); + ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +/* TODO this is already a nl802154, so move into ieee802154 */ +static inline void +rdev_get_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_table **table) +{ + rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table); +} + +static inline void +rdev_lock_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev); +} + +static inline void +rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev); +} + +static inline int +rdev_get_llsec_params(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_params *params) +{ + return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params); +} + +static inline int +rdev_set_llsec_params(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_params *params, + u32 changed) +{ + return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params, + changed); +} + +static inline int +rdev_add_llsec_key(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key); +} + +static inline int +rdev_del_llsec_key(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id) +{ + return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id); +} + +static inline int +rdev_add_seclevel(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl); +} + +static inline int +rdev_del_seclevel(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl); +} + +static inline int +rdev_add_device(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_device *dev_desc) +{ + return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc); +} + +static inline int +rdev_del_device(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr) +{ + return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr); +} + +static inline int +rdev_add_devkey(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, + devkey); +} + +static inline int +rdev_del_devkey(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, + devkey); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + #endif /* __CFG802154_RDEV_OPS */ diff --git a/kernel/net/ieee802154/socket.c b/kernel/net/ieee802154/socket.c index 627a25376..a548be247 100644 --- a/kernel/net/ieee802154/socket.c +++ b/kernel/net/ieee802154/socket.c @@ -64,10 +64,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) if (tmp->type != ARPHRD_IEEE802154) continue; - pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp); - short_addr = - ieee802154_mlme_ops(tmp)->get_short_addr(tmp); - + pan_id = tmp->ieee802154_ptr->pan_id; + short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; @@ -228,15 +226,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) goto out; } - if (dev->type != ARPHRD_IEEE802154) { - err = -ENODEV; - goto out_put; - } - sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); -out_put: dev_put(dev); out: release_sock(sk); @@ -281,12 +273,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out; } - mtu = dev->mtu; + mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %Zu, mtu = %u\n", size, mtu); - err = -EINVAL; + err = -EMSGSIZE; goto out_dev; } @@ -645,7 +637,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = -ENXIO; goto out; } - mtu = dev->mtu; + mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { @@ -684,8 +676,8 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; - err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &dst_addr, - ro->bound ? &ro->src_addr : NULL, size); + err = wpan_dev_hard_header(skb, dev, &dst_addr, + ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; @@ -803,9 +795,9 @@ static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); - pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev); - hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr); + pan_id = dev->ieee802154_ptr->pan_id; + short_addr = dev->ieee802154_ptr->short_addr; + hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { @@ -1020,7 +1012,7 @@ static int ieee802154_create(struct net *net, struct socket *sock, } rc = -ENOMEM; - sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto); + sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; diff --git a/kernel/net/ieee802154/sysfs.c b/kernel/net/ieee802154/sysfs.c index 133b42806..bd88525b0 100644 --- a/kernel/net/ieee802154/sysfs.c +++ b/kernel/net/ieee802154/sysfs.c @@ -14,11 +14,13 @@ */ #include +#include #include #include "core.h" #include "sysfs.h" +#include "rdev-ops.h" static inline struct cfg802154_registered_device * dev_to_rdev(struct device *dev) @@ -62,10 +64,46 @@ static struct attribute *pmib_attrs[] = { }; ATTRIBUTE_GROUPS(pmib); +#ifdef CONFIG_PM_SLEEP +static int wpan_phy_suspend(struct device *dev) +{ + struct cfg802154_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + if (rdev->ops->suspend) { + rtnl_lock(); + ret = rdev_suspend(rdev); + rtnl_unlock(); + } + + return ret; +} + +static int wpan_phy_resume(struct device *dev) +{ + struct cfg802154_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + if (rdev->ops->resume) { + rtnl_lock(); + ret = rdev_resume(rdev); + rtnl_unlock(); + } + + return ret; +} + +static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume); +#define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops) +#else +#define WPAN_PHY_PM_OPS NULL +#endif + struct class wpan_phy_class = { .name = "ieee802154", .dev_release = wpan_phy_release, .dev_groups = pmib_groups, + .pm = WPAN_PHY_PM_OPS, }; int wpan_phy_sysfs_init(void) diff --git a/kernel/net/ieee802154/trace.h b/kernel/net/ieee802154/trace.h index 5ac25eb6e..9a471e41e 100644 --- a/kernel/net/ieee802154/trace.h +++ b/kernel/net/ieee802154/trace.h @@ -1,4 +1,4 @@ -/* Based on net/wireless/tracing.h */ +/* Based on net/wireless/trace.h */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg802154 @@ -40,6 +40,28 @@ * rdev->ops traces * *************************************************************/ +DECLARE_EVENT_CLASS(wpan_phy_only_evt, + TP_PROTO(struct wpan_phy *wpan_phy), + TP_ARGS(wpan_phy), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG) +); + +DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend, + TP_PROTO(struct wpan_phy *wpan_phy), + TP_ARGS(wpan_phy) +); + +DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume, + TP_PROTO(struct wpan_phy *wpan_phy), + TP_ARGS(wpan_phy) +); + TRACE_EVENT(802154_rdev_add_virtual_intf, TP_PROTO(struct wpan_phy *wpan_phy, char *name, enum nl802154_iftype type, __le64 extended_addr), @@ -56,7 +78,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf, __entry->type = type; __entry->extended_addr = extended_addr; ), - TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx", + TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx", WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, __le64_to_cpu(__entry->extended_addr)) ); @@ -93,6 +115,21 @@ TRACE_EVENT(802154_rdev_set_channel, __entry->page, __entry->channel) ); +TRACE_EVENT(802154_rdev_set_tx_power, + TP_PROTO(struct wpan_phy *wpan_phy, s32 power), + TP_ARGS(wpan_phy, power), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(s32, power) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->power = power; + ), + TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG, + __entry->power) +); + TRACE_EVENT(802154_rdev_set_cca_mode, TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca), TP_ARGS(wpan_phy, cca), @@ -108,6 +145,21 @@ TRACE_EVENT(802154_rdev_set_cca_mode, WPAN_CCA_PR_ARG) ); +TRACE_EVENT(802154_rdev_set_cca_ed_level, + TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level), + TP_ARGS(wpan_phy, ed_level), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(s32, ed_level) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->ed_level = ed_level; + ), + TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG, + __entry->ed_level) +); + DECLARE_EVENT_CLASS(802154_le16_template, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), @@ -137,7 +189,7 @@ DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), TP_ARGS(wpan_phy, wpan_dev, le16arg), - TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x", + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __le16_to_cpu(__entry->le16arg)) ); @@ -160,7 +212,7 @@ TRACE_EVENT(802154_rdev_set_backoff_exponent, ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT - ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG, + ", min be: %d, max be: %d", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be) ); @@ -223,6 +275,25 @@ TRACE_EVENT(802154_rdev_set_lbt_mode, WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode)) ); +TRACE_EVENT(802154_rdev_set_ackreq_default, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + bool ackreq), + TP_ARGS(wpan_phy, wpan_dev, ackreq), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(bool, ackreq) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->ackreq = ackreq; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", ackreq default: %s", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq)) +); + TRACE_EVENT(802154_rdev_return_int, TP_PROTO(struct wpan_phy *wpan_phy, int ret), TP_ARGS(wpan_phy, ret), diff --git a/kernel/net/ipv4/Kconfig b/kernel/net/ipv4/Kconfig index bd2901604..416dfa004 100644 --- a/kernel/net/ipv4/Kconfig +++ b/kernel/net/ipv4/Kconfig @@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" - depends on INET - select NET_UDP_TUNNEL - ---help--- - This allows one to create Geneve virtual interfaces that provide - Layer 2 Networks over Layer 3 Networks. Geneve is often used - to tunnel virtual network infrastructure in virtualized environments. - For more information see: - http://tools.ietf.org/html/draft-gross-geneve-01 - - To compile this driver as a module, choose M here: the module - - config INET_AH tristate "IP: AH transformation" select XFRM_ALGO @@ -615,6 +601,22 @@ config TCP_CONG_DCTCP For further details see: http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf +config TCP_CONG_CDG + tristate "CAIA Delay-Gradient (CDG)" + default n + ---help--- + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: + + o Use the delay gradient as a congestion signal. + o Back off with an average probability that is independent of the RTT. + o Coexist with flows that use loss-based congestion control. + o Tolerate packet loss unrelated to congestion. + + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -646,6 +648,9 @@ choice config DEFAULT_DCTCP bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_CDG + bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_RENO bool "Reno" endchoice @@ -668,6 +673,7 @@ config DEFAULT_TCP_CONG default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG default "cubic" config TCP_MD5SIG diff --git a/kernel/net/ipv4/Makefile b/kernel/net/ipv4/Makefile index 518c04ed6..c29809f76 100644 --- a/kernel/net/ipv4/Makefile +++ b/kernel/net/ipv4/Makefile @@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ + tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ @@ -42,6 +43,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o @@ -56,7 +58,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/kernel/net/ipv4/af_inet.c b/kernel/net/ipv4/af_inet.c index a5aa54ea6..5c5db6636 100644 --- a/kernel/net/ipv4/af_inet.c +++ b/kernel/net/ipv4/af_inet.c @@ -112,12 +112,14 @@ #include #include #include +#include #include #include #include #ifdef CONFIG_IP_MROUTE #include #endif +#include /* The inetsw table contains everything that inet_create needs to @@ -217,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - !inet_csk(sk)->icsk_accept_queue.fastopenq) { + !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - err = fastopen_init_queue(sk, backlog); + fastopen_queue_tune(sk, backlog); else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0) - err = fastopen_init_queue(sk, + fastopen_queue_tune(sk, ((uint)sysctl_tcp_fastopen) >> 16); - else - err = 0; - if (err) - goto out; tcp_fastopen_init_key_once(true); } @@ -259,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -319,7 +320,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -426,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -447,7 +449,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; + chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -490,7 +493,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; @@ -1038,22 +1042,16 @@ void inet_register_protosw(struct inet_protosw *p) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ - answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); - /* Check only the non-wild match. */ - if (INET_PROTOSW_PERMANENT & answer->flags) { - if (protocol == answer->protocol) - break; - last_perm = lh; - } - - answer = NULL; + if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) + break; + if (protocol == answer->protocol) + goto out_permanent; + last_perm = lh; } - if (answer) - goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with @@ -1432,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(family, type, protocol, &sock); + int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; @@ -1442,45 +1440,56 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); - - sk_change_net(*sk, net); } return rc; } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); +u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) +{ + return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field); + unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) - res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); + res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, + size_t syncp_offset) +{ + void *bhptr; + struct u64_stats_sync *syncp; + u64 v; + unsigned int start; + + bhptr = per_cpu_ptr(mib, cpu); + syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); + do { + start = u64_stats_fetch_begin_irq(syncp); + v = *(((u64 *)bhptr) + offt); + } while (u64_stats_fetch_retry_irq(syncp, start)); + + return v; +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); + u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { - void *bhptr; - struct u64_stats_sync *syncp; - u64 v; - unsigned int start; - - bhptr = per_cpu_ptr(mib, cpu); - syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); - do { - start = u64_stats_fetch_begin_irq(syncp); - v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_irq(syncp, start)); - - res += v; + res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } @@ -1599,7 +1608,7 @@ static __net_init int inet_init_net(struct net *net) */ seqlock_init(&net->ipv4.ip_local_ports.lock); net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 61000; + net->ipv4.ip_local_ports.range[1] = 60999; seqlock_init(&net->ipv4.ping_group_range.lock); /* @@ -1781,6 +1790,8 @@ static int __init inet_init(void) dev_add_pack(&ip_packet_type); + ip_tunnel_core_init(); + rc = 0; out: return rc; diff --git a/kernel/net/ipv4/ah4.c b/kernel/net/ipv4/ah4.c index ac9a32ec3..f2a71025a 100644 --- a/kernel/net/ipv4/ah4.c +++ b/kernel/net/ipv4/ah4.c @@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); diff --git a/kernel/net/ipv4/arp.c b/kernel/net/ipv4/arp.c index 933a92820..59b3e0e8f 100644 --- a/kernel/net/ipv4/arp.c +++ b/kernel/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include #include #include +#include +#include #include @@ -233,7 +235,7 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } - neigh->type = inet_addr_type(dev_net(dev), addr); + neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); @@ -291,6 +293,39 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(skb); } +/* Create and send an arp packet. */ +static void arp_send_dst(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, + struct dst_entry *dst) +{ + struct sk_buff *skb; + + /* arp on this interface. */ + if (dev->flags & IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + skb_dst_set(skb, dst_clone(dst)); + arp_xmit(skb); +} + +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, const unsigned char *src_hw, + const unsigned char *target_hw) +{ + arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, + target_hw, NULL); +} +EXPORT_SYMBOL(arp_send); + static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; @@ -299,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -309,7 +345,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), + if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; @@ -317,7 +353,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!skb) break; saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(dev_net(dev), dev, + saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -346,8 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } - arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL); + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = skb_dst(skb); + arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -585,48 +624,28 @@ out: } EXPORT_SYMBOL(arp_create); +static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb, - NULL, skb->dev, dev_queue_xmit_sk); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, + dev_net(skb->dev), NULL, skb, NULL, skb->dev, + arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); -/* - * Create and send an arp packet. - */ -void arp_send(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw) -{ - struct sk_buff *skb; - - /* - * No arp on this interface. - */ - - if (dev->flags&IFF_NOARP) - return; - - skb = arp_create(type, ptype, dest_ip, dev, src_ip, - dest_hw, src_hw, target_hw); - if (!skb) - return; - - arp_xmit(skb); -} -EXPORT_SYMBOL(arp_send); - /* * Process an arp request. */ -static int arp_process(struct sock *sk, struct sk_buff *skb) +static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -638,7 +657,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) u16 dev_type = dev->type; int addr_type; struct neighbour *n; - struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -739,13 +758,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(net, tip) == RTN_LOCAL && + inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -764,9 +788,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -784,13 +809,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); - return 0; + goto out_free_dst; } goto out; } @@ -802,16 +828,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); if (IN_DEV_ARP_ACCEPT(in_dev)) { + unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - inet_addr_type(net, sip) == RTN_UNICAST; + addr_type == RTN_UNICAST; if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) + addr_type == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -842,12 +870,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) out: consume_skb(skb); +out_free_dst: + dst_release(reply_dst); return 0; } static void parp_redo(struct sk_buff *skb) { - arp_process(NULL, skb); + arp_process(dev_net(skb->dev), NULL, skb); } @@ -880,8 +910,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb, - dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, + dev_net(dev), NULL, skb, dev, NULL, + arp_process); consumeskb: consume_skb(skb); @@ -1017,14 +1048,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { - read_lock_bh(&neigh->lock); - memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); - r->arp_flags = arp_state_to_flags(neigh); - read_unlock_bh(&neigh->lock); - r->arp_ha.sa_family = dev->type; - strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + if (!(neigh->nud_state & NUD_NOARP)) { + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + r->arp_ha.sa_family = dev->type; + strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + err = 0; + } neigh_release(neigh); - err = 0; } return err; } diff --git a/kernel/net/ipv4/datagram.c b/kernel/net/ipv4/datagram.c index 574fad9cc..f915abff1 100644 --- a/kernel/net/ipv4/datagram.c +++ b/kernel/net/ipv4/datagram.c @@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; - inet_set_txhash(sk); + sk_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/kernel/net/ipv4/devinet.c b/kernel/net/ipv4/devinet.c index 419d23c53..f6303b175 100644 --- a/kernel/net/ipv4/devinet.c +++ b/kernel/net/ipv4/devinet.c @@ -882,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); - blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } return 0; } @@ -1645,7 +1644,8 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } -static size_t inet_get_link_af_size(const struct net_device *dev) +static size_t inet_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); @@ -1655,7 +1655,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev) return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } -static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; @@ -1740,6 +1741,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + size += nla_total_size(4); return size; } @@ -1780,6 +1783,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -1819,6 +1826,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, + [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -1839,7 +1847,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -2048,6 +2056,12 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, ifindex, cnf); } + if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + ifindex, cnf); + } } return ret; @@ -2169,6 +2183,8 @@ static struct devinet_sysctl_table { "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, + "ignore_routes_with_linkdown"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2383,4 +2399,3 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, inet_netconf_dump_devconf, NULL); } - diff --git a/kernel/net/ipv4/esp4.c b/kernel/net/ipv4/esp4.c index 30b544f02..477937465 100644 --- a/kernel/net/ipv4/esp4.c +++ b/kernel/net/ipv4/esp4.c @@ -49,7 +49,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) len = ALIGN(len, crypto_tfm_ctx_alignment()); } - len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len += sizeof(struct aead_request) + crypto_aead_reqsize(aead); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; @@ -68,17 +68,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } -static inline struct aead_givcrypt_request *esp_tmp_givreq( - struct crypto_aead *aead, u8 *iv) -{ - struct aead_givcrypt_request *req; - - req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), - crypto_tfm_ctx_alignment()); - aead_givcrypt_set_tfm(req, aead); - return req; -} - static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) { struct aead_request *req; @@ -97,14 +86,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static inline struct scatterlist *esp_givreq_sg( - struct crypto_aead *aead, struct aead_givcrypt_request *req) -{ - return (void *)ALIGN((unsigned long)(req + 1) + - crypto_aead_reqsize(aead), - __alignof__(struct scatterlist)); -} - static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -113,14 +94,37 @@ static void esp_output_done(struct crypto_async_request *base, int err) xfrm_output_resume(skb, err); } +/* Move ESP header back into place. */ +static void esp_restore_header(struct sk_buff *skb, unsigned int offset) +{ + struct ip_esp_hdr *esph = (void *)(skb->data + offset); + void *tmp = ESP_SKB_CB(skb)->tmp; + __be32 *seqhi = esp_tmp_seqhi(tmp); + + esph->seq_no = esph->spi; + esph->spi = *seqhi; +} + +static void esp_output_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); +} + +static void esp_output_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_output_restore_header(skb); + esp_output_done(base, err); +} + static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { int err; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct aead_givcrypt_request *req; + struct aead_request *req; struct scatterlist *sg; - struct scatterlist *asg; struct sk_buff *trailer; void *tmp; u8 *iv; @@ -129,17 +133,19 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int clen; int alen; int plen; + int ivlen; int tfclen; int nfrags; int assoclen; - int sglists; int seqhilen; __be32 *seqhi; + __be64 seqno; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); tfclen = 0; if (x->tfcpad) { @@ -160,16 +166,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) nfrags = err; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) { err = -ENOMEM; goto error; @@ -177,9 +181,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); - req = esp_tmp_givreq(aead, iv); - asg = esp_givreq_sg(aead, req); - sg = asg + sglists; + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -235,37 +238,53 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) *skb_mac_header(skb) = IPPROTO_UDP; } - esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + aead_request_set_callback(req, 0, esp_output_done, skb); + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); + *seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + } + + esph->spi = x->id.spi; + sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, - esph->enc_data + crypto_aead_ivsize(aead) - skb->data, - clen + alen); + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); - if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); - - aead_givcrypt_set_callback(req, 0, esp_output_done, skb); - aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, assoclen); - aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low + - ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); + aead_request_set_ad(req, assoclen); + + seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + memset(iv, 0, ivlen); + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; - err = crypto_aead_givencrypt(req); - if (err == -EINPROGRESS) + err = crypto_aead_encrypt(req); + + switch (err) { + case -EINPROGRESS: goto error; - if (err == -EBUSY) + case -EBUSY: err = NET_XMIT_DROP; + break; + + case 0: + if ((x->props.flags & XFRM_STATE_ESN)) + esp_output_restore_header(skb); + } kfree(tmp); @@ -364,6 +383,20 @@ static void esp_input_done(struct crypto_async_request *base, int err) xfrm_input_resume(skb, esp_input_done2(skb, err)); } +static void esp_input_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, 0); + __skb_pull(skb, 4); +} + +static void esp_input_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_input_restore_header(skb); + esp_input_done(base, err); +} + /* * Note: detecting truncated vs. non-truncated authentication data is very * expensive, so we only support truncated data, which is the recommended @@ -375,19 +408,18 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; - int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int ivlen = crypto_aead_ivsize(aead); + int elen = skb->len - sizeof(*esph) - ivlen; int nfrags; int assoclen; - int sglists; int seqhilen; __be32 *seqhi; void *tmp; u8 *iv; struct scatterlist *sg; - struct scatterlist *asg; int err = -EINVAL; - if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) + if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) goto out; if (elen <= 0) @@ -400,17 +432,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = err; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } err = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) goto out; @@ -418,36 +448,39 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); - asg = esp_req_sg(aead, req); - sg = asg + sglists; + sg = esp_req_sg(aead, req); skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr *)skb->data; - /* Get ivec. This can be wrong, check against another impls. */ - iv = esph->enc_data; - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + aead_request_set_callback(req, 0, esp_input_done, skb); + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + aead_request_set_callback(req, 0, esp_input_done_esn, skb); + } - aead_request_set_callback(req, 0, esp_input_done, skb); - aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, assoclen); + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); + aead_request_set_ad(req, assoclen); err = crypto_aead_decrypt(req); if (err == -EINPROGRESS) goto out; + if ((x->props.flags & XFRM_STATE_ESN)) + esp_input_restore_header(skb); + err = esp_input_done2(skb, err); out: @@ -519,10 +552,16 @@ static void esp_destroy(struct xfrm_state *x) static int esp_init_aead(struct xfrm_state *x) { + char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = -ENAMETOOLONG; + if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", + x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -561,15 +600,19 @@ static int esp_init_authenc(struct xfrm_state *x) if ((x->props.flags & XFRM_STATE_ESN)) { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authencesn(%s,%s)", + "%s%sauthencesn(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } else { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authenc(%s,%s)", + "%s%sauthenc(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } diff --git a/kernel/net/ipv4/fib_frontend.c b/kernel/net/ipv4/fib_frontend.c index 872494e6e..473447593 100644 --- a/kernel/net/ipv4/fib_frontend.c +++ b/kernel/net/ipv4/fib_frontend.c @@ -45,6 +45,8 @@ #include #include #include +#include +#include #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -211,12 +213,12 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr) + __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; - struct fib_table *local_table; + struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; @@ -225,10 +227,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (local_table) { + table = fib_get_table(net, tb_id); + if (table) { ret = RTN_UNICAST; - if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } @@ -238,19 +240,40 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } +unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) +{ + return __inet_dev_addr_type(net, NULL, addr, tb_id); +} +EXPORT_SYMBOL(inet_addr_type_table); + unsigned int inet_addr_type(struct net *net, __be32 addr) { - return __inet_dev_addr_type(net, NULL, addr); + return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); +/* inet_addr_type with dev == NULL but using the table from a dev + * if one is associated + */ +unsigned int inet_addr_type_dev_table(struct net *net, + const struct net_device *dev, + __be32 addr) +{ + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, NULL, addr, rt_table); +} +EXPORT_SYMBOL(inet_addr_type_dev_table); + __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -280,7 +303,8 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - if (!fib_lookup(net, &fl4, &res)) + fl4.flowi4_tun_key.tun_id = 0; + if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { scope = RT_SCOPE_LINK; @@ -308,18 +332,24 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev); + if (!fl4.flowi4_iif) + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_flags = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; + trace_fib_validate_source(dev, &fl4); + net = dev_net(dev); - if (fib_lookup(net, &fl4, &res)) + if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) @@ -337,6 +367,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh->nh_dev == dev) { dev_match = true; break; + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + dev_match = true; + break; } } #else @@ -354,7 +387,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl4, &res) == 0) { + if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } @@ -494,9 +527,12 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { + unsigned int addr_type; + cfg->fc_gw = addr; + addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && - inet_addr_type(net, addr) == RTN_UNICAST) + addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } @@ -591,6 +627,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -656,6 +694,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; + case RTA_ENCAP: + cfg->fc_encap = attr; + break; + case RTA_ENCAP_TYPE: + cfg->fc_encap_type = nla_get_u16(attr); + break; } } @@ -760,6 +804,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); + u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -774,11 +819,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad }, }; - if (type == RTN_UNICAST) - tb = fib_new_table(net, RT_TABLE_MAIN); - else - tb = fib_new_table(net, RT_TABLE_LOCAL); + if (!tb_id) + tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; + tb = fib_new_table(net, tb_id); if (!tb) return; @@ -823,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_NEWROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - prefix, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, prim); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { @@ -870,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); subnet = 1; } @@ -960,11 +1006,14 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); } if (!(ok & LOCAL_OK)) { + unsigned int addr_type; + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (gone && - inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + addr_type = inet_addr_type_dev_table(dev_net(dev), dev, + ifa->ifa_local); + if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -1063,9 +1112,10 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, unsigned long event, + bool force) { - if (fib_sync_down_dev(dev, force)) + if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1081,7 +1131,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); @@ -1093,7 +1143,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, event, true); } else { rt_cache_flush(dev_net(dev)); } @@ -1105,11 +1155,13 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info; struct in_device *in_dev; struct net *net = dev_net(dev); + unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1124,18 +1176,32 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_add_ifaddr(ifa); } endfor_ifa(in_dev); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, event, false); break; - case NETDEV_CHANGEMTU: case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + fib_sync_up(dev, RTNH_F_LINKDOWN); + else + fib_sync_down_dev(dev, event, false); + /* fall through */ + case NETDEV_CHANGEMTU: rt_cache_flush(net); break; + case NETDEV_CHANGEUPPER: + info = ptr; + /* flush all routes if dev is linked to or unlinked from + * an L3 master device (e.g., VRF) + */ + if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + fib_disable_ip(dev, NETDEV_DOWN, true); + break; } return NOTIFY_DONE; } diff --git a/kernel/net/ipv4/fib_lookup.h b/kernel/net/ipv4/fib_lookup.h index c6211ed60..9c0292072 100644 --- a/kernel/net/ipv4/fib_lookup.h +++ b/kernel/net/ipv4/fib_lookup.h @@ -13,6 +13,7 @@ struct fib_alias { u8 fa_state; u8 fa_slen; u32 tb_id; + s16 fa_default; struct rcu_head rcu; }; diff --git a/kernel/net/ipv4/fib_rules.c b/kernel/net/ipv4/fib_rules.c index 56151982f..f2bda9e89 100644 --- a/kernel/net/ipv4/fib_rules.c +++ b/kernel/net/ipv4/fib_rules.c @@ -47,11 +47,12 @@ struct fib4_rule { #endif }; -int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, - .flags = FIB_LOOKUP_NOREF, + .flags = flags, }; int err; @@ -317,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, - .default_pref = fib_default_rule_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, diff --git a/kernel/net/ipv4/fib_semantics.c b/kernel/net/ipv4/fib_semantics.c index 8d695b665..d97268e8f 100644 --- a/kernel/net/ipv4/fib_semantics.c +++ b/kernel/net/ipv4/fib_semantics.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "fib_lookup.h" @@ -56,8 +57,7 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH - -static DEFINE_SPINLOCK(fib_multipath_lock); +u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -208,6 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + lwtstate_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -266,7 +267,8 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -318,7 +320,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -366,6 +368,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->fib_nhs) { + size_t nh_encapsize = 0; /* Also handles the special case fib_nhs == 1 */ /* each nexthop is packed in an attribute */ @@ -374,8 +377,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); + /* grab encap info */ + for_nexthops(fi) { + if (nh->nh_lwtstate) { + /* RTA_ENCAP_TYPE */ + nh_encapsize += lwtunnel_get_encap_size( + nh->nh_lwtstate); + /* RTA_ENCAP */ + nh_encapsize += nla_total_size(2); + } + } endfor_nexthops(fi); + /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); + payload += nla_total_size((fi->fib_nhs * nhsize) + + nh_encapsize); + } return payload; @@ -421,13 +437,15 @@ static int fib_detect_death(struct fib_info *fi, int order, if (n) { state = n->nud_state; neigh_release(n); + } else { + return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || - (*last_idx < 0 && order > dflt)) { + (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } @@ -452,6 +470,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; + int ret; + change_nexthops(fi) { int attrlen; @@ -475,18 +496,130 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (nexthop_nh->nh_tclassid) fi->fib_net->ipv4.fib_num_tclassid_users++; #endif + nla = nla_find(attrs, attrlen, RTA_ENCAP); + if (nla) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + struct nlattr *nla_entype; + + nla_entype = nla_find(attrs, attrlen, + RTA_ENCAP_TYPE); + if (!nla_entype) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + ret = lwtunnel_build_state(dev, nla_get_u16( + nla_entype), + nla, AF_INET, cfg, + &lwtstate); + if (ret) + goto errout; + nexthop_nh->nh_lwtstate = + lwtstate_get(lwtstate); + } } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); return 0; + +err_inval: + ret = -EINVAL; + +errout: + return ret; } -#endif +static void fib_rebalance(struct fib_info *fi) +{ + int total; + int w; + struct in_device *in_dev; + + if (fi->fib_nhs < 2) + return; + + total = 0; + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + + in_dev = __in_dev_get_rtnl(nh->nh_dev); + + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN) + continue; + + total += nh->nh_weight; + } endfor_nexthops(fi); + + w = 0; + change_nexthops(fi) { + int upper_bound; + + in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); + + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + upper_bound = -1; + } else if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + upper_bound = -1; + } else { + w += nexthop_nh->nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; + } + + atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + } endfor_nexthops(fi); + + net_get_random_once(&fib_multipath_secret, + sizeof(fib_multipath_secret)); +} + +static inline void fib_add_weight(struct fib_info *fi, + const struct fib_nh *nh) +{ + fi->fib_weight += nh->nh_weight; +} + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define fib_rebalance(fi) do { } while (0) +#define fib_add_weight(fi, nh) do { } while (0) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +static int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh, + const struct fib_config *cfg) +{ + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + int ret, result = 0; + + if (encap_type == LWTUNNEL_ENCAP_NONE) + return 0; + + if (oif) + dev = __dev_get_by_index(net, oif); + ret = lwtunnel_build_state(dev, encap_type, encap, + AF_INET, cfg, &lwtstate); + if (!ret) { + result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + lwtstate_free(lwtstate); + } + + return result; +} int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { + struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -496,6 +629,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_encap) { + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, cfg->fc_oif, + fi->fib_nh, cfg)) + return 1; + } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; @@ -585,7 +724,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { - int err; + int err = 0; struct net *net; struct net_device *dev; @@ -594,16 +733,20 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_result res; if (nh->nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) - return -EINVAL; dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); + if (addr_type != RTN_UNICAST) + return -EINVAL; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -611,6 +754,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { + struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, @@ -621,7 +765,24 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res); + + if (cfg->fc_table) + tbl = fib_get_table(net, cfg->fc_table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); + } + if (err) { rcu_read_unlock(); return err; @@ -636,6 +797,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -654,6 +817,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -713,8 +878,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *dest; unsigned int new_hash; - hlist_del(&fi->fib_hash); - new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); @@ -731,8 +894,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *ldest; unsigned int new_hash; - hlist_del(&fi->fib_lhash); - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); ldest = &new_laddrhash[new_hash]; hlist_add_head(&fi->fib_lhash, ldest); @@ -757,6 +918,74 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh->nh_saddr; } +static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) +{ + if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || + fib_prefsrc != cfg->fc_dst) { + u32 tb_id = cfg->fc_table; + int rc; + + if (tb_id == RT_TABLE_MAIN) + tb_id = RT_TABLE_LOCAL; + + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id); + + if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, RT_TABLE_LOCAL); + } + + if (rc != RTN_LOCAL) + return false; + } + return true; +} + +static int +fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return 0; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + fi->fib_metrics[type - 1] = val; + } + + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -829,36 +1058,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } endfor_nexthops(fi) - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - u32 val; - - if (type > RTAX_MAX) - goto err_inval; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err_inval; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - fi->fib_metrics[type - 1] = val; - } - } - } + err = fib_convert_metrics(fi, cfg); + if (err) + goto failure; if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -879,6 +1081,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else { struct fib_nh *nh = fi->fib_nh; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, AF_INET, cfg, + &lwtstate); + if (err) + goto failure; + + nh->nh_lwtstate = lwtstate_get(lwtstate); + } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; @@ -924,24 +1142,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; + change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) + fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc) { - if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || - fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) - goto err_inval; - } + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + goto err_inval; change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) + fib_rebalance(fi); + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -1027,17 +1250,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { + struct in_device *in_dev; + if (fi->fib_nh->nh_gw && nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtm->rtm_flags |= RTNH_F_DEAD; + } #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif + if (fi->fib_nh->nh_lwtstate) + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1049,11 +1282,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(fi) { + struct in_device *in_dev; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rtnl(nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtnh->rtnh_flags |= RTNH_F_DEAD; + } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; @@ -1065,6 +1306,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif + if (nh->nh_lwtstate) + lwtunnel_fill_encap(skb, nh->nh_lwtstate); /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); @@ -1107,7 +1350,13 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +/* Event force Flags Description + * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host + * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host + * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed + * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + */ +int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1133,49 +1382,79 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nexthop_nh->nh_power; - nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); -#endif + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && + nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } + + fib_rebalance(fi); } return ret; } /* Must be invoked inside of an RCU protected region. */ -void fib_select_default(struct fib_result *res) +void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; + u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; - struct fib_alias *fa; + struct fib_alias *fa, *fa1 = NULL; + u32 last_prio = res->fi->fib_priority; + u8 last_tos = 0; hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + if (fa->fa_slen != slen) + continue; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fa->tb_id != tb->tb_id) + continue; + if (next_fi->fib_priority > last_prio && + fa->fa_tos == last_tos) { + if (last_tos) + continue; + break; + } + if (next_fi->fib_flags & RTNH_F_DEAD) + continue; + last_tos = fa->fa_tos; + last_prio = next_fi->fib_priority; + if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; @@ -1185,10 +1464,11 @@ void fib_select_default(struct fib_result *res) if (!fi) { if (next_fi != res->fi) break; + fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { + &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); - tb->tb_default = order; + fa1->fa_default = order; goto out; } fi = next_fi; @@ -1196,31 +1476,30 @@ void fib_select_default(struct fib_result *res) } if (order <= 0 || !fi) { - tb->tb_default = -1; + if (fa1) + fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { + fa1->fa_default)) { fib_result_assign(res, fi); - tb->tb_default = order; + fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); - tb->tb_default = last_idx; + fa1->fa_default = last_idx; out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1231,6 +1510,13 @@ int fib_sync_up(struct net_device *dev) if (!(dev->flags & IFF_UP)) return 0; + if (nh_flags & RTNH_F_DEAD) { + unsigned int flags = dev_get_flags(dev); + + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + nh_flags |= RTNH_F_LINKDOWN; + } + prev_fi = NULL; hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; @@ -1247,7 +1533,7 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } @@ -1258,71 +1544,57 @@ int fib_sync_up(struct net_device *dev) !__in_dev_get_rtnl(dev)) continue; alive++; - spin_lock_bh(&fib_multipath_lock); - nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; - spin_unlock_bh(&fib_multipath_lock); + nexthop_nh->nh_flags &= ~nh_flags; } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } + + fib_rebalance(fi); } return ret; } -/* - * The algorithm is suboptimal, but it provides really - * fair weighted route distribution. - */ -void fib_select_multipath(struct fib_result *res) +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; - int w; - spin_lock_bh(&fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } - } endfor_nexthops(fi); - fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } - } - - - /* w should be random number [0..fi->fib_power-1], - * it is pretty bad approximation. - */ - - w = jiffies % fi->fib_power; + for_nexthops(fi) { + if (hash > atomic_read(&nh->nh_upper_bound)) + continue; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && - nexthop_nh->nh_power) { - w -= nexthop_nh->nh_power; - if (w <= 0) { - nexthop_nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); - return; - } - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); } #endif + +void fib_select_path(struct net *net, struct fib_result *res, + struct flowi4 *fl4, int mp_hash) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (mp_hash < 0) + mp_hash = get_hash_from_flowi4(fl4) >> 1; + + fib_select_multipath(res, mp_hash); + } + else +#endif + if (!res->prefixlen && + res->table->tb_num_default > 1 && + res->type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(fl4, res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, *res); +} +EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/kernel/net/ipv4/fib_trie.c b/kernel/net/ipv4/fib_trie.c index 0ca933db1..744e5936c 100644 --- a/kernel/net/ipv4/fib_trie.c +++ b/kernel/net/ipv4/fib_trie.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ #include #include #include +#include #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -324,13 +326,15 @@ static inline void empty_child_dec(struct key_vector *n) static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - struct key_vector *l = kv->kv; + struct key_vector *l; + struct tnode *kv; + kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ + l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; @@ -345,24 +349,26 @@ static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) static struct key_vector *tnode_new(t_key key, int pos, int bits) { - struct tnode *tnode = tnode_alloc(bits); unsigned int shift = pos + bits; - struct key_vector *tn = tnode->kv; + struct key_vector *tn; + struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), - sizeof(struct key_vector *) << bits); - + tnode = tnode_alloc(bits); if (!tnode) return NULL; + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; + tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; @@ -1077,6 +1083,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; + unsigned int nlflags = 0; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; @@ -1165,14 +1172,15 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; + new_fa->fa_default = -1; - err = netdev_switch_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); kmem_cache_free(fn_alias_kmem, new_fa); goto out; } @@ -1196,7 +1204,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa_match) goto out; - if (!(cfg->fc_nlflags & NLM_F_APPEND)) + if (cfg->fc_nlflags & NLM_F_APPEND) + nlflags = NLM_F_APPEND; + else fa = fa_first; } err = -ENOENT; @@ -1214,14 +1224,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; + new_fa->fa_default = -1; /* (Optionally) offload fib entry to switch hardware. */ - err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, + cfg->fc_nlflags, tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); goto out_free_new_fa; } @@ -1235,12 +1244,12 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, - &cfg->fc_nlinfo, 0); + &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_sw_fib_del: - netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1270,6 +1279,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; + trace_fib_table_lookup(tb->tb_id, flp); + pn = t->kv; cindex = 0; @@ -1406,11 +1417,20 @@ found: continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { const struct fib_nh *nh = &fi->fib_nh[nhsel]; + struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); if (nh->nh_flags & RTNH_F_DEAD) continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { + if (flp->flowi4_oif && + flp->flowi4_oif != nh->nh_oif) + continue; + } if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); @@ -1425,6 +1445,8 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup_nh(nh); + return err; } } @@ -1518,8 +1540,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1547,7 +1569,7 @@ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) do { /* record parent and next child index */ pn = n; - cindex = key ? get_index(key, pn) : 0; + cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; @@ -1768,10 +1790,9 @@ void fib_table_flush_external(struct fib_table *tb) if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) continue; - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); } /* update leaf slen */ @@ -1834,10 +1855,9 @@ int fib_table_flush(struct fib_table *tb) continue; } - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1976,7 +1996,6 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) return NULL; tb->tb_id = id; - tb->tb_default = -1; tb->tb_num_default = 0; tb->tb_data = (alias ? alias->__data : tb->__data); @@ -2053,11 +2072,12 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct key_vector *n, *pn = t->kv; + struct key_vector *n, *pn; if (!t) return NULL; + pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; diff --git a/kernel/net/ipv4/fou.c b/kernel/net/ipv4/fou.c index 34968cd5c..bd903fe0f 100644 --- a/kernel/net/ipv4/fou.c +++ b/kernel/net/ipv4/fou.c @@ -24,6 +24,7 @@ struct fou { u16 type; struct udp_offload udp_offloads; struct list_head list; + struct rcu_head rcu; }; #define FOU_F_REMCSUM_NOPARTIAL BIT(0) @@ -79,7 +80,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + size_t plen = sizeof(struct udphdr) + hdrlen + + max_t(size_t, offset + sizeof(u16), start); + + if (skb->remcsum_offload) + return guehdr; if (!pskb_may_pull(skb, plen)) return NULL; @@ -221,29 +226,21 @@ out_unlock: static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, - size_t hdrlen, u8 ipproto, - struct gro_remcsum *grc, bool nopartial) + size_t hdrlen, struct gro_remcsum *grc, + bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) - return NULL; + return guehdr; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - guehdr = skb_gro_header_slow(skb, off + plen, off); - if (!guehdr) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, - start, offset, grc, nopartial); + guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -307,10 +304,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, - data + doffset, hdrlen, - guehdr->proto_ctype, &grc, + data + doffset, hdrlen, &grc, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); + if (!guehdr) goto out; @@ -351,7 +348,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); @@ -421,7 +418,7 @@ static void fou_release(struct fou *fou) list_del(&fou->list); udp_tunnel_sock_release(sock); - kfree(fou); + kfree_rcu(fou, rcu); } static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) @@ -570,7 +567,7 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); - if (family != AF_INET && family != AF_INET6) + if (family != AF_INET) return -EINVAL; cfg->udp_config.family = family; diff --git a/kernel/net/ipv4/geneve.c b/kernel/net/ipv4/geneve.c deleted file mode 100644 index 8986e63f3..000000000 --- a/kernel/net/ipv4/geneve.c +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Geneve: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#include -#include -#include -#endif - -/* Protects sock_list and refcounts. */ -static DEFINE_MUTEX(geneve_mutex); - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head sock_list; -}; - -static int geneve_net_id; - -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - -static struct geneve_sock *geneve_find_sock(struct net *net, - sa_family_t family, __be16 port) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == port && - inet_sk(gs->sock->sk)->sk.sk_family == family) - return gs; - } - - return NULL; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; - - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); -} -EXPORT_SYMBOL_GPL(geneve_xmit_skb); - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - return err; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto error; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - - if (unlikely(geneveh->ver != GENEVE_VER)) - goto error; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto error; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - gs->rcv(gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; - -error: - /* Let the UDP layer deal with the skb */ - return 1; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool ipv6) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - gs->rcv = rcv; - gs->rcv_data = data; - - /* Initialize the geneve udp offloads structure */ - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); - - /* Mark socket as an encapsulation socket */ - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - list_add(&gs->list, &gn->sock_list); - - return gs; -} - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) -{ - struct geneve_sock *gs; - - mutex_lock(&geneve_mutex); - - gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); - if (gs) { - if (!no_share && gs->rcv == rcv) - gs->refcnt++; - else - gs = ERR_PTR(-EBUSY); - } else { - gs = geneve_socket_create(net, port, rcv, data, ipv6); - } - - mutex_unlock(&geneve_mutex); - - return gs; -} -EXPORT_SYMBOL_GPL(geneve_sock_add); - -void geneve_sock_release(struct geneve_sock *gs) -{ - mutex_lock(&geneve_mutex); - - if (--gs->refcnt) - goto unlock; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); - -unlock: - mutex_unlock(&geneve_mutex); -} -EXPORT_SYMBOL_GPL(geneve_sock_release); - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->sock_list); - - return 0; -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -static int __init geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - return rc; - - pr_info("Geneve driver\n"); - - return 0; -} -module_init(geneve_init_module); - -static void __exit geneve_cleanup_module(void) -{ - unregister_pernet_subsys(&geneve_net_ops); -} -module_exit(geneve_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jesse Gross "); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); -MODULE_ALIAS_RTNL_LINK("geneve"); diff --git a/kernel/net/ipv4/gre_demux.c b/kernel/net/ipv4/gre_demux.c index 4a7b5b2a1..d9c552a72 100644 --- a/kernel/net/ipv4/gre_demux.c +++ b/kernel/net/ipv4/gre_demux.c @@ -31,7 +31,6 @@ #include static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; -static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX]; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { @@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(tpi->flags); - greh->protocol = tpi->proto; - - if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (tpi->flags&TUNNEL_SEQ) { - *ptr = tpi->seq; - ptr--; - } - if (tpi->flags&TUNNEL_KEY) { - *ptr = tpi->key; - ptr--; - } - if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } -} -EXPORT_SYMBOL_GPL(gre_build_header); - -static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err) -{ - const struct gre_base_hdr *greh; - __be32 *options; - int hdr_len; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - hdr_len = ip_gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { - *csum_err = true; - return -EINVAL; - } - - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); - - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else - tpi->key = 0; - - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else - tpi->seq = 0; - - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = htons(ETH_P_IP); - if ((*(u8 *)options & 0xF0) != 0x40) { - hdr_len += 4; - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - } - } - - return iptunnel_pull_header(skb, hdr_len, tpi->proto); -} - -static int gre_cisco_rcv(struct sk_buff *skb) -{ - struct tnl_ptk_info tpi; - int i; - bool csum_err = false; - -#ifdef CONFIG_NET_IPGRE_BROADCAST - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; - } -#endif - - if (parse_gre_header(skb, &tpi, &csum_err) < 0) - goto drop; - - rcu_read_lock(); - for (i = 0; i < GRE_IP_PROTO_MAX; i++) { - struct gre_cisco_protocol *proto; - int ret; - - proto = rcu_dereference(gre_cisco_proto_list[i]); - if (!proto) - continue; - ret = proto->handler(skb, &tpi); - if (ret == PACKET_RCVD) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -drop: - kfree_skb(skb); - return 0; -} - -static void gre_cisco_err(struct sk_buff *skb, u32 info) -{ - /* All the routers (except for Linux) return only - * 8 bytes of packet payload. It means, that precise relaying of - * ICMP in the real Internet is absolutely infeasible. - * - * Moreover, Cisco "wise men" put GRE key to the third word - * in GRE header. It makes impossible maintaining even soft - * state for keyed - * GRE tunnels with enabled checksum. Tell them "thank you". - * - * Well, I wonder, rfc1812 was written by Cisco employee, - * what the hell these idiots break standards established - * by themselves??? - */ - - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct tnl_ptk_info tpi; - bool csum_err = false; - int i; - - if (parse_gre_header(skb, &tpi, &csum_err)) { - if (!csum_err) /* ignore csum errors. */ - return; - } - - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - skb->dev->ifindex, 0, IPPROTO_GRE, 0); - return; - } - if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, - IPPROTO_GRE, 0); - return; - } - - rcu_read_lock(); - for (i = 0; i < GRE_IP_PROTO_MAX; i++) { - struct gre_cisco_protocol *proto; - - proto = rcu_dereference(gre_cisco_proto_list[i]); - if (!proto) - continue; - - if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD) - goto out; - - } -out: - rcu_read_unlock(); -} - static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; @@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = { .netns_ok = 1, }; -static const struct gre_protocol ipgre_protocol = { - .handler = gre_cisco_rcv, - .err_handler = gre_cisco_err, -}; - -int gre_cisco_register(struct gre_cisco_protocol *newp) -{ - struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) - &gre_cisco_proto_list[newp->priority]; - - return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY; -} -EXPORT_SYMBOL_GPL(gre_cisco_register); - -int gre_cisco_unregister(struct gre_cisco_protocol *del_proto) -{ - struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) - &gre_cisco_proto_list[del_proto->priority]; - int ret; - - ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL; - - if (ret) - return ret; - - synchronize_net(); - return 0; -} -EXPORT_SYMBOL_GPL(gre_cisco_unregister); - static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); - goto err; - } - - if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) { - pr_info("%s: can't add ipgre handler\n", __func__); - goto err_gre; + return -EAGAIN; } - return 0; -err_gre: - inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); -err: - return -EAGAIN; } static void __exit gre_exit(void) { - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/kernel/net/ipv4/gre_offload.c b/kernel/net/ipv4/gre_offload.c index 5aa46d4b4..5a8ee3282 100644 --- a/kernel/net/ipv4/gre_offload.c +++ b/kernel/net/ipv4/gre_offload.c @@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP))) + SKB_GSO_IPIP | + SKB_GSO_SIT))) goto out; if (!skb->encapsulation) diff --git a/kernel/net/ipv4/icmp.c b/kernel/net/ipv4/icmp.c index be5fd9b81..74314d95d 100644 --- a/kernel/net/ipv4/icmp.c +++ b/kernel/net/ipv4/icmp.c @@ -97,6 +97,7 @@ #include #include #include +#include /* * Build xmit assembly blocks @@ -309,9 +310,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, rc = false; if (icmp_global_allow()) { + int vif = l3mdev_master_ifindex(dst->dev); struct inet_peer *peer; - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) @@ -426,6 +428,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; + fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -438,6 +441,22 @@ out_unlock: icmp_xmit_unlock(sk); } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* Source and destination is swapped. See ip_multipath_icmp_hash */ +static int icmp_multipath_hash_skb(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + return fib_multipath_hash(iph->daddr, iph->saddr); +} + +#else + +#define icmp_multipath_hash_skb(skb) (-1) + +#endif + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -459,8 +478,11 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; + fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key_hash(net, fl4, + icmp_multipath_hash_skb(skb_in)); if (IS_ERR(rt)) return rt; @@ -481,7 +503,8 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(net, skb_in->dev, + fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) err = PTR_ERR(rt2); @@ -497,6 +520,7 @@ static struct rtable *icmp_route_lookup(struct net *net, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ + skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, RT_TOS(tos), rt2->dst.dev); @@ -829,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb) */ if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { + inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) { net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", &ip_hdr(skb)->saddr, icmph->type, icmph->code, diff --git a/kernel/net/ipv4/igmp.c b/kernel/net/ipv4/igmp.c index a3a697f5f..05e4cba14 100644 --- a/kernel/net/ipv4/igmp.c +++ b/kernel/net/ipv4/igmp.c @@ -110,6 +110,9 @@ #define IP_MAX_MEMBERSHIPS 20 #define IP_MAX_MSF 10 +/* IGMP reports for link-local multicast groups are enabled by default */ +int sysctl_igmp_llm_reports __read_mostly = 1; + #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -394,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(skb); + return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; + if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; @@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(pmc->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; @@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - else if (type == IGMP_HOST_LEAVE_MESSAGE) + + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return 0; + + if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; @@ -727,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(unsigned long data) @@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { @@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; @@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; reporter = im->reporter; igmp_stop_timer(im); @@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; if (in_dev->dead) return; @@ -1339,6 +1360,171 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group); +static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret = -EINVAL; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + skb_chk = skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum); + if (!skb_chk) + goto err; + + if (!pskb_may_pull(skb_chk, len)) + goto err; + + ret = ip_mc_check_igmp_msg(skb_chk); + if (ret) + goto err; + + if (skb_trimmed) + *skb_trimmed = skb_chk; + /* free now unneeded clone */ + else if (skb_chk != skb) + kfree_skb(skb_chk); + + ret = 0; + +err: + if (ret && skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return ret; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb transport header accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * Caller needs to set the skb network header and free any returned skb if it + * differs from the provided skb. + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ @@ -1353,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; /* a failover is happening and switches * must be notified immediately @@ -1937,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); - if (!in_dev) { + if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } @@ -1958,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - ip_mc_dec_group(in_dev, group); + if (in_dev) + ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); @@ -2203,11 +2393,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct ip_sf_socklist *psl; struct net *net = sock_net(sk); + ASSERT_RTNL(); + if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; @@ -2228,7 +2418,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); - rtnl_unlock(); if (!psl) { len = 0; count = 0; @@ -2247,7 +2436,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, return -EFAULT; return 0; done: - rtnl_unlock(); return err; } @@ -2261,6 +2449,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; + ASSERT_RTNL(); + psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; @@ -2268,8 +2458,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { @@ -2281,7 +2469,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, goto done; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); - rtnl_unlock(); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; @@ -2301,7 +2488,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, } return 0; done: - rtnl_unlock(); return err; } @@ -2380,7 +2566,7 @@ void ip_mc_drop_socket(struct sock *sk) } /* called with rcu_read_lock() */ -int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; diff --git a/kernel/net/ipv4/inet_connection_sock.c b/kernel/net/ipv4/inet_connection_sock.c index b27fc401c..641489148 100644 --- a/kernel/net/ipv4/inet_connection_sock.c +++ b/kernel/net/ipv4/inet_connection_sock.c @@ -99,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; kuid_t uid = sock_i_uid(sk); + int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; local_bh_disable(); if (!snum) { @@ -106,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); + if (attempt_half) { + int half = low + ((high - low) >> 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } remaining = (high - low) + 1; smallest_rover = rover = prandom_u32() % remaining + low; @@ -127,11 +136,6 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && - !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = smallest_rover; - goto tb_found; - } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = rover; @@ -159,6 +163,11 @@ again: snum = smallest_rover; goto have_snum; } + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto again; + } goto fail; } /* OK, here is the one we will use. HEAD is @@ -321,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && - tcp_rsk(req)->tfo_listener && - queue->fastopenq) { - spin_lock_bh(&queue->fastopenq->lock); + tcp_rsk(req)->tfo_listener) { + spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to @@ -339,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) req->sk = NULL; req = NULL; } - spin_unlock_bh(&queue->fastopenq->lock); + spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); @@ -399,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -struct dst_entry *inet_csk_route_req(struct sock *sk, +struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { @@ -430,7 +437,7 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_req); -struct dst_entry *inet_csk_route_child_sock(struct sock *sk, +struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { @@ -469,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -554,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); @@ -564,27 +518,21 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; - spin_lock(&queue->syn_wait_lock); + if (sk_hashed(req_to_sk(req))) { + spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); } - - spin_unlock(&queue->syn_wait_lock); - if (del_timer_sync(&req->rsk_timer)) + if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; } @@ -598,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); +void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) +{ + inet_csk_reqsk_queue_drop(sk, req); + reqsk_put(req); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_state_load(sk_listener) != TCP_LISTEN) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -633,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); - if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + qlen = reqsk_queue_len(queue); + if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -657,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } - inet_csk_reqsk_queue_drop(sk_listener, req); - reqsk_put(req); +drop: + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - req->rsk_hash = hash; - - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); + atomic_set(&req->rsk_refcnt, 2 + 1); +} - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -782,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); -int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +int inet_csk_listen_start(struct sock *sk, int backlog) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = 0; + sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); @@ -800,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk->sk_state = TCP_LISTEN; + sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -811,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); +static void inet_child_forget(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != req->rsk_listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + } + inet_csk_destroy_sock(child); + reqsk_put(req); +} + +struct sock *inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) +{ + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + spin_lock(&queue->rskq_lock); + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_child_forget(sk, req, child); + child = NULL; + } else { + req->sk = child; + req->dl_next = NULL; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_head = req; + else + queue->rskq_accept_tail->dl_next = req; + queue->rskq_accept_tail = req; + sk_acceptq_added(sk); + } + spin_unlock(&queue->rskq_lock); + return child; +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_add); + +struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, + struct request_sock *req, bool own_req) +{ + if (own_req) { + inet_csk_reqsk_queue_drop(sk, req); + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; + } + /* Too bad, another child took ownership of the request, undo. */ + bh_unlock_sock(child); + sock_put(child); + return NULL; +} +EXPORT_SYMBOL(inet_csk_complete_hashdance); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. @@ -824,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct request_sock *acc_req; - struct request_sock *req; - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(queue); + struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -838,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(queue); - - while ((req = acc_req) != NULL) { + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; - acc_req = req->dl_next; - local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - percpu_counter_inc(sk->sk_prot->orphan_count); - - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != req->rsk_listener); - - /* Paranoid, to prevent race condition if - * an inbound pkt destined for child is - * blocked by sock lock in tcp_v4_rcv(). - * Also to satisfy an assertion in - * tcp_v4_destroy_sock(). - */ - tcp_sk(child)->fastopen_rsk = NULL; - } - inet_csk_destroy_sock(child); - + inet_child_forget(sk, req, child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); - sk_acceptq_removed(sk); - reqsk_put(req); + cond_resched(); } - if (queue->fastopenq) { + if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ - spin_lock_bh(&queue->fastopenq->lock); - acc_req = queue->fastopenq->rskq_rst_head; - queue->fastopenq->rskq_rst_head = NULL; - spin_unlock_bh(&queue->fastopenq->lock); - while ((req = acc_req) != NULL) { - acc_req = req->dl_next; + spin_lock_bh(&queue->fastopenq.lock); + req = queue->fastopenq.rskq_rst_head; + queue->fastopenq.rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq.lock); + while (req != NULL) { + next = req->dl_next; reqsk_put(req); + req = next; } } - WARN_ON(sk->sk_ack_backlog); + WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/kernel/net/ipv4/inet_diag.c b/kernel/net/ipv4/inet_diag.c index 4d32262c7..ab9f8a666 100644 --- a/kernel/net/ipv4/inet_diag.c +++ b/kernel/net/ipv4/inet_diag.c @@ -151,6 +151,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; + + if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && + nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) + goto errout; } #endif @@ -200,9 +204,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) { + if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve(skb, INET_DIAG_INFO, - sizeof(struct tcp_info)); + handler->idiag_info_size); if (!attr) goto errout; @@ -726,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !listen_sock_qlen(lopt)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -840,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -875,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -902,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) @@ -1078,14 +1002,62 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) return inet_diag_get_exact(skb, h, nlmsg_data(h)); } +static +int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) +{ + const struct inet_diag_handler *handler; + struct nlmsghdr *nlh; + struct nlattr *attr; + struct inet_diag_msg *r; + void *info = NULL; + int err = 0; + + nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); + if (!nlh) + return -ENOMEM; + + r = nlmsg_data(nlh); + memset(r, 0, sizeof(*r)); + inet_diag_msg_common_fill(r, sk); + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) + r->id.idiag_sport = inet_sk(sk)->inet_sport; + r->idiag_state = sk->sk_state; + + if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { + nlmsg_cancel(skb, nlh); + return err; + } + + handler = inet_diag_lock_handler(sk->sk_protocol); + if (IS_ERR(handler)) { + inet_diag_unlock_handler(handler); + nlmsg_cancel(skb, nlh); + return PTR_ERR(handler); + } + + attr = handler->idiag_info_size + ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + : NULL; + if (attr) + info = nla_data(attr); + + handler->idiag_get_info(sk, r, info); + inet_diag_unlock_handler(handler); + + nlmsg_end(skb, nlh); + return 0; +} + static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; int inet_diag_register(const struct inet_diag_handler *h) diff --git a/kernel/net/ipv4/inet_fragment.c b/kernel/net/ipv4/inet_fragment.c index 5e346a082..fe144dae7 100644 --- a/kernel/net/ipv4/inet_fragment.c +++ b/kernel/net/ipv4/inet_fragment.c @@ -131,34 +131,22 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) unsigned int evicted = 0; HLIST_HEAD(expired); -evict_again: spin_lock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &hb->chain, list) { if (!inet_fragq_should_evict(fq)) continue; - if (!del_timer(&fq->timer)) { - /* q expiring right now thus increment its refcount so - * it won't be freed under us and wait until the timer - * has finished executing then destroy it - */ - atomic_inc(&fq->refcnt); - spin_unlock(&hb->chain_lock); - del_timer_sync(&fq->timer); - inet_frag_put(fq, f); - goto evict_again; - } + if (!del_timer(&fq->timer)) + continue; - fq->flags |= INET_FRAG_EVICTED; - hlist_del(&fq->list); - hlist_add_head(&fq->list, &expired); + hlist_add_head(&fq->list_evictor, &expired); ++evicted; } spin_unlock(&hb->chain_lock); - hlist_for_each_entry_safe(fq, n, &expired, list) + hlist_for_each_entry_safe(fq, n, &expired, list_evictor) f->frag_expire((unsigned long) fq); return evicted; @@ -221,12 +209,6 @@ int inet_frags_init(struct inet_frags *f) } EXPORT_SYMBOL(inet_frags_init); -void inet_frags_init_net(struct netns_frags *nf) -{ - init_frag_mem_limit(nf); -} -EXPORT_SYMBOL(inet_frags_init_net); - void inet_frags_fini(struct inet_frags *f) { cancel_work_sync(&f->frags_work); @@ -240,18 +222,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) int i; nf->low_thresh = 0; - local_bh_disable(); evict_again: + local_bh_disable(); seq = read_seqbegin(&f->rnd_seqlock); for (i = 0; i < INETFRAGS_HASHSZ ; i++) inet_evict_bucket(f, &f->hash[i]); - if (read_seqretry(&f->rnd_seqlock, seq)) - goto evict_again; - local_bh_enable(); + cond_resched(); + + if (read_seqretry(&f->rnd_seqlock, seq) || + percpu_counter_sum(&nf->mem)) + goto evict_again; percpu_counter_destroy(&nf->mem); } @@ -284,8 +268,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) struct inet_frag_bucket *hb; hb = get_frag_bucket_locked(fq, f); - if (!(fq->flags & INET_FRAG_EVICTED)) - hlist_del(&fq->list); + hlist_del(&fq->list); + fq->flags |= INET_FRAG_COMPLETE; spin_unlock(&hb->chain_lock); } @@ -297,7 +281,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) if (!(fq->flags & INET_FRAG_COMPLETE)) { fq_unlink(fq, f); atomic_dec(&fq->refcnt); - fq->flags |= INET_FRAG_COMPLETE; } } EXPORT_SYMBOL(inet_frag_kill); @@ -330,11 +313,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) fp = xp; } sum = sum_truesize + f->qsize; - sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); kmem_cache_free(f->frags_cachep, q); + + sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); @@ -390,7 +374,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, q->net = nf; f->constructor(q, arg); - add_frag_mem_limit(q, f->qsize); + add_frag_mem_limit(nf, f->qsize); setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); diff --git a/kernel/net/ipv4/inet_hashtables.c b/kernel/net/ipv4/inet_hashtables.c index c6fb80bd5..ccc598079 100644 --- a/kernel/net/ipv4/inet_hashtables.c +++ b/kernel/net/ipv4/inet_hashtables.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -90,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; @@ -111,8 +108,6 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; - atomic_dec(&hashinfo->bsockets); - spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -131,7 +126,7 @@ void inet_put_port(struct sock *sk) } EXPORT_SYMBOL(inet_put_port); -int __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; @@ -142,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (unlikely(!tb)) { + spin_unlock(&head->lock); + return -ENOENT; + } if (tb->port != port) { /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption @@ -190,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } @@ -348,7 +349,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -376,21 +376,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; @@ -399,23 +395,27 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet_sk_port_offset(const struct sock *sk) +static u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; - int twrefcnt = 0; + bool ret = true; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -423,25 +423,41 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); + if (osk) { + WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); + ret = sk_nulls_del_node_init_rcu(osk); } + if (ret) + __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; + return ret; } -EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +{ + bool ok = inet_ehash_insert(sk, osk); + + if (ok) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } else { + percpu_counter_inc(sk->sk_prot->orphan_count); + sk->sk_state = TCP_CLOSE; + sock_set_flag(sk, SOCK_DEAD); + inet_csk_destroy_sock(sk); + } + return ok; +} +EXPORT_SYMBOL_GPL(inet_ehash_nolisten); + +void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, tw); - + if (sk->sk_state != TCP_LISTEN) { + inet_ehash_nolisten(sk, osk); + return; + } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -449,7 +465,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); - return 0; } EXPORT_SYMBOL(__inet_hash); @@ -496,7 +511,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); - int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -507,8 +521,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; + /* By starting with offset being an even number, + * we tend to leave about 50% of ports for other uses, + * like bind(0). + */ + offset &= ~1; + local_bh_disable(); - for (i = 1; i <= remaining; i++) { + for (i = 0; i < remaining; i++) { port = low + (i + offset) % remaining; if (inet_is_local_reserved_port(net, port)) continue; @@ -552,25 +572,20 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return -EADDRNOTAVAIL; ok: - hint += i; + hint += (i + 2) & ~1; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += __inet_hash_nolisten(sk, tw); + inet_ehash_nolisten(sk, (struct sock *)tw); } if (tw) - twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); - if (tw) { - inet_twsk_deschedule(tw); - while (twrefcnt) { - twrefcnt--; - inet_twsk_put(tw); - } - } + if (tw) + inet_twsk_deschedule_put(tw); ret = 0; goto out; @@ -580,7 +595,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { @@ -599,7 +614,11 @@ out: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); @@ -608,7 +627,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; - atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, @@ -616,3 +634,32 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); + +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int locksz = sizeof(spinlock_t); + unsigned int i, nblocks = 1; + + if (locksz != 0) { + /* allocate 2 cache lines or at least one spinlock per cpu */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); + nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + + /* no more locks than number of hash buckets */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, + GFP_KERNEL | __GFP_NOWARN); + if (!hashinfo->ehash_locks) + hashinfo->ehash_locks = vmalloc(nblocks * locksz); + + if (!hashinfo->ehash_locks) + return -ENOMEM; + + for (i = 0; i < nblocks; i++) + spin_lock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = nblocks - 1; + return 0; +} +EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); diff --git a/kernel/net/ipv4/inet_timewait_sock.c b/kernel/net/ipv4/inet_timewait_sock.c index 00ec8d5d7..c67f9bd76 100644 --- a/kernel/net/ipv4/inet_timewait_sock.c +++ b/kernel/net/ipv4/inet_timewait_sock.c @@ -17,28 +17,6 @@ #include -/** - * inet_twsk_unhash - unhash a timewait socket from established hash - * @tw: timewait socket - * - * unhash a timewait socket from established hash, if hashed. - * ehash lock must be held by caller. - * Returns 1 if caller should call inet_twsk_put() after lock release. - */ -int inet_twsk_unhash(struct inet_timewait_sock *tw) -{ - if (hlist_nulls_unhashed(&tw->tw_node)) - return 0; - - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; -} - /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket @@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw) * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ -int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, +void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) - return 0; + return; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; + __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; - struct inet_bind_hashbucket *bhead; - int refcnt; - /* Unlink from established hashes. */ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + struct inet_bind_hashbucket *bhead; spin_lock(lock); - refcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ @@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) hashinfo->bhash_size)]; spin_lock(&bhead->lock); - refcnt += inet_twsk_bind_unhash(tw, hashinfo); + inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); - atomic_sub(refcnt, &tw->tw_refcnt); atomic_dec(&tw->tw_dr->tw_count); inet_twsk_put(tw); } @@ -153,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -170,7 +142,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -void tw_timer_handler(unsigned long data) +static void tw_timer_handler(unsigned long data) { struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; @@ -235,15 +207,19 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); * tcp_input.c to verify this. */ -/* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw) +/* This is for handling early-kills of TIME_WAIT sockets. + * Warning : consume reference. + * Caller should not access tw anymore. + */ +void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { if (del_timer_sync(&tw->tw_timer)) inet_twsk_kill(tw); + inet_twsk_put(tw); } -EXPORT_SYMBOL(inet_twsk_deschedule); +EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -271,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) @@ -311,9 +289,8 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw); + inet_twsk_deschedule_put(tw); local_bh_enable(); - inet_twsk_put(tw); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is diff --git a/kernel/net/ipv4/inetpeer.c b/kernel/net/ipv4/inetpeer.c index 241afd743..86fa45809 100644 --- a/kernel/net/ipv4/inetpeer.c +++ b/kernel/net/ipv4/inetpeer.c @@ -157,22 +157,6 @@ void __init inet_initpeers(void) INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } -static int addr_compare(const struct inetpeer_addr *a, - const struct inetpeer_addr *b) -{ - int i, n = (a->family == AF_INET ? 1 : 4); - - for (i = 0; i < n; i++) { - if (a->addr.a6[i] == b->addr.a6[i]) - continue; - if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) - return -1; - return 1; - } - - return 0; -} - #define rcu_deref_locked(X, BASE) \ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) @@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a, *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty;) { \ - int cmp = addr_compare(_daddr, &u->daddr); \ + int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ if (cmp == -1) \ @@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int count = 0; while (u != peer_avl_empty) { - int cmp = addr_compare(daddr, &u->daddr); + int cmp = inetpeer_addr_cmp(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was * deleted (refcnt=-1) diff --git a/kernel/net/ipv4/ip_forward.c b/kernel/net/ipv4/ip_forward.c index 367448494..da0d7ce85 100644 --- a/kernel/net/ipv4/ip_forward.c +++ b/kernel/net/ipv4/ip_forward.c @@ -39,17 +39,21 @@ #include #include -static bool ip_may_fragment(const struct sk_buff *skb) -{ - return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - skb->ignore_df; -} - static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; + if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)) + return false; + + /* original fragment exceeds mtu and DF is set */ + if (unlikely(IPCB(skb)->frag_max_size > mtu)) + return true; + + if (skb->ignore_df) + return false; + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) return false; @@ -57,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) } -static int ip_forward_finish(struct sock *sk, struct sk_buff *skb) +static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); skb_sender_cpu_clear(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } int ip_forward(struct sk_buff *skb) @@ -77,6 +81,7 @@ int ip_forward(struct sk_buff *skb) struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + struct net *net; /* that should never happen */ if (skb->pkt_type != PACKET_HOST) @@ -95,6 +100,7 @@ int ip_forward(struct sk_buff *skb) return NET_RX_SUCCESS; skb_forward_csum(skb); + net = dev_net(skb->dev); /* * According to the RFC, we must first decrease the TTL field. If @@ -114,8 +120,8 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { - IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + if (ip_exceeds_mtu(skb, mtu)) { + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto drop; @@ -139,8 +145,9 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, rt->dst.dev, ip_forward_finish); + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, rt->dst.dev, + ip_forward_finish); sr_failed: /* @@ -151,7 +158,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/kernel/net/ipv4/ip_fragment.c b/kernel/net/ipv4/ip_fragment.c index cae22a1a8..b8a0607da 100644 --- a/kernel/net/ipv4/ip_fragment.c +++ b/kernel/net/ipv4/ip_fragment.c @@ -48,6 +48,7 @@ #include #include #include +#include /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -75,7 +76,9 @@ struct ipq { __be16 id; u8 protocol; u8 ecn; /* RFC3168 support */ + u16 max_df_size; /* largest frag with DF set seen */ int iif; + int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -98,6 +101,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct ip4_create_arg { struct iphdr *iph; u32 user; + int vif; }; static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) @@ -126,7 +130,8 @@ static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user; + qp->user == arg->user && + qp->vif == arg->vif; } static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -143,9 +148,11 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; + qp->vif = arg->vif; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + NULL; } static void ip4_frag_free(struct inet_frag_queue *q) @@ -173,6 +180,15 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } +static bool frag_expire_skip_icmp(u32 user) +{ + return user == IP_DEFRAG_AF_PACKET || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, + __IP_DEFRAG_CONNTRACK_IN_END) || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP_DEFRAG_CONNTRACK_BRIDGE_IN); +} + /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -192,7 +208,7 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if (!(qp->q.flags & INET_FRAG_EVICTED)) { + if (!inet_frag_evicting(&qp->q)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; @@ -217,10 +233,8 @@ static void ip_expire(unsigned long arg) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_AF_PACKET || - ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && - (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && - (skb_rtable(head)->rt_type != RTN_LOCAL))) + if (frag_expire_skip_icmp(qp->user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. */ @@ -236,7 +250,8 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, + u32 user, int vif) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -244,6 +259,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; + arg.vif = vif; hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); @@ -301,7 +317,7 @@ static int ip_frag_reinit(struct ipq *qp) kfree_skb(fp); fp = xp; } while (fp); - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; @@ -319,6 +335,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; + unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; @@ -446,7 +463,7 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - sub_frag_mem_limit(&qp->q, free_it->truesize); + sub_frag_mem_limit(qp->q.net, free_it->truesize); kfree_skb(free_it); } } @@ -470,13 +487,18 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(&qp->q, skb->truesize); + add_frag_mem_limit(qp->q.net, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; + fragsize = skb->len + ihl; + + if (fragsize > qp->q.max_size) + qp->q.max_size = fragsize; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len + ihl > qp->q.max_size) - qp->q.max_size = skb->len + ihl; + fragsize > qp->max_df_size) + qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { @@ -508,7 +530,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; - int sum_truesize; u8 ecn; ipq_kill(qp); @@ -573,47 +594,47 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(&qp->q, clone->truesize); + add_frag_mem_limit(qp->q.net, clone->truesize); } + skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; - - sum_truesize += fp->truesize; + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; + head->truesize += fp->truesize; } - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(qp->q.net, head->truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = qp->q.max_size; + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); - /* max_size != 0 implies at least one fragment had IP_DF set */ - iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + /* When we set IP_DF on a refragmented skb we must also force a + * call to ip_fragment to avoid forwarding a DF-skb of size s while + * original sender only sent fragments of size f (where f < s). + * + * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest + * frag seen to avoid sending tiny DF-fragments in case skb was built + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { + IPCB(head)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; + } + ip_send_check(iph); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); @@ -633,16 +654,17 @@ out_fail: } /* Process an incoming IP datagram fragment. */ -int ip_defrag(struct sk_buff *skb, u32 user) +int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { + struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; + int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; - struct net *net; - net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + skb_orphan(skb); /* Lookup (or create) queue header */ - qp = ip_find(net, ip_hdr(skb), user); + qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret; @@ -661,7 +683,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); -struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; @@ -690,7 +712,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - if (ip_defrag(skb, user)) + if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } @@ -818,6 +840,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -841,9 +865,13 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + res = inet_frags_init_net(&net->ipv4.frags); + if (res) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res) + inet_frags_uninit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) diff --git a/kernel/net/ipv4/ip_gre.c b/kernel/net/ipv4/ip_gre.c index 5fd706473..614521437 100644 --- a/kernel/net/ipv4/ip_gre.c +++ b/kernel/net/ipv4/ip_gre.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -121,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_net_id __read_mostly; static int gre_tap_net_id __read_mostly; -static int ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags & TUNNEL_CSUM) + addend += 4; + if (o_flags & TUNNEL_KEY) + addend += 4; + if (o_flags & TUNNEL_SEQ) + addend += 4; + return addend; +} + +static __be16 gre_flags_to_tnl_flags(__be16 flags) +{ + __be16 tflags = 0; + + if (flags & GRE_CSUM) + tflags |= TUNNEL_CSUM; + if (flags & GRE_ROUTING) + tflags |= TUNNEL_ROUTING; + if (flags & GRE_KEY) + tflags |= TUNNEL_KEY; + if (flags & GRE_SEQ) + tflags |= TUNNEL_SEQ; + if (flags & GRE_STRICT) + tflags |= TUNNEL_STRICT; + if (flags & GRE_REC) + tflags |= TUNNEL_REC; + if (flags & GRE_VERSION) + tflags |= TUNNEL_VERSION; + + return tflags; +} + +static __be16 tnl_flags_to_gre_flags(__be16 tflags) +{ + __be16 flags = 0; + + if (tflags & TUNNEL_CSUM) + flags |= GRE_CSUM; + if (tflags & TUNNEL_ROUTING) + flags |= GRE_ROUTING; + if (tflags & TUNNEL_KEY) + flags |= GRE_KEY; + if (tflags & TUNNEL_SEQ) + flags |= GRE_SEQ; + if (tflags & TUNNEL_STRICT) + flags |= GRE_STRICT; + if (tflags & TUNNEL_REC) + flags |= GRE_REC; + if (tflags & TUNNEL_VERSION) + flags |= GRE_VERSION; + + return flags; +} + +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err) +{ + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = ip_gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (skb_checksum_simple_validate(skb)) { + *csum_err = true; + return -EINVAL; + } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else { + tpi->key = 0; + } + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else { + tpi->seq = 0; + } + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) { + hdr_len += 4; + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + } + } + return iptunnel_pull_header(skb, hdr_len, tpi->proto); +} + +static void ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info, switch (type) { default: case ICMP_PARAMETERPROB: - return PACKET_RCVD; + return; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return PACKET_RCVD; + return; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info, break; } break; + case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return PACKET_RCVD; + return; break; case ICMP_REDIRECT: @@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info, iph->daddr, iph->saddr, tpi->key); if (!t) - return PACKET_REJECT; + return; if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return PACKET_RCVD; + return; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return PACKET_RCVD; + return; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; - return PACKET_RCVD; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + * + * Moreover, Cisco "wise men" put GRE key to the third word + * in GRE header. It makes impossible maintaining even soft + * state for keyed + * GRE tunnels with enabled checksum. Tell them "thank you". + * + * Well, I wonder, rfc1812 was written by Cisco employee, + * what the hell these idiots break standards established + * by themselves??? + */ + + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct tnl_ptk_info tpi; + bool csum_err = false; + + if (parse_gre_header(skb, &tpi, &csum_err)) { + if (!csum_err) /* ignore csum errors. */ + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + skb->dev->ifindex, 0, IPPROTO_GRE, 0); + return; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, + IPPROTO_GRE, 0); + return; + } + + ipgre_err(skb, info, &tpi); +} + +static __be64 key_to_tunnel_id(__be32 key) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u32)key); +#else + return (__force __be64)((__force u64)key << 32); +#endif +} + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 tunnel_id_to_key(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif } static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { struct net *net = dev_net(skb->dev); + struct metadata_dst *tun_dst = NULL; struct ip_tunnel_net *itn; const struct iphdr *iph; struct ip_tunnel *tunnel; @@ -218,40 +399,211 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) if (tunnel) { skb_pop_mac_header(skb); - ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); + if (tunnel->collect_md) { + __be16 flags; + __be64 tun_id; + + flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + tun_id = key_to_tunnel_id(tpi->key); + tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + } + + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_REJECT; } +static int gre_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + bool csum_err = false; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + + if (parse_gre_header(skb, &tpi, &csum_err) < 0) + goto drop; + + if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: + kfree_skb(skb); + return 0; +} + +static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, + __be16 proto, __be32 key, __be32 seq) +{ + struct gre_base_hdr *greh; + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(flags); + greh->protocol = proto; + + if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + + if (flags & TUNNEL_SEQ) { + *ptr = seq; + ptr--; + } + if (flags & TUNNEL_KEY) { + *ptr = key; + ptr--; + } + if (flags & TUNNEL_CSUM && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { + *ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); + } + } +} + static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct tnl_ptk_info tpi; - tpi.flags = tunnel->parms.o_flags; - tpi.proto = proto; - tpi.key = tunnel->parms.o_key; if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; - tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->tun_hlen); - - skb_set_inner_protocol(skb, tpi.proto); + build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } +static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, + bool csum) +{ + return iptunnel_handle_offloads(skb, csum, + csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); +} + +static struct rtable *gre_get_rt(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl, + const struct ip_tunnel_key *key) +{ + struct net *net = dev_net(dev); + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); + fl->flowi4_mark = skb->mark; + fl->flowi4_proto = IPPROTO_GRE; + + return ip_route_output_key(net, fl); +} + +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df, flags; + int err; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + + tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + tunnel_hlen + sizeof(struct iphdr); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + /* Push Tunnel header. */ + skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) { + skb = NULL; + goto err_free_rt; + } + + flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + tunnel_id_to_key(tun_info->key.tun_id), 0); + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, + key->u.ipv4.dst, IPPROTO_GRE, + key->tos, key->ttl, df, false); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; +} + +static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct rtable *rt; + struct flowi4 fl4; + + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + rt = gre_get_rt(skb, dev, &fl4, &info->key); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + info->key.u.ipv4.src = fl4.saddr; + return 0; +} + static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + if (dev->header_ops) { /* Need space for new headers */ if (skb_cow_head(skb, dev->needed_headroom - @@ -277,7 +629,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, goto out; __gre_xmit(skb, dev, tnl_params, skb->protocol); - return NETDEV_TX_OK; free_skb: @@ -292,6 +643,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); if (IS_ERR(skb)) goto out; @@ -300,7 +656,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, goto free_skb; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); - return NETDEV_TX_OK; free_skb: @@ -530,10 +885,9 @@ static int ipgre_tunnel_init(struct net_device *dev) return ip_tunnel_init(dev); } -static struct gre_cisco_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .priority = 0, +static const struct gre_protocol ipgre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, }; static int __net_init ipgre_init_net(struct net *net) @@ -596,8 +950,10 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms) +static void ipgre_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms) { memset(parms, 0, sizeof(*parms)); @@ -635,6 +991,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_GRE_COLLECT_METADATA]) { + struct ip_tunnel *t = netdev_priv(dev); + + t->collect_md = true; + } } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -688,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; static void ipgre_tap_setup(struct net_device *dev) @@ -712,7 +1075,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, return err; } - ipgre_netlink_parms(data, tb, &p); + ipgre_netlink_parms(dev, data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); } @@ -730,7 +1093,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], return err; } - ipgre_netlink_parms(data, tb, &p); + ipgre_netlink_parms(dev, data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); } @@ -765,6 +1128,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -796,6 +1161,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (t->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -817,6 +1187,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { @@ -849,9 +1220,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; +struct net_device *gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ip_tunnel *t; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, name, name_assign_type, + &ipgre_tap_ops, tb); + if (IS_ERR(dev)) + return dev; + + /* Configure flow based GRE device. */ + t = netdev_priv(dev); + t->collect_md = true; + + err = ipgre_newlink(net, dev, tb, NULL); + if (err < 0) + goto out; + return dev; +out: + free_netdev(dev); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(gretap_fb_dev_create); + static int __net_init ipgre_tap_init_net(struct net *net) { - return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } static void __net_exit ipgre_tap_exit_net(struct net *net) @@ -881,7 +1281,7 @@ static int __init ipgre_init(void) if (err < 0) goto pnet_tap_faied; - err = gre_cisco_register(&ipgre_protocol); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; @@ -900,7 +1300,7 @@ static int __init ipgre_init(void) tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_tap_net_ops); pnet_tap_faied: @@ -912,7 +1312,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); } diff --git a/kernel/net/ipv4/ip_input.c b/kernel/net/ipv4/ip_input.c index 2db4c8773..b1209b633 100644 --- a/kernel/net/ipv4/ip_input.c +++ b/kernel/net/ipv4/ip_input.c @@ -146,6 +146,7 @@ #include #include #include +#include /* * Process Router Attention IP option (RFC 2113) @@ -156,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; @@ -166,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), dev_net(dev))) { + net_eq(sock_net(sk), net)) { if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { @@ -187,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb) +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); - __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); @@ -247,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb) /* * Reassemble IP fragments. */ + struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } - return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, + net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -310,7 +311,7 @@ drop: int sysctl_ip_early_demux __read_mostly = 1; EXPORT_SYMBOL(sysctl_ip_early_demux); -static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; @@ -331,13 +332,12 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (!skb_dst(skb)) { + if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER); goto drop; } } @@ -358,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); @@ -377,6 +375,7 @@ drop: int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct iphdr *iph; + struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -386,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); + net = dev_net(dev); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto out; } @@ -416,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); - IP_ADD_STATS_BH(dev_net(dev), + IP_ADD_STATS_BH(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); @@ -430,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -440,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -452,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, ip_rcv_finish); csum_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/kernel/net/ipv4/ip_output.c b/kernel/net/ipv4/ip_output.c index c65b93a7b..49f028563 100644 --- a/kernel/net/ipv4/ip_output.c +++ b/kernel/net/ipv4/ip_output.c @@ -83,6 +83,11 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); +static int +ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)); + /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { @@ -91,32 +96,28 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL, - skb_dst(skb)->dev, dst_output_sk); -} - -int __ip_local_out(struct sk_buff *skb) -{ - return __ip_local_out_sk(skb->sk, skb); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } -int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip_local_out(skb); + err = __ip_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out_sk); +EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -131,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * Add an ip header to a skbuff and send it out. * */ -int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, +int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ @@ -145,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->dst)) - iph->frag_off = htons(IP_DF); - else - iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(sock_net(sk), skb, sk); + if (ip_dont_fragment(sk, &rt->dst)) { + iph->frag_off = htons(IP_DF); + iph->id = 0; + } else { + iph->frag_off = 0; + __ip_select_ident(net, iph, 1); + } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -164,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -178,9 +182,9 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -216,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -224,8 +229,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) - return ip_finish_output2(sk, skb); + skb_gso_network_seglen(skb) <= mtu) + return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -235,6 +240,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) * from host network stack. */ features = netif_skb_features(skb); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); @@ -248,7 +254,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) int err; segs->next = NULL; - err = ip_fragment(sk, segs, ip_finish_output2); + err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -258,25 +264,28 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) return ret; } -static int ip_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif + mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb); + return ip_finish_output_gso(net, sk, skb, mtu); - if (skb->len > ip_skb_dst_mtu(skb)) - return ip_fragment(sk, skb, ip_finish_output2); + if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); } -int ip_mc_output(struct sock *sk, struct sk_buff *skb) +int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -284,7 +293,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -312,7 +321,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } @@ -327,26 +336,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb, - NULL, newskb->dev, dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, newskb, NULL, newskb->dev, + dev_loopback_xmit); } - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, - skb->dev, ip_finish_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb->dev, + ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sock *sk, struct sk_buff *skb) +int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -369,6 +380,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; @@ -399,7 +411,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, + rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -436,20 +448,20 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(sock_net(sk), skb, sk, + ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(skb); + res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -478,6 +490,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)) +{ + struct iphdr *iph = ip_hdr(skb); + + if ((iph->frag_off & htons(IP_DF)) == 0) + return ip_do_fragment(net, sk, skb, output); + + if (unlikely(!skb->ignore_df || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(net, sk, skb, output); +} + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -485,8 +519,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -500,6 +534,11 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, dev = rt->dst.dev; + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto fail; + /* * Point into the IP datagram header. */ @@ -507,15 +546,8 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || - (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - kfree_skb(skb); - return -EMSGSIZE; - } + if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) + mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. @@ -523,10 +555,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) - mtu -= nf_bridge_mtu_reduction(skb); -#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: @@ -599,10 +627,10 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, ip_send_check(iph); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -612,7 +640,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, } if (err == 0) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } @@ -621,7 +649,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, kfree_skb(frag); frag = skb; } - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: @@ -635,9 +663,6 @@ slow_path_clean: } slow_path: - /* for offloaded checksums cleanup checksum before fragmentation */ - if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) - goto fail; iph = ip_hdr(skb); left = skb->len - hlen; /* Space per frame */ @@ -711,6 +736,9 @@ slow_path: iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); + if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) + iph->frag_off |= htons(IP_DF); + /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE @@ -736,22 +764,22 @@ slow_path: ip_send_check(iph); - err = output(sk, skb2); + err = output(net, sk, skb2); if (err) goto fail; - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } -EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) @@ -886,6 +914,7 @@ static int __ip_append_data(struct sock *sk, if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & NETIF_F_V4_CSUM && + !(flags & MSG_MORE) && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -893,7 +922,7 @@ static int __ip_append_data(struct sock *sk, if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && - (sk->sk_type == SOCK_DGRAM)) { + (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); @@ -1217,11 +1246,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, } while (size > 0) { - int i; - - if (skb_is_gso(skb)) + if (skb_is_gso(skb)) { len = size; - else { + } else { /* Check if the remaining data fits into current packet. */ len = mtu - skb->len; @@ -1273,15 +1300,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, continue; } - i = skb_shinfo(skb)->nr_frags; if (len > size) len = size; - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); - } else if (i < MAX_SKB_FRAGS) { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, len); - } else { + + if (skb_append_pagefrags(skb, page, offset, len)) { err = -EMSGSIZE; goto error; } @@ -1416,7 +1438,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb); + err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); @@ -1524,6 +1546,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, struct net *net = sock_net(sk); struct sk_buff *nskb; int err; + int oif; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; @@ -1541,7 +1564,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, + oif = arg->bound_dev_if; + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; + + flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, @@ -1573,7 +1600,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: diff --git a/kernel/net/ipv4/ip_sockglue.c b/kernel/net/ipv4/ip_sockglue.c index 6ddde8999..a50124260 100644 --- a/kernel/net/ipv4/ip_sockglue.c +++ b/kernel/net/ipv4/ip_sockglue.c @@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + + /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) @@ -591,6 +593,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: + case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: @@ -741,6 +744,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->nodefrag = val ? 1 : 0; break; + case IP_BIND_ADDRESS_NO_PORT: + inet->bind_address_no_port = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1247,11 +1253,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt); * the _received_ ones. The set sets the _sent_ ones. */ +static bool getsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_MSFILTER: + case MCAST_MSFILTER: + return true; + } + return false; +} + static int do_ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); - int val; + bool needs_rtnl = getsockopt_needs_rtnl(optname); + int val, err = 0; int len; if (level != SOL_IP) @@ -1265,6 +1282,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (len < 0) return -EINVAL; + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -1333,6 +1352,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_NODEFRAG: val = inet->nodefrag; break; + case IP_BIND_ADDRESS_NO_PORT: + val = inet->bind_address_no_port; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; @@ -1379,39 +1401,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MSFILTER: { struct ip_msfilter msf; - int err; if (len < IP_MSFILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out; } if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; + err = -EFAULT; + goto out; } err = ip_mc_msfget(sk, &msf, (struct ip_msfilter __user *)optval, optlen); - release_sock(sk); - return err; + goto out; } case MCAST_MSFILTER: { struct group_filter gsf; - int err; if (len < GROUP_FILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out; } if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; + err = -EFAULT; + goto out; } err = ip_mc_gsfget(sk, &gsf, (struct group_filter __user *)optval, optlen); - release_sock(sk); - return err; + goto out; } case IP_MULTICAST_ALL: val = inet->mc_all; @@ -1478,6 +1496,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; } return 0; + +out: + release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); + return err; } int ip_getsockopt(struct sock *sk, int level, diff --git a/kernel/net/ipv4/ip_tunnel.c b/kernel/net/ipv4/ip_tunnel.c index 626d9e56a..cbb51f3fa 100644 --- a/kernel/net/ipv4/ip_tunnel.c +++ b/kernel/net/ipv4/ip_tunnel.c @@ -230,10 +230,13 @@ skip_key_lookup: if (cand) return cand; + t = rcu_dereference(itn->collect_md_tun); + if (t) + return t; + if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) return netdev_priv(itn->fb_tunnel_dev); - return NULL; } EXPORT_SYMBOL_GPL(ip_tunnel_lookup); @@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) { struct hlist_head *head = ip_bucket(itn, &t->parms); + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, t); hlist_add_head_rcu(&t->hash_node, head); } -static void ip_tunnel_del(struct ip_tunnel *t) +static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) { + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, NULL); hlist_del_init_rcu(&t->hash_node); } @@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, } int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, bool log_ecn_error) + const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, + bool log_ecn_error) { struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); @@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, skb->dev = tunnel->dev; } + if (tun_dst) + skb_dst_set(skb, (struct dst_entry *)tun_dst); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel_parm *p, bool set_mtu) { - ip_tunnel_del(t); + ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; t->parms.i_key = p->i_key; @@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { - ip_tunnel_del(netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); unregister_netdevice_queue(dev, head); } } @@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], nt = netdev_priv(dev); itn = net_generic(net, nt->ip_tnl_net_id); - if (ip_tunnel_find(itn, p, dev->type)) - return -EEXIST; + if (nt->collect_md) { + if (rtnl_dereference(itn->collect_md_tun)) + return -EEXIST; + } else { + if (ip_tunnel_find(itn, p, dev->type)) + return -EEXIST; + } nt->net = net; nt->parms = *p; @@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], dev->mtu = mtu; ip_tunnel_add(itn, nt); - out: return err; } @@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; + if (tunnel->collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev) itn = net_generic(net, tunnel->ip_tnl_net_id); /* fb_tunnel_dev will be unregisted in net-exit call. */ if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); ip_tunnel_dst_reset_all(tunnel); } diff --git a/kernel/net/ipv4/ip_tunnel_core.c b/kernel/net/ipv4/ip_tunnel_core.c index ce63ab21b..6cb9009c3 100644 --- a/kernel/net/ipv4/ip_tunnel_core.c +++ b/kernel/net/ipv4/ip_tunnel_core.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -45,12 +46,14 @@ #include #include #include +#include int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); + struct net *net = dev_net(rt->dst.dev); struct iphdr *iph; int err; @@ -74,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(dev_net(rt->dst.dev), iph, - skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out_sk(sk, skb); + err = ip_local_out(net, sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; @@ -98,7 +100,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return -ENOMEM; eh = (struct ethhdr *)skb->data; - if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); @@ -118,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) @@ -165,6 +194,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, { int i; + netdev_stats_to_stats64(tot, &dev->stats); + for_each_possible_cpu(i) { const struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, i); @@ -185,22 +216,211 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->tx_bytes += tx_bytes; } - tot->multicast = dev->stats.multicast; + return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; +static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, + [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, + [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, + [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, + [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; + int err; - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy); + if (err < 0) + return err; - tot->collisions = dev->stats.collisions; + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; - return tot; + new_state->type = LWTUNNEL_ENCAP_IP; + + tun_info = lwt_tun_info(new_state); + + if (tb[LWTUNNEL_IP_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]); + + if (tb[LWTUNNEL_IP_DST]) + tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); + + if (tb[LWTUNNEL_IP_SRC]) + tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); + + if (tb[LWTUNNEL_IP_TTL]) + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); + + if (tb[LWTUNNEL_IP_TOS]) + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); + + if (tb[LWTUNNEL_IP_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; } -EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); + +static int ip_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || + nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || + nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || + nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || + nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* LWTUNNEL_IP_ID */ + + nla_total_size(4) /* LWTUNNEL_IP_DST */ + + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + + nla_total_size(1) /* LWTUNNEL_IP_TTL */ + + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ +} + +static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + return memcmp(lwt_tun_info(a), lwt_tun_info(b), + sizeof(struct ip_tunnel_info)); +} + +static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { + .build_state = ip_tun_build_state, + .fill_encap = ip_tun_fill_encap_info, + .get_encap_size = ip_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, +}; + +static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP6; + + tun_info = lwt_tun_info(new_state); + + if (tb[LWTUNNEL_IP6_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]); + + if (tb[LWTUNNEL_IP6_DST]) + tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); + + if (tb[LWTUNNEL_IP6_SRC]) + tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]); + + if (tb[LWTUNNEL_IP6_HOPLIMIT]) + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]); + + if (tb[LWTUNNEL_IP6_TC]) + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); + + if (tb[LWTUNNEL_IP6_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip6_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* LWTUNNEL_IP6_ID */ + + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + + nla_total_size(1) /* LWTUNNEL_IP6_TC */ + + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { + .build_state = ip6_tun_build_state, + .fill_encap = ip6_tun_fill_encap_info, + .get_encap_size = ip6_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, +}; + +void __init ip_tunnel_core_init(void) +{ + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); +} + +struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(ip_tunnel_metadata_cnt); + +void ip_tunnel_need_metadata(void) +{ + static_key_slow_inc(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); + +void ip_tunnel_unneed_metadata(void) +{ + static_key_slow_dec(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); diff --git a/kernel/net/ipv4/ip_vti.c b/kernel/net/ipv4/ip_vti.c index 0c152087c..4d8f0b698 100644 --- a/kernel/net/ipv4/ip_vti.c +++ b/kernel/net/ipv4/ip_vti.c @@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; - err = dst_output(skb); + err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = skb->len; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/kernel/net/ipv4/ipconfig.c b/kernel/net/ipv4/ipconfig.c index 8e7328c6a..0bc7412d9 100644 --- a/kernel/net/ipv4/ipconfig.c +++ b/kernel/net/ipv4/ipconfig.c @@ -94,7 +94,7 @@ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ #define CONF_SEND_RETRIES 6 /* Send six requests per open */ -#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ +#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */ #define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ #define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ #define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ @@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; +#if defined(CONFIG_IP_PNP_DHCP) +static char dhcp_client_identifier[253] __initdata; +#endif + /* Persistent data: */ static int ic_proto_used; /* Protocol used, if any */ @@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options) memcpy(e, vendor_class_identifier, len); e += len; } + len = strlen(dhcp_client_identifier + 1); + /* the minimum length of identifier is 2, include 1 byte type, + * and can not be larger than the length of options + */ + if (len >= 1 && len < 312 - (e - options) - 1) { + *e++ = 61; + *e++ = len + 1; + memcpy(e, dhcp_client_identifier, len + 1); + e += len + 1; + } } *e++ = 255; /* End of the list */ @@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name) return 0; } #ifdef CONFIG_IP_PNP_DHCP - else if (!strcmp(name, "dhcp")) { + else if (!strncmp(name, "dhcp", 4)) { + char *client_id; + ic_proto_enabled &= ~IC_RARP; + client_id = strstr(name, "dhcp,"); + if (client_id) { + char *v; + + client_id = client_id + 5; + v = strchr(client_id, ','); + if (!v) + return 1; + *v = 0; + if (kstrtou8(client_id, 0, dhcp_client_identifier)) + DBG("DHCP: Invalid client identifier type\n"); + strncpy(dhcp_client_identifier + 1, v + 1, 251); + *v = ','; + } return 1; } #endif diff --git a/kernel/net/ipv4/ipip.c b/kernel/net/ipv4/ipip.c index ff96396eb..a09fb0dec 100644 --- a/kernel/net/ipv4/ipip.c +++ b/kernel/net/ipv4/ipip.c @@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb) goto drop; if (iptunnel_pull_header(skb, 0, tpi.proto)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } return -1; @@ -251,10 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } - p.i_key = p.o_key = p.i_flags = p.o_flags = 0; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - + p.i_key = p.o_key = 0; + p.i_flags = p.o_flags = 0; err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; diff --git a/kernel/net/ipv4/ipmr.c b/kernel/net/ipv4/ipmr.c index 3a2c0162c..c3a38353f 100644 --- a/kernel/net/ipv4/ipmr.c +++ b/kernel/net/ipv4/ipmr.c @@ -134,7 +134,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); -static void mroute_clean_tables(struct mr_table *mrt); +static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .policy = ipmr_rule_policy, @@ -351,7 +350,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -442,10 +441,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -541,10 +536,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -1209,7 +1200,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt) +static void mroute_clean_tables(struct mr_table *mrt, bool all) { int i; LIST_HEAD(list); @@ -1218,8 +1209,9 @@ static void mroute_clean_tables(struct mr_table *mrt) /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags & VIFF_STATIC)) - vif_delete(mrt, i, 0, &list); + if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) + continue; + vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); @@ -1227,7 +1219,7 @@ static void mroute_clean_tables(struct mr_table *mrt) for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); @@ -1262,7 +1254,7 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); } } rtnl_unlock(); @@ -1679,17 +1671,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb, nf_reset(skb); } -static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb) +static inline int ipmr_forward_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } /* @@ -1746,7 +1739,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * to blackhole. */ - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } @@ -1788,8 +1781,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; diff --git a/kernel/net/ipv4/netfilter.c b/kernel/net/ipv4/netfilter.c index 65de0684e..c3776ff67 100644 --- a/kernel/net/ipv4/netfilter.c +++ b/kernel/net/ipv4/netfilter.c @@ -17,9 +17,8 @@ #include /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) +int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; @@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, } } -static int nf_ip_reroute(struct sk_buff *skb, +static int nf_ip_reroute(struct net *net, struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(skb, RTN_UNSPEC); + return ip_route_me_harder(net, skb, RTN_UNSPEC); } return 0; } @@ -197,11 +196,4 @@ static int __init ipv4_netfilter_init(void) { return nf_register_afinfo(&nf_ip_afinfo); } - -static void __exit ipv4_netfilter_fini(void) -{ - nf_unregister_afinfo(&nf_ip_afinfo); -} - -module_init(ipv4_netfilter_init); -module_exit(ipv4_netfilter_fini); +subsys_initcall(ipv4_netfilter_init); diff --git a/kernel/net/ipv4/netfilter/Kconfig b/kernel/net/ipv4/netfilter/Kconfig index fb20f3631..c187c60e3 100644 --- a/kernel/net/ipv4/netfilter/Kconfig +++ b/kernel/net/ipv4/netfilter/Kconfig @@ -58,6 +58,13 @@ config NFT_REJECT_IPV4 default NFT_REJECT tristate +config NFT_DUP_IPV4 + tristate "IPv4 nf_tables packet duplication support" + depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DUP_IPV4 + help + This module enables IPv4 packet duplication support for nf_tables. + endif # NF_TABLES_IPV4 config NF_TABLES_ARP @@ -67,6 +74,13 @@ config NF_TABLES_ARP endif # NF_TABLES +config NF_DUP_IPV4 + tristate "Netfilter IPv4 packet duplication to alternate destination" + depends on !NF_CONNTRACK || NF_CONNTRACK + help + This option enables the nf_dup_ipv4 core, which duplicates an IPv4 + packet to be rerouted to another destination. + config NF_LOG_ARP tristate "ARP packet logging" default m if NETFILTER_ADVANCED=n @@ -195,7 +209,8 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE || IP_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/kernel/net/ipv4/netfilter/Makefile b/kernel/net/ipv4/netfilter/Makefile index 7fe6c7035..87b073da1 100644 --- a/kernel/net/ipv4/netfilter/Makefile +++ b/kernel/net/ipv4/netfilter/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o +obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables @@ -70,3 +71,5 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o # just filtering instance of ARP tables for now obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o diff --git a/kernel/net/ipv4/netfilter/arp_tables.c b/kernel/net/ipv4/netfilter/arp_tables.c index a61200754..11dccba47 100644 --- a/kernel/net/ipv4/netfilter/arp_tables.c +++ b/kernel/net/ipv4/netfilter/arp_tables.c @@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, arpinfo->iniface, - arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); return 0; } @@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, arpinfo->outiface, - arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); return 0; } @@ -240,23 +240,24 @@ get_entry(const void *base, unsigned int offset) return (struct arpt_entry *)(base + offset); } -static inline __pure +static inline struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) { return (void *)entry + entry->next_offset; } unsigned int arpt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; - struct arpt_entry *e, *back; + struct arpt_entry *e, **jumpstack; const char *indev, *outdev; - void *table_base; + const void *table_base; + unsigned int cpu, stackidx = 0; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; @@ -270,16 +271,21 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + cpu = smp_processor_id(); /* * Ensure we load private-> members after we've fetched the base * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[smp_processor_id()]; + table_base = private->entries; + jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; + /* No TEE support for arptables, so no need to switch to alternate + * stack. All targets that reenter must return absolute verdicts. + */ e = get_entry(table_base, private->hook_entry[hook]); - back = get_entry(table_base, private->underflow[hook]); + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.hooknum = hook; @@ -289,13 +295,15 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { const struct xt_entry_target *t; + struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -310,27 +318,24 @@ unsigned int arpt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - e = back; - back = get_entry(table_base, back->comefrom); + if (stackidx == 0) { + e = get_entry(table_base, + private->underflow[hook]); + } else { + e = jumpstack[--stackidx]; + e = arpt_next_entry(e); + } continue; } if (table_base + v != arpt_next_entry(e)) { - /* Save old back ptr in next entry */ - struct arpt_entry *next = arpt_next_entry(e); - next->comefrom = (void *)back - table_base; - - /* set back pointer to next entry */ - back = next; + jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } - /* Targets which reenter must return - * abs. verdicts - */ acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); @@ -463,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -521,6 +526,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); @@ -538,6 +547,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -614,13 +625,14 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in * newinfo). */ static int translate_table(struct xt_table_info *newinfo, void *entry0, - const struct arpt_replace *repl) + const struct arpt_replace *repl) { struct arpt_entry *iter; unsigned int i; @@ -702,12 +714,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -722,14 +728,16 @@ static void get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -774,7 +782,7 @@ static int copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; @@ -863,16 +871,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -884,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1037,7 +1045,7 @@ static int __do_replace(struct net *net, const char *name, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter); @@ -1061,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name, } static int do_replace(struct net *net, const void __user *user, - unsigned int len) + unsigned int len) { int ret; struct arpt_replace tmp; @@ -1084,8 +1092,7 @@ static int do_replace(struct net *net, const void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1115,7 +1122,7 @@ static int do_replace(struct net *net, const void __user *user, static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1125,7 +1132,6 @@ static int do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct arpt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1181,12 +1187,13 @@ static int do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1396,7 +1403,7 @@ static int translate_compat_table(const char *name, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1416,9 +1423,17 @@ static int translate_compat_table(const char *name, i = 0; xt_entry_foreach(iter1, entry1, newinfo->size) { + iter1->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(iter1->counters.pcnt)) { + ret = -ENOMEM; + break; + } + ret = check_target(iter1, name); - if (ret != 0) + if (ret != 0) { + xt_percpu_counter_free(iter1->counters.pcnt); break; + } ++i; if (strcmp(arpt_get_target(iter1)->u.user.name, XT_ERROR_TARGET) == 0) @@ -1448,11 +1463,6 @@ static int translate_compat_table(const char *name, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1511,8 +1521,7 @@ static int compat_do_replace(struct net *net, void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1609,7 +1618,6 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; unsigned int i = 0; struct arpt_entry *iter; @@ -1617,11 +1625,9 @@ static int compat_copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy on our node/cpu */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -1790,8 +1796,7 @@ struct xt_table *arpt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); @@ -1822,7 +1827,7 @@ void arpt_unregister_table(struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter); if (private->number > private->initial_entries) diff --git a/kernel/net/ipv4/netfilter/arptable_filter.c b/kernel/net/ipv4/netfilter/arptable_filter.c index 93876d031..1897ee160 100644 --- a/kernel/net/ipv4/netfilter/arptable_filter.c +++ b/kernel/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,10 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return arpt_do_table(skb, ops->hooknum, state, - net->ipv4.arptable_filter); + return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/kernel/net/ipv4/netfilter/ip_tables.c b/kernel/net/ipv4/netfilter/ip_tables.c index 2d0e265fe..b99affad6 100644 --- a/kernel/net/ipv4/netfilter/ip_tables.c +++ b/kernel/net/ipv4/netfilter/ip_tables.c @@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip, if (FWINV(ret != 0, IPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, ipinfo->iniface, - ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); return false; } @@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip, if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, ipinfo->outiface, - ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); return false; } @@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip, FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, - ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); return false; } @@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, return 0; } -static void trace_packet(const struct sk_buff *skb, +static void trace_packet(struct net *net, + const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, @@ -254,15 +255,12 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ipt_entry *e) { - const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; - struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; @@ -278,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure +static inline struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) { return (void *)entry + entry->next_offset; @@ -287,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ipt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -298,12 +296,13 @@ ipt_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; ip = ip_hdr(skb); indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; @@ -316,6 +315,7 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.family = NFPROTO_IPV4; @@ -331,20 +331,29 @@ ipt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", - table->name, hook, origptr, + pr_debug("Entering %s(hook %u), UF %p\n", + table->name, hook, get_entry(table_base, private->underflow[hook])); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); if (!ip_packet_match(ip, indev, outdev, @@ -361,7 +370,8 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -369,8 +379,8 @@ ipt_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, state->in, state->out, - table->name, private, e); + trace_packet(state->net, skb, hook, state->in, + state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { @@ -383,28 +393,24 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) { + if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " "to %p\n", e); } else { - e = jumpstack[--*stackptr]; + e = jumpstack[--stackidx]; pr_debug("Pulled %p out from pos %u\n", - e, *stackptr); + e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; pr_debug("Pushed %p into pos %u\n", - e, *stackptr - 1); + e, stackidx - 1); } e = get_entry(table_base, v); @@ -423,11 +429,10 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; resetting sp from %u to %u\n", - __func__, *stackptr, origptr); - *stackptr = origptr; - xt_write_recseq_end(addend); - local_bh_enable(); + pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); + + xt_write_recseq_end(addend); + local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -479,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo, unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -544,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -665,6 +670,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -691,6 +700,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, ret = check_target(e, net, name); if (ret) goto err; + return 0; err: module_put(t->u.kernel.target->me); @@ -700,6 +710,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -784,13 +797,14 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, - const struct ipt_replace *repl) + const struct ipt_replace *repl) { struct ipt_entry *iter; unsigned int i; @@ -866,12 +880,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -887,14 +895,16 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -939,11 +949,7 @@ copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1051,16 +1057,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ipt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1072,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1181,7 +1187,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_old_entry; struct ipt_entry *iter; ret = 0; @@ -1224,8 +1229,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1271,8 +1275,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1301,9 +1304,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len) static int do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) + unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1313,7 +1316,6 @@ do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct ipt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1369,12 +1371,12 @@ do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1444,7 +1446,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1513,8 +1514,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ip, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ip, &off); if (ret != 0) goto release_matches; ++j; @@ -1610,6 +1610,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) unsigned int j; int ret = 0; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -1634,6 +1638,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -1718,7 +1725,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1770,11 +1777,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1821,8 +1823,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1893,7 +1894,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ipt_entry *iter; @@ -1901,14 +1901,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2083,8 +2078,7 @@ struct xt_table *ipt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2115,7 +2109,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c b/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c index 771ab3d01..4a9e6db9d 100644 --- a/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) struct clusterip_config *config; int ret; + if (par->nft_compat) { + pr_err("cannot use CLUSTERIP target from nftables compat\n"); + return -EOPNOTSUPP; + } + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { @@ -487,14 +492,14 @@ static void arp_print(struct arp_payload *payload) { #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; - int j,k; + int j, k; - for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); - hbuffer[k++]=':'; + hbuffer[k++] = ':'; } - hbuffer[--k]='\0'; + hbuffer[--k] = '\0'; pr_debug("src %pI4@%s, dst %pI4\n", &payload->src_ip, hbuffer, &payload->dst_ip); @@ -502,14 +507,14 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(const struct nf_hook_ops *ops, +arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; - struct net *net = dev_net(state->in ? state->in : state->out); + struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || diff --git a/kernel/net/ipv4/netfilter/ipt_ECN.c b/kernel/net/ipv4/netfilter/ipt_ECN.c index 4bf3dc49a..270765236 100644 --- a/kernel/net/ipv4/netfilter/ipt_ECN.c +++ b/kernel/net/ipv4/netfilter/ipt_ECN.c @@ -72,7 +72,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr = einfo->proto.tcp.cwr; inet_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + oldval, ((__be16 *)tcph)[6], false); return true; } diff --git a/kernel/net/ipv4/netfilter/ipt_REJECT.c b/kernel/net/ipv4/netfilter/ipt_REJECT.c index 87907d4bd..1d16c0f28 100644 --- a/kernel/net/ipv4/netfilter/ipt_REJECT.c +++ b/kernel/net/ipv4/netfilter/ipt_REJECT.c @@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(skb, hook); + nf_send_reset(par->net, skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c b/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c index e9e677930..5fdc55651 100644 --- a/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,7 @@ #include static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; @@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) } static void -synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +synproxy_send_tcp(const struct synproxy_net *snet, + const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { + struct net *net = nf_ct_net(snet->tmpl); + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { @@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, nf_conntrack_get(nfct); } - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; free_nskb: @@ -68,7 +71,8 @@ free_nskb: } static void -synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +synproxy_send_client_synack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; @@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } @@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void @@ -220,13 +224,14 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool @@ -257,7 +262,7 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_net *snet = synproxy_pernet(par->net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -286,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(skb, th, &opts); + synproxy_send_client_synack(snet, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { @@ -298,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); + struct synproxy_net *snet = synproxy_pernet(nhs->net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -432,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = { static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { { .hook = ipv4_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv4_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, diff --git a/kernel/net/ipv4/netfilter/ipt_ah.c b/kernel/net/ipv4/netfilter/ipt_ah.c index 14a2aa8b8..a787d07f6 100644 --- a/kernel/net/ipv4/netfilter/ipt_ah.c +++ b/kernel/net/ipv4/netfilter/ipt_ah.c @@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); - r=(spi >= min && spi <= max) ^ invert; + r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } diff --git a/kernel/net/ipv4/netfilter/ipt_rpfilter.c b/kernel/net/ipv4/netfilter/ipt_rpfilter.c index 4bfaedf9b..78cc64edd 100644 --- a/kernel/net/ipv4/netfilter/ipt_rpfilter.c +++ b/kernel/net/ipv4/netfilter/ipt_rpfilter.c @@ -32,15 +32,14 @@ static __be32 rpfilter_get_saddr(__be32 addr) return addr; } -static bool rpfilter_lookup_reverse(struct flowi4 *fl4, +static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; bool dev_match; - struct net *net = dev_net(dev); int ret __maybe_unused; - if (fib_lookup(net, fl4, &res)) + if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { @@ -61,9 +60,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, if (FIB_RES_DEV(res) == dev) dev_match = true; #endif - if (dev_match || flags & XT_RPFILTER_LOOSE) - return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; - return dev_match; + return dev_match || flags & XT_RPFILTER_LOOSE; } static bool rpfilter_is_local(const struct sk_buff *skb) @@ -98,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; - return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/kernel/net/ipv4/netfilter/iptable_filter.c b/kernel/net/ipv4/netfilter/iptable_filter.c index a0f3beca5..397ef2dd1 100644 --- a/kernel/net/ipv4/netfilter/iptable_filter.c +++ b/kernel/net/ipv4/netfilter/iptable_filter.c @@ -33,19 +33,16 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter); + return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/kernel/net/ipv4/netfilter/iptable_mangle.c b/kernel/net/ipv4/netfilter/iptable_mangle.c index 62cbb8c5f..ba5d392a1 100644 --- a/kernel/net/ipv4/netfilter/iptable_mangle.c +++ b/kernel/net/ipv4/netfilter/iptable_mangle.c @@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = { static unsigned int ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) { - struct net_device *out = state->out; unsigned int ret; const struct iphdr *iph; u_int8_t tos; @@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state, - dev_net(out)->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(const struct nf_hook_ops *ops, +iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->out)->ipv4.iptable_mangle); + if (state->hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, state, + state->net->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->in)->ipv4.iptable_mangle); + return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/kernel/net/ipv4/netfilter/iptable_nat.c b/kernel/net/ipv4/netfilter/iptable_nat.c index 0d4d9cdf9..ae2cd2752 100644 --- a/kernel/net/ipv4/netfilter/iptable_nat.c +++ b/kernel/net/ipv4/netfilter/iptable_nat.c @@ -28,49 +28,46 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table); + return ipt_do_table(skb, state, state->net->ipv4.nat_table); } -static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, @@ -78,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_out, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, @@ -86,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_local_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, @@ -94,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, diff --git a/kernel/net/ipv4/netfilter/iptable_raw.c b/kernel/net/ipv4/netfilter/iptable_raw.c index 0356e6da4..1ba02811a 100644 --- a/kernel/net/ipv4/netfilter/iptable_raw.c +++ b/kernel/net/ipv4/netfilter/iptable_raw.c @@ -20,19 +20,16 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw); + return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/kernel/net/ipv4/netfilter/iptable_security.c b/kernel/net/ipv4/netfilter/iptable_security.c index 4bce3980c..c2e23d5e9 100644 --- a/kernel/net/ipv4/netfilter/iptable_security.c +++ b/kernel/net/ipv4/netfilter/iptable_security.c @@ -37,20 +37,16 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, - net->ipv4.iptable_security); + return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -83,7 +79,7 @@ static int __init iptable_security_init(void) int ret; ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) return ret; sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); diff --git a/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 30ad9554b..461ca926f 100644 --- a/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_helper(const struct nf_hook_ops *ops, +static unsigned int ipv4_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops, ct, ctinfo); } -static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -143,14 +143,14 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } -static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } /* Connection tracking may drop packets, but never alters them, so @@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, @@ -280,7 +274,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 80d5554b9..c567e1b5d 100644 --- a/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net) } static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; @@ -134,15 +134,17 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + const struct nf_conntrack_zone *zone; + struct nf_conntrack_zone tmp; NF_CT_ASSERT(skb->nfct == NULL); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), - PF_INET, &origtuple)) { + PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } diff --git a/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c b/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c index c88b7d434..a04dee536 100644 --- a/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,14 +22,13 @@ #endif #include -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, + u_int32_t user) { int err; - skb_orphan(skb); - local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) { @@ -43,33 +42,32 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { - u16 zone = NF_CT_DEFAULT_ZONE; - + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); -#endif + if (skb->nfct) { + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) - return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + } #endif + if (nf_bridge_in_prerouting(skb)) + return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; + if (hooknum == NF_INET_PRE_ROUTING) - return IP_DEFRAG_CONNTRACK_IN + zone; + return IP_DEFRAG_CONNTRACK_IN + zone_id; else - return IP_DEFRAG_CONNTRACK_OUT + zone; + return IP_DEFRAG_CONNTRACK_OUT + zone_id; } -static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (sk && (sk->sk_family == PF_INET) && - inet->nodefrag) + if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && + inet_sk(sk)->nodefrag) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -83,9 +81,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = - nf_ct_defrag_user(ops->hooknum, skb); + nf_ct_defrag_user(state->hook, skb); - if (nf_ct_ipv4_gather_frags(skb, user)) + if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; @@ -94,14 +92,12 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, diff --git a/kernel/net/ipv4/netfilter/nf_dup_ipv4.c b/kernel/net/ipv4/netfilter/nf_dup_ipv4.c new file mode 100644 index 000000000..ceb187308 --- /dev/null +++ b/kernel/net/ipv4/netfilter/nf_dup_ipv4.c @@ -0,0 +1,106 @@ +/* + * (C) 2007 by Sebastian Claßen + * (C) 2007-2010 by Jan Engelhardt + * + * Extracted from xt_TEE.c + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 or later, as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + +static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, + const struct in_addr *gw, int oif) +{ + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + if (oif != -1) + fl4.flowi4_oif = oif; + + fl4.daddr = gw->s_addr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; + skb->protocol = htons(ETH_P_IP); + + return true; +} + +void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, + const struct in_addr *gw, int oif) +{ + struct iphdr *iph; + + if (this_cpu_read(nf_skb_duplicated)) + return; + /* + * Copy the skb, and route the copy. Will later return %XT_CONTINUE for + * the original skb, which should continue on its way as if nothing has + * happened. The copy should be independently delivered to the gateway. + */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Avoid counting cloned packets towards the original connection. */ + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + /* + * If we are in PREROUTING/INPUT, the checksum must be recalculated + * since the length could have changed as a result of defragmentation. + * + * We also decrease the TTL to mitigate potential loops between two + * hosts. + * + * Set %IP_DF so that the original source is notified of a potentially + * decreased MTU on the clone route. IPv6 does this too. + */ + iph = ip_hdr(skb); + iph->frag_off |= htons(IP_DF); + if (hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_IN) + --iph->ttl; + ip_send_check(iph); + + if (nf_dup_ipv4_route(net, skb, gw, oif)) { + __this_cpu_write(nf_skb_duplicated, true); + ip_local_out(net, skb->sk, skb); + __this_cpu_write(nf_skb_duplicated, false); + } else { + kfree_skb(skb); + } +} +EXPORT_SYMBOL_GPL(nf_dup_ipv4); + +MODULE_AUTHOR("Sebastian Claßen "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet"); +MODULE_LICENSE("GPL"); diff --git a/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index e59cc05c0..5075b7ecd 100644 --- a/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -120,7 +120,7 @@ static void nf_nat_ipv4_csum_update(struct sk_buff *skb, oldip = iph->daddr; newip = t->dst.u3.ip; } - inet_proto_csum_replace4(check, skb, oldip, newip, 1); + inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, @@ -151,7 +151,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, } } else inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), 1); + htons(oldlen), htons(datalen), true); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); /* We never see fragments: conntrack defrags on pre-routing * and local-out, and nf_nat_out protects post-routing. @@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) + state->hook)) return NF_DROP; else return NF_ACCEPT; @@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, state, ct); + ret = do_chain(priv, skb, state, ct); if (ret != NF_ACCEPT) return ret; - if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) break; - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } @@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -345,9 +345,9 @@ oif_changed: EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } @@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/kernel/net/ipv4/netfilter/nf_nat_pptp.c b/kernel/net/ipv4/netfilter/nf_nat_pptp.c index 657d2307f..b3ca21b2b 100644 --- a/kernel/net/ipv4/netfilter/nf_nat_pptp.c +++ b/kernel/net/ipv4/netfilter/nf_nat_pptp.c @@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct net *net = nf_ct_net(ct); const struct nf_conn *master = ct->master; struct nf_conntrack_expect *other_exp; - struct nf_conntrack_tuple t; + struct nf_conntrack_tuple t = {}; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; struct nf_nat_range range; diff --git a/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c b/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c index 4557b4ab8..7b98baa13 100644 --- a/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -67,7 +67,7 @@ icmp_manip_pkt(struct sk_buff *skb, hdr = (struct icmphdr *)(skb->data + hdroff); inet_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); + hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } diff --git a/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c b/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c index 7c6766713..ddb894ac1 100644 --- a/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg, } if (obj->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + mangle_address(ctx.begin, ctx.pointer - 4, map, check); kfree(obj->id); kfree(obj); diff --git a/kernel/net/ipv4/netfilter/nf_reject_ipv4.c b/kernel/net/ipv4/netfilter/nf_reject_ipv4.c index 3262e41ff..c747b2d9e 100644 --- a/kernel/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/kernel/net/ipv4/netfilter/nf_reject_ipv4.c @@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); /* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; const struct iphdr *oiph; @@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; /* "Never happens" */ @@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; diff --git a/kernel/net/ipv4/netfilter/nf_tables_arp.c b/kernel/net/ipv4/netfilter/nf_tables_arp.c index 8412268bb..9d09d4f59 100644 --- a/kernel/net/ipv4/netfilter/nf_tables_arp.c +++ b/kernel/net/ipv4/netfilter/nf_tables_arp.c @@ -15,15 +15,15 @@ #include static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, +nft_do_chain_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_arp __read_mostly = { diff --git a/kernel/net/ipv4/netfilter/nf_tables_ipv4.c b/kernel/net/ipv4/netfilter/nf_tables_ipv4.c index aa180d3a6..ca9dc3c46 100644 --- a/kernel/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/kernel/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,18 +18,18 @@ #include #include -static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, +static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, +static unsigned int nft_ipv4_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv4(ops, skb, state); + return nft_do_chain_ipv4(priv, skb, state); } struct nft_af_info nft_af_ipv4 __read_mostly = { diff --git a/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c index bf5c30ae1..f5c66a7a4 100644 --- a/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -26,44 +26,44 @@ #include #include -static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { diff --git a/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c b/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c index e335b0afd..2375b0a8b 100644 --- a/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -21,7 +21,7 @@ #include #include -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +static unsigned int nf_route_table_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); mark = skb->mark; iph = ip_hdr(skb); @@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain(&pkt, ops); + ret = nft_do_chain(&pkt, priv); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, iph->daddr != daddr || skb->mark != mark || iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) + if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) ret = NF_DROP; } return ret; diff --git a/kernel/net/ipv4/netfilter/nft_dup_ipv4.c b/kernel/net/ipv4/netfilter/nft_dup_ipv4.c new file mode 100644 index 000000000..bf855e64f --- /dev/null +++ b/kernel/net/ipv4/netfilter/nft_dup_ipv4.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_dup_ipv4 { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +static void nft_dup_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + struct in_addr gw = { + .s_addr = (__force __be32)regs->data[priv->sreg_addr], + }; + int oif = regs->data[priv->sreg_dev]; + + nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); +} + +static int nft_dup_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_DUP_SREG_ADDR] == NULL) + return -EINVAL; + + priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); + err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + if (err < 0) + return err; + + if (tb[NFTA_DUP_SREG_DEV] != NULL) { + priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + } + return 0; +} + +static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dup_ipv4_type; +static const struct nft_expr_ops nft_dup_ipv4_ops = { + .type = &nft_dup_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)), + .eval = nft_dup_ipv4_eval, + .init = nft_dup_ipv4_init, + .dump = nft_dup_ipv4_dump, +}; + +static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = { + [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_dup_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "dup", + .ops = &nft_dup_ipv4_ops, + .policy = nft_dup_ipv4_policy, + .maxattr = NFTA_DUP_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_dup_ipv4_module_init(void) +{ + return nft_register_expr(&nft_dup_ipv4_type); +} + +static void __exit nft_dup_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_dup_ipv4_type); +} + +module_init(nft_dup_ipv4_module_init); +module_exit(nft_dup_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup"); diff --git a/kernel/net/ipv4/netfilter/nft_masq_ipv4.c b/kernel/net/ipv4/netfilter/nft_masq_ipv4.c index 40e414c4c..b72ffc58e 100644 --- a/kernel/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/kernel/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/kernel/net/ipv4/netfilter/nft_redir_ipv4.c b/kernel/net/ipv4/netfilter/nft_redir_ipv4.c index d8d795df9..c09d43814 100644 --- a/kernel/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/kernel/net/ipv4/netfilter/nft_redir_ipv4.c @@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, mr.range[0].flags |= priv->flags; regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, - pkt->ops->hooknum); + pkt->hook); } static struct nft_expr_type nft_redir_ipv4_type; diff --git a/kernel/net/ipv4/netfilter/nft_reject_ipv4.c b/kernel/net/ipv4/netfilter/nft_reject_ipv4.c index b07e58b51..c24f41c81 100644 --- a/kernel/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/kernel/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; default: break; diff --git a/kernel/net/ipv4/ping.c b/kernel/net/ipv4/ping.c index 05ff44b75..aa67e0e64 100644 --- a/kernel/net/ipv4/ping.c +++ b/kernel/net/ipv4/ping.c @@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(isk->freebind || isk->transparent || has_addr || + if (!(net->ipv6.sysctl.ip_nonlocal_bind || + isk->freebind || isk->transparent || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; @@ -745,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; } diff --git a/kernel/net/ipv4/proc.c b/kernel/net/ipv4/proc.c index e1f3b911d..3abd9d7a3 100644 --- a/kernel/net/ipv4/proc.c +++ b/kernel/net/ipv4/proc.c @@ -298,6 +298,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), + SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), + SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), + SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), + SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_SENTINEL }; diff --git a/kernel/net/ipv4/raw.c b/kernel/net/ipv4/raw.c index 561cd4b8f..7113bae4e 100644 --- a/kernel/net/ipv4/raw.c +++ b/kernel/net/ipv4/raw.c @@ -406,13 +406,16 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->transport_header += iphlen; + if (iph->protocol == IPPROTO_ICMP && + length >= iphlen + sizeof(struct icmphdr)) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); } - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); - err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, - NULL, rt->dst.dev, dst_output_sk); + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, rt->dst.dev, + dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -483,6 +486,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; @@ -542,9 +546,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); - if (err) + err = ip_cmsg_send(net, msg, &ipc, false); + if (unlikely(err)) { + kfree(ipc.opt); goto out; + } if (ipc.opt) free = 1; } @@ -597,6 +603,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); + if (!saddr && ipc.oif) { + err = l3mdev_get_saddr(net, ipc.oif, &fl4); + if (err < 0) + goto done; + } + if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -607,7 +619,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/kernel/net/ipv4/route.c b/kernel/net/ipv4/route.c index f45f2a12f..02c62299d 100644 --- a/kernel/net/ipv4/route.c +++ b/kernel/net/ipv4/route.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_SYSCTL @@ -109,6 +111,8 @@ #include #endif #include +#include +#include #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -125,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ @@ -457,12 +462,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, } #define IP_IDENTS_SZ 2048u -struct ip_ident_bucket { - atomic_t id; - u32 stamp32; -}; -static struct ip_ident_bucket *ip_idents __read_mostly; +static atomic_t *ip_idents __read_mostly; +static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker @@ -470,15 +472,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(bucket->stamp32); + u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; + atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 delta = 0; - if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, &bucket->id) - segs; + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -749,11 +752,11 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { - if (fib_lookup(net, fl4, &res) == 0) { + if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, 0); + 0, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -836,6 +839,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct inet_peer *peer; struct net *net; int log_martians; + int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); @@ -844,10 +848,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); + vif = l3mdev_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); @@ -936,7 +941,8 @@ static int ip_error(struct sk_buff *skb) break; } - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, + l3mdev_master_ifindex(skb->dev), 1); send = true; if (peer) { @@ -977,7 +983,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, @@ -1147,7 +1153,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) +static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1188,7 +1194,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) fl4.flowi4_mark = skb->mark; rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, @@ -1405,6 +1411,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif + rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1432,12 +1439,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, } static struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, bool nopolicy, bool noxfrm, bool will_cache) { - return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | - (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0)); + struct rtable *rt; + + rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); + + if (rt) { + rt->rt_genid = rt_genid_ipv4(dev_net(dev)); + rt->rt_flags = flags; + rt->rt_type = type; + rt->rt_is_input = 0; + rt->rt_iif = 0; + rt->rt_pmtu = 0; + rt->rt_gateway = 0; + rt->rt_uses_gateway = 0; + rt->rt_table_id = 0; + INIT_LIST_HEAD(&rt->rt_uncached); + + rt->dst.output = ip_output; + if (flags & RTCF_LOCAL) + rt->dst.input = ip_local_deliver; + } + + return rt; } /* called in rcu_read_lock() section */ @@ -1446,6 +1475,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; u32 itag = 0; int err; @@ -1458,9 +1488,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(saddr)) - goto e_inval; + if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) + goto e_inval; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) @@ -1471,7 +1500,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + if (our) + flags |= RTCF_LOCAL; + + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1480,20 +1512,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev)); - rth->rt_flags = RTCF_MULTICAST; - rth->rt_type = RTN_MULTICAST; rth->rt_is_input= 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); - if (our) { - rth->dst.input= ip_local_deliver; - rth->rt_flags |= RTCF_LOCAL; - } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) @@ -1538,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe, __rcu **fnhe_p; + u32 hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + + hash = rcu_dereference_protected(nh->nh_exceptions, + lockdep_is_held(&fnhe_lock)); + hash += hval; + + fnhe_p = &hash->chain; + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); + while (fnhe) { + if (fnhe->fnhe_daddr == daddr) { + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); + fnhe_flush_routes(fnhe); + kfree_rcu(fnhe, rcu); + break; + } + fnhe_p = &fnhe->fnhe_next; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, + lockdep_is_held(&fnhe_lock)); + } + + spin_unlock_bh(&fnhe_lock); +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1548,7 +1597,6 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - unsigned int flags = 0; bool do_cache; u32 itag = 0; @@ -1592,18 +1640,27 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe) + if (fnhe) { rth = rcu_dereference(fnhe->fnhe_rth_input); - else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(&FIB_RES_NH(*res), daddr); + fnhe = NULL; + } else { + goto rt_cache; + } + } + + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); +rt_cache: if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } - rth = rt_dst_alloc(out_dev->dev, + rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { @@ -1611,21 +1668,22 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = flags; - rth->rt_type = res->type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; - rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; + rth->dst.output = lwtunnel_output; + } + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1633,6 +1691,48 @@ out: return err; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* To make ICMP packets follow the right flow, the multipath hash is + * calculated from the inner IP addresses in reverse order. + */ +static int ip_multipath_icmp_hash(struct sk_buff *skb) +{ + const struct iphdr *outer_iph = ip_hdr(skb); + struct icmphdr _icmph; + const struct icmphdr *icmph; + struct iphdr _inner_iph; + const struct iphdr *inner_iph; + + if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) + goto standard_hash; + + icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), + &_icmph); + if (!icmph) + goto standard_hash; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + goto standard_hash; + } + + inner_iph = skb_header_pointer(skb, + outer_iph->ihl * 4 + sizeof(_icmph), + sizeof(_inner_iph), &_inner_iph); + if (!inner_iph) + goto standard_hash; + + return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); + +standard_hash: + return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); +} + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, const struct flowi4 *fl4, @@ -1640,8 +1740,15 @@ static int ip_mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(res); + if (res->fi && res->fi->fib_nhs > 1) { + int h; + + if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) + h = ip_multipath_icmp_hash(skb); + else + h = fib_multipath_hash(saddr, daddr); + fib_select_multipath(res, h); + } #endif /* create a routing cache entry */ @@ -1664,6 +1771,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); + struct ip_tunnel_info *tun_info; struct flowi4 fl4; unsigned int flags = 0; u32 itag = 0; @@ -1681,10 +1789,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + tun_info = skb_tunnel_info(skb); + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) + fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; + else + fl4.flowi4_tun_key.tun_id = 0; + skb_dst_drop(skb); + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; res.fi = NULL; + res.table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1712,13 +1828,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; @@ -1732,7 +1849,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; goto local_input; } @@ -1754,7 +1871,7 @@ brd_input: err = fib_validate_source(skb, saddr, 0, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; @@ -1774,26 +1891,19 @@ local_input: } } - rth = rt_dst_alloc(net->loopback_dev, + rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - - rth->rt_genid = rt_genid_ipv4(net); - rth->rt_flags = flags|RTCF_LOCAL; - rth->rt_type = res.type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res.table) + rth->rt_table_id = res.table->tb_id; + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1814,6 +1924,7 @@ no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; res.fi = NULL; + res.table = NULL; goto local_input; /* @@ -1836,8 +1947,6 @@ e_nobufs: goto out; martian_source: - err = -EINVAL; -martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } @@ -1945,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); - if (fnhe) + if (fnhe) { prth = &fnhe->fnhe_rth_output; - else { - if (unlikely(fl4->flowi4_flags & - FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { - do_cache = false; - goto add; + rth = rcu_dereference(*prth); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(nh, fl4->daddr); + fnhe = NULL; + } else { + goto rt_cache; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); } + + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); rth = rcu_dereference(*prth); + +rt_cache: if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; @@ -1965,29 +2084,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } add: - rth = rt_dst_alloc(dev_out, + rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), do_cache); if (!rth) return ERR_PTR(-ENOBUFS); - rth->dst.output = ip_output; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); - rth->rt_flags = flags; - rth->rt_type = type; - rth->rt_is_input = 0; rth->rt_iif = orig_oif ? : 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) - rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { @@ -2006,6 +2115,8 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); + if (lwtunnel_output_redirect(rth->dst.lwtstate)) + rth->dst.output = lwtunnel_output; return rth; } @@ -2014,7 +2125,8 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + int mp_hash) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2022,6 +2134,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2097,7 +2210,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto out; } if (ipv4_is_local_multicast(fl4->daddr) || - ipv4_is_lbcast(fl4->daddr)) { + ipv4_is_lbcast(fl4->daddr) || + fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); @@ -2111,6 +2225,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } + + rth = l3mdev_get_rtable(dev_out, fl4); + if (rth) + goto out; } if (!fl4->daddr) { @@ -2124,10 +2242,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif) { + if (fl4->flowi4_oif && + !netif_index_is_l3_master(net, fl4->flowi4_oif)) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2152,7 +2272,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } @@ -2169,18 +2289,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res); - else -#endif - if (!res.prefixlen && - res.table->tb_num_default > 1 && - res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(&res); - - if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, res); + fib_select_path(net, &res, fl4, mp_hash); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; @@ -2193,7 +2302,7 @@ out: rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { @@ -2245,7 +2354,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; new->dev = ort->dst.dev; if (new->dev) @@ -2262,7 +2371,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - dst_free(new); } @@ -2272,7 +2380,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, - struct sock *sk) + const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); @@ -2288,7 +2396,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, __be32 dst, __be32 src, +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, int event, int nowait, unsigned int flags) { @@ -2308,8 +2416,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4->flowi4_tos; - r->rtm_table = RT_TABLE_MAIN; - if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + r->rtm_table = table_id; + if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; @@ -2414,6 +2522,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int err; int mark; struct sk_buff *skb; + u32 table_id = RT_TABLE_MAIN; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2449,6 +2558,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + if (netif_index_is_l3_master(net, fl4.flowi4_oif)) + fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; + if (iif) { struct net_device *dev; @@ -2483,7 +2595,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, dst, src, &fl4, skb, + if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) + table_id = rt->rt_table_id; + + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err < 0) @@ -2504,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; @@ -2742,6 +2856,10 @@ int __init ip_rt_init(void) prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); + if (!ip_tstamps) + panic("IP: failed to allocate ip_tstamps\n"); + for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/kernel/net/ipv4/syncookies.c b/kernel/net/ipv4/syncookies.c index df849e5a1..4cbe9f0a4 100644 --- a/kernel/net/ipv4/syncookies.c +++ b/kernel/net/ipv4/syncookies.c @@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mssp) +__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - tcp_synq_overflow(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return __cookie_v4_init_sequence(iph, th, mssp); } @@ -219,23 +215,26 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; + bool own_req; - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + NULL, &own_req); if (child) { atomic_set(&req->rsk_refcnt, 1); + sock_rps_save_rxhash(child, skb); inet_csk_reqsk_queue_add(sk, req, child); } else { reqsk_free(req); } return child; } - +EXPORT_SYMBOL(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; @@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; @@ -345,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->snt_synack.v64 = 0; treq->tfo_listener = false; ireq->ir_iif = sk->sk_bound_dev_if; @@ -381,17 +384,17 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = get_cookie_sock(sk, skb, req, &rt->dst); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff --git a/kernel/net/ipv4/sysctl_net_ipv4.c b/kernel/net/ipv4/sysctl_net_ipv4.c index 143f5f380..1866f9102 100644 --- a/kernel/net/ipv4/sysctl_net_ipv4.c +++ b/kernel/net/ipv4/sysctl_net_ipv4.c @@ -29,6 +29,7 @@ static int zero; static int one = 1; static int four = 4; +static int thousand = 1000; static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -45,10 +46,16 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&net->ipv4.ip_local_ports.lock); + bool same_parity = !((range[0] ^ range[1]) & 1); + + write_seqlock_bh(&net->ipv4.ip_local_ports.lock); + if (same_parity && !net->ipv4.ip_local_ports.warned) { + net->ipv4.ip_local_ports.warned = true; + pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); + } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.ip_local_ports.lock); + write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -488,6 +495,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_recovery", + .data = &sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, @@ -569,6 +583,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_min_rtt_wlen", + .data = &sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -702,9 +723,27 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = &one, .extra2 = &gso_max_segs, }, + { + .procname = "tcp_pacing_ss_ratio", + .data = &sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, { .procname = "tcp_autocorking", .data = &sysctl_tcp_autocorking, @@ -827,6 +866,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_ecn_fallback", + .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), @@ -904,6 +950,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "igmp_link_local_mcast_reports", + .data = &sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/kernel/net/ipv4/tcp.c b/kernel/net/ipv4/tcp.c index bb2ce74f6..036a76ba2 100644 --- a/kernel/net/ipv4/tcp.c +++ b/kernel/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include +#include #include int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; @@ -388,6 +389,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + tp->rtt_min[0].rtt = ~0U; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -450,11 +452,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + int state; sock_rps_record_flow(sk); sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + + state = sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -491,14 +496,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -506,9 +511,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; @@ -516,8 +518,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after @@ -627,6 +628,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; + + tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) @@ -695,8 +698,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; @@ -779,7 +783,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, ret = -EAGAIN; break; } - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -809,16 +813,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_scheduled; + + if (force_schedule) { + mem_scheduled = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -885,11 +901,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto out_err; } - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; @@ -908,7 +925,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -921,7 +939,7 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { + if (!can_coalesce && i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -951,7 +969,8 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) { + size -= copy; + if (!size) { tcp_tx_timestamp(sk, skb); goto out; } @@ -972,7 +991,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -987,6 +1007,9 @@ do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1092,7 +1115,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } @@ -1110,7 +1134,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1144,7 +1168,8 @@ new_segment: skb = sk_stream_alloc_skb(sk, select_size(sk, sg), - sk->sk_allocation); + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -1187,7 +1212,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { + if (i == sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1247,7 +1272,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1275,6 +1301,9 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } @@ -1554,7 +1583,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - struct sk_buff *skb; + struct sk_buff *skb, *last; u32 urg_hole = 0; if (unlikely(flags & MSG_ERRQUEUE)) @@ -1614,7 +1643,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; /* Now that we have two receive queues this * shouldn't happen. */ @@ -1733,15 +1764,17 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); - } else - sk_wait_data(sk, &timeo); + } else { + sk_wait_data(sk, &timeo, last); + } if (user_recv) { int chunk; /* __ Restore normal policy in scheduler __ */ - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; @@ -1752,7 +1785,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, do_prequeue: tcp_prequeue_process(sk); - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; @@ -1900,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2204,7 +2238,8 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; @@ -2227,13 +2262,6 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2483,6 +2511,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2548,7 +2583,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, TCPF_LISTEN))) { tcp_fastopen_init_key_once(true); - err = fastopen_init_queue(sk, val); + fastopen_queue_tune(sk, val); } else { err = -EINVAL; } @@ -2599,15 +2634,19 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + u64 rate64; u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; + + info->tcpi_state = sk_state_load(sk); - info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2635,7 +2674,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { + if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = sk->sk_max_ack_backlog; } else { @@ -2665,16 +2704,20 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; rate = READ_ONCE(sk->sk_pacing_rate); - info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_pacing_rate); rate = READ_ONCE(sk->sk_max_pacing_rate); - info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_max_pacing_rate); do { start = u64_stats_fetch_begin_irq(&tp->syncp); - info->tcpi_bytes_acked = tp->bytes_acked; - info->tcpi_bytes_received = tp->bytes_received; + put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); + put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2812,10 +2855,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq) - val = icsk->icsk_accept_queue.fastopenq->max_qlen; - else - val = 0; + val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_TIMESTAMP: @@ -2824,6 +2864,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } @@ -3028,11 +3104,12 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) diff --git a/kernel/net/ipv4/tcp_bic.c b/kernel/net/ipv4/tcp_bic.c index c037644ea..fd1405d37 100644 --- a/kernel/net/ipv4/tcp_bic.c +++ b/kernel/net/ipv4/tcp_bic.c @@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); diff --git a/kernel/net/ipv4/tcp_cdg.c b/kernel/net/ipv4/tcp_cdg.c new file mode 100644 index 000000000..167b6a3e1 --- /dev/null +++ b/kernel/net/ipv4/tcp_cdg.c @@ -0,0 +1,433 @@ +/* + * CAIA Delay-Gradient (CDG) congestion control + * + * This implementation is based on the paper: + * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. + * + * Scavenger traffic (Less-than-Best-Effort) should disable coexistence + * heuristics using parameters use_shadow=0 and use_ineff=0. + * + * Parameters window, backoff_beta, and backoff_factor are crucial for + * throughput and delay. Future work is needed to determine better defaults, + * and to provide guidelines for use in different environments/contexts. + * + * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. + * Parameter window is only configurable when loading tcp_cdg as a module. + * + * Notable differences from paper/FreeBSD: + * o Using Hybrid Slow start and Proportional Rate Reduction. + * o Add toggle for shadow window mechanism. Suggested by David Hayes. + * o Add toggle for non-congestion loss tolerance. + * o Scaling parameter G is changed to a backoff factor; + * conversion is given by: backoff_factor = 1000/(G * window). + * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. + * o More accurate e^-x. + */ +#include +#include +#include +#include + +#define HYSTART_ACK_TRAIN 1 +#define HYSTART_DELAY 2 + +static int window __read_mostly = 8; +static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ +static unsigned int backoff_factor __read_mostly = 42; +static unsigned int hystart_detect __read_mostly = 3; +static unsigned int use_ineff __read_mostly = 5; +static bool use_shadow __read_mostly = true; +static bool use_tolerance __read_mostly; + +module_param(window, int, 0444); +MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); +module_param(backoff_beta, uint, 0644); +MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); +module_param(backoff_factor, uint, 0644); +MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); +module_param(hystart_detect, uint, 0644); +MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " + "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); +module_param(use_ineff, uint, 0644); +MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); +module_param(use_shadow, bool, 0644); +MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); +module_param(use_tolerance, bool, 0644); +MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); + +struct minmax { + union { + struct { + s32 min; + s32 max; + }; + u64 v64; + }; +}; + +enum cdg_state { + CDG_UNKNOWN = 0, + CDG_NONFULL = 1, + CDG_FULL = 2, + CDG_BACKOFF = 3, +}; + +struct cdg { + struct minmax rtt; + struct minmax rtt_prev; + struct minmax *gradients; + struct minmax gsum; + bool gfilled; + u8 tail; + u8 state; + u8 delack; + u32 rtt_seq; + u32 undo_cwnd; + u32 shadow_wnd; + u16 backoff_cnt; + u16 sample_cnt; + s32 delay_min; + u32 last_ack; + u32 round_start; +}; + +/** + * nexp_u32 - negative base-e exponential + * @ux: x in units of micro + * + * Returns exp(ux * -1e-6) * U32_MAX. + */ +static u32 __pure nexp_u32(u32 ux) +{ + static const u16 v[] = { + /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ + 65535, + 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, + 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, + }; + u32 msb = ux >> 8; + u32 res; + int i; + + /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ + if (msb > U16_MAX) + return 0; + + /* Scale first eight bits linearly: */ + res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); + + /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ + for (i = 1; msb; i++, msb >>= 1) { + u32 y = v[i & -(msb & 1)] + U32_C(1); + + res = ((u64)res * y) >> 16; + } + + return res; +} + +/* Based on the HyStart algorithm (by Ha et al.) that is implemented in + * tcp_cubic. Differences/experimental changes: + * o Using Hayes' delayed ACK filter. + * o Using a usec clock for the ACK train. + * o Reset ACK train when application limited. + * o Invoked at any cwnd (i.e. also when cwnd < 16). + * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). + */ +static void tcp_cdg_hystart_update(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); + if (ca->delay_min == 0) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { + u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); + + if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { + ca->last_ack = now_us; + ca->round_start = now_us; + } else if (before(now_us, ca->last_ack + 3000)) { + u32 base_owd = max(ca->delay_min / 2U, 125U); + + ca->last_ack = now_us; + if (after(now_us, ca->round_start + base_owd)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + return; + } + } + } + + if (hystart_detect & HYSTART_DELAY) { + if (ca->sample_cnt < 8) { + ca->sample_cnt++; + } else { + s32 thresh = max(ca->delay_min + ca->delay_min / 8U, + 125U); + + if (ca->rtt.min > thresh) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } +} + +static s32 tcp_cdg_grad(struct cdg *ca) +{ + s32 gmin = ca->rtt.min - ca->rtt_prev.min; + s32 gmax = ca->rtt.max - ca->rtt_prev.max; + s32 grad; + + if (ca->gradients) { + ca->gsum.min += gmin - ca->gradients[ca->tail].min; + ca->gsum.max += gmax - ca->gradients[ca->tail].max; + ca->gradients[ca->tail].min = gmin; + ca->gradients[ca->tail].max = gmax; + ca->tail = (ca->tail + 1) & (window - 1); + gmin = ca->gsum.min; + gmax = ca->gsum.max; + } + + /* We keep sums to ignore gradients during cwnd reductions; + * the paper's smoothed gradients otherwise simplify to: + * (rtt_latest - rtt_oldest) / window. + * + * We also drop division by window here. + */ + grad = gmin > 0 ? gmin : gmax; + + /* Extrapolate missing values in gradient window: */ + if (!ca->gfilled) { + if (!ca->gradients && window > 1) + grad *= window; /* Memory allocation failed. */ + else if (ca->tail == 0) + ca->gfilled = true; + else + grad = (grad * window) / (int)ca->tail; + } + + /* Backoff was effectual: */ + if (gmin <= -32 || gmax <= -32) + ca->backoff_cnt = 0; + + if (use_tolerance) { + /* Reduce small variations to zero: */ + gmin = DIV_ROUND_CLOSEST(gmin, 64); + gmax = DIV_ROUND_CLOSEST(gmax, 64); + + if (gmin > 0 && gmax <= 0) + ca->state = CDG_FULL; + else if ((gmin > 0 && gmax > 0) || gmax < 0) + ca->state = CDG_NONFULL; + } + return grad; +} + +static bool tcp_cdg_backoff(struct sock *sk, u32 grad) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + return false; + + if (use_ineff) { + ca->backoff_cnt++; + if (ca->backoff_cnt > use_ineff) + return false; + } + + ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); + ca->state = CDG_BACKOFF; + tcp_enter_cwr(sk); + return true; +} + +/* Not called in CWR or Recovery state. */ +static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_cwnd; + u32 incr; + + if (tcp_in_slow_start(tp) && hystart_detect) + tcp_cdg_hystart_update(sk); + + if (after(ack, ca->rtt_seq) && ca->rtt.v64) { + s32 grad = 0; + + if (ca->rtt_prev.v64) + grad = tcp_cdg_grad(ca); + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + ca->last_ack = 0; + ca->sample_cnt = 0; + + if (grad > 0 && tcp_cdg_backoff(sk, grad)) + return; + } + + if (!tcp_is_cwnd_limited(sk)) { + ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); + return; + } + + prior_snd_cwnd = tp->snd_cwnd; + tcp_reno_cong_avoid(sk, ack, acked); + + incr = tp->snd_cwnd - prior_snd_cwnd; + ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); +} + +static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (rtt_us <= 0) + return; + + /* A heuristic for filtering delayed ACKs, adapted from: + * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support + * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. + */ + if (tp->sacked_out == 0) { + if (num_acked == 1 && ca->delack) { + /* A delayed ACK is only used for the minimum if it is + * provenly lower than an existing non-zero minimum. + */ + ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->delack--; + return; + } else if (num_acked > 1 && ca->delack < 5) { + ca->delack++; + } + } + + ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); + ca->rtt.max = max(ca->rtt.max, rtt_us); +} + +static u32 tcp_cdg_ssthresh(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->undo_cwnd = tp->snd_cwnd; + + if (ca->state == CDG_BACKOFF) + return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); + + if (ca->state == CDG_NONFULL && use_tolerance) + return tp->snd_cwnd; + + ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); + if (use_shadow) + return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); + return max(2U, tp->snd_cwnd >> 1); +} + +static u32 tcp_cdg_undo_cwnd(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); +} + +static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct minmax *gradients; + + switch (ev) { + case CA_EVENT_CWND_RESTART: + gradients = ca->gradients; + if (gradients) + memset(gradients, 0, window * sizeof(gradients[0])); + memset(ca, 0, sizeof(*ca)); + + ca->gradients = gradients; + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; + break; + case CA_EVENT_COMPLETE_CWR: + ca->state = CDG_UNKNOWN; + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + break; + default: + break; + } +} + +static void tcp_cdg_init(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We silently fall back to window = 1 if allocation fails. */ + if (window > 1) + ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), + GFP_NOWAIT | __GFP_NOWARN); + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; +} + +static void tcp_cdg_release(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + kfree(ca->gradients); +} + +struct tcp_congestion_ops tcp_cdg __read_mostly = { + .cong_avoid = tcp_cdg_cong_avoid, + .cwnd_event = tcp_cdg_cwnd_event, + .pkts_acked = tcp_cdg_acked, + .undo_cwnd = tcp_cdg_undo_cwnd, + .ssthresh = tcp_cdg_ssthresh, + .release = tcp_cdg_release, + .init = tcp_cdg_init, + .owner = THIS_MODULE, + .name = "cdg", +}; + +static int __init tcp_cdg_register(void) +{ + if (backoff_beta > 1024 || window < 1 || window > 256) + return -ERANGE; + if (!is_power_of_2(window)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_cdg); + return 0; +} + +static void __exit tcp_cdg_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_cdg); +} + +module_init(tcp_cdg_register); +module_exit(tcp_cdg_unregister); +MODULE_AUTHOR("Kenneth Klette Jonassen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP CDG"); diff --git a/kernel/net/ipv4/tcp_cong.c b/kernel/net/ipv4/tcp_cong.c index 84be008c9..882caa4e7 100644 --- a/kernel/net/ipv4/tcp_cong.c +++ b/kernel/net/ipv4/tcp_cong.c @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; - u32 key; + u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = __tcp_ca_find_autoload(name); - key = ca ? ca->key : TCP_CA_UNSPEC; + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } rcu_read_unlock(); return key; @@ -170,6 +173,10 @@ out: */ if (ca->get_info) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) @@ -178,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } static void tcp_reinit_congestion_control(struct sock *sk, @@ -189,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); + if (sk->sk_state != TCP_CLOSE) + tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ @@ -365,10 +376,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) */ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - u32 cwnd = tp->snd_cwnd + acked; + u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - if (cwnd > tp->snd_ssthresh) - cwnd = tp->snd_ssthresh + 1; acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); @@ -413,7 +422,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; diff --git a/kernel/net/ipv4/tcp_cubic.c b/kernel/net/ipv4/tcp_cubic.c index 06d3d665a..448c2615f 100644 --- a/kernel/net/ipv4/tcp_cubic.c +++ b/kernel/net/ipv4/tcp_cubic.c @@ -151,6 +151,27 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } +static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_TX_START) { + struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; + + /* We were application limited (idle) for a while. + * Shift epoch_start to keep cwnd growth to cubic curve. + */ + if (ca->epoch_start && delta > 0) { + ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } + return; + } +} + /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% @@ -320,7 +341,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); @@ -439,7 +460,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tp->snd_cwnd <= tp->snd_ssthresh && + if (hystart && tcp_in_slow_start(tp) && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } @@ -450,6 +471,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, + .cwnd_event = bictcp_cwnd_event, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/kernel/net/ipv4/tcp_dctcp.c b/kernel/net/ipv4/tcp_dctcp.c index 4c41c1287..7e538f71f 100644 --- a/kernel/net/ipv4/tcp_dctcp.c +++ b/kernel/net/ipv4/tcp_dctcp.c @@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - /* For avoiding denominator == 1. */ - if (ca->acked_bytes_total == 0) - ca->acked_bytes_total = 1; + u64 bytes_ecn = ca->acked_bytes_ecn; + u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ - ca->dctcp_alpha = ca->dctcp_alpha - - (ca->dctcp_alpha >> dctcp_shift_g) + - (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / - ca->acked_bytes_total; - if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) - /* Clamp dctcp_alpha to max. */ - ca->dctcp_alpha = DCTCP_MAX_ALPHA; + alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); + if (bytes_ecn) { + /* If dctcp_shift_g == 1, a 32bit value would overflow + * after 8 Mbytes. + */ + bytes_ecn <<= (10 - dctcp_shift_g); + do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + } + /* dctcp_alpha can be read from dctcp_get_info() without + * synchro, so we ask compiler to not use dctcp_alpha + * as a temporary variable in prior operations. + */ + WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } diff --git a/kernel/net/ipv4/tcp_diag.c b/kernel/net/ipv4/tcp_diag.c index 79b34a0f4..b31604086 100644 --- a/kernel/net/ipv4/tcp_diag.c +++ b/kernel/net/ipv4/tcp_diag.c @@ -19,13 +19,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { - const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) { + if (sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; - } else { + } else if (sk->sk_type == SOCK_STREAM) { + const struct tcp_sock *tp = tcp_sk(sk); + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } @@ -50,6 +51,7 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init tcp_diag_init(void) diff --git a/kernel/net/ipv4/tcp_fastopen.c b/kernel/net/ipv4/tcp_fastopen.c index f9c0fb84e..55be6ac70 100644 --- a/kernel/net/ipv4/tcp_fastopen.c +++ b/kernel/net/ipv4/tcp_fastopen.c @@ -124,27 +124,29 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } -static bool tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct dst_entry *dst, - struct request_sock *req) +static struct sock *tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; u32 end_seq; + bool own_req; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + NULL, &own_req); if (!child) - return false; + return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -161,15 +163,13 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); + atomic_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); @@ -178,12 +178,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * Note that IPv6 might also have used skb_get() trick - * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) - * So we need to eventually get a clone of the packet, - * before inserting it in sk_receive_queue. + /* Queue the data carried in the SYN packet. + * We used to play tricky games with skb_get(). + * With lockless listener, it is a dead end. + * Do not think about it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since @@ -191,12 +189,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, */ end_seq = TCP_SKB_CB(skb)->end_seq; if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2; - - if (unlikely(skb_shared(skb))) - skb2 = skb_clone(skb, GFP_ATOMIC); - else - skb2 = skb_get(skb); + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (likely(skb2)) { skb_dst_drop(skb2); @@ -214,11 +207,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, } } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; - sk->sk_data_ready(sk); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(!req->sk); - return true; + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. + */ + return child; } static bool tcp_fastopen_queue_check(struct sock *sk) @@ -235,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { @@ -261,13 +253,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ -bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst) +struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -276,7 +269,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; - return false; + return NULL; } if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) @@ -296,11 +289,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, * data in SYN_RECV state. */ fastopen: - if (tcp_fastopen_create_child(sk, skb, dst, req)) { + child = tcp_fastopen_create_child(sk, skb, dst, req); + if (child) { foc->len = -1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - return true; + return child; } NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ @@ -308,6 +302,5 @@ fastopen: valid_foc.exp = foc->exp; *foc = valid_foc; - return false; + return NULL; } -EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/kernel/net/ipv4/tcp_highspeed.c b/kernel/net/ipv4/tcp_highspeed.c index 882c08aae..db7842495 100644 --- a/kernel/net/ipv4/tcp_highspeed.c +++ b/kernel/net/ipv4/tcp_highspeed.c @@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* Update AIMD parameters. diff --git a/kernel/net/ipv4/tcp_htcp.c b/kernel/net/ipv4/tcp_htcp.c index 58469fff6..82f0d9ed6 100644 --- a/kernel/net/ipv4/tcp_htcp.c +++ b/kernel/net/ipv4/tcp_htcp.c @@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* In dangerous area, increase slowly. diff --git a/kernel/net/ipv4/tcp_hybla.c b/kernel/net/ipv4/tcp_hybla.c index f963b274f..083831e35 100644 --- a/kernel/net/ipv4/tcp_hybla.c +++ b/kernel/net/ipv4/tcp_hybla.c @@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) rho_fractions = ca->rho_3ls - (ca->rho << 3); - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* * slow start * INC = 2^RHO - 1 diff --git a/kernel/net/ipv4/tcp_illinois.c b/kernel/net/ipv4/tcp_illinois.c index f71002e4d..2ab9bbb6f 100644 --- a/kernel/net/ipv4/tcp_illinois.c +++ b/kernel/net/ipv4/tcp_illinois.c @@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In slow start */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { diff --git a/kernel/net/ipv4/tcp_input.c b/kernel/net/ipv4/tcp_input.c index c9ab96418..d4c511584 100644 --- a/kernel/net/ipv4/tcp_input.c +++ b/kernel/net/ipv4/tcp_input.c @@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; +int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_thin_dupack __read_mostly; @@ -109,6 +110,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -196,11 +198,13 @@ static void tcp_enter_quickack_mode(struct sock *sk) * and the session is not interactive. */ -static inline bool tcp_in_quickack_mode(const struct sock *sk) +static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); - return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; + return (dst && dst_metric(dst, RTAX_QUICKACK)) || + (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -359,7 +363,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { + !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -446,7 +450,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !sk_under_memory_pressure(sk) && + !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -750,13 +754,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) * TCP pacing, to smooth the burst on large writes when packets * in flight is significantly lower than cwnd (or rwin) */ +int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; +int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; + static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= sysctl_tcp_pacing_ss_ratio; + else + rate *= sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); @@ -861,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > 0) tcp_disable_early_retrans(tp); + tp->rack.reord = 1; } /* This must be called before lost_out is incremented */ @@ -886,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } -static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, - struct sk_buff *skb) +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); @@ -1028,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, return !before(start_seq, end_seq - tp->max_window); } -/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "B". Later note: FACK people cheated me again 8), we have to account - * for reordering! Ugly, but should help. - * - * Search retransmitted skbs from write_queue that were sent when snd_nxt was - * less than what is now known to be received by the other end (derived from - * highest SACK block). Also calculate the lowest snd_nxt among the remaining - * retransmitted skbs to avoid some costly processing per ACKs. - */ -static void tcp_mark_lost_retrans(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt = 0; - u32 new_low_seq = tp->snd_nxt; - u32 received_upto = tcp_highest_sack_seq(tp); - - if (!tcp_is_fack(tp) || !tp->retrans_out || - !after(received_upto, tp->lost_retrans_low) || - icsk->icsk_ca_state != TCP_CA_Recovery) - return; - - tcp_for_write_queue(skb, sk) { - u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; - - if (skb == tcp_send_head(sk)) - break; - if (cnt == tp->retrans_out) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) - continue; - - /* TODO: We would like to get rid of tcp_is_fack(tp) only - * constraint here (see above) but figuring out that at - * least tp->reordering SACK blocks reside between ack_seq - * and received_upto is not easy task to do cheaply with - * the available datastructures. - * - * Whether FACK should check here for tp->reordering segs - * in-between one could argue for either way (it would be - * rather simple to implement as we could count fack_count - * during the walk and do tp->fackets_out - fack_count). - */ - if (after(received_upto, ack_seq)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - - tcp_skb_mark_lost_uncond_verify(tp, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); - } else { - if (before(ack_seq, new_low_seq)) - new_low_seq = ack_seq; - cnt += tcp_skb_pcount(skb); - } - } - - if (tp->retrans_out) - tp->lost_retrans_low = new_low_seq; -} - static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1130,7 +1086,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sacktag_state { int reord; int fack_count; - long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + struct skb_mstamp first_sackt; + struct skb_mstamp last_sackt; int flag; }; @@ -1212,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { + tcp_rack_advance(tp, xmit_time, sacked); + if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing @@ -1233,14 +1196,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - /* Pick the earliest sequence sacked for RTT */ - if (state->rtt_us < 0) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - state->rtt_us = skb_mstamp_us_delta(&now, - xmit_time); - } + if (state->first_sackt.v64 == 0) + state->first_sackt = *xmit_time; + state->last_sackt = *xmit_time; } if (sacked & TCPCB_LOST) { @@ -1316,16 +1274,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) { - skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; - } + if (!TCP_SKB_CB(prev)->tcp_gso_size) + TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (tcp_skb_pcount(skb) <= 1) { - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - } + if (tcp_skb_pcount(skb) <= 1) + TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1634,7 +1588,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, long *sack_rtt_us) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1642,7 +1596,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; - struct tcp_sacktag_state state; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; @@ -1650,9 +1603,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int i, j; int first_sack_index; - state.flag = 0; - state.reord = tp->packets_out; - state.rtt_us = -1L; + state->flag = 0; + state->reord = tp->packets_out; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1663,7 +1615,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) - state.flag |= FLAG_DSACKING_ACK; + state->flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1728,7 +1680,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } skb = tcp_write_queue_head(sk); - state.fack_count = 0; + state->fack_count = 0; i = 0; if (!tp->sacked_out) { @@ -1762,10 +1714,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, &state, + skb = tcp_sacktag_skip(skb, sk, state, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, - &state, + state, start_seq, cache->start_seq, dup_sack); @@ -1776,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, - &state, + state, cache->end_seq); /* ...tail remains todo... */ @@ -1785,12 +1737,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; cache++; goto walk; } - skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); + skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1800,12 +1752,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; } - skb = tcp_sacktag_skip(skb, sk, &state, start_seq); + skb = tcp_sacktag_skip(skb, sk, state, start_seq); walk: - skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: @@ -1820,11 +1772,10 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state.reord < tp->fackets_out) && + if ((state->reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); out: @@ -1834,8 +1785,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt_us = state.rtt_us; - return state.flag; + return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than @@ -1924,14 +1874,13 @@ void tcp_enter_loss(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - bool new_recovery = false; + bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { - new_recovery = true; tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); @@ -2255,7 +2204,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) (oldcnt >= packets)) break; - mss = skb_shinfo(skb)->gso_size; + mss = tcp_skb_mss(skb); err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss, GFP_ATOMIC); if (err < 0) @@ -2303,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) +{ + return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + before(tp->rx_opt.rcv_tsecr, when); +} + +/* skb is spurious retransmitted if the returned timestamp echo + * reply is prior to the skb transmission time + */ +static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, + const struct sk_buff *skb) +{ + return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); +} + /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || - (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); + tcp_tsopt_ecr_before(tp, tp->retrans_stamp); } /* Undo procedures. */ @@ -2482,15 +2446,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) return false; } -/* The cwnd reduction in CWR and Recovery use the PRR algorithm - * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ +/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. - * 2) If packets in flight is lower than ssthresh (such as due to excess - * losses and/or application stalls), do not perform any further cwnd - * reductions, but instead slow start up to ssthresh. + * 2) Otherwise PRR uses packet conservation to send as much as delivered. + * But when the retransmits are acked without further losses, PRR + * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { @@ -2507,7 +2470,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit) + int fast_rexmit, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2515,17 +2478,22 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, int newly_acked_sacked = prior_unsacked - (tp->packets_out - tp->sacked_out); + if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) + return; + tp->prr_delivered += newly_acked_sacked; - if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else { + } else if ((flag & FLAG_RETRANS_DATA_ACKED) && + !(flag & FLAG_LOST_RETRANS)) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); + } else { + sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2555,6 +2523,7 @@ void tcp_enter_cwr(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_CWR); } } +EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { @@ -2585,7 +2554,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2595,6 +2564,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) @@ -2614,6 +2584,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Do a simple retransmit without using the backoff mechanisms in @@ -2682,7 +2653,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tcp_init_undo(tp); - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!tcp_in_cwnd_reduction(sk)) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); tcp_init_cwnd_reduction(sk); @@ -2742,7 +2713,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked) + const int prior_unsacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2758,7 +2729,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * mark more packets lost or retransmit more. */ if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); return true; } @@ -2838,6 +2809,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && + tcp_rack_mark_lost(sk)) + flag |= FLAG_LOST_RETRANS; + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2845,7 +2821,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2858,9 +2834,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); - if (icsk->icsk_ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open && + !(flag & FLAG_LOST_RETRANS)) return; - /* Fall through to processing in Open state. */ + /* Change state if cwnd is undone or retransmits are lost */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -2895,12 +2872,73 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); tcp_xmit_retransmit_queue(sk); } +/* Kathleen Nichols' algorithm for tracking the minimum value of + * a data stream over some fixed time interval. (E.g., the minimum + * RTT over the past five minutes.) It uses constant space and constant + * time per update yet almost always delivers the same minimum as an + * implementation that has to keep all the data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of the + * n'th best >= n-1'th best. It also makes sure that the three values + * are widely separated in the time window since that bounds the worse + * case error when that data is monotonically increasing over the window. + * + * Upon getting a new min, we can forget everything earlier because it + * has no value - the new min is <= everything else in the window by + * definition and it's the most recent. So we restart fresh on every new min + * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd + * best. + */ +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +{ + const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; + struct rtt_meas *m = tcp_sk(sk)->rtt_min; + struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + u32 elapsed; + + /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ + if (unlikely(rttm.rtt <= m[0].rtt)) + m[0] = m[1] = m[2] = rttm; + else if (rttm.rtt <= m[1].rtt) + m[1] = m[2] = rttm; + else if (rttm.rtt <= m[2].rtt) + m[2] = rttm; + + elapsed = now - m[0].ts; + if (unlikely(elapsed > wlen)) { + /* Passed entire window without a new min so make 2nd choice + * the new min & 3rd choice the new 2nd. So forth and so on. + */ + m[0] = m[1]; + m[1] = m[2]; + m[2] = rttm; + if (now - m[0].ts > wlen) { + m[0] = m[1]; + m[1] = rttm; + if (now - m[0].ts > wlen) + m[0] = rttm; + } + } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { + /* Passed a quarter of the window without a new min so + * take 2nd choice from the 2nd quarter of the window. + */ + m[2] = m[1] = rttm; + } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { + /* Passed half the window without a new min so take the 3rd + * choice from the last half of the window. + */ + m[2] = rttm; + } +} + static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - long seq_rtt_us, long sack_rtt_us) + long seq_rtt_us, long sack_rtt_us, + long ca_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2909,9 +2947,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * Karn's algorithm forbids taking RTT if some retransmitted data * is acked (RFC6298). */ - if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt_us = -1L; - if (seq_rtt_us < 0) seq_rtt_us = sack_rtt_us; @@ -2923,11 +2958,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - + seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - + tp->rx_opt.rcv_tsecr); if (seq_rtt_us < 0) return false; + /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is + * always taken together with ACK, SACK, or TS-opts. Any negative + * values will be skipped with the seq_rtt_us < 0 check above. + */ + tcp_update_rtt_min(sk, ca_rtt_us); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -2937,21 +2977,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) +void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - long seq_rtt_us = -1L; + long rtt_us = -1L; - if (synack_stamp && !tp->total_retrans) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { + struct skb_mstamp now; - /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets - * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() - */ - if (!tp->srtt_us) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); + skb_mstamp_get(&now); + rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); + } + + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); } + static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3055,7 +3095,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, long sack_rtt_us) + u32 prior_snd_una, + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt, now; @@ -3063,8 +3104,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; - long ca_seq_rtt_us = -1L; + long sack_rtt_us = -1L; long seq_rtt_us = -1L; + long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; bool rtt_update; @@ -3113,6 +3155,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= acked_pcount; + else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3151,17 +3195,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_SACK_RENEGING; skb_mstamp_get(&now); - if (likely(first_ackt.v64)) { + if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + if (sack->first_sackt.v64) { + sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, + ca_rtt_us); if (flag & FLAG_ACKED) { - const struct tcp_congestion_ops *ca_ops - = inet_csk(sk)->icsk_ca_ops; - tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { @@ -3184,11 +3230,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3198,6 +3239,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); @@ -3238,7 +3282,7 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); @@ -3331,6 +3375,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); + if (tcp_send_head(sk)) + tcp_slow_start_after_idle_check(sk); + if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); @@ -3466,6 +3513,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sacktag_state sack_state; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3474,7 +3522,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - long sack_rtt_us = -1L; + + sack_state.first_sackt.v64 = 0; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3538,7 +3587,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; @@ -3563,13 +3612,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_rtt_us); + &sack_state); acked -= tp->packets_out; - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, prior_unsacked, @@ -3578,6 +3623,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, acked); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3615,7 +3664,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -3951,7 +4000,6 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3963,9 +4011,7 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - dst = __sk_dst_get(sk); - if (!dst || !dst_metric(dst, RTAX_QUICKACK)) - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -4438,19 +4484,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; + int err = -ENOMEM; + int data_len = 0; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size, sk->sk_allocation); + if (size > PAGE_SIZE) { + int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); + + data_len = npages << PAGE_SHIFT; + size = data_len + (size & ~PAGE_MASK); + } + skb = alloc_skb_with_frags(size - data_len, data_len, + PAGE_ALLOC_COSTLY_ORDER, + &err, sk->sk_allocation); if (!skb) goto err; + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_from_msg(skb_put(skb, size), msg, size)) + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (err) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; @@ -4466,7 +4527,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) err_free: kfree_skb(skb); err: - return -ENOMEM; + return err; + } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -4514,10 +4576,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: - if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); @@ -4788,7 +4852,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (sk_under_memory_pressure(sk)) + else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4832,7 +4896,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) return false; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -5451,7 +5515,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5622,6 +5686,7 @@ discard: } tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->copied_seq = tp->rcv_nxt; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -5677,15 +5742,14 @@ reset_and_undo: * address independent. */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; - u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5729,7 +5793,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5764,15 +5828,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (!acceptable) return 1; + if (!tp->srtt_us) + tcp_synack_rtt_meas(sk, req); + /* Once we leave TCP_SYN_RECV, we no longer need req * so release it. */ if (req) { - synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { - synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5795,7 +5860,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -5982,14 +6046,17 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; + u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); + ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); + ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + (ecn_ok_dst & DST_FEATURE_ECN_CA)) inet_rsk(req)->ecn_ok = 1; } @@ -5999,11 +6066,11 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_time_stamp; + skb_mstamp_get(&tcp_rsk(req)->snt_synack); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -6019,9 +6086,11 @@ static void tcp_openreq_init(struct request_sock *req, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener) + struct sock *sk_listener, + bool attach_listener) { - struct request_sock *req = reqsk_alloc(ops, sk_listener); + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); @@ -6041,13 +6110,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(struct sock *sk, +static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; - struct listen_sock *lopt; #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { @@ -6058,29 +6127,45 @@ static bool tcp_syn_flood_action(struct sock *sk, #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; + if (!queue->synflood_warned && + sysctl_tcp_syncookies != 2 && + xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); - } + return want_cookie; } +static void tcp_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + u32 *copy; + + copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); + if (copy) { + copy[0] = len; + memcpy(©[1], skb_network_header(skb), len); + req->saved_syn = copy; + } + } +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_fastopen_cookie foc = { .len = -1 }; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; - struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); + struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; - bool want_cookie = false, fastopen; + struct request_sock *req; + bool want_cookie = false; struct flowi fl; - struct tcp_fastopen_cookie foc = { .len = -1 }; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6104,7 +6189,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops, sk); + req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; @@ -6187,19 +6272,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - + if (!want_cookie) { + tcp_reqsk_record_syn(sk, req, skb); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); + } + if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + &foc, false); + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, fastopen_sk); + sk->sk_data_ready(sk); + bh_unlock_sock(fastopen_sk); + sock_put(fastopen_sk); + } else { tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - + reqsk_put(req); return 0; drop_and_release: diff --git a/kernel/net/ipv4/tcp_ipv4.c b/kernel/net/ipv4/tcp_ipv4.c index 441ca6f38..8c7e63163 100644 --- a/kernel/net/ipv4/tcp_ipv4.c +++ b/kernel/net/ipv4/tcp_ipv4.c @@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - inet_set_txhash(sk); + sk_set_txhash(sk); rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); @@ -312,7 +312,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ -void tcp_req_err(struct sock *sk, u32 seq) +void tcp_req_err(struct sock *sk, u32 seq, bool abort) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); @@ -324,17 +324,17 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); - } else { + } else if (abort) { /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); } + reqsk_put(req); } EXPORT_SYMBOL(tcp_req_err); @@ -384,7 +384,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } seq = ntohl(th->seq); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, + type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -576,7 +581,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -705,7 +710,8 @@ release_sk1: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v4_send_ack(struct net *net, + struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) @@ -720,7 +726,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb_dst(skb)->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -782,7 +787,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(sock_net(sk), skb, + tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, @@ -795,15 +801,17 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? - tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : + tcp_sk(sk)->snd_nxt; + + tcp_v4_send_ack(sock_net(sk), skb, seq, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, @@ -818,11 +826,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,12 +841,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); @@ -865,7 +872,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -877,7 +884,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -894,7 +901,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, } EXPORT_SYMBOL(tcp_md5_do_lookup); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; @@ -922,7 +929,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) @@ -1112,10 +1120,13 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +#endif + /* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(struct sock *sk, +static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG /* * This gets called for each TCP segment that arrives * so we want to be efficient. @@ -1165,10 +1176,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, return true; } return false; -} #endif + return false; +} -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, +static void tcp_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); @@ -1179,7 +1192,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, ireq->opt = tcp_v4_save_options(skb); } -static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { @@ -1218,7 +1232,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1241,9 +1254,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request); * The three way handshake has completed - we got a valid synack - * now create the new socket. */ -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; @@ -1277,7 +1292,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; - inet_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; @@ -1320,7 +1334,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - __inet_hash_nolisten(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + if (*own_req) + tcp_move_syn(newtp, req); return newsk; @@ -1338,34 +1354,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1373,7 +1366,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1400,17 +1393,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1420,7 +1413,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + if (tcp_rcv_state_process(sk, skb)) { rsk = sk; goto reset; } @@ -1508,7 +1501,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else - skb_dst_force(skb); + skb_dst_force_safe(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; @@ -1590,6 +1583,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; +lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1598,6 +1592,35 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk; + + sk = req->rsk_listener; + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + reqsk_put(req); + goto discard_it; + } + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_and_relse; + } + if (nsk == sk) { + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_and_relse; + } else { + sock_put(sk); + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1606,26 +1629,25 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif nf_reset(skb); if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v4_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1638,6 +1660,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret; @@ -1646,7 +1669,7 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: @@ -1670,10 +1693,6 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1686,8 +1705,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; goto process; } @@ -1713,8 +1731,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { - dst_hold(dst); + if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } @@ -1802,6 +1819,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); @@ -1836,35 +1854,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1874,16 +1864,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->uid = sock_i_uid(sk); - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2015,7 +1995,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2074,7 +2053,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2099,11 +2077,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2157,7 +2130,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; @@ -2174,7 +2147,8 @@ static void get_openreq4(const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, - from_kuid_munged(seq_user_ns(f), uid), + from_kuid_munged(seq_user_ns(f), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2188,12 +2162,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; + int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || @@ -2211,17 +2186,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - if (sk->sk_state == TCP_LISTEN) + state = sk_state_load(sk); + if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else - /* - * because we dont lock socket, we might find a transient negative value + /* Because we don't lock the socket, + * we might find a transient negative value. */ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", - i, src, srcp, dest, destp, sk->sk_state, + i, src, srcp, dest, destp, state, tp->write_seq - tp->snd_una, rx_queue, timer_active, @@ -2235,8 +2211,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sk->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + state == TCP_LISTEN ? + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } @@ -2275,18 +2251,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq4(v, seq, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq4(v, seq, st->num); + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; @@ -2410,12 +2380,15 @@ static int __net_init tcp_sk_init(struct net *net) goto fail; *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } + net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn_fallback = 1; + net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; - return 0; + return 0; fail: tcp_sk_exit(net); diff --git a/kernel/net/ipv4/tcp_metrics.c b/kernel/net/ipv4/tcp_metrics.c index a51d63a43..c8cbc2b4b 100644 --- a/kernel/net/ipv4/tcp_metrics.c +++ b/kernel/net/ipv4/tcp_metrics.c @@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - if (a->family != b->family) - return false; - if (a->family == AF_INET) - return a->addr.a4 == b->addr.a4; - return ipv6_addr_equal(&a->addr.in6, &b->addr.in6); + return inetpeer_addr_cmp(a, b) == 0; } struct tcpm_hash_bucket { @@ -247,14 +243,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: - saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; - daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); + inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); + hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr; - daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr; + inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); + inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -285,25 +281,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock struct net *net; if (tw->tw_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); + hash = ipv4_addr_hash(tw->tw_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (tw->tw_family == AF_INET6) { if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); + hash = ipv4_addr_hash(tw->tw_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = tw->tw_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = tw->tw_v6_daddr; + inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr); hash = ipv6_addr_hash(&tw->tw_v6_daddr); } } @@ -335,25 +325,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct net *net; if (sk->sk_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = sk->sk_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = sk->sk_v6_daddr; + inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } @@ -461,7 +445,7 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, tp->snd_cwnd); } - } else if (tp->snd_cwnd > tp->snd_ssthresh && + } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) @@ -796,18 +780,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_daddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, - tm->tcpm_saddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, - &tm->tcpm_daddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, - &tm->tcpm_saddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: @@ -956,20 +940,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, a = info->attrs[v4]; if (a) { - addr->family = AF_INET; - addr->addr.a4 = nla_get_in_addr(a); + inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) - *hash = (__force unsigned int) addr->addr.a4; + *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { + struct in6_addr in6; + if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; - addr->family = AF_INET6; - addr->addr.in6 = nla_get_in6_addr(a); + in6 = nla_get_in6_addr(a); + inetpeer_set_addr_v6(addr, &in6); if (hash) - *hash = ipv6_addr_hash(&addr->addr.in6); + *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; diff --git a/kernel/net/ipv4/tcp_minisocks.c b/kernel/net/ipv4/tcp_minisocks.c index 17e7339ee..ac6b1961f 100644 --- a/kernel/net/ipv4/tcp_minisocks.c +++ b/kernel/net/ipv4/tcp_minisocks.c @@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_RST; } @@ -163,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -198,12 +197,11 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -253,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -324,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -340,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -364,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +/* Warning : This function is called without sk_listener being locked. + * Be sure to read socket fields once, as their value could change under us. + */ void tcp_openreq_init_rwin(struct request_sock *req, - struct sock *sk, struct dst_entry *dst) + const struct sock *sk_listener, + const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - __u8 rcv_wscale; + const struct tcp_sock *tp = tcp_sk(sk_listener); + u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); + int full_space = tcp_full_space(sk_listener); int mss = dst_metric_advmss(dst); + u32 window_clamp; + __u8 rcv_wscale; - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + if (user_mss && user_mss < mss) + mss = user_mss; + window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -436,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); @@ -451,6 +458,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 0; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; @@ -462,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + newtp->rtt_min[0].rtt = ~0U; newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -471,7 +480,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack; + newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -503,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -538,6 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->rack.mstamp.v64 = 0; + newtp->rack.advanced = 0; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -565,8 +577,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; - - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { @@ -697,7 +708,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -754,16 +765,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); if (!child) goto listen_overflow; - inet_csk_reqsk_queue_drop(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); + return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (!sysctl_tcp_abort_on_overflow) { @@ -810,8 +819,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); + ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); diff --git a/kernel/net/ipv4/tcp_offload.c b/kernel/net/ipv4/tcp_offload.c index 3f7c2fca5..9864a2dba 100644 --- a/kernel/net/ipv4/tcp_offload.c +++ b/kernel/net/ipv4/tcp_offload.c @@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, oldlen = (u16)~skb->len; __skb_pull(skb, thlen); - mss = tcp_skb_mss(skb); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -242,7 +242,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - mss = tcp_skb_mss(p); + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); diff --git a/kernel/net/ipv4/tcp_output.c b/kernel/net/ipv4/tcp_output.c index 986440b24..9bfc39ff2 100644 --- a/kernel/net/ipv4/tcp_output.c +++ b/kernel/net/ipv4/tcp_output.c @@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; -/* Default TSQ limit of two TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +/* Default TSQ limit of four TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 262144; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames @@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk) } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". - * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) + * This is the first part of cwnd validation mechanism. + */ +void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); - s32 delta = tcp_time_stamp - tp->lsndtime; - u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tp->snd_cwnd; tcp_ca_event(sk, CA_EVENT_CWND_RESTART); @@ -163,20 +163,17 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - const struct dst_entry *dst = __sk_dst_get(sk); - if (sysctl_tcp_slow_start_after_idle && - (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) - tcp_cwnd_restart(sk, __sk_dst_get(sk)); + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && - (!dst || !dst_metric(dst, RTAX_QUICKACK))) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ @@ -350,15 +347,20 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) } } +static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, - struct sock *sk) +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) { + if (inet_rsk(req)->ecn_ok) th->ece = 1; - if (tcp_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); - } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -393,8 +395,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -402,8 +402,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; - shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -610,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct sock *sk, - struct request_sock *req, - unsigned int mss, struct sk_buff *skb, - struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) +static unsigned int tcp_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -941,9 +938,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() * which holds one reference to sk_wmem_alloc. @@ -994,6 +988,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) tcp_ecn_send(sk, skb, tcp_header_size); @@ -1018,8 +1013,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + tp->segs_out += tcp_skb_pcount(skb); + /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1056,25 +1053,17 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; - shinfo->gso_type = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } @@ -1163,7 +1152,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp); + buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ @@ -1206,8 +1195,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. @@ -1287,7 +1276,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } @@ -1619,13 +1608,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; @@ -1680,7 +1668,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; @@ -1722,7 +1710,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp); + buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; @@ -1749,8 +1737,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); @@ -1777,7 +1765,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; - if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR))) + if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer @@ -1834,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Ok, it looks like it is advisable to defer. */ - if (cong_win < send_win && cong_win < skb->len) + if (cong_win < send_win && cong_win <= skb->len) *is_cwnd_limited = true; return true; @@ -1941,7 +1929,7 @@ static int tcp_mtu_probe(struct sock *sk) } /* We're allowed to probe. Build it now. */ - nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC); + nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; sk->sk_wmem_queued += nskb->truesize; @@ -1984,7 +1972,7 @@ static int tcp_mtu_probe(struct sock *sk) skb->len, 0); } else { __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); } TCP_SKB_CB(skb)->seq += copy; } @@ -1994,7 +1982,7 @@ static int tcp_mtu_probe(struct sock *sk) if (len >= probe_size) break; } - tcp_init_tso_segs(sk, nskb, nskb->len); + tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). @@ -2056,7 +2044,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { @@ -2067,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { - is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -2078,7 +2065,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; - if (tso_segs == 1 || !max_segs) { + if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) @@ -2091,7 +2078,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } limit = mss_now; - if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp)) + if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, @@ -2149,10 +2136,11 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); + return !tp->packets_out && tcp_send_head(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2172,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (sk->sk_state == TCP_SYN_RECV) + if (tp->fastopen_rsk) return false; /* TLP is only scheduled when next timer event is RTO. */ @@ -2182,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2191,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account - * for delayed ack when there's one outstanding packet. + * for delayed ack when there's one outstanding packet. If no RTT + * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1; + timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; if (tp->packets_out == 1) timeout = max_t(u32, timeout, (rtt + (rtt >> 1) + TCP_DELACK_MAX)); @@ -2229,7 +2218,7 @@ static bool skb_still_in_host_queue(const struct sock *sk, return false; } -/* When probe timeout (PTO) fires, send a new segment if one exists, else +/* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) @@ -2238,11 +2227,19 @@ void tcp_send_loss_probe(struct sock *sk) struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); - int err = -1; - if (tcp_send_head(sk)) { - err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - goto rearm_timer; + skb = tcp_send_head(sk); + if (skb) { + if (tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; + } + skb = tcp_write_queue_prev(sk, skb); + } else { + skb = tcp_write_queue_tail(sk); } /* At most one outstanding TLP retransmission. */ @@ -2250,7 +2247,6 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; /* Retransmit last segment. */ - skb = tcp_write_queue_tail(sk); if (WARN_ON(!skb)) goto rearm_timer; @@ -2265,26 +2261,24 @@ void tcp_send_loss_probe(struct sock *sk) if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_tail(sk); + skb = tcp_write_queue_next(sk, skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - err = __tcp_retransmit_skb(sk, skb); + if (__tcp_retransmit_skb(sk, skb)) + goto rearm_timer; /* Record snd_nxt for loss detection. */ - if (likely(!err)) - tp->tlp_high_seq = tp->snd_nxt; + tp->tlp_high_seq = tp->snd_nxt; +probe_sent: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + /* Reset s.t. tcp_rearm_rto will restart timer from now */ + inet_csk(sk)->icsk_pending = 0; rearm_timer: - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - - if (likely(!err)) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBES); + tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to @@ -2392,7 +2386,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2610,11 +2604,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } + /* RFC3168, section 6.1.1.1. ECN fallback */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Make a copy, if the first transmission SKB clone we made @@ -2657,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("retrans_out leaked\n"); } #endif - if (!tp->retrans_out) - tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -2666,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, - * see tcp_input.c tcp_sacktag_write_queue(). - */ - TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2816,8 +2808,10 @@ begin_fwd: * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() */ -static void sk_forced_wmem_schedule(struct sock *sk, int size) +void sk_forced_mem_schedule(struct sock *sk, int size) { int amt, status; @@ -2841,7 +2835,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { + if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; @@ -2864,7 +2858,7 @@ coalesce: return; } skb_reserve(skb, MAX_TCP_HEADER); - sk_forced_wmem_schedule(sk, skb->truesize); + sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); @@ -2945,20 +2939,22 @@ int tcp_send_synack(struct sock *sk) * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { - struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th; - struct sk_buff *skb; + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *md5 = NULL; + struct tcp_out_options opts; + struct sk_buff *skb; int tcp_header_size; + struct tcphdr *th; + u16 user_mss; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2966,11 +2962,21 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb_set_owner_w(skb, req_to_sk(req)); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -2984,8 +2990,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2994,7 +3001,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -3008,8 +3015,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); @@ -3143,7 +3150,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, err = 0, copied; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; struct sk_buff *syn_data; @@ -3176,22 +3183,23 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); - copied = copy_from_iter(skb_put(syn_data, space), space, - &fo->data->msg_iter); - if (unlikely(!copied)) { - kfree_skb(syn_data); - goto fallback; - } - if (copied != space) { - skb_trim(syn_data, copied); - space = copied; + if (space) { + int copied = copy_from_iter(skb_put(syn_data, space), space, + &fo->data->msg_iter); + if (unlikely(!copied)) { + kfree_skb(syn_data); + goto fallback; + } + if (copied != space) { + skb_trim(syn_data, copied); + space = copied; + } } - /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; @@ -3242,7 +3250,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; @@ -3383,7 +3391,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack); * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3401,6 +3409,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); + NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3408,12 +3417,12 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ -int tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3441,7 +3450,7 @@ int tcp_write_wakeup(struct sock *sk) if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); @@ -3450,8 +3459,8 @@ int tcp_write_wakeup(struct sock *sk) return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) - tcp_xmit_probe_skb(sk, 1); - return tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 1, mib); + return tcp_xmit_probe_skb(sk, 0, mib); } } @@ -3465,7 +3474,7 @@ void tcp_send_probe0(struct sock *sk) unsigned long probe_max; int err; - err = tcp_write_wakeup(sk); + err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ @@ -3491,17 +3500,18 @@ void tcp_send_probe0(struct sock *sk) probe_max = TCP_RESOURCE_PROBE_INTERVAL; } inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - inet_csk_rto_backoff(icsk, probe_max), + tcp_probe0_when(sk, probe_max), TCP_RTO_MAX); } -int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + tcp_rsk(req)->txhash = net_tx_rndhash(); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/kernel/net/ipv4/tcp_recovery.c b/kernel/net/ipv4/tcp_recovery.c new file mode 100644 index 000000000..5353085fd --- /dev/null +++ b/kernel/net/ipv4/tcp_recovery.c @@ -0,0 +1,109 @@ +#include +#include + +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; + +/* Marks a packet lost, if some packet sent later has been (s)acked. + * The underlying idea is similar to the traditional dupthresh and FACK + * but they look at different metrics: + * + * dupthresh: 3 OOO packets delivered (packet count) + * FACK: sequence delta to highest sacked sequence (sequence space) + * RACK: sent time delta to the latest delivered packet (time domain) + * + * The advantage of RACK is it applies to both original and retransmitted + * packet and therefore is robust against tail losses. Another advantage + * is being more resilient to reordering by simply allowing some + * "settling delay", instead of tweaking the dupthresh. + * + * The current version is only used after recovery starts but can be + * easily extended to detect the first loss. + */ +int tcp_rack_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + u32 reo_wnd, prior_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) + return 0; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + + /* To be more reordering resilient, allow min_rtt/4 settling delay + * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed + * RTT because reordering is often a path property and less related + * to queuing or delayed ACKs. + * + * TODO: measure and adapt to the observed reordering delay, and + * use a timer to retransmit like the delayed early retransmit. + */ + reo_wnd = 1000; + if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + + tcp_for_write_queue(skb, sk) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (skb == tcp_send_head(sk)) + break; + + /* Skip ones already (s)acked */ + if (!after(scb->end_seq, tp->snd_una) || + scb->sacked & TCPCB_SACKED_ACKED) + continue; + + if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + + if (skb_mstamp_us_delta(&tp->rack.mstamp, + &skb->skb_mstamp) <= reo_wnd) + continue; + + /* skb is lost if packet sent later is sacked */ + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (scb->sacked & TCPCB_SACKED_RETRANS) { + scb->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSTRETRANSMIT); + } + } else if (!(scb->sacked & TCPCB_RETRANS)) { + /* Original data are sent sequentially so stop early + * b/c the rest are all sent after rack_sent + */ + break; + } + } + return prior_retrans - tp->retrans_out; +} + +/* Record the most recently (re)sent time among the (s)acked packets */ +void tcp_rack_advance(struct tcp_sock *tp, + const struct skb_mstamp *xmit_time, u8 sacked) +{ + if (tp->rack.mstamp.v64 && + !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + return; + + if (sacked & TCPCB_RETRANS) { + struct skb_mstamp now; + + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + skb_mstamp_get(&now); + if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + return; + } + + tp->rack.mstamp = *xmit_time; + tp->rack.advanced = 1; +} diff --git a/kernel/net/ipv4/tcp_scalable.c b/kernel/net/ipv4/tcp_scalable.c index 333bcb241..bf5ea9e9b 100644 --- a/kernel/net/ipv4/tcp_scalable.c +++ b/kernel/net/ipv4/tcp_scalable.c @@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), diff --git a/kernel/net/ipv4/tcp_timer.c b/kernel/net/ipv4/tcp_timer.c index 8c65dc147..193ba1fa8 100644 --- a/kernel/net/ipv4/tcp_timer.c +++ b/kernel/net/ipv4/tcp_timer.c @@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) } /* Calculate maximal number or retries on an orphaned socket. */ -static int tcp_orphan_retries(struct sock *sk, int alive) +static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sysctl_tcp_orphan_retries; /* May be zero. */ @@ -168,7 +168,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data) + if (tp->syn_data && icsk->icsk_retransmits == 1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -176,6 +176,18 @@ static int tcp_write_timeout(struct sock *sk) syn_set = true; } else { if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + /* Some middle-boxes may black-hole Fast Open _after_ + * the handshake. Therefore we conservatively disable + * Fast Open on this path on recurring timeouts with + * few or zero bytes acked after Fast Open. + */ + if (tp->syn_data_acked && + tp->bytes_acked <= tp->rx_opt.mss_clamp) { + tcp_fastopen_cache_set(sk, 0, NULL, true, 0); + if (icsk->icsk_retransmits == sysctl_tcp_retries1) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -184,7 +196,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -247,7 +259,7 @@ void tcp_delack_timer_handler(struct sock *sk) } out: - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) sk_mem_reclaim(sk); } @@ -298,7 +310,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) @@ -616,7 +628,7 @@ static void tcp_keepalive_timer (unsigned long data) tcp_write_err(sk); goto out; } - if (tcp_write_wakeup(sk) <= 0) { + if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { @@ -649,4 +661,3 @@ void tcp_init_xmit_timers(struct sock *sk) inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } -EXPORT_SYMBOL(tcp_init_xmit_timers); diff --git a/kernel/net/ipv4/tcp_vegas.c b/kernel/net/ipv4/tcp_vegas.c index a6cea1d5e..13951c408 100644 --- a/kernel/net/ipv4/tcp_vegas.c +++ b/kernel/net/ipv4/tcp_vegas.c @@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; - if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { + if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ @@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); - } else if (tp->snd_cwnd <= tp->snd_ssthresh) { + } else if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { @@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ - else if (tp->snd_cwnd <= tp->snd_ssthresh) + else if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); } diff --git a/kernel/net/ipv4/tcp_veno.c b/kernel/net/ipv4/tcp_veno.c index 112151eee..0d094b995 100644 --- a/kernel/net/ipv4/tcp_veno.c +++ b/kernel/net/ipv4/tcp_veno.c @@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { diff --git a/kernel/net/ipv4/tcp_yeah.c b/kernel/net/ipv4/tcp_yeah.c index 17d356629..3e6a472e6 100644 --- a/kernel/net/ipv4/tcp_yeah.c +++ b/kernel/net/ipv4/tcp_yeah.c @@ -219,7 +219,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); - return tp->snd_cwnd - reduction; + return max_t(int, tp->snd_cwnd - reduction, 2); } static struct tcp_congestion_ops tcp_yeah __read_mostly = { diff --git a/kernel/net/ipv4/udp.c b/kernel/net/ipv4/udp.c index 1b8c5ba7d..7f8ab46ad 100644 --- a/kernel/net/ipv4/udp.c +++ b/kernel/net/ipv4/udp.c @@ -100,7 +100,6 @@ #include #include #include -#include #include #include #include @@ -375,7 +374,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; return score; } @@ -419,6 +419,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -963,8 +966,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, sk->sk_family == AF_INET6); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; connected = 0; @@ -1013,13 +1018,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!rt) { struct net *net = sock_net(sk); + __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), + flow_flags, faddr, saddr, dport, inet->inet_sport); + if (!saddr && ipc.oif) { + err = l3mdev_get_saddr(net, ipc.oif, fl4); + if (err < 0) + goto out; + } + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { diff --git a/kernel/net/ipv4/udp_diag.c b/kernel/net/ipv4/udp_diag.c index b763c39ae..6116604bf 100644 --- a/kernel/net/ipv4/udp_diag.c +++ b/kernel/net/ipv4/udp_diag.c @@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = { .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, + .idiag_info_size = 0, }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = { .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, + .idiag_info_size = 0, }; static int __init udp_diag_init(void) diff --git a/kernel/net/ipv4/udp_tunnel.c b/kernel/net/ipv4/udp_tunnel.c index 6bb98cc19..aba428626 100644 --- a/kernel/net/ipv4/udp_tunnel.c +++ b/kernel/net/ipv4/udp_tunnel.c @@ -4,9 +4,10 @@ #include #include #include +#include +#include #include #include -#include int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -15,12 +16,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -47,7 +46,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; @@ -101,8 +100,30 @@ void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + if (family == AF_INET) + tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); + else + tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + if (udp_hdr(skb)->check) + info->key.tun_flags |= TUNNEL_CSUM; + return tun_dst; +} +EXPORT_SYMBOL_GPL(udp_tun_rx_dst); + MODULE_LICENSE("GPL"); diff --git a/kernel/net/ipv4/xfrm4_input.c b/kernel/net/ipv4/xfrm4_input.c index 60b032f58..62e1e72db 100644 --- a/kernel/net/ipv4/xfrm4_input.c +++ b/kernel/net/ipv4/xfrm4_input.c @@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb) +static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); @@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } diff --git a/kernel/net/ipv4/xfrm4_output.c b/kernel/net/ipv4/xfrm4_output.c index 2878dbfff..7ee6518af 100644 --- a/kernel/net/ipv4/xfrm4_output.c +++ b/kernel/net/ipv4/xfrm4_output.c @@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu) { + skb->protocol = htons(ETH_P_IP); + if (skb->sk) xfrm_local_error(skb, mtu); else @@ -80,24 +82,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } -static int __xfrm4_output(struct sock *sk, struct sk_buff *skb) +static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; #ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm4_output(struct sock *sk, struct sk_buff *skb) +int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, skb_dst(skb)->dev, __xfrm4_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb_dst(skb)->dev, + __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/kernel/net/ipv4/xfrm4_policy.c b/kernel/net/ipv4/xfrm4_policy.c index bff69746e..7b0edb37a 100644 --- a/kernel/net/ipv4/xfrm4_policy.c +++ b/kernel/net/ipv4/xfrm4_policy.c @@ -15,11 +15,12 @@ #include #include #include +#include static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, - int tos, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { @@ -28,9 +29,12 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; + fl4->flowi4_oif = oif; if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; @@ -38,22 +42,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, return ERR_CAST(rt); } -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); } -static int xfrm4_get_saddr(struct net *net, +static int xfrm4_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; @@ -93,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; @@ -107,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) int oif = 0; if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + oif = l3mdev_fib_oif(skb_dst(skb)->dev); memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; @@ -122,7 +127,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_DCCP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ports = (__be16 *)xprth; + __be16 *ports; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; fl4->fl4_dport = ports[!reverse]; @@ -130,8 +138,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; case IPPROTO_ICMP: - if (pskb_may_pull(skb, xprth + 2 - skb->data)) { - u8 *icmp = xprth; + if (xprth + 2 < skb->data || + pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp; + + xprth = skb_network_header(skb) + iph->ihl * 4; + icmp = xprth; fl4->fl4_icmp_type = icmp[0]; fl4->fl4_icmp_code = icmp[1]; @@ -139,33 +151,50 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; case IPPROTO_ESP: - if (pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr = (__be32 *)xprth; + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be32 *ehdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; } break; case IPPROTO_AH: - if (pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr = (__be32 *)xprth; + if (xprth + 8 < skb->data || + pskb_may_pull(skb, xprth + 8 - skb->data)) { + __be32 *ah_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; } break; case IPPROTO_COMP: - if (pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr = (__be16 *)xprth; + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ipcomp_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ipcomp_hdr = (__be16 *)xprth; fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; case IPPROTO_GRE: - if (pskb_may_pull(skb, xprth + 12 - skb->data)) { - __be16 *greflags = (__be16 *)xprth; - __be32 *gre_hdr = (__be32 *)xprth; + if (xprth + 12 < skb->data || + pskb_may_pull(skb, xprth + 12 - skb->data)) { + __be16 *greflags; + __be32 *gre_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + greflags = (__be16 *)xprth; + gre_hdr = (__be32 *)xprth; if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) @@ -230,7 +259,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xfrm_dst_ifdown(dst, dev); } -static struct dst_ops xfrm4_dst_ops = { +static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, @@ -239,12 +268,12 @@ static struct dst_ops xfrm4_dst_ops = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 32768, + .gc_thresh = INT_MAX, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .dst_ops = &xfrm4_dst_ops, + .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .decode_session = _decode_session4, @@ -266,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static int __net_init xfrm4_net_init(struct net *net) +static int __net_init xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -294,7 +323,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm4_net_exit(struct net *net) +static void __net_exit xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -306,12 +335,44 @@ static void __net_exit xfrm4_net_exit(struct net *net) if (!net_eq(net, &init_net)) kfree(table); } +#else /* CONFIG_SYSCTL */ +static int inline xfrm4_net_sysctl_init(struct net *net) +{ + return 0; +} + +static void inline xfrm4_net_sysctl_exit(struct net *net) +{ +} +#endif + +static int __net_init xfrm4_net_init(struct net *net) +{ + int ret; + + memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template, + sizeof(xfrm4_dst_ops_template)); + ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops); + if (ret) + return ret; + + ret = xfrm4_net_sysctl_init(net); + if (ret) + dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); + + return ret; +} + +static void __net_exit xfrm4_net_exit(struct net *net) +{ + xfrm4_net_sysctl_exit(net); + dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); +} static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, }; -#endif static void __init xfrm4_policy_init(void) { @@ -320,13 +381,9 @@ static void __init xfrm4_policy_init(void) void __init xfrm4_init(void) { - dst_entries_init(&xfrm4_dst_ops); - xfrm4_state_init(); xfrm4_policy_init(); xfrm4_protocol_init(); -#ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm4_net_ops); -#endif } diff --git a/kernel/net/ipv6/Kconfig b/kernel/net/ipv6/Kconfig index 438a73aa7..983bb9997 100644 --- a/kernel/net/ipv6/Kconfig +++ b/kernel/net/ipv6/Kconfig @@ -5,16 +5,15 @@ # IPv6 as module will cause a CRASH if you try to unload it menuconfig IPV6 tristate "The IPv6 protocol" - default m + default y ---help--- - This is complemental support for the IP version 6. - You will still be able to do traditional IPv4 networking as well. + Support for IP version 6 (IPv6). For general information about IPv6, see . - For Linux IPv6 development information, see . - For specific information about IPv6 under Linux, read the HOWTO at - . + For specific information about IPv6 under Linux, see + Documentation/networking/ipv6.txt and read the HOWTO at + To compile this protocol support as a module, choose M here: the module will be called ipv6. @@ -93,6 +92,25 @@ config IPV6_MIP6 If unsure, say N. +config IPV6_ILA + tristate "IPv6: Identifier Locator Addressing (ILA)" + select LWTUNNEL + ---help--- + Support for IPv6 Identifier Locator Addressing (ILA). + + ILA is a mechanism to do network virtualization without + encapsulation. The basic concept of ILA is that we split an + IPv6 address into a 64 bit locator and 64 bit identifier. The + identifier is the identity of an entity in communication + ("who") and the locator expresses the location of the + entity ("where"). + + ILA can be configured using the "encap ila" option with + "ip -6 route" command. ILA is described in + https://tools.ietf.org/html/draft-herbert-nvo3-ila-00. + + If unsure, say N. + config INET6_XFRM_TUNNEL tristate select INET6_TUNNEL diff --git a/kernel/net/ipv6/Makefile b/kernel/net/ipv6/Makefile index 2e8c06108..2c900c7b7 100644 --- a/kernel/net/ipv6/Makefile +++ b/kernel/net/ipv6/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o +obj-$(CONFIG_IPV6_ILA) += ila.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o @@ -48,4 +49,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +obj-y += mcast_snoop.o endif diff --git a/kernel/net/ipv6/addrconf.c b/kernel/net/ipv6/addrconf.c index 37b70e82b..e8d3da081 100644 --- a/kernel/net/ipv6/addrconf.c +++ b/kernel/net/ipv6/addrconf.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -195,6 +196,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_from_local = 0, + .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -211,7 +213,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_mtu = 1, .stable_secret = { .initialized = false, - } + }, + .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -236,6 +240,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_from_local = 0, + .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -253,6 +258,8 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .stable_secret = { .initialized = false, }, + .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; /* Check if a valid qdisc is available */ @@ -343,6 +350,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) setup_timer(&ndev->rs_timer, addrconf_rs_timer, (unsigned long)ndev); memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); + + if (ndev->cnf.stable_secret.initialized) + ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + else + ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64; + ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -411,6 +424,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (err) { ipv6_mc_destroy_dev(ndev); del_timer(&ndev->regen_timer); + snmp6_unregister_dev(ndev); goto err_release; } /* protected by rtnl_lock */ @@ -468,6 +482,9 @@ static int inet6_netconf_msgsize_devconf(int type) if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + size += nla_total_size(4); + return size; } @@ -504,6 +521,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + devconf->ignore_routes_with_linkdown) < 0) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -540,6 +562,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, + [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, @@ -560,7 +583,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -762,6 +785,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) rt6_purge_dflt_routers(net); return 1; } + +static void addrconf_linkdown_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); + + idev->cnf.ignore_routes_with_linkdown = newf; + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + dev->ifindex, + &idev->cnf); + } + } +} + +static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + rtnl_unlock(); + return 0; + } + + if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) { + net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf; + addrconf_linkdown_change(net, newf); + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } + rtnl_unlock(); + + return 1; +} + #endif /* Nobody refers to this ifaddr, destroy it */ @@ -1358,15 +1438,96 @@ out: return ret; } +static int __ipv6_dev_get_saddr(struct net *net, + struct ipv6_saddr_dst *dst, + struct inet6_dev *idev, + struct ipv6_saddr_score *scores, + int hiscore_idx) +{ + struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; + + read_lock_bh(&idev->lock); + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + int i; + + /* + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense, unless it is also flagged as optimistic. + * - Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if ((score->ifa->flags & IFA_F_TENTATIVE) && + (!(score->ifa->flags & IFA_F_OPTIMISTIC))) + continue; + + score->addr_type = __ipv6_addr_type(&score->ifa->addr); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) { + net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", + idev->dev->name); + continue; + } + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i); + miniscore = ipv6_get_saddr_eval(net, score, dst, i); + + if (minihiscore > miniscore) { + if (i == IPV6_SADDR_RULE_SCOPE && + score->scopedist > 0) { + /* + * special case: + * each remaining entry + * has too small (not enough) + * scope, because ifa entries + * are sorted by their scope + * values. + */ + goto out; + } + break; + } else if (minihiscore < miniscore) { + if (hiscore->ifa) + in6_ifa_put(hiscore->ifa); + + in6_ifa_hold(score->ifa); + + swap(hiscore, score); + hiscore_idx = 1 - hiscore_idx; + + /* restore our iterator */ + score->ifa = hiscore->ifa; + + break; + } + } + } +out: + read_unlock_bh(&idev->lock); + return hiscore_idx; +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct ipv6_saddr_score scores[2], - *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_score scores[2], *hiscore; struct ipv6_saddr_dst dst; + struct inet6_dev *idev; struct net_device *dev; int dst_type; + bool use_oif_addr = false; + int hiscore_idx = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1375,105 +1536,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); dst.prefs = prefs; - hiscore->rule = -1; - hiscore->ifa = NULL; + scores[hiscore_idx].rule = -1; + scores[hiscore_idx].ifa = NULL; rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - struct inet6_dev *idev; - - /* Candidate Source Address (section 4) - * - multicast and link-local destination address, - * the set of candidate source address MUST only - * include addresses assigned to interfaces - * belonging to the same link as the outgoing - * interface. - * (- For site-local destination addresses, the - * set of candidate source addresses MUST only - * include addresses assigned to interfaces - * belonging to the same site as the outgoing - * interface.) - */ - if (((dst_type & IPV6_ADDR_MULTICAST) || - dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && - dst.ifindex && dev->ifindex != dst.ifindex) - continue; - - idev = __in6_dev_get(dev); - if (!idev) - continue; - - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { - int i; - - /* - * - Tentative Address (RFC2462 section 5.4) - * - A tentative address is not considered - * "assigned to an interface" in the traditional - * sense, unless it is also flagged as optimistic. - * - Candidate Source Address (section 4) - * - In any case, anycast addresses, multicast - * addresses, and the unspecified address MUST - * NOT be included in a candidate set. - */ - if ((score->ifa->flags & IFA_F_TENTATIVE) && - (!(score->ifa->flags & IFA_F_OPTIMISTIC))) - continue; - - score->addr_type = __ipv6_addr_type(&score->ifa->addr); + /* Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + * - "It is RECOMMENDED that the candidate source addresses + * be the set of unicast addresses assigned to the + * interface that will be used to send to the destination + * (the 'outgoing' interface)." (RFC 6724) + */ + if (dst_dev) { + idev = __in6_dev_get(dst_dev); + if ((dst_type & IPV6_ADDR_MULTICAST) || + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL || + (idev && idev->cnf.use_oif_addrs_only)) { + use_oif_addr = true; + } + } - if (unlikely(score->addr_type == IPV6_ADDR_ANY || - score->addr_type & IPV6_ADDR_MULTICAST)) { - net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", - dev->name); + if (use_oif_addr) { + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); + } else { + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (!idev) continue; - } - - score->rule = -1; - bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); - - for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { - int minihiscore, miniscore; - - minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); - miniscore = ipv6_get_saddr_eval(net, score, &dst, i); - - if (minihiscore > miniscore) { - if (i == IPV6_SADDR_RULE_SCOPE && - score->scopedist > 0) { - /* - * special case: - * each remaining entry - * has too small (not enough) - * scope, because ifa entries - * are sorted by their scope - * values. - */ - goto try_nextdev; - } - break; - } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - - swap(hiscore, score); - - /* restore our iterator */ - score->ifa = hiscore->ifa; - - break; - } - } + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } -try_nextdev: - read_unlock_bh(&idev->lock); } rcu_read_unlock(); + hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) return -EADDRNOTAVAIL; @@ -1845,37 +1951,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) __ipv6_dev_ac_dec(ifp->idev, &addr); } -static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) -{ - if (dev->addr_len != ETH_ALEN) - return -1; - memcpy(eui, dev->dev_addr, 3); - memcpy(eui + 5, dev->dev_addr + 3, 3); - - /* - * The zSeries OSA network cards can be shared among various - * OS instances, but the OSA cards have only one MAC address. - * This leads to duplicate address conflicts in conjunction - * with IPv6 if more than one instance uses the same card. - * - * The driver for these cards can deliver a unique 16-bit - * identifier for each instance sharing the same card. It is - * placed instead of 0xFFFE in the interface identifier. The - * "u" bit of the interface identifier is not inverted in this - * case. Hence the resulting interface identifier has local - * scope according to RFC2373. - */ - if (dev->dev_id) { - eui[3] = (dev->dev_id >> 8) & 0xFF; - eui[4] = dev->dev_id & 0xFF; - } else { - eui[3] = 0xFF; - eui[4] = 0xFE; - eui[0] ^= 2; - } - return 0; -} - static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) { if (dev->addr_len != IEEE802154_ADDR_LEN) @@ -2079,7 +2154,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, unsigned long expires, u32 flags) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_PREFIX, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, .fc_metric = IP6_RT_PRIO_ADDRCONF, .fc_ifindex = dev->ifindex, .fc_expires = expires, @@ -2112,8 +2187,9 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, struct fib6_node *fn; struct rt6_info *rt = NULL; struct fib6_table *table; + u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX; - table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX); + table = fib6_get_table(dev_net(dev), tb_id); if (!table) return NULL; @@ -2121,6 +2197,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); if (!fn) goto out; + + noflags |= RTF_CACHE; for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->dst.dev->ifindex != dev->ifindex) continue; @@ -2142,7 +2220,7 @@ out: static void addrconf_add_mroute(struct net_device *dev) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_LOCAL, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL, .fc_metric = IP6_RT_PRIO_ADDRCONF, .fc_ifindex = dev->ifindex, .fc_dst_len = 8, @@ -2383,7 +2461,7 @@ ok: #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && !net->ipv6.devconf_all->forwarding && sllao) - addr_flags = IFA_F_OPTIMISTIC; + addr_flags |= IFA_F_OPTIMISTIC; #endif /* Do not allow to create too much of autoconfigured @@ -2960,6 +3038,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { struct in6_addr addr; + /* no link local addresses on L3 master devices */ + if (netif_is_l3_master(idev->dev)) + return; + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { @@ -3050,6 +3132,8 @@ static void addrconf_gre_config(struct net_device *dev) } addrconf_addr_gen(idev, true); + if (dev->flags & IFF_POINTOPOINT) + addrconf_add_mroute(dev); } #endif @@ -3070,6 +3154,32 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } break; + case NETDEV_CHANGEMTU: + /* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */ + if (dev->mtu < IPV6_MIN_MTU) { + addrconf_ifdown(dev, 1); + break; + } + + if (idev) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + break; + } + + /* allocate new idev */ + idev = ipv6_add_dev(dev); + if (IS_ERR(idev)) + break; + + /* device is still not ready */ + if (!(idev->if_flags & IF_READY)) + break; + + run_pending = 1; + + /* fall through */ + case NETDEV_UP: case NETDEV_CHANGE: if (dev->flags & IFF_SLAVE) @@ -3093,7 +3203,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, idev->if_flags |= IF_READY; run_pending = 1; } - } else { + } else if (event == NETDEV_CHANGE) { if (!addrconf_qdisc_ok(dev)) { /* device is still not ready. */ break; @@ -3158,24 +3268,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } break; - case NETDEV_CHANGEMTU: - if (idev && dev->mtu >= IPV6_MIN_MTU) { - rt6_mtu_change(dev, dev->mtu); - idev->cnf.mtu6 = dev->mtu; - break; - } - - if (!idev && dev->mtu >= IPV6_MIN_MTU) { - idev = ipv6_add_dev(dev); - if (!IS_ERR(idev)) - break; - } - - /* - * if MTU under IPV6_MIN_MTU. - * Stop IPv6 on this interface. - */ - case NETDEV_DOWN: case NETDEV_UNREGISTER: /* @@ -3414,6 +3506,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; + bool notify = false; addrconf_join_solict(dev, &ifp->addr); @@ -3459,7 +3552,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ - ipv6_ifa_notify(RTM_NEWADDR, ifp); + notify = true; } } @@ -3467,6 +3560,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) out: spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); + if (notify) + ipv6_ifa_notify(RTM_NEWADDR, ifp); } static void addrconf_dad_start(struct inet6_ifaddr *ifp) @@ -3556,7 +3651,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); rtnl_unlock(); @@ -4558,6 +4653,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; @@ -4583,7 +4679,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ + array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } static inline size_t inet6_ifla6_size(void) @@ -4603,6 +4701,7 @@ static inline size_t inet6_if_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ } @@ -4622,18 +4721,24 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, } static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, - int items, int bytes, size_t syncpoff) + int bytes, size_t syncpoff) { - int i; - int pad = bytes - sizeof(u64) * items; + int i, c; + u64 buff[IPSTATS_MIB_MAX]; + int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX; + BUG_ON(pad < 0); - /* Use put_unaligned() because stats may not be aligned for u64. */ - put_unaligned(items, &stats[0]); - for (i = 1; i < items; i++) - put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); + memset(buff, 0, sizeof(buff)); + buff[0] = IPSTATS_MIB_MAX; - memset(&stats[items], 0, pad); + for_each_possible_cpu(c) { + for (i = 1; i < IPSTATS_MIB_MAX; i++) + buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff); + } + + memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64)); + memset(&stats[IPSTATS_MIB_MAX], 0, pad); } static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, @@ -4641,8 +4746,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, { switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats64(stats, idev->stats.ipv6, - IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); + __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes, + offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); @@ -4650,7 +4755,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } -static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) +static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, + u32 ext_filter_mask) { struct nlattr *nla; struct ifla_cacheinfo ci; @@ -4670,6 +4776,9 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) /* XXX - MC not implemented */ + if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) + return 0; + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); if (!nla) goto nla_put_failure; @@ -4697,7 +4806,8 @@ nla_put_failure: return -EMSGSIZE; } -static size_t inet6_get_link_af_size(const struct net_device *dev) +static size_t inet6_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) { if (!__in6_dev_get(dev)) return 0; @@ -4705,14 +4815,15 @@ static size_t inet6_get_link_af_size(const struct net_device *dev) return inet6_ifla6_size(); } -static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev, + u32 ext_filter_mask) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev) return -ENODATA; - if (inet6_fill_ifla6_attrs(skb, idev) < 0) + if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0) return -EMSGSIZE; return 0; @@ -4859,13 +4970,15 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || + nla_put_u8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) goto nla_put_failure; protoinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protoinfo) goto nla_put_failure; - if (inet6_fill_ifla6_attrs(skb, idev) < 0) + if (inet6_fill_ifla6_attrs(skb, idev, 0) < 0) goto nla_put_failure; nla_nest_end(skb, protoinfo); @@ -5046,13 +5159,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); - if (rt && ip6_del_rt(rt)) - dst_free(&rt->dst); + if (rt) + ip6_del_rt(rt); } dst_hold(&ifp->rt->dst); - if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->dst); + ip6_del_rt(ifp->rt); rt_genid_bump_ipv6(net); break; @@ -5260,13 +5372,10 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, goto out; } - if (!write) { - err = snprintf(str, sizeof(str), "%pI6", - &secret->secret); - if (err >= sizeof(str)) { - err = -EIO; - goto out; - } + err = snprintf(str, sizeof(str), "%pI6", &secret->secret); + if (err >= sizeof(str)) { + err = -EIO; + goto out; } err = proc_dostring(&lctl, write, buffer, lenp, ppos); @@ -5304,6 +5413,34 @@ out: return err; } +static +int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + /* ctl->data points to idev->cnf.ignore_routes_when_linkdown + * we should not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_fixup_linkdown(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; @@ -5453,6 +5590,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "accept_ra_min_hop_limit", + .data = &ipv6_devconf.accept_ra_min_hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, @@ -5582,6 +5726,20 @@ static struct addrconf_sysctl_table .mode = 0600, .proc_handler = addrconf_sysctl_stable_secret, }, + { + .procname = "use_oif_addrs_only", + .data = &ipv6_devconf.use_oif_addrs_only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ignore_routes_with_linkdown", + .data = &ipv6_devconf.ignore_routes_with_linkdown, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, + }, { /* sentinel */ } diff --git a/kernel/net/ipv6/addrconf_core.c b/kernel/net/ipv6/addrconf_core.c index ca09bf49a..bfa941fc1 100644 --- a/kernel/net/ipv6/addrconf_core.c +++ b/kernel/net/ipv6/addrconf_core.c @@ -107,7 +107,16 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v) } EXPORT_SYMBOL(inet6addr_notifier_call_chain); -const struct ipv6_stub *ipv6_stub __read_mostly; +static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, + struct dst_entry **u2, + struct flowi6 *u3) +{ + return -EAFNOSUPPORT; +} + +const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { + .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, +}; EXPORT_SYMBOL_GPL(ipv6_stub); /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ diff --git a/kernel/net/ipv6/addrlabel.c b/kernel/net/ipv6/addrlabel.c index 882124ebb..a8f6986dc 100644 --- a/kernel/net/ipv6/addrlabel.c +++ b/kernel/net/ipv6/addrlabel.c @@ -552,7 +552,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && ip6addrlbl_hold(p)) + if (p && !ip6addrlbl_hold(p)) p = NULL; lseq = ip6addrlbl_table.seq; rcu_read_unlock(); diff --git a/kernel/net/ipv6/af_inet6.c b/kernel/net/ipv6/af_inet6.c index eef63b394..9f5137cd6 100644 --- a/kernel/net/ipv6/af_inet6.c +++ b/kernel/net/ipv6/af_inet6.c @@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; @@ -167,7 +170,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -197,6 +200,7 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; + np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk)); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -342,7 +346,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!(inet->freebind || inet->transparent) && + if (!net->ipv6.sysctl.ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; @@ -362,7 +367,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet_reset_saddr(sk); err = -EADDRINUSE; goto out; @@ -425,9 +431,11 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } } EXPORT_SYMBOL_GPL(inet6_destroy_sock); @@ -656,7 +664,10 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), + &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -665,7 +676,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return 0; @@ -678,8 +689,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct ipv6_pinfo *np = inet6_sk(sk); if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || - np->rxopt.bits.ohopopts)) || + if (((opt->flags & IP6SKB_HOPBYHOP) && + (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || @@ -765,9 +776,10 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; - net->ipv6.sysctl.auto_flowlabels = 0; + net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; + net->ipv6.sysctl.flowlabel_state_ranges = 0; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/kernel/net/ipv6/ah6.c b/kernel/net/ipv6/ah6.c index ed7d4e3f9..0630a4d5d 100644 --- a/kernel/net/ipv6/ah6.c +++ b/kernel/net/ipv6/ah6.c @@ -577,8 +577,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); diff --git a/kernel/net/ipv6/datagram.c b/kernel/net/ipv6/datagram.c index b10a88986..428162155 100644 --- a/kernel/net/ipv6/datagram.c +++ b/kernel/net/ipv6/datagram.c @@ -162,13 +162,18 @@ ipv4_connected: fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) fl6.flowi6_oif = np->mcast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - opt = flowlabel ? flowlabel->opt : np->opt; + rcu_read_lock(); + opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); err = 0; @@ -199,7 +204,7 @@ ipv4_connected: NULL); sk->sk_state = TCP_ESTABLISHED; - ip6_set_txhash(sk); + sk_set_txhash(sk); out: fl6_sock_release(flowlabel); return err; @@ -263,7 +268,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; @@ -568,8 +573,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } /* HbH is allowed only once */ - if (np->rxopt.bits.hopopts && opt->hop) { - u8 *ptr = nh + opt->hop; + if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { + u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } @@ -630,8 +635,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } - if (np->rxopt.bits.ohopopts && opt->hop) { - u8 *ptr = nh + opt->hop; + if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { + u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { diff --git a/kernel/net/ipv6/esp6.c b/kernel/net/ipv6/esp6.c index 7c07ce36a..060a60b2f 100644 --- a/kernel/net/ipv6/esp6.c +++ b/kernel/net/ipv6/esp6.c @@ -76,7 +76,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) len = ALIGN(len, crypto_tfm_ctx_alignment()); } - len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len += sizeof(struct aead_request) + crypto_aead_reqsize(aead); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; @@ -96,17 +96,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } -static inline struct aead_givcrypt_request *esp_tmp_givreq( - struct crypto_aead *aead, u8 *iv) -{ - struct aead_givcrypt_request *req; - - req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), - crypto_tfm_ctx_alignment()); - aead_givcrypt_set_tfm(req, aead); - return req; -} - static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) { struct aead_request *req; @@ -125,14 +114,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static inline struct scatterlist *esp_givreq_sg( - struct crypto_aead *aead, struct aead_givcrypt_request *req) -{ - return (void *)ALIGN((unsigned long)(req + 1) + - crypto_aead_reqsize(aead), - __alignof__(struct scatterlist)); -} - static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -141,32 +122,57 @@ static void esp_output_done(struct crypto_async_request *base, int err) xfrm_output_resume(skb, err); } +/* Move ESP header back into place. */ +static void esp_restore_header(struct sk_buff *skb, unsigned int offset) +{ + struct ip_esp_hdr *esph = (void *)(skb->data + offset); + void *tmp = ESP_SKB_CB(skb)->tmp; + __be32 *seqhi = esp_tmp_seqhi(tmp); + + esph->seq_no = esph->spi; + esph->spi = *seqhi; +} + +static void esp_output_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); +} + +static void esp_output_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_output_restore_header(skb); + esp_output_done(base, err); +} + static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct aead_givcrypt_request *req; + struct aead_request *req; struct scatterlist *sg; - struct scatterlist *asg; struct sk_buff *trailer; void *tmp; int blksize; int clen; int alen; int plen; + int ivlen; int tfclen; int nfrags; int assoclen; - int sglists; int seqhilen; u8 *iv; u8 *tail; __be32 *seqhi; + __be64 seqno; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); tfclen = 0; if (x->tfcpad) { @@ -187,16 +193,14 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) nfrags = err; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) { err = -ENOMEM; goto error; @@ -204,9 +208,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); - req = esp_tmp_givreq(aead, iv); - asg = esp_givreq_sg(aead, req); - sg = asg + sglists; + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -227,37 +230,53 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; - esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + aead_request_set_callback(req, 0, esp_output_done, skb); + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); + *seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + } + + esph->spi = x->id.spi; + sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, - esph->enc_data + crypto_aead_ivsize(aead) - skb->data, - clen + alen); + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); - if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); - - aead_givcrypt_set_callback(req, 0, esp_output_done, skb); - aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, assoclen); - aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low + - ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); + aead_request_set_ad(req, assoclen); + + seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + memset(iv, 0, ivlen); + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; - err = crypto_aead_givencrypt(req); - if (err == -EINPROGRESS) + err = crypto_aead_encrypt(req); + + switch (err) { + case -EINPROGRESS: goto error; - if (err == -EBUSY) + case -EBUSY: err = NET_XMIT_DROP; + break; + + case 0: + if ((x->props.flags & XFRM_STATE_ESN)) + esp_output_restore_header(skb); + } kfree(tmp); @@ -318,25 +337,38 @@ static void esp_input_done(struct crypto_async_request *base, int err) xfrm_input_resume(skb, esp_input_done2(skb, err)); } +static void esp_input_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, 0); + __skb_pull(skb, 4); +} + +static void esp_input_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_input_restore_header(skb); + esp_input_done(base, err); +} + static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; - int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int ivlen = crypto_aead_ivsize(aead); + int elen = skb->len - sizeof(*esph) - ivlen; int nfrags; int assoclen; - int sglists; int seqhilen; int ret = 0; void *tmp; __be32 *seqhi; u8 *iv; struct scatterlist *sg; - struct scatterlist *asg; - if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) { + if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) { ret = -EINVAL; goto out; } @@ -355,16 +387,14 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) ret = -ENOMEM; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) goto out; @@ -372,36 +402,39 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); - asg = esp_req_sg(aead, req); - sg = asg + sglists; + sg = esp_req_sg(aead, req); skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr *)skb->data; - /* Get ivec. This can be wrong, check against another impls. */ - iv = esph->enc_data; - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + aead_request_set_callback(req, 0, esp_input_done, skb); + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + aead_request_set_callback(req, 0, esp_input_done_esn, skb); + } - aead_request_set_callback(req, 0, esp_input_done, skb); - aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, assoclen); + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); + aead_request_set_ad(req, assoclen); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) goto out; + if ((x->props.flags & XFRM_STATE_ESN)) + esp_input_restore_header(skb); + ret = esp_input_done2(skb, ret); out: @@ -461,10 +494,16 @@ static void esp6_destroy(struct xfrm_state *x) static int esp_init_aead(struct xfrm_state *x) { + char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = -ENAMETOOLONG; + if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", + x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -503,15 +542,19 @@ static int esp_init_authenc(struct xfrm_state *x) if ((x->props.flags & XFRM_STATE_ESN)) { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authencesn(%s,%s)", + "%s%sauthencesn(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } else { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authenc(%s,%s)", + "%s%sauthenc(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } diff --git a/kernel/net/ipv6/exthdrs.c b/kernel/net/ipv6/exthdrs.c index a7bbbe455..ea7c4d64a 100644 --- a/kernel/net/ipv6/exthdrs.c +++ b/kernel/net/ipv6/exthdrs.c @@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb) return -1; } - opt->hop = sizeof(struct ipv6hdr); + opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); @@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; + atomic_set(&opt2->refcnt, 1); } return opt2; } @@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); - + atomic_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); diff --git a/kernel/net/ipv6/fib6_rules.c b/kernel/net/ipv6/fib6_rules.c index 2367a16ea..ed33abf57 100644 --- a/kernel/net/ipv6/fib6_rules.c +++ b/kernel/net/ipv6/fib6_rules.c @@ -32,6 +32,7 @@ struct fib6_rule { struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { + struct rt6_info *rt; struct fib_lookup_arg arg = { .lookup_ptr = lookup, .flags = FIB_LOOKUP_NOREF, @@ -40,11 +41,21 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); - if (arg.result) - return arg.result; + rt = arg.result; - dst_hold(&net->ipv6.ip6_null_entry->dst); - return &net->ipv6.ip6_null_entry->dst; + if (!rt) { + dst_hold(&net->ipv6.ip6_null_entry->dst); + return &net->ipv6.ip6_null_entry->dst; + } + + if (rt->rt6i_flags & RTF_REJECT && + rt->dst.error == -EAGAIN) { + ip6_rt_put(rt); + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } + + return &rt->dst; } static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, @@ -258,11 +269,6 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) -{ - return 0x3FFF; -} - static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ @@ -279,7 +285,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, - .default_pref = fib6_rule_default_pref, .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, diff --git a/kernel/net/ipv6/icmp.c b/kernel/net/ipv6/icmp.c index 2c2b5d51f..0a37ddc7a 100644 --- a/kernel/net/ipv6/icmp.c +++ b/kernel/net/ipv6/icmp.c @@ -68,6 +68,7 @@ #include #include #include +#include #include @@ -207,7 +208,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct inet_peer *peer; peer = inet_getpeer_v6(net->ipv6.peers, - &rt->rt6i_dst.addr, 1); + &fl6->daddr, 1); res = inet_peer_xrlim_allow(peer, tmo); if (peer) inet_putpeer(peer); @@ -329,7 +330,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, struct flowi6 fl2; int err; - err = ip6_dst_lookup(sk, &dst, fl6); + err = ip6_dst_lookup(net, sk, &dst, fl6); if (err) return ERR_PTR(err); @@ -337,7 +338,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, * We won't send icmp if the destination is known * anycast. */ - if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + if (ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); @@ -361,7 +362,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, if (err) goto relookup_failed; - err = ip6_dst_lookup(sk, &dst2, &fl2); + err = ip6_dst_lookup(net, sk, &dst2, &fl2); if (err) goto relookup_failed; @@ -452,7 +453,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - net_dbg_ratelimited("icmp6_send: addr_any/mcast source\n"); + net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", + &hdr->saddr, &hdr->daddr); return; } @@ -460,7 +462,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - net_dbg_ratelimited("icmp6_send: no reply to icmp error\n"); + net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", + &hdr->saddr, &hdr->daddr); return; } @@ -496,6 +499,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev); + dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; @@ -509,7 +515,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) len = skb->len - msg.offset; len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); if (len < 0) { - net_dbg_ratelimited("icmp: len problem\n"); + net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", + &hdr->saddr, &hdr->daddr); goto out_dst_release; } @@ -564,7 +571,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && - ipv6_anycast_destination(skb))) + ipv6_anycast_destination(skb_dst(skb), saddr))) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); @@ -575,7 +582,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = skb->dev->ifindex; + fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); @@ -591,7 +598,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(sk, &dst, &fl6); + err = ip6_dst_lookup(net, sk, &dst, &fl6); if (err) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); @@ -781,7 +788,8 @@ static int icmpv6_rcv(struct sk_buff *skb) if (type & ICMPV6_INFOMSG_MASK) break; - net_dbg_ratelimited("icmpv6: msg of unknown type\n"); + net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n", + saddr, daddr); /* * error of unknown type. @@ -826,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } -/* - * Special lock-class for __icmpv6_sk: - */ -static struct lock_class_key icmpv6_socket_sk_dst_lock_key; - static int __net_init icmpv6_sk_init(struct net *net) { struct sock *sk; @@ -852,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net) net->ipv6.icmp_sk[i] = sk; - /* - * Split off their lock-class, because sk->sk_dst_lock - * gets used from softirqs, which is safe for - * __icmpv6_sk (because those never get directly used - * via userspace syscalls), but unsafe for normal sockets. - */ - lockdep_set_class(&sk->sk_dst_lock, - &icmpv6_socket_sk_dst_lock_key); - /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ diff --git a/kernel/net/ipv6/ila.c b/kernel/net/ipv6/ila.c new file mode 100644 index 000000000..1a6852e1a --- /dev/null +++ b/kernel/net/ipv6/ila.c @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ila_params { + __be64 locator; + __be64 locator_match; + __wsum csum_diff; +}; + +static inline struct ila_params *ila_params_lwtunnel( + struct lwtunnel_state *lwstate) +{ + return (struct ila_params *)lwstate->data; +} + +static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) +{ + __be32 diff[] = { + ~from[0], ~from[1], to[0], to[1], + }; + + return csum_partial(diff, sizeof(diff), 0); +} + +static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +{ + if (*(__be64 *)&ip6h->daddr == p->locator_match) + return p->csum_diff; + else + return compute_csum_diff8((__be32 *)&ip6h->daddr, + (__be32 *)&p->locator); +} + +static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +{ + __wsum diff; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + size_t nhoff = sizeof(struct ipv6hdr); + + /* First update checksum */ + switch (ip6h->nexthdr) { + case NEXTHDR_TCP: + if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { + struct tcphdr *th = (struct tcphdr *) + (skb_network_header(skb) + nhoff); + + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&th->check, skb, + diff, true); + } + break; + case NEXTHDR_UDP: + if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) { + struct udphdr *uh = (struct udphdr *) + (skb_network_header(skb) + nhoff); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&uh->check, skb, + diff, true); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + break; + case NEXTHDR_ICMP: + if (likely(pskb_may_pull(skb, + nhoff + sizeof(struct icmp6hdr)))) { + struct icmp6hdr *ih = (struct icmp6hdr *) + (skb_network_header(skb) + nhoff); + + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, + diff, true); + } + break; + } + + /* Now change destination address */ + *(__be64 *)&ip6h->daddr = p->locator; +} + +static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + + return dst->lwtstate->orig_output(net, sk, skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int ila_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + + return dst->lwtstate->orig_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, +}; + +static int ila_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct ila_params *p; + struct nlattr *tb[ILA_ATTR_MAX + 1]; + size_t encap_len = sizeof(*p); + struct lwtunnel_state *newts; + const struct fib6_config *cfg6 = cfg; + int ret; + + if (family != AF_INET6) + return -EINVAL; + + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, + ila_nl_policy); + if (ret < 0) + return ret; + + if (!tb[ILA_ATTR_LOCATOR]) + return -EINVAL; + + newts = lwtunnel_state_alloc(encap_len); + if (!newts) + return -ENOMEM; + + newts->len = encap_len; + p = ila_params_lwtunnel(newts); + + p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); + + if (cfg6->fc_dst_len > sizeof(__be64)) { + /* Precompute checksum difference for translation since we + * know both the old locator and the new one. + */ + p->locator_match = *(__be64 *)&cfg6->fc_dst; + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator_match, (__be32 *)&p->locator); + } + + newts->type = LWTUNNEL_ENCAP_ILA; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | + LWTUNNEL_STATE_INPUT_REDIRECT; + + *ts = newts; + + return 0; +} + +static int ila_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ila_params *p = ila_params_lwtunnel(lwtstate); + + if (nla_put_u64(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + /* No encapsulation overhead */ + return 0; +} + +static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct ila_params *a_p = ila_params_lwtunnel(a); + struct ila_params *b_p = ila_params_lwtunnel(b); + + return (a_p->locator != b_p->locator); +} + +static const struct lwtunnel_encap_ops ila_encap_ops = { + .build_state = ila_build_state, + .output = ila_output, + .input = ila_input, + .fill_encap = ila_fill_encap_info, + .get_encap_size = ila_encap_nlsize, + .cmp_encap = ila_encap_cmp, +}; + +static int __init ila_init(void) +{ + return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); +} + +static void __exit ila_fini(void) +{ + lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); +} + +module_init(ila_init); +module_exit(ila_fini); +MODULE_AUTHOR("Tom Herbert "); +MODULE_LICENSE("GPL"); diff --git a/kernel/net/ipv6/inet6_connection_sock.c b/kernel/net/ipv6/inet6_connection_sock.c index 6927f3fb5..a7ca2cde2 100644 --- a/kernel/net/ipv6/inet6_connection_sock.c +++ b/kernel/net/ipv6/inet6_connection_sock.c @@ -65,19 +65,22 @@ int inet6_csk_bind_conflict(const struct sock *sk, } EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); -struct dst_entry *inet6_csk_route_req(struct sock *sk, +struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, - const struct request_sock *req) + const struct request_sock *req, + u8 proto) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); - fl6->flowi6_proto = IPPROTO_TCP; + fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; @@ -91,73 +94,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, return dst; } - -/* - * request_sock (formerly open request) hash tables. - */ -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - u32 c; - - c = jhash_3words((__force u32)raddr->s6_addr32[0], - (__force u32)raddr->s6_addr32[1], - (__force u32)raddr->s6_addr32[2], - rnd); - - c = jhash_2words((__force u32)raddr->s6_addr32[3], - (__force u32)rport, - c); - - return c & (synq_hsize - 1); -} - -struct request_sock *inet6_csk_search_req(struct sock *sk, - const __be16 rport, - const struct in6_addr *raddr, - const struct in6_addr *laddr, - const int iif) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && - ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && - (!ireq->ir_iif || ireq->ir_iif == iif)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk != NULL); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet6_csk_search_req); - -void inet6_csk_reqsk_queue_hash_add(struct sock *sk, - struct request_sock *req, - const unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); +EXPORT_SYMBOL(inet6_csk_route_req); void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { @@ -173,14 +110,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) } EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); -static inline -void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) -{ - __ip6_dst_store(sk, dst, daddr, saddr); -} - static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { @@ -207,14 +136,16 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_dport = inet->inet_dport; security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!IS_ERR(dst)) - __inet6_csk_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return dst; } @@ -240,7 +171,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } diff --git a/kernel/net/ipv6/inet6_hashtables.c b/kernel/net/ipv6/inet6_hashtables.c index 871641bc1..21ace5a2b 100644 --- a/kernel/net/ipv6/inet6_hashtables.c +++ b/kernel/net/ipv6/inet6_hashtables.c @@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } @@ -207,7 +209,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -234,21 +235,17 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; @@ -257,7 +254,7 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet6_sk_port_offset(const struct sock *sk) +static u32 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -269,7 +266,11 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet6_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/kernel/net/ipv6/ip6_fib.c b/kernel/net/ipv6/ip6_fib.c index bde57b113..0c7e276c2 100644 --- a/kernel/net/ipv6/ip6_fib.c +++ b/kernel/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -154,10 +155,39 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_rcu_free(struct rt6_info *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + +static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) +{ + int cpu; + + if (!non_pcpu_rt->rt6i_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + rt6_rcu_free(pcpu_rt); + *ppcpu_rt = NULL; + } + } + + non_pcpu_rt->rt6i_pcpu = NULL; +} + static void rt6_release(struct rt6_info *rt) { - if (atomic_dec_and_test(&rt->rt6i_ref)) - dst_free(&rt->dst); + if (atomic_dec_and_test(&rt->rt6i_ref)) { + rt6_free_pcpu(rt); + rt6_rcu_free(rt); + } } static void fib6_link_table(struct net *net, struct fib6_table *tb) @@ -234,6 +264,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) return NULL; } +EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { @@ -255,7 +286,17 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { - return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + struct rt6_info *rt; + + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + if (rt->rt6i_flags & RTF_REJECT && + rt->dst.error == -EAGAIN) { + ip6_rt_put(rt); + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } + + return &rt->dst; } static void __net_init fib6_tables_init(struct net *net) @@ -738,6 +779,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt6_clean_expires(iter); else rt6_set_expires(iter, rt->dst.expires); + iter->rt6i_pmtu = rt->rt6i_pmtu; return -EEXIST; } /* If we have the same destination and the same metric, @@ -820,7 +862,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -846,7 +888,7 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -907,6 +949,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); + if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && + !atomic_read(&rt->dst.__refcnt))) + return -EINVAL; + if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -999,6 +1045,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); + rt->dst.flags &= ~DST_NOCACHE; } out: @@ -1023,7 +1070,8 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); } return err; @@ -1034,7 +1082,8 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); return err; #endif } @@ -1384,7 +1433,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info); + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/kernel/net/ipv6/ip6_flowlabel.c b/kernel/net/ipv6/ip6_flowlabel.c index d49112501..dc2db4f7b 100644 --- a/kernel/net/ipv6/ip6_flowlabel.c +++ b/kernel/net/ipv6/ip6_flowlabel.c @@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp)) != NULL; + (sfl = rcu_dereference_protected(*sflp, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = rcu_dereference(sfl->next); + *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); @@ -595,6 +596,10 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; + if (net->ipv6.sysctl.flowlabel_state_ranges && + (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) + return -ERANGE; + fl = fl_create(net, sk, &freq, optval, optlen, &err); if (!fl) return err; diff --git a/kernel/net/ipv6/ip6_gre.c b/kernel/net/ipv6/ip6_gre.c index 69f4f689f..e5ea177d3 100644 --- a/kernel/net/ipv6/ip6_gre.c +++ b/kernel/net/ipv6/ip6_gre.c @@ -404,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); break; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); } break; case ICMPV6_PARAMPROB: @@ -421,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -634,20 +634,20 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(tunnel); + dst = ip6_tnl_dst_get(tunnel); if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(tunnel, ndst); + skb_dst_set(skb, dst); proto = NEXTHDR_GRE; if (encap_limit >= 0) { @@ -729,7 +726,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, */ ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); + ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_store(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1183,7 +1178,8 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, false)); + t->fl.u.ip6.flowlabel, true, + &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; @@ -1222,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { static void ip6gre_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1244,9 +1243,10 @@ static void ip6gre_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6gre_tunnel_init(struct net_device *dev) +static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; tunnel = netdev_priv(dev); @@ -1254,16 +1254,37 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = ip6_tnl_dst_init(tunnel); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + return 0; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int ret; + + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; + + tunnel = netdev_priv(dev); + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } @@ -1459,19 +1480,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; - tunnel = netdev_priv(dev); + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } @@ -1553,13 +1571,11 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], return -EEXIST; } else { t = nt; - - ip6gre_tunnel_unlink(ign, t); - ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); - ip6gre_tunnel_link(ign, t); - netdev_state_change(dev); } + ip6gre_tunnel_unlink(ign, t); + ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); + ip6gre_tunnel_link(ign, t); return 0; } diff --git a/kernel/net/ipv6/ip6_input.c b/kernel/net/ipv6/ip6_input.c index 57990c929..9075acf08 100644 --- a/kernel/net/ipv6/ip6_input.c +++ b/kernel/net/ipv6/ip6_input.c @@ -45,8 +45,9 @@ #include #include #include +#include -int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb) +int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -55,7 +56,7 @@ int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb) if (ipprot && ipprot->early_demux) ipprot->early_demux(skb); } - if (!skb_dst(skb)) + if (!skb_valid_dst(skb)) ip6_route_input(skb); return dst_input(skb); @@ -98,7 +99,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ - IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; @@ -108,7 +109,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; - IP6_ADD_STATS_BH(dev_net(dev), idev, + IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_NOECTPKTS + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); @@ -182,8 +183,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb, - dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, ip6_rcv_finish); err: IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); @@ -198,9 +199,8 @@ drop: */ -static int ip6_input_finish(struct sock *sk, struct sk_buff *skb) +static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; @@ -277,8 +277,8 @@ discard: int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_input_finish); } diff --git a/kernel/net/ipv6/ip6_offload.c b/kernel/net/ipv6/ip6_offload.c index 08b62047c..eeca943f1 100644 --- a/kernel/net/ipv6/ip6_offload.c +++ b/kernel/net/ipv6/ip6_offload.c @@ -264,6 +264,9 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); int err = -ENOSYS; + if (skb->encapsulation) + skb_set_inner_network_header(skb, nhoff); + iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); rcu_read_lock(); @@ -280,6 +283,13 @@ out_unlock: return err; } +static int sit_gro_complete(struct sk_buff *skb, int nhoff) +{ + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_SIT; + return ipv6_gro_complete(skb, nhoff); +} + static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { @@ -292,6 +302,8 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = sit_gro_complete, }, }; diff --git a/kernel/net/ipv6/ip6_output.c b/kernel/net/ipv6/ip6_output.c index bc09cb97b..31144c486 100644 --- a/kernel/net/ipv6/ip6_output.c +++ b/kernel/net/ipv6/ip6_output.c @@ -55,8 +55,9 @@ #include #include #include +#include -static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; @@ -71,7 +72,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && - ((mroute6_socket(dev_net(dev), skb) && + ((mroute6_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr))) { @@ -82,19 +83,18 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) */ if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { - IP6_INC_STATS(dev_net(dev), idev, + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } } - IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, - skb->len); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && @@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst); + nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); @@ -116,48 +116,49 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } -static int ip6_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) - return ip6_fragment(sk, skb, ip6_finish_output2); + return ip6_fragment(net, sk, skb, ip6_finish_output2); else - return ip6_finish_output2(sk, skb); + return ip6_finish_output2(net, sk, skb); } -int ip6_output(struct sock *sk, struct sk_buff *skb) +int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + if (unlikely(idev->cnf.disable_ipv6)) { - IP6_INC_STATS(dev_net(dev), idev, - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP, SCTP and DCCP) + * Note : socket lock is not held for SYNACK packets, but might be modified + * by calls to skb_set_owner_w() and ipv6_local_error(), + * which are using proper atomic operations or spinlocks. */ - -int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, +int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr; @@ -186,7 +187,10 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, } consume_skb(skb); skb = skb2; - skb_set_owner_w(skb, sk); + /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, + * it is safe to call in our context (socket lock not held) + */ + skb_set_owner_w(skb, (struct sock *)sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -207,7 +211,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel)); + np->autoflowlabel, fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -224,12 +228,20 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, dst->dev, dst_output_sk); + /* hooks should never assume socket lock is held. + * we promote our socket to non const + */ + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dst->dev, + dst_output); } skb->dev = dst->dev; - ipv6_local_error(sk, EMSGSIZE, fl6, mtu); + /* ipv6_local_error() does not require socket lock, + * we promote our socket to non const + */ + ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; @@ -317,10 +329,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) return 0; } -static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb) +static inline int ip6_forward_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { skb_sender_cpu_clear(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) @@ -376,6 +389,9 @@ int ip6_forward(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; + if (unlikely(skb->sk)) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -459,7 +475,7 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) @@ -512,8 +528,8 @@ int ip6_forward(struct sk_buff *skb) IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb, - skb->dev, dst->dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: @@ -540,8 +556,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -int ip6_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); @@ -551,10 +567,9 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, struct frag_hdr *fh; unsigned int mtu, hlen, left, len; int hroom, troom; - __be32 frag_id = 0; + __be32 frag_id; int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - struct net *net = dev_net(skb_dst(skb)->dev); hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; @@ -564,40 +579,50 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ - if (unlikely(!skb->ignore_df && skb->len > mtu) || - (IP6CB(skb)->frag_max_size && - IP6CB(skb)->frag_max_size > mtu)) { - if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + if (unlikely(!skb->ignore_df && skb->len > mtu)) + goto fail_toobig; - skb->dev = skb_dst(skb)->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); - return -EMSGSIZE; + if (IP6CB(skb)->frag_max_size) { + if (IP6CB(skb)->frag_max_size > mtu) + goto fail_toobig; + + /* don't send fragments larger than what we received */ + mtu = IP6CB(skb)->frag_max_size; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; } if (np && np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto fail_toobig; mtu -= hlen + sizeof(struct frag_hdr); + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto fail; + + hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ @@ -614,8 +639,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, err = 0; offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; @@ -623,8 +646,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, if (!tmp_hdr) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); - return -ENOMEM; + err = -ENOMEM; + goto fail; } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); __skb_pull(skb, hlen); fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -632,11 +658,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); - ipv6_select_ident(net, fh, rt); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); - frag_id = fh->identification; + fh->identification = frag_id; first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); @@ -670,7 +695,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, ip6_copy_metadata(frag, skb); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); @@ -710,10 +735,6 @@ slow_path_clean: } slow_path: - if ((skb->ip_summed == CHECKSUM_PARTIAL) && - skb_checksum_help(skb)) - goto fail; - left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ @@ -722,7 +743,6 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; - hroom = LL_RESERVED_SPACE(rt->dst.dev); troom = rt->dst.dev->needed_tailroom; /* @@ -778,11 +798,7 @@ slow_path: */ fh->nexthdr = nexthdr; fh->reserved = 0; - if (!frag_id) { - ipv6_select_ident(net, fh, rt); - frag_id = fh->identification; - } else - fh->identification = frag_id; + fh->identification = frag_id; /* * Copy a block of the IP datagram. @@ -803,7 +819,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - err = output(sk, frag); + err = output(net, sk, frag); if (err) goto fail; @@ -815,6 +831,14 @@ slow_path: consume_skb(skb); return err; +fail_toobig: + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + err = -EMSGSIZE; + fail: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); @@ -867,7 +891,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { dst_release(dst); dst = NULL; } @@ -876,15 +901,15 @@ out: return dst; } -static int ip6_dst_lookup_tail(struct sock *sk, +static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { - struct net *net = sock_net(sk); #ifdef CONFIG_IPV6_OPTIMISTIC_DAD struct neighbour *n; struct rt6_info *rt; #endif int err; + int flags = 0; /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, @@ -916,10 +941,13 @@ static int ip6_dst_lookup_tail(struct sock *sk, dst_release(*dst); *dst = NULL; } + + if (fl6->flowi6_oif) + flags |= RT6_LOOKUP_F_IFACE; } if (!*dst) - *dst = ip6_route_output(net, sk, fl6); + *dst = ip6_route_output_flags(net, sk, fl6, flags); err = (*dst)->error; if (err) @@ -936,7 +964,8 @@ static int ip6_dst_lookup_tail(struct sock *sk, */ rt = (struct rt6_info *) *dst; rcu_read_lock_bh(); - n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, + rt6_nexthop(rt, &fl6->daddr)); err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock_bh(); @@ -988,10 +1017,11 @@ out_err_release: * * It returns zero on success, or a standard errno code on error. */ -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) +int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, + struct flowi6 *fl6) { *dst = NULL; - return ip6_dst_lookup_tail(sk, dst, fl6); + return ip6_dst_lookup_tail(net, sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); @@ -1006,17 +1036,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * It returns a valid dst pointer on success, or a pointer encoded * error code. */ -struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, +struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; - err = ip6_dst_lookup_tail(sk, &dst, fl6); + err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; + if (!fl6->flowi6_oif) + fl6->flowi6_oif = l3mdev_fib_oif(dst->dev); return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } @@ -1044,7 +1076,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, dst = ip6_sk_dst_check(sk, dst, fl6); - err = ip6_dst_lookup_tail(sk, &dst, fl6); + err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) @@ -1060,11 +1092,10 @@ static inline int ip6_ufo_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, int transhdrlen, int mtu, unsigned int flags, - struct rt6_info *rt) + const struct flowi6 *fl6) { struct sk_buff *skb; - struct frag_hdr fhdr; int err; /* There is support for UDP large send offload by network @@ -1106,8 +1137,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - sizeof(struct frag_hdr)) & ~7; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - ipv6_select_ident(sock_net(sk), &fhdr, rt); - skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), + &fl6->daddr, + &fl6->saddr); append: return skb_append_datato_frags(sk, skb, getfrag, from, @@ -1242,6 +1274,7 @@ static int __ip6_append_data(struct sock *sk, struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; + unsigned int maxnonfragsize, headersize; skb = skb_peek_tail(queue); if (!skb) { @@ -1259,38 +1292,43 @@ static int __ip6_append_data(struct sock *sk, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); - if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { - unsigned int maxnonfragsize, headersize; - - headersize = sizeof(struct ipv6hdr) + - (opt ? opt->opt_flen + opt->opt_nflen : 0) + - (dst_allfrag(&rt->dst) ? - sizeof(struct frag_hdr) : 0) + - rt->rt6i_nfheader_len; - - if (ip6_sk_ignore_df(sk)) - maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; - else - maxnonfragsize = mtu; + headersize = sizeof(struct ipv6hdr) + + (opt ? opt->opt_flen + opt->opt_nflen : 0) + + (dst_allfrag(&rt->dst) ? + sizeof(struct frag_hdr) : 0) + + rt->rt6i_nfheader_len; + + if (cork->length + length > mtu - headersize && dontfrag && + (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_RAW)) { + ipv6_local_rxpmtu(sk, fl6, mtu - headersize + + sizeof(struct ipv6hdr)); + goto emsgsize; + } - /* dontfrag active */ - if ((cork->length + length > mtu - headersize) && dontfrag && - (sk->sk_protocol == IPPROTO_UDP || - sk->sk_protocol == IPPROTO_RAW)) { - ipv6_local_rxpmtu(sk, fl6, mtu - headersize + - sizeof(struct ipv6hdr)); - goto emsgsize; - } + if (ip6_sk_ignore_df(sk)) + maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; + else + maxnonfragsize = mtu; - if (cork->length + length > maxnonfragsize - headersize) { + if (cork->length + length > maxnonfragsize - headersize) { emsgsize: - ipv6_local_error(sk, EMSGSIZE, fl6, - mtu - headersize + - sizeof(struct ipv6hdr)); - return -EMSGSIZE; - } + ipv6_local_error(sk, EMSGSIZE, fl6, + mtu - headersize + + sizeof(struct ipv6hdr)); + return -EMSGSIZE; } + /* CHECKSUM_PARTIAL only with no extension headers and when + * we are not going to fragment + */ + if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && + headersize == sizeof(struct ipv6hdr) && + length < mtu - headersize && + !(flags & MSG_MORE) && + rt->dst.dev->features & NETIF_F_V6_CSUM) + csummode = CHECKSUM_PARTIAL; + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { sock_tx_timestamp(sk, &tx_flags); if (tx_flags & SKBTX_ANY_SW_TSTAMP && @@ -1298,16 +1336,6 @@ emsgsize: tskey = sk->sk_tskey++; } - /* If this is the first and only packet and device - * supports checksum offloading, let's use it. - * Use transhdrlen, same as IPv4, because partial - * sums only work when transhdrlen is set. - */ - if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && - length + fragheaderlen < mtu && - rt->dst.dev->features & NETIF_F_V6_CSUM && - !exthdrlen) - csummode = CHECKSUM_PARTIAL; /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1329,10 +1357,10 @@ emsgsize: (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && - (sk->sk_type == SOCK_DGRAM)) { + (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, - transhdrlen, mtu, flags, rt); + transhdrlen, mtu, flags, fl6); if (err) goto error; return 0; @@ -1641,7 +1669,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel)); + np->autoflowlabel, fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; @@ -1670,7 +1698,7 @@ int ip6_send_skb(struct sk_buff *skb) struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); int err; - err = ip6_local_out(skb); + err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); diff --git a/kernel/net/ipv6/ip6_tunnel.c b/kernel/net/ipv6/ip6_tunnel.c index 5cafd92c2..137fca42a 100644 --- a/kernel/net/ipv6/ip6_tunnel.c +++ b/kernel/net/ipv6/ip6_tunnel.c @@ -126,36 +126,92 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - struct dst_entry *dst = t->dst_cache; + write_seqlock_bh(&idst->lock); + dst_release(rcu_dereference_protected( + idst->dst, + lockdep_is_held(&idst->lock.lock))); + if (dst) { + dst_hold(dst); + idst->cookie = rt6_get_cookie((struct rt6_info *)dst); + } else { + idst->cookie = 0; + } + rcu_assign_pointer(idst->dst, dst); + write_sequnlock_bh(&idst->lock); +} + +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +{ + struct ip6_tnl_dst *idst; + struct dst_entry *dst; + unsigned int seq; + u32 cookie; - if (dst && dst->obsolete && - !dst->ops->check(dst, t->dst_cookie)) { - t->dst_cache = NULL; + idst = raw_cpu_ptr(t->dst_cache); + + rcu_read_lock(); + do { + seq = read_seqbegin(&idst->lock); + dst = rcu_dereference(idst->dst); + cookie = idst->cookie; + } while (read_seqretry(&idst->lock, seq)); + + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + + if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { + ip6_tnl_per_cpu_dst_set(idst, NULL); dst_release(dst); - return NULL; + dst = NULL; } - return dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { - dst_release(t->dst_cache); - t->dst_cache = NULL; + int i; + + for_each_possible_cpu(i) + ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) +{ + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); + +void ip6_tnl_dst_destroy(struct ip6_tnl *t) { - struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - dst_release(t->dst_cache); - t->dst_cache = dst; + if (!t->dst_cache) + return; + + ip6_tnl_dst_reset(t); + free_percpu(t->dst_cache); } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t) +{ + int i; + + t->dst_cache = alloc_percpu(struct ip6_tnl_dst); + if (!t->dst_cache) + return -ENOMEM; + + for_each_possible_cpu(i) + seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -271,6 +327,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -510,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; @@ -529,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -1010,23 +1069,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(t); + dst = ip6_tnl_dst_get(t); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -1072,12 +1131,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(t, ndst); + skb_dst_set(skb, dst); + skb->transport_header = skb->network_header; proto = fl6->flowi6_proto; @@ -1095,20 +1153,18 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); + ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_store(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1573,12 +1629,21 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + int ret; t->dev = dev; t->net = dev_net(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + + ret = ip6_tnl_dst_init(t); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } diff --git a/kernel/net/ipv6/ip6_udp_tunnel.c b/kernel/net/ipv6/ip6_udp_tunnel.c index bba8903e8..14dacf1df 100644 --- a/kernel/net/ipv6/ip6_udp_tunnel.c +++ b/kernel/net/ipv6/ip6_udp_tunnel.c @@ -19,11 +19,18 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, int err; struct socket *sock = NULL; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); + if (cfg->ipv6_v6only) { + int val = 1; + + err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + (char *) &val, sizeof(val)); + if (err < 0) + goto error; + } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, @@ -55,7 +62,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; diff --git a/kernel/net/ipv6/ip6_vti.c b/kernel/net/ipv6/ip6_vti.c index 0224c032d..0a8610b33 100644 --- a/kernel/net/ipv6/ip6_vti.c +++ b/kernel/net/ipv6/ip6_vti.c @@ -482,7 +482,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) return -EMSGSIZE; } - err = dst_output(skb); + err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); diff --git a/kernel/net/ipv6/ip6mr.c b/kernel/net/ipv6/ip6mr.c index 5f36266b1..a10e77103 100644 --- a/kernel/net/ipv6/ip6mr.c +++ b/kernel/net/ipv6/ip6mr.c @@ -118,7 +118,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, int cmd); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr6_table *mrt); +static void mroute_clean_tables(struct mr6_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES @@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .policy = ip6mr_rule_policy, @@ -335,7 +334,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr6_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -766,10 +765,6 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -1543,7 +1538,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr6_table *mrt) +static void mroute_clean_tables(struct mr6_table *mrt, bool all) { int i; LIST_HEAD(list); @@ -1553,8 +1548,9 @@ static void mroute_clean_tables(struct mr6_table *mrt) * Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) - mif6_delete(mrt, i, &list); + if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) + continue; + mif6_delete(mrt, i, &list); } unregister_netdevice_many(&list); @@ -1563,7 +1559,7 @@ static void mroute_clean_tables(struct mr6_table *mrt) */ for (i = 0; i < MFC6_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; write_lock_bh(&mrt_lock); list_del(&c->list); @@ -1626,7 +1622,7 @@ int ip6mr_sk_done(struct sock *sk) net->ipv6.devconf_all); write_unlock_bh(&mrt_lock); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); err = 0; break; } @@ -1986,13 +1982,13 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) } #endif -static inline int ip6mr_forward2_finish(struct sock *sk, struct sk_buff *skb) +static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTOCTETS, skb->len); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } /* @@ -2064,8 +2060,8 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb, - skb->dev, dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dev, ip6mr_forward2_finish); out_free: diff --git a/kernel/net/ipv6/ipv6_sockglue.c b/kernel/net/ipv6/ipv6_sockglue.c index 63e695691..4449ad1f8 100644 --- a/kernel/net/ipv6/ipv6_sockglue.c +++ b/kernel/net/ipv6/ipv6_sockglue.c @@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg(&inet6_sk(sk)->opt, opt); + opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, + opt); sk_dst_reset(sk); return opt; @@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, + NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); @@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; - opt = ipv6_renew_options(sk, np->opt, optname, + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = ipv6_renew_options(sk, opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); if (IS_ERR(opt)) { @@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; opt = ipv6_update_options(sk, opt); sticky_done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } @@ -486,6 +493,7 @@ sticky_done: break; memset(opt, 0, sizeof(*opt)); + atomic_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_user(opt+1, optval, optlen)) @@ -502,8 +510,10 @@ update: retv = 0; opt = ipv6_update_options(sk, opt); done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } case IPV6_UNICAST_HOPS: @@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDR: case IPV6_DSTOPTS: { + struct ipv6_txoptions *opt; lock_sock(sk); - len = ipv6_getsockopt_sticky(sk, np->opt, - optname, optval, len); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) diff --git a/kernel/net/ipv6/mcast.c b/kernel/net/ipv6/mcast.c index 083b2927f..5ee56d0a8 100644 --- a/kernel/net/ipv6/mcast.c +++ b/kernel/net/ipv6/mcast.c @@ -1645,13 +1645,12 @@ static void mld_sendpack(struct sk_buff *skb) payload_len = skb->len; err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net->ipv6.igmp_sk, skb, NULL, skb->dev, - dst_output_sk); + net, net->ipv6.igmp_sk, skb, NULL, skb->dev, + dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); } else { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); } @@ -2008,13 +2007,13 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } skb_dst_set(skb, dst); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, skb->dev, dst_output_sk); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb->dev, + dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len); } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); diff --git a/kernel/net/ipv6/mcast_snoop.c b/kernel/net/ipv6/mcast_snoop.c new file mode 100644 index 000000000..9405b04ee --- /dev/null +++ b/kernel/net/ipv6/mcast_snoop.c @@ -0,0 +1,216 @@ +/* Copyright (C) 2010: YOSHIFUJI Hideaki + * Copyright (C) 2015: Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + * + * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. + */ + +#include +#include +#include +#include +#include + +static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + if (ip6h->version != 6) + return -EINVAL; + + len = offset + ntohs(ip6h->payload_len); + if (skb->len < len || len <= offset) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_exthdrs(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + int offset; + u8 nexthdr; + __be16 frag_off; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + return -ENOMSG; + + nexthdr = ip6h->nexthdr; + offset = skb_network_offset(skb) + sizeof(*ip6h); + offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); + + if (offset < 0) + return -EINVAL; + + if (nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct mld2_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ipv6_mc_check_mld_query(struct sk_buff *skb) +{ + struct mld_msg *mld; + unsigned int len = skb_transport_offset(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + len += sizeof(struct mld_msg); + if (skb->len < len) + return -EINVAL; + + /* MLDv1? */ + if (skb->len != len) { + /* or MLDv2? */ + len += sizeof(struct mld2_query) - sizeof(struct mld_msg); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + mld = (struct mld_msg *)skb_transport_header(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (ipv6_addr_any(&mld->mld_mca) && + !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_mld_msg(struct sk_buff *skb) +{ + struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MGM_REPORT: + /* fall through */ + return 0; + case ICMPV6_MLD2_REPORT: + return ipv6_mc_check_mld_reportv2(skb); + case ICMPV6_MGM_QUERY: + return ipv6_mc_check_mld_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); +} + +static int __ipv6_mc_check_mld(struct sk_buff *skb, + struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + int ret = -EINVAL; + + transport_len = ntohs(ipv6_hdr(skb)->payload_len); + transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); + + skb_chk = skb_checksum_trimmed(skb, transport_len, + ipv6_mc_validate_checksum); + if (!skb_chk) + goto err; + + if (!pskb_may_pull(skb_chk, len)) + goto err; + + ret = ipv6_mc_check_mld_msg(skb_chk); + if (ret) + goto err; + + if (skb_trimmed) + *skb_trimmed = skb_chk; + /* free now unneeded clone */ + else if (skb_chk != skb) + kfree_skb(skb_chk); + + ret = 0; + +err: + if (ret && skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return ret; +} + +/** + * ipv6_mc_check_mld - checks whether this is a sane MLD packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) + * + * Checks whether an IPv6 packet is a valid MLD packet. If so sets + * skb transport header accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an MLD packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an MLD packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * Caller needs to set the skb network header and free any returned skb if it + * differs from the provided skb. + */ +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret; + + ret = ipv6_mc_check_ip6hdr(skb); + if (ret < 0) + return ret; + + ret = ipv6_mc_check_exthdrs(skb); + if (ret < 0) + return ret; + + return __ipv6_mc_check_mld(skb, skb_trimmed); +} +EXPORT_SYMBOL(ipv6_mc_check_mld); diff --git a/kernel/net/ipv6/mip6.c b/kernel/net/ipv6/mip6.c index b9779d441..60c79a08e 100644 --- a/kernel/net/ipv6/mip6.c +++ b/kernel/net/ipv6/mip6.c @@ -118,7 +118,7 @@ static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) struct mip6_report_rate_limiter { spinlock_t lock; - struct timeval stamp; + ktime_t stamp; int iif; struct in6_addr src; struct in6_addr dst; @@ -184,20 +184,18 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static inline int mip6_report_rl_allow(struct timeval *stamp, +static inline int mip6_report_rl_allow(ktime_t stamp, const struct in6_addr *dst, const struct in6_addr *src, int iif) { int allow = 0; spin_lock_bh(&mip6_report_rl.lock); - if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec || - mip6_report_rl.stamp.tv_usec != stamp->tv_usec || + if (!ktime_equal(mip6_report_rl.stamp, stamp) || mip6_report_rl.iif != iif || !ipv6_addr_equal(&mip6_report_rl.src, src) || !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { - mip6_report_rl.stamp.tv_sec = stamp->tv_sec; - mip6_report_rl.stamp.tv_usec = stamp->tv_usec; + mip6_report_rl.stamp = stamp; mip6_report_rl.iif = iif; mip6_report_rl.src = *src; mip6_report_rl.dst = *dst; @@ -216,7 +214,7 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; - struct timeval stamp; + ktime_t stamp; int err = 0; if (unlikely(fl6->flowi6_proto == IPPROTO_MH && @@ -230,9 +228,9 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, (skb_network_header(skb) + offset); } - skb_get_timestamp(skb, &stamp); + stamp = skb_get_ktime(skb); - if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr, + if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr, hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; diff --git a/kernel/net/ipv6/ndisc.c b/kernel/net/ipv6/ndisc.c index 96f153c08..84afb9a77 100644 --- a/kernel/net/ipv6/ndisc.c +++ b/kernel/net/ipv6/ndisc.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -147,6 +148,7 @@ struct neigh_table nd_tbl = { .gc_thresh2 = 512, .gc_thresh3 = 1024, }; +EXPORT_SYMBOL_GPL(nd_tbl); static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) { @@ -441,8 +443,11 @@ static void ndisc_send_skb(struct sk_buff *skb, if (!dst) { struct flowi6 fl6; + int oif = l3mdev_fib_oif(skb->dev); - icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex); + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); + if (oif != skb->dev->ifindex) + fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); @@ -463,9 +468,9 @@ static void ndisc_send_skb(struct sk_buff *skb, idev = __in6_dev_get(dst->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, dst->dev, - dst_output_sk); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, dst->dev, + dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); @@ -474,8 +479,7 @@ static void ndisc_send_skb(struct sk_buff *skb, rcu_read_unlock(); } -void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *daddr, +void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { @@ -541,7 +545,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { - ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &ifa->addr, + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); @@ -551,8 +555,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) in6_dev_put(idev); } -void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *solicit, +void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct sk_buff *skb; @@ -675,12 +678,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, neigh, target, target, saddr); + ndisc_send_ns(dev, target, target, saddr); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + ndisc_send_ns(dev, target, &mcaddr, saddr); } } @@ -764,7 +767,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { - +have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { /* @@ -790,6 +793,18 @@ static void ndisc_recv_ns(struct sk_buff *skb) } else { struct net *net = dev_net(dev); + /* perhaps an address on the master device */ + if (netif_is_l3_slave(dev)) { + struct net_device *mdev; + + mdev = netdev_master_upper_dev_get_rcu(dev); + if (mdev) { + ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); + if (ifp) + goto have_ifp; + } + } + idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ @@ -824,7 +839,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) is_router = idev->cnf.forwarding; if (dad) { - ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } @@ -845,8 +860,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE); if (neigh || !dev->header_ops) { - ndisc_send_na(dev, neigh, saddr, &msg->target, - !!is_router, + ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); @@ -1074,6 +1088,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct ndisc_options ndopts; int optlen; unsigned int pref = 0; + __u32 old_if_flags; + bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); @@ -1144,6 +1160,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ + old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? @@ -1151,6 +1168,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); + if (old_if_flags != in6_dev->if_flags) + send_ifinfo_notify = true; + if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", @@ -1163,7 +1183,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) */ if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1225,18 +1245,16 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt) rt6_set_expires(rt, jiffies + (HZ * lifetime)); - if (ra_msg->icmph.icmp6_hop_limit) { - /* Only set hop_limit on the interface if it is higher than - * the current hop_limit. - */ - if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) { + if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && + ra_msg->icmph.icmp6_hop_limit) { + if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } else { - ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n"); + ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } - if (rt) - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, - ra_msg->icmph.icmp6_hop_limit); } skip_defrtr: @@ -1254,7 +1272,7 @@ skip_defrtr: rtime = HZ/10; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); @@ -1271,11 +1289,17 @@ skip_defrtr: GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; } } } + /* + * Send a notify if RA changed managed/otherconf flags or timer settings + */ + if (send_ifinfo_notify) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + skip_linkparms: /* @@ -1313,7 +1337,7 @@ skip_linkparms: #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); @@ -1472,6 +1496,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + int oif = l3mdev_fib_oif(dev); bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { @@ -1488,7 +1513,10 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, - &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); + &saddr_buf, &ipv6_hdr(skb)->saddr, oif); + + if (oif != skb->dev->ifindex) + fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { @@ -1506,7 +1534,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); @@ -1650,6 +1678,7 @@ int ndisc_rcv(struct sk_buff *skb) static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; @@ -1664,6 +1693,11 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, ndisc_send_unsol_na(dev); in6_dev_put(idev); break; + case NETDEV_CHANGE: + change_info = ptr; + if (change_info->flags_changed & IFF_NOARP) + neigh_changeaddr(&nd_tbl, dev); + break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); diff --git a/kernel/net/ipv6/netfilter.c b/kernel/net/ipv6/netfilter.c index d958718b5..d11c46833 100644 --- a/kernel/net/ipv6/netfilter.c +++ b/kernel/net/ipv6/netfilter.c @@ -18,9 +18,8 @@ #include #include -int ip6_route_me_harder(struct sk_buff *skb) +int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct ipv6hdr *iph = ipv6_hdr(skb); unsigned int hh_len; struct dst_entry *dst; @@ -93,7 +92,7 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, } } -static int nf_ip6_reroute(struct sk_buff *skb, +static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, const struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -103,7 +102,7 @@ static int nf_ip6_reroute(struct sk_buff *skb, if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) - return ip6_route_me_harder(skb); + return ip6_route_me_harder(net, skb); } return 0; } @@ -191,6 +190,8 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, + .route_input = ip6_route_input, + .fragment = ip6_fragment }; static const struct nf_afinfo nf_ip6_afinfo = { diff --git a/kernel/net/ipv6/netfilter/Kconfig b/kernel/net/ipv6/netfilter/Kconfig index ca6998345..e10a04c9c 100644 --- a/kernel/net/ipv6/netfilter/Kconfig +++ b/kernel/net/ipv6/netfilter/Kconfig @@ -47,9 +47,23 @@ config NFT_REJECT_IPV6 default NFT_REJECT tristate +config NFT_DUP_IPV6 + tristate "IPv6 nf_tables packet duplication support" + depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DUP_IPV6 + help + This module enables IPv6 packet duplication support for nf_tables. + endif # NF_TABLES_IPV6 endif # NF_TABLES +config NF_DUP_IPV6 + tristate "Netfilter IPv6 packet duplication to alternate destination" + depends on !NF_CONNTRACK || NF_CONNTRACK + help + This option enables the nf_dup_ipv6 core, which duplicates an IPv6 + packet to be rerouted to another destination. + config NF_REJECT_IPV6 tristate "IPv6 packet rejection" default m if NETFILTER_ADVANCED=n @@ -186,7 +200,8 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP6_NF_MANGLE || IP6_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/kernel/net/ipv6/netfilter/Makefile b/kernel/net/ipv6/netfilter/Makefile index c36e0a549..b4f7d0b4e 100644 --- a/kernel/net/ipv6/netfilter/Makefile +++ b/kernel/net/ipv6/netfilter/Makefile @@ -30,6 +30,8 @@ obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o # reject obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o +obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o @@ -37,6 +39,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o +obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/kernel/net/ipv6/netfilter/ip6_tables.c b/kernel/net/ipv6/netfilter/ip6_tables.c index 62f5b0d0b..99425cf28 100644 --- a/kernel/net/ipv6/netfilter/ip6_tables.c +++ b/kernel/net/ipv6/netfilter/ip6_tables.c @@ -117,7 +117,7 @@ ip6_packet_match(const struct sk_buff *skb, if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, ip6info->iniface, - ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":""); + ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : ""); return false; } @@ -126,14 +126,14 @@ ip6_packet_match(const struct sk_buff *skb, if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, ip6info->outiface, - ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":""); + ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : ""); return false; } /* ... might want to do something with class and flowlabel here ... */ /* look for the desired protocol header */ - if((ip6info->flags & IP6T_F_PROTO)) { + if (ip6info->flags & IP6T_F_PROTO) { int protohdr; unsigned short _frag_off; @@ -151,9 +151,9 @@ ip6_packet_match(const struct sk_buff *skb, ip6info->proto); if (ip6info->proto == protohdr) { - if(ip6info->invflags & IP6T_INV_PROTO) { + if (ip6info->invflags & IP6T_INV_PROTO) return false; - } + return true; } @@ -275,7 +275,8 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, return 0; } -static void trace_packet(const struct sk_buff *skb, +static void trace_packet(struct net *net, + const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, @@ -283,15 +284,12 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ip6t_entry *e) { - const void *table_base; const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; - struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; @@ -307,7 +305,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure struct ip6t_entry * +static inline struct ip6t_entry * ip6t_next_entry(const struct ip6t_entry *entry) { return (void *)entry + entry->next_offset; @@ -316,22 +314,23 @@ ip6t_next_entry(const struct ip6t_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ip6t_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as @@ -341,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb, * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.hotdrop = false; + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.family = NFPROTO_IPV6; @@ -357,16 +357,25 @@ ip6t_do_table(struct sk_buff *skb, */ smp_read_barrier_depends(); cpu = smp_processor_id(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); acpar.thoff = 0; @@ -384,7 +393,8 @@ ip6t_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); IP_NF_ASSERT(t->u.kernel.target); @@ -392,8 +402,8 @@ ip6t_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, state->in, state->out, - table->name, private, e); + trace_packet(state->net, skb, hook, state->in, + state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { @@ -406,20 +416,16 @@ ip6t_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) + if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else - e = ip6t_next_entry(jumpstack[--*stackptr]); + e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; } e = get_entry(table_base, v); @@ -437,10 +443,8 @@ ip6t_do_table(struct sk_buff *skb, break; } while (!acpar.hotdrop); - *stackptr = origptr; - - xt_write_recseq_end(addend); - local_bh_enable(); + xt_write_recseq_end(addend); + local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -557,7 +561,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -679,6 +683,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -714,6 +722,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -797,13 +808,15 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, - const struct ip6t_replace *repl) + const struct ip6t_replace *repl) { struct ip6t_entry *iter; unsigned int i; @@ -879,12 +892,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -900,14 +907,16 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -952,11 +961,7 @@ copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1064,16 +1069,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET6, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1085,7 +1090,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1147,7 +1152,7 @@ static int get_info(struct net *net, void __user *user, static int get_entries(struct net *net, struct ip6t_get_entries __user *uptr, - const int *len) + const int *len) { int ret; struct ip6t_get_entries get; @@ -1194,7 +1199,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - const void *loc_cpu_old_entry; struct ip6t_entry *iter; ret = 0; @@ -1237,8 +1241,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1284,8 +1287,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1316,7 +1318,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1326,7 +1328,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - const void *loc_cpu_entry; struct ip6t_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1374,7 +1375,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, goto free; } - local_bh_disable(); private = t->private; if (private->number != num_counters) { @@ -1383,16 +1383,15 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); addend = xt_write_recseq_begin(); - loc_cpu_entry = private->entries[curcpu]; - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); - unlock_up_free: local_bh_enable(); xt_table_unlock(t); @@ -1459,7 +1458,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1528,8 +1526,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ipv6, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; @@ -1623,6 +1620,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; j = 0; mtpar.net = net; mtpar.table = name; @@ -1647,6 +1647,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -1731,7 +1734,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1783,11 +1786,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1834,8 +1832,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1906,7 +1903,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ip6t_entry *iter; @@ -1914,14 +1910,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2096,8 +2087,7 @@ struct xt_table *ip6t_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2127,7 +2117,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/kernel/net/ipv6/netfilter/ip6t_REJECT.c b/kernel/net/ipv6/netfilter/ip6t_REJECT.c index 12331efd4..db29bbf41 100644 --- a/kernel/net/ipv6/netfilter/ip6t_REJECT.c +++ b/kernel/net/ipv6/netfilter/ip6t_REJECT.c @@ -35,14 +35,12 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI "); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); - static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; - struct net *net = dev_net((par->in != NULL) ? par->in : par->out); + struct net *net = par->net; - pr_debug("%s: medium point\n", __func__); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); @@ -65,8 +63,11 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) case IP6T_TCP_RESET: nf_send_reset6(net, skb, par->hooknum); break; - default: - net_info_ratelimited("case %u not handled yet\n", reject->with); + case IP6T_ICMP6_POLICY_FAIL: + nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum); + break; + case IP6T_ICMP6_REJECT_ROUTE: + nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum); break; } diff --git a/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c b/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c index 6edb7b106..3deed5860 100644 --- a/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -37,12 +37,13 @@ synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, } static void -synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +synproxy_send_tcp(const struct synproxy_net *snet, + const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct ipv6hdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { - struct net *net = nf_ct_net((struct nf_conn *)nfct); + struct net *net = nf_ct_net(snet->tmpl); struct dst_entry *dst; struct flowi6 fl6; @@ -75,7 +76,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, nf_conntrack_get(nfct); } - ip6_local_out(nskb); + ip6_local_out(net, nskb->sk, nskb); return; free_nskb: @@ -83,7 +84,8 @@ free_nskb: } static void -synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +synproxy_send_client_synack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; @@ -119,7 +121,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -163,7 +165,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } @@ -203,7 +205,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void @@ -235,13 +237,14 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool @@ -272,7 +275,7 @@ static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_net *snet = synproxy_pernet(par->net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -301,7 +304,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(skb, th, &opts); + synproxy_send_client_synack(snet, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { @@ -313,11 +316,11 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); + struct synproxy_net *snet = synproxy_pernet(nhs->net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -455,14 +458,12 @@ static struct xt_target synproxy_tg6_reg __read_mostly = { static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { { .hook = ipv6_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv6_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, diff --git a/kernel/net/ipv6/netfilter/ip6t_rpfilter.c b/kernel/net/ipv6/netfilter/ip6t_rpfilter.c index 790e0c6b1..1ee1b25df 100644 --- a/kernel/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/kernel/net/ipv6/netfilter/ip6t_rpfilter.c @@ -26,7 +26,7 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr) return addr_type & IPV6_ADDR_UNICAST; } -static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, +static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, const struct net_device *dev, u8 flags) { struct rt6_info *rt; @@ -53,7 +53,7 @@ static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, lookup_flags |= RT6_LOOKUP_F_IFACE; } - rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags); + rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags); if (rt->dst.error) goto out; @@ -93,7 +93,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) if (unlikely(saddrtype == IPV6_ADDR_ANY)) return true ^ invert; /* not routable: forward path will drop it */ - return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/kernel/net/ipv6/netfilter/ip6table_filter.c b/kernel/net/ipv6/netfilter/ip6table_filter.c index 5c33d8abc..8b277b983 100644 --- a/kernel/net/ipv6/netfilter/ip6table_filter.c +++ b/kernel/net/ipv6/netfilter/ip6table_filter.c @@ -32,12 +32,10 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c. */ static unsigned int -ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_filter); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/kernel/net/ipv6/netfilter/ip6table_mangle.c b/kernel/net/ipv6/netfilter/ip6table_mangle.c index b551f5b79..abe278b07 100644 --- a/kernel/net/ipv6/netfilter/ip6table_mangle.c +++ b/kernel/net/ipv6/netfilter/ip6table_mangle.c @@ -57,8 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, state, - dev_net(state->out)->ipv6.ip6table_mangle); + ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -66,7 +65,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { - err = ip6_route_me_harder(skb); + err = ip6_route_me_harder(state->net, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -76,17 +75,16 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* The work comes in here from netfilter.c. */ static unsigned int -ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (state->hook == NF_INET_LOCAL_OUT) return ip6t_mangle_out(skb, state); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ip6t_do_table(skb, ops->hooknum, state, - dev_net(state->out)->ipv6.ip6table_mangle); + if (state->hook == NF_INET_POST_ROUTING) + return ip6t_do_table(skb, state, + state->net->ipv6.ip6table_mangle); /* INPUT/FORWARD */ - return ip6t_do_table(skb, ops->hooknum, state, - dev_net(state->in)->ipv6.ip6table_mangle); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/kernel/net/ipv6/netfilter/ip6table_nat.c b/kernel/net/ipv6/netfilter/ip6table_nat.c index c3a7f7af0..de2a10a56 100644 --- a/kernel/net/ipv6/netfilter/ip6table_nat.c +++ b/kernel/net/ipv6/netfilter/ip6table_nat.c @@ -30,49 +30,46 @@ static const struct xt_table nf_nat_ipv6_table = { .af = NFPROTO_IPV6, }; -static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - - return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_nat); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat); } -static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_fn(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain); } -static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_in(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain); } -static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_out(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain); } -static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops, +static unsigned int ip6table_nat_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_local_fn(ops, skb, state, ip6table_nat_do_chain); + return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = ip6table_nat_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, @@ -80,7 +77,6 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = ip6table_nat_out, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, @@ -88,7 +84,6 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = ip6table_nat_local_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, @@ -96,7 +91,6 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = ip6table_nat_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/kernel/net/ipv6/netfilter/ip6table_raw.c b/kernel/net/ipv6/netfilter/ip6table_raw.c index 0b33caad2..902196356 100644 --- a/kernel/net/ipv6/netfilter/ip6table_raw.c +++ b/kernel/net/ipv6/netfilter/ip6table_raw.c @@ -19,12 +19,10 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_raw); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/kernel/net/ipv6/netfilter/ip6table_security.c b/kernel/net/ipv6/netfilter/ip6table_security.c index fcef83c25..0d856fedf 100644 --- a/kernel/net/ipv6/netfilter/ip6table_security.c +++ b/kernel/net/ipv6/netfilter/ip6table_security.c @@ -36,13 +36,10 @@ static const struct xt_table security_table = { }; static unsigned int -ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip6table_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return ip6t_do_table(skb, ops->hooknum, state, - net->ipv6.ip6table_security); + return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; diff --git a/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 4ba0c34c6..1aa584876 100644 --- a/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -95,7 +95,7 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv6_helper(const struct nf_hook_ops *ops, +static unsigned int ipv6_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -131,7 +131,7 @@ static unsigned int ipv6_helper(const struct nf_hook_ops *ops, return helper->help(skb, protoff, ct, ctinfo); } -static unsigned int ipv6_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv6_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -165,14 +165,14 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(state->in), PF_INET6, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } -static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -181,48 +181,42 @@ static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return nf_conntrack_in(dev_net(state->out), PF_INET6, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { { .hook = ipv6_conntrack_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_conntrack_local, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_HELPER, }, { .hook = ipv6_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { .hook = ipv6_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_CONNTRACK_HELPER, }, { .hook = ipv6_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST-1, @@ -251,7 +245,7 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) if (*len < 0 || (unsigned int) *len < sizeof(sin6)) return -EINVAL; - h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (!h) { pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), diff --git a/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 90388d606..660bc10c7 100644 --- a/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -36,6 +36,7 @@ static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; @@ -56,12 +57,12 @@ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1 + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 }; static const u_int8_t noct_valid_new[] = { [ICMPV6_MGM_QUERY - 130] = 1, - [ICMPV6_MGM_REPORT -130] = 1, + [ICMPV6_MGM_REPORT - 130] = 1, [ICMPV6_MGM_REDUCTION - 130] = 1, [NDISC_ROUTER_SOLICITATION - 130] = 1, [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, @@ -150,7 +151,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + struct nf_conntrack_zone tmp; NF_CT_ASSERT(skb->nfct == NULL); @@ -159,7 +160,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, skb_network_offset(skb) + sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr), - PF_INET6, &origtuple)) { + PF_INET6, net, &origtuple)) { pr_debug("icmpv6_error: Can't get tuple\n"); return -NF_ACCEPT; } @@ -177,7 +178,8 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, zone, &intuple); + h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), + &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; diff --git a/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c b/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c index 6f187c8d8..bab4441ed 100644 --- a/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -59,7 +59,7 @@ struct nf_ct_frag6_skb_cb struct sk_buff *orig; }; -#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) +#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) static struct inet_frags nf_frags; @@ -190,7 +190,7 @@ static void nf_ct_frag6_expire(unsigned long data) /* Creation primitives. */ static inline struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, - struct in6_addr *dst, u8 ecn) + struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -200,6 +200,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.user = user; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; local_bh_disable(); @@ -348,7 +349,7 @@ found: fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(&fq->q, skb->truesize); + add_frag_mem_limit(fq->q.net, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -430,7 +431,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - add_frag_mem_limit(&fq->q, clone->truesize); + add_frag_mem_limit(fq->q.net, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -445,7 +446,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { + for (fp = head->next; fp; fp = fp->next) { head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) @@ -454,7 +455,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } - sub_frag_mem_limit(&fq->q, head->truesize); + sub_frag_mem_limit(fq->q.net, head->truesize); head->ignore_df = 1; head->next = NULL; @@ -563,12 +564,10 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) return 0; } -struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) +struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { struct sk_buff *clone; struct net_device *dev = skb->dev; - struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev) - : dev_net(skb->dev); struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; @@ -603,7 +602,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) fhdr = (struct frag_hdr *)skb_transport_header(clone); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; @@ -633,6 +632,7 @@ ret_orig: kfree_skb(clone); return skb; } +EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); void nf_ct_frag6_consume_orig(struct sk_buff *skb) { @@ -645,15 +645,22 @@ void nf_ct_frag6_consume_orig(struct sk_buff *skb) s = s2; } } +EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig); static int nf_ct_net_init(struct net *net) { + int res; + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&net->nf_frag.frags); - - return nf_ct_frag6_sysctl_register(net); + res = inet_frags_init_net(&net->nf_frag.frags); + if (res) + return res; + res = nf_ct_frag6_sysctl_register(net); + if (res) + inet_frags_uninit_net(&net->nf_frag.frags); + return res; } static void nf_ct_net_exit(struct net *net) diff --git a/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index a45db0b47..4fdbed5eb 100644 --- a/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -33,26 +33,25 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { - u16 zone = NF_CT_DEFAULT_ZONE; - + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); -#endif + if (skb->nfct) { + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) - return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + } #endif + if (nf_bridge_in_prerouting(skb)) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; + if (hooknum == NF_INET_PRE_ROUTING) - return IP6_DEFRAG_CONNTRACK_IN + zone; + return IP6_DEFRAG_CONNTRACK_IN + zone_id; else - return IP6_DEFRAG_CONNTRACK_OUT + zone; - + return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } -static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -64,7 +63,8 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, return NF_ACCEPT; #endif - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb)); + reasm = nf_ct_frag6_gather(state->net, skb, + nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (reasm == NULL) return NF_STOLEN; @@ -75,7 +75,7 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, nf_ct_frag6_consume_orig(reasm); - NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, state->sk, reasm, + NF_HOOK_THRESH(NFPROTO_IPV6, state->hook, state->net, state->sk, reasm, state->in, state->out, state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); @@ -85,14 +85,12 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, diff --git a/kernel/net/ipv6/netfilter/nf_dup_ipv6.c b/kernel/net/ipv6/netfilter/nf_dup_ipv6.c new file mode 100644 index 000000000..6989c70ae --- /dev/null +++ b/kernel/net/ipv6/netfilter/nf_dup_ipv6.c @@ -0,0 +1,82 @@ +/* + * (C) 2007 by Sebastian Claßen + * (C) 2007-2010 by Jan Engelhardt + * + * Extracted from xt_TEE.c + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 or later, as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + +static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, + const struct in6_addr *gw, int oif) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + if (oif != -1) + fl6.flowi6_oif = oif; + + fl6.daddr = *gw; + fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]); + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + return false; + } + skb_dst_drop(skb); + skb_dst_set(skb, dst); + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + + return true; +} + +void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, + const struct in6_addr *gw, int oif) +{ + if (this_cpu_read(nf_skb_duplicated)) + return; + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + if (hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_IN) { + struct ipv6hdr *iph = ipv6_hdr(skb); + --iph->hop_limit; + } + if (nf_dup_ipv6_route(net, skb, gw, oif)) { + __this_cpu_write(nf_skb_duplicated, true); + ip6_local_out(net, skb->sk, skb); + __this_cpu_write(nf_skb_duplicated, false); + } else { + kfree_skb(skb); + } +} +EXPORT_SYMBOL_GPL(nf_dup_ipv6); + +MODULE_AUTHOR("Sebastian Claßen "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("nf_dup_ipv6: IPv6 packet duplication"); +MODULE_LICENSE("GPL"); diff --git a/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index e76900e0a..238e70c3f 100644 --- a/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -124,7 +124,7 @@ static void nf_nat_ipv6_csum_update(struct sk_buff *skb, newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, - newip->s6_addr32, 1); + newip->s6_addr32, true); } static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, @@ -155,7 +155,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, } } else inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), 1); + htons(oldlen), htons(datalen), true); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -262,9 +262,9 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); unsigned int -nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -272,7 +272,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); __be16 frag_off; int hdrlen; u8 nexthdr; @@ -303,7 +303,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, + state->hook, hdrlen)) return NF_DROP; else @@ -317,21 +317,21 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, state, ct); + ret = do_chain(priv, skb, state, ct); if (ret != NF_ACCEPT) return ret; - if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) break; - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; @@ -340,11 +340,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -353,9 +353,9 @@ oif_changed: EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); unsigned int -nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -363,7 +363,7 @@ nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, unsigned int ret; struct in6_addr daddr = ipv6_hdr(skb)->daddr; - ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -373,9 +373,9 @@ nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); unsigned int -nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -391,7 +391,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, if (skb->len < sizeof(struct ipv6hdr)) return NF_ACCEPT; - ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -403,7 +403,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET6); + err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } @@ -414,9 +414,9 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); unsigned int -nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -430,14 +430,14 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (skb->len < sizeof(struct ipv6hdr)) return NF_ACCEPT; - ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - err = ip6_route_me_harder(skb); + err = ip6_route_me_harder(state->net, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -446,7 +446,7 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET6); + err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 774560966..31ba7ca19 100644 --- a/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -34,7 +34,7 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)); - if (ipv6_dev_get_saddr(dev_net(out), out, + if (ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; diff --git a/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c index 2205e8eee..57593b00c 100644 --- a/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ b/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -73,7 +73,7 @@ icmpv6_manip_pkt(struct sk_buff *skb, hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, - tuple->src.u.icmp.id, 0); + tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; diff --git a/kernel/net/ipv6/netfilter/nf_reject_ipv6.c b/kernel/net/ipv6/netfilter/nf_reject_ipv6.c index 94b4c6dfb..e0f922b77 100644 --- a/kernel/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/kernel/net/ipv6/netfilter/nf_reject_ipv6.c @@ -26,7 +26,7 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, int tcphoff; proto = oip6h->nexthdr; - tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), + tcphoff = ipv6_skip_exthdr(oldskb, ((u8 *)(oip6h + 1) - oldskb->data), &proto, &frag_off); if ((tcphoff < 0) || (tcphoff > oldskb->len)) { @@ -206,7 +206,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(nskb); + ip6_local_out(net, nskb->sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); @@ -224,7 +224,7 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook) return true; proto = ip6h->nexthdr; - thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); + thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; diff --git a/kernel/net/ipv6/netfilter/nf_tables_ipv6.c b/kernel/net/ipv6/netfilter/nf_tables_ipv6.c index c8148ba76..120ea9131 100644 --- a/kernel/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/kernel/net/ipv6/netfilter/nf_tables_ipv6.c @@ -16,20 +16,20 @@ #include #include -static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, +static unsigned int nft_do_chain_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0) + if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) return NF_DROP; - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, +static unsigned int nft_ipv6_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -40,7 +40,7 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv6(ops, skb, state); + return nft_do_chain_ipv6(priv, skb, state); } struct nft_af_info nft_af_ipv6 __read_mostly = { diff --git a/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 951bb458b..443cd306c 100644 --- a/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -24,44 +24,44 @@ #include #include -static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, ops, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_in(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_out(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv6_local_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv6 = { diff --git a/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c b/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c index 0dafdaac5..71d995ff3 100644 --- a/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -22,7 +22,7 @@ #include #include -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +static unsigned int nf_route_table_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -33,7 +33,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, u32 mark, flowlabel; /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0) + if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) return NF_DROP; /* save source/dest address, mark, hoplimit, flowlabel, priority */ @@ -45,14 +45,14 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u32 *)ipv6_hdr(skb)); - ret = nft_do_chain(&pkt, ops); + ret = nft_do_chain(&pkt, priv); if (ret != NF_DROP && ret != NF_QUEUE && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP; return ret; } @@ -61,11 +61,11 @@ static const struct nf_chain_type nft_chain_route_ipv6 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV6, - .owner = THIS_MODULE, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_OUT), .hooks = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, + [NF_INET_LOCAL_OUT] = nf_route_table_hook, + }, }; static int __init nft_chain_route_init(void) diff --git a/kernel/net/ipv6/netfilter/nft_dup_ipv6.c b/kernel/net/ipv6/netfilter/nft_dup_ipv6.c new file mode 100644 index 000000000..8bfd470cb --- /dev/null +++ b/kernel/net/ipv6/netfilter/nft_dup_ipv6.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_dup_ipv6 { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +static void nft_dup_ipv6_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_dup_ipv6 *priv = nft_expr_priv(expr); + struct in6_addr *gw = (struct in6_addr *)®s->data[priv->sreg_addr]; + int oif = regs->data[priv->sreg_dev]; + + nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif); +} + +static int nft_dup_ipv6_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dup_ipv6 *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_DUP_SREG_ADDR] == NULL) + return -EINVAL; + + priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); + err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr)); + if (err < 0) + return err; + + if (tb[NFTA_DUP_SREG_DEV] != NULL) { + priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + } + return 0; +} + +static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_dup_ipv6 *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dup_ipv6_type; +static const struct nft_expr_ops nft_dup_ipv6_ops = { + .type = &nft_dup_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv6)), + .eval = nft_dup_ipv6_eval, + .init = nft_dup_ipv6_init, + .dump = nft_dup_ipv6_dump, +}; + +static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = { + [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_dup_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "dup", + .ops = &nft_dup_ipv6_ops, + .policy = nft_dup_ipv6_policy, + .maxattr = NFTA_DUP_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_dup_ipv6_module_init(void) +{ + return nft_register_expr(&nft_dup_ipv6_type); +} + +static void __exit nft_dup_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_dup_ipv6_type); +} + +module_init(nft_dup_ipv6_module_init); +module_exit(nft_dup_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup"); diff --git a/kernel/net/ipv6/netfilter/nft_redir_ipv6.c b/kernel/net/ipv6/netfilter/nft_redir_ipv6.c index effd393bd..aca44e89a 100644 --- a/kernel/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/kernel/net/ipv6/netfilter/nft_redir_ipv6.c @@ -35,8 +35,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, range.flags |= priv->flags; - regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, - pkt->ops->hooknum); + regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->hook); } static struct nft_expr_type nft_redir_ipv6_type; diff --git a/kernel/net/ipv6/netfilter/nft_reject_ipv6.c b/kernel/net/ipv6/netfilter/nft_reject_ipv6.c index d0d1540ec..533cd5719 100644 --- a/kernel/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/kernel/net/ipv6/netfilter/nft_reject_ipv6.c @@ -24,15 +24,14 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + nf_send_reset6(pkt->net, pkt->skb, pkt->hook); break; default: break; diff --git a/kernel/net/ipv6/output_core.c b/kernel/net/ipv6/output_core.c index 85892af57..462f2a76b 100644 --- a/kernel/net/ipv6/output_core.c +++ b/kernel/net/ipv6/output_core.c @@ -8,9 +8,11 @@ #include #include #include +#include static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, - struct in6_addr *dst, struct in6_addr *src) + const struct in6_addr *dst, + const struct in6_addr *src) { u32 hash, id; @@ -60,17 +62,17 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); -void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr, - struct rt6_info *rt) +__be32 ipv6_select_ident(struct net *net, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { static u32 ip6_idents_hashrnd __read_mostly; u32 id; net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr, - &rt->rt6i_src.addr); - fhdr->identification = htonl(id); + id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); + return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); @@ -136,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) EXPORT_SYMBOL(ip6_dst_hoplimit); #endif -static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int len; @@ -146,30 +148,20 @@ static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); - return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); -} - -int __ip6_local_out(struct sk_buff *skb) -{ - return __ip6_local_out_sk(skb->sk, skb); + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); -int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip6_local_out_sk(sk, skb); + err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip6_local_out_sk); - -int ip6_local_out(struct sk_buff *skb) -{ - return ip6_local_out_sk(skb->sk, skb); -} EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/kernel/net/ipv6/raw.c b/kernel/net/ipv6/raw.c index 8072bd413..99140986e 100644 --- a/kernel/net/ipv6/raw.c +++ b/kernel/net/ipv6/raw.c @@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; - if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!(addr_type & IPV6_ADDR_MULTICAST) && + !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) { err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { @@ -613,6 +614,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, unsigned int flags) { struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; @@ -651,9 +653,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (err) goto error_fault; - IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, - NULL, rt->dst.dev, dst_output_sk); + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, + NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -665,7 +667,7 @@ error_fault: err = -EFAULT; kfree_skb(skb); error: - IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); if (err == -ENOBUFS && !np->recverr) err = 0; return err; @@ -731,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; @@ -837,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -865,6 +870,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->ucast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (inet->hdrincl) + fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -901,6 +909,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: dst_confirm(dst); @@ -1324,13 +1333,7 @@ static struct inet_protosw rawv6_protosw = { int __init rawv6_init(void) { - int ret; - - ret = inet6_register_protosw(&rawv6_protosw); - if (ret) - goto out; -out: - return ret; + return inet6_register_protosw(&rawv6_protosw); } void rawv6_exit(void) diff --git a/kernel/net/ipv6/reassembly.c b/kernel/net/ipv6/reassembly.c index 8ffa2c8cc..45f5ae51d 100644 --- a/kernel/net/ipv6/reassembly.c +++ b/kernel/net/ipv6/reassembly.c @@ -108,7 +108,10 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) return fq->id == arg->id && fq->user == arg->user && ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst); + ipv6_addr_equal(&fq->daddr, arg->dst) && + (arg->iif == fq->iif || + !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))); } EXPORT_SYMBOL(ip6_frag_match); @@ -144,7 +147,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - if (fq->q.flags & INET_FRAG_EVICTED) + if (inet_frag_evicting(&fq->q)) goto out_rcu_unlock; IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); @@ -180,7 +183,7 @@ static void ip6_frag_expire(unsigned long data) static struct frag_queue * fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, u8 ecn) + const struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -190,6 +193,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; hash = inet6_hash_frag(id, src, dst); @@ -330,7 +334,7 @@ found: fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(&fq->q, skb->truesize); + add_frag_mem_limit(fq->q.net, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -443,7 +447,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(&fq->q, clone->truesize); + add_frag_mem_limit(fq->q.net, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -481,7 +485,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } fp = next; } - sub_frag_mem_limit(&fq->q, sum_truesize); + sub_frag_mem_limit(fq->q.net, sum_truesize); head->next = NULL; head->dev = dev; @@ -551,7 +555,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq) { int ret; @@ -706,13 +710,19 @@ static void ip6_frags_sysctl_unregister(void) static int __net_init ipv6_frags_init_net(struct net *net) { + int res; + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&net->ipv6.frags); - - return ip6_frags_ns_sysctl_register(net); + res = inet_frags_init_net(&net->ipv6.frags); + if (res) + return res; + res = ip6_frags_ns_sysctl_register(net); + if (res) + inet_frags_uninit_net(&net->ipv6.frags); + return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) diff --git a/kernel/net/ipv6/route.c b/kernel/net/ipv6/route.c index f371fefa7..3f164d3aa 100644 --- a/kernel/net/ipv6/route.c +++ b/kernel/net/ipv6/route.c @@ -54,10 +54,14 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include @@ -72,8 +76,7 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest); +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -84,14 +87,15 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO @@ -104,65 +108,83 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif -static void rt6_bind_peer(struct rt6_info *rt, int create) +struct uncached_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); + +static void rt6_uncached_list_add(struct rt6_info *rt) { - struct inet_peer_base *base; - struct inet_peer *peer; + struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - base = inetpeer_base_ptr(rt->_rt6i_peer); - if (!base) - return; + rt->dst.flags |= DST_NOCACHE; + rt->rt6i_uncached_list = ul; - peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); - if (peer) { - if (!rt6_set_peer(rt, peer)) - inet_putpeer(peer); - } + spin_lock_bh(&ul->lock); + list_add_tail(&rt->rt6i_uncached, &ul->head); + spin_unlock_bh(&ul->lock); } -static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) +static void rt6_uncached_list_del(struct rt6_info *rt) { - if (rt6_has_peer(rt)) - return rt6_peer_ptr(rt); + if (!list_empty(&rt->rt6i_uncached)) { + struct uncached_list *ul = rt->rt6i_uncached_list; - rt6_bind_peer(rt, create); - return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); + spin_lock_bh(&ul->lock); + list_del(&rt->rt6i_uncached); + spin_unlock_bh(&ul->lock); + } } -static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - return __rt6_get_peer(rt, 1); -} + struct net_device *loopback_dev = net->loopback_dev; + int cpu; -static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - struct rt6_info *rt = (struct rt6_info *) dst; - struct inet_peer *peer; - u32 *p = NULL; - - if (!(rt->dst.flags & DST_HOST)) - return dst_cow_metrics_generic(dst, old); + if (dev == loopback_dev) + return; - peer = rt6_get_peer_create(rt); - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + struct rt6_info *rt; - p = peer->metrics; - if (inet_metrics_new(peer) || - (old & DST_METRICS_FORCE_OVERWRITE)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + spin_lock_bh(&ul->lock); + list_for_each_entry(rt, &ul->head, rt6i_uncached) { + struct inet6_dev *rt_idev = rt->rt6i_idev; + struct net_device *rt_dev = rt->dst.dev; - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); + if (rt_idev->dev == dev) { + rt->rt6i_idev = in6_dev_get(loopback_dev); + in6_dev_put(rt_idev); + } - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; + if (rt_dev == dev) { + rt->dst.dev = loopback_dev; + dev_hold(rt->dst.dev); + dev_put(rt_dev); + } } + spin_unlock_bh(&ul->lock); } - return p; +} + +static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) +{ + return dst_metrics_write_ptr(rt->dst.from); +} + +static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_cow_metrics(rt); + else if (rt->rt6i_flags & RTF_CACHE) + return NULL; + else + return dst_cow_metrics_generic(dst, old); } static inline const void *choose_neigh_daddr(struct rt6_info *rt, @@ -227,12 +249,6 @@ static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, { } -static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, - unsigned long old) -{ - return NULL; -} - static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .destroy = ip6_dst_destroy, @@ -241,7 +257,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, .redirect = ip6_rt_blackhole_redirect, - .cow_metrics = ip6_rt_blackhole_cow_metrics, + .cow_metrics = dst_cow_metrics_generic, .neigh_lookup = ip6_neigh_lookup, }; @@ -288,7 +304,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard_sk, + .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -298,34 +314,67 @@ static const struct rt6_info ip6_blk_hole_entry_template = { #endif +static void rt6_info_init(struct rt6_info *rt) +{ + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + INIT_LIST_HEAD(&rt->rt6i_siblings); + INIT_LIST_HEAD(&rt->rt6i_uncached); +} + /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags, - struct fib6_table *table) +static struct rt6_info *__ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); + if (rt) + rt6_info_init(rt); + + return rt; +} + +static struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) +{ + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); + if (rt) { - struct dst_entry *dst = &rt->dst; + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + if (rt->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **p; - memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - INIT_LIST_HEAD(&rt->rt6i_siblings); + p = per_cpu_ptr(rt->rt6i_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } } + return rt; } static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct inet6_dev *idev = rt->rt6i_idev; struct dst_entry *from = dst->from; + struct inet6_dev *idev; - if (!(rt->dst.flags & DST_HOST)) - dst_destroy_metrics_generic(dst); + dst_destroy_metrics_generic(dst); + free_percpu(rt->rt6i_pcpu); + rt6_uncached_list_del(rt); + idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); @@ -333,11 +382,6 @@ static void ip6_dst_destroy(struct dst_entry *dst) dst->from = NULL; dst_release(from); - - if (rt6_has_peer(rt)) { - struct inet_peer *peer = rt6_peer_ptr(rt); - inet_putpeer(peer); - } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -360,6 +404,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, } } +static bool __rt6_check_expired(const struct rt6_info *rt) +{ + if (rt->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, rt->dst.expires); + else + return false; +} + static bool rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) { @@ -378,31 +430,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) static int rt6_info_hash_nhsfn(unsigned int candidate_count, const struct flowi6 *fl6) { - unsigned int val = fl6->flowi6_proto; - - val ^= ipv6_addr_hash(&fl6->daddr); - val ^= ipv6_addr_hash(&fl6->saddr); - - /* Work only if this not encapsulated */ - switch (fl6->flowi6_proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_SCTP: - val ^= (__force u16)fl6->fl6_sport; - val ^= (__force u16)fl6->fl6_dport; - break; - - case IPPROTO_ICMPV6: - val ^= (__force u16)fl6->fl6_icmp_type; - val ^= (__force u16)fl6->fl6_icmp_code; - break; - } - /* RFC6438 recommands to use flowlabel */ - val ^= (__force u32)fl6->flowlabel; - - /* Perhaps, we need to tune, this function? */ - val = val ^ (val >> 7) ^ (val >> 12); - return val % candidate_count; + return get_hash_from_flowi6(fl6) % candidate_count; } static struct rt6_info *rt6_multipath_select(struct rt6_info *match, @@ -455,10 +483,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (dev->flags & IFF_LOOPBACK) { if (!sprt->rt6i_idev || sprt->rt6i_idev->dev->ifindex != oif) { - if (flags & RT6_LOOKUP_F_IFACE && oif) + if (flags & RT6_LOOKUP_F_IFACE) continue; - if (local && (!oif || - local->rt6i_idev->dev->ifindex == oif)) + if (local && + local->rt6i_idev->dev->ifindex == oif) continue; } local = sprt; @@ -495,13 +523,14 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); dev_put(work->dev); kfree(work); } static void rt6_probe(struct rt6_info *rt) { + struct __rt6_probe_work *work; struct neighbour *neigh; /* * Okay, this does not seem to be appropriate @@ -516,34 +545,33 @@ static void rt6_probe(struct rt6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { - write_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) goto out; - } - - if (!neigh || - time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { - struct __rt6_probe_work *work; + work = NULL; + write_lock(&neigh->lock); + if (!(neigh->nud_state & NUD_VALID) && + time_after(jiffies, + neigh->updated + + rt->rt6i_idev->cnf.rtr_probe_interval)) { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + __neigh_set_probe_once(neigh); + } + write_unlock(&neigh->lock); + } else { work = kmalloc(sizeof(*work), GFP_ATOMIC); + } - if (neigh && work) - __neigh_set_probe_once(neigh); - - if (neigh) - write_unlock(&neigh->lock); + if (work) { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = rt->rt6i_gateway; + dev_hold(rt->dst.dev); + work->dev = rt->dst.dev; + schedule_work(&work->work); + } - if (work) { - INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; - schedule_work(&work->work); - } - } else { out: - write_unlock(&neigh->lock); - } rcu_read_unlock_bh(); } #else @@ -622,6 +650,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, { int m; bool match_do_rr = false; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *dev = rt->dst.dev; + + if (dev && !netif_carrier_ok(dev) && + idev->cnf.ignore_routes_with_linkdown) + goto out; if (rt6_check_expired(rt)) goto out; @@ -652,15 +686,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match; + struct rt6_info *rt, *match, *cont; int mpri = -1; match = NULL; - for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + cont = NULL; + for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + match = find_match(rt, oif, strict, &mpri, match, do_rr); - for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + } + + for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + + match = find_match(rt, oif, strict, &mpri, match, do_rr); + } + + if (match || !cont) + return match; + + for (rt = cont; rt; rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -694,6 +746,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) return match ? match : net->ipv6.ip6_null_entry; } +static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); +} + #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) @@ -872,9 +929,9 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info, &mxc); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -882,15 +939,25 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, * Clone the route. */ - rt = ip6_rt_copy(ort, daddr); + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; - if (rt) { + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0); + + if (!rt) + return NULL; + + ip6_rt_copy_init(rt, ort); + rt->rt6i_flags |= RTF_CACHE; + rt->rt6i_metric = 0; + rt->dst.flags |= DST_HOST; + rt->rt6i_dst.addr = *daddr; + rt->rt6i_dst.plen = 128; + + if (!rt6_is_gw_or_nonexthop(ort)) { if (ort->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; - - rt->rt6i_flags |= RTF_CACHE; - #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; @@ -902,35 +969,93 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, - const struct in6_addr *daddr) +static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { - struct rt6_info *rt = ip6_rt_copy(ort, daddr); + struct rt6_info *pcpu_rt; - if (rt) - rt->rt6i_flags |= RTF_CACHE; - return rt; + pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags); + + if (!pcpu_rt) + return NULL; + ip6_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt6i_protocol = rt->rt6i_protocol; + pcpu_rt->rt6i_flags |= RTF_PCPU; + return pcpu_rt; +} + +/* It should be called with read_lock_bh(&tb6_lock) acquired */ +static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, **p; + + p = this_cpu_ptr(rt->rt6i_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) { + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + } + return pcpu_rt; +} + +static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct rt6_info *pcpu_rt, *prev, **p; + + pcpu_rt = ip6_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + dst_hold(&net->ipv6.ip6_null_entry->dst); + return net->ipv6.ip6_null_entry; + } + + read_lock_bh(&table->tb6_lock); + if (rt->rt6i_pcpu) { + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + } else { + /* rt has been removed from the fib6 tree + * before we have a chance to acquire the read_lock. + * In this case, don't brother to create a pcpu rt + * since rt is going away anyway. The next + * dst_check() will trigger a re-lookup. + */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = rt; + } + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + read_unlock_bh(&table->tb6_lock); + return pcpu_rt; } static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *nrt; + struct rt6_info *rt; int strict = 0; - int attempts = 3; - int err; strict |= flags & RT6_LOOKUP_F_IFACE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; -redo_fib6_lookup_lock: read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) + oif = 0; + redo_rt6_select: rt = rt6_select(fn, oif, strict); if (rt->rt6i_nsiblings) @@ -944,51 +1069,65 @@ redo_rt6_select: strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; - } else { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - goto out2; } } - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - if (rt->rt6i_flags & RTF_CACHE) - goto out2; + if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) - nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST)) - nrt = rt6_alloc_clone(rt, &fl6->daddr); - else - goto out2; + rt6_dst_from_metrics_check(rt); + return rt; + } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && + !(rt->rt6i_flags & RTF_GATEWAY))) { + /* Create a RTF_CACHE clone which will not be + * owned by the fib6 tree. It is for the special case where + * the daddr in the skb during the neighbor look-up is different + * from the fl6->daddr used to look-up route here. + */ - ip6_rt_put(rt); - rt = nrt ? : net->ipv6.ip6_null_entry; + struct rt6_info *uncached_rt; - dst_hold(&rt->dst); - if (nrt) { - err = ip6_ins_rt(nrt); - if (!err) - goto out2; - } + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - if (--attempts <= 0) - goto out2; + uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + dst_release(&rt->dst); - /* - * Race condition! In the gap, when table->tb6_lock was - * released someone could insert this route. Relookup. - */ - ip6_rt_put(rt); - goto redo_fib6_lookup_lock; + if (uncached_rt) + rt6_uncached_list_add(uncached_rt); + else + uncached_rt = net->ipv6.ip6_null_entry; -out2: - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_hold(&uncached_rt->dst); + return uncached_rt; - return rt; + } else { + /* Get a percpu copy */ + + struct rt6_info *pcpu_rt; + + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = rt6_get_pcpu_route(rt); + + if (pcpu_rt) { + read_unlock_bh(&table->tb6_lock); + } else { + /* We have to do the read_unlock first + * because rt6_make_pcpu_route() may trigger + * ip6_dst_gc() which will take the write_lock. + */ + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + pcpu_rt = rt6_make_pcpu_route(rt); + dst_release(&rt->dst); + } + + return pcpu_rt; + + } } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, @@ -1012,8 +1151,9 @@ void ip6_route_input(struct sk_buff *skb) const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { - .flowi6_iif = skb->dev->ifindex, + .flowi6_iif = l3mdev_fib_oif(skb->dev), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), @@ -1021,6 +1161,10 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + tun_info = skb_tunnel_info(skb); + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) + fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; + skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } @@ -1030,24 +1174,31 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, - struct flowi6 *fl6) +struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, + struct flowi6 *fl6, int flags) { - int flags = 0; + struct dst_entry *dst; + bool any_src; + + dst = l3mdev_rt6_dst_by_oif(net, fl6); + if (dst) + return dst; fl6->flowi6_iif = LOOPBACK_IFINDEX; - if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) + any_src = ipv6_addr_any(&fl6->saddr); + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || + (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; - if (!ipv6_addr_any(&fl6->saddr)) + if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } -EXPORT_SYMBOL(ip6_route_output); +EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { @@ -1056,25 +1207,20 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { - new = &rt->dst; - - memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); - rt6_init_peer(rt, net->ipv6.peers); + rt6_info_init(rt); + new = &rt->dst; new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; - if (dst_metrics_read_only(&ort->dst)) - new->_metrics = ort->dst._metrics; - else - dst_copy_metrics(new, &ort->dst); + dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags; + rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -1093,6 +1239,34 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ +static void rt6_dst_from_metrics_check(struct rt6_info *rt) +{ + if (rt->dst.from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); +} + +static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +{ + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return &rt->dst; +} + +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +{ + if (!__rt6_check_expired(rt) && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + return &rt->dst; + else + return NULL; +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1103,13 +1277,14 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) - return NULL; - if (rt6_check_expired(rt)) - return NULL; + rt6_dst_from_metrics_check(rt); - return dst; + if (rt->rt6i_flags & RTF_PCPU || + (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) + return rt6_dst_from_check(rt, cookie); + else + return rt6_check(rt, cookie); } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -1140,32 +1315,76 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) +static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) +{ + struct net *net = dev_net(rt->dst.dev); + + rt->rt6i_flags |= RTF_MODIFIED; + rt->rt6i_pmtu = mtu; + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); +} + +static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) +{ + return !(rt->rt6i_flags & RTF_CACHE) && + (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); +} + +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, + const struct ipv6hdr *iph, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info *)dst; - dst_confirm(dst); - if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { - struct net *net = dev_net(dst->dev); + if (rt6->rt6i_flags & RTF_LOCAL) + return; - rt6->rt6i_flags |= RTF_MODIFIED; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + dst_confirm(dst); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu >= dst_mtu(dst)) + return; - dst_metric_set(dst, RTAX_MTU, mtu); - rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); + if (!rt6_cache_allowed_for_pmtu(rt6)) { + rt6_do_update_pmtu(rt6, mtu); + } else { + const struct in6_addr *daddr, *saddr; + struct rt6_info *nrt6; + + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + return; + } + nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + if (nrt6) { + rt6_do_update_pmtu(nrt6, mtu); + + /* ip6_ins_rt(nrt6) will bump the + * rt6->rt6i_node->fn_sernum + * which will fail the next rt6_check() and + * invalidate the sk->sk_dst_cache. + */ + ip6_ins_rt(nrt6); + } } } +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); +} + void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark) { @@ -1182,7 +1401,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); @@ -1341,9 +1560,14 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + goto out; + + mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; @@ -1373,7 +1597,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, if (unlikely(!idev)) return ERR_PTR(-ENODEV); - rt = ip6_dst_alloc(net, dev, 0, NULL); + rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); @@ -1472,6 +1696,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; u32 *mp; @@ -1485,51 +1710,57 @@ static int ip6_convert_metrics(struct mx6_config *mxc, nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (unlikely(type > RTAX_MAX)) + goto err; - if (type) { - u32 val; + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; - if (unlikely(type > RTAX_MAX)) + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) goto err; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + goto err; - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } + mp[type - 1] = val; + __set_bit(type - 1, mxc->mx_valid); + } - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); - } + if (ecn_ca) { + __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); + mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; } mxc->mx = mp; - return 0; err: kfree(mp); return -EINVAL; } -int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) +static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { - int err; struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; int addr_type; + int err = -EINVAL; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) - return -EINVAL; + goto out; #ifndef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len) - return -EINVAL; + goto out; #endif if (cfg->fc_ifindex) { err = -ENODEV; @@ -1559,7 +1790,8 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); + rt = ip6_dst_alloc(net, NULL, + (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); if (!rt) { err = -ENOMEM; @@ -1587,12 +1819,29 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) rt->dst.output = ip6_output; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, AF_INET6, cfg, + &lwtstate); + if (err) + goto out; + rt->dst.lwtstate = lwtstate_get(lwtstate); + if (lwtunnel_output_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_output = rt->dst.output; + rt->dst.output = lwtunnel_output; + } + if (lwtunnel_input_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input; + } + } + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) { + if (rt->rt6i_dst.plen == 128) rt->dst.flags |= DST_HOST; - dst_metrics_set_force_overwrite(&rt->dst); - } #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1626,7 +1875,7 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard_sk; + rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -1635,9 +1884,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; @@ -1650,9 +1901,21 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) int gwa_type; gw_addr = &cfg->fc_gateway; - rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + err = -EINVAL; + if (ipv6_chk_addr_and_flags(net, gw_addr, + gwa_type & IPV6_ADDR_LINKLOCAL ? + dev : NULL, 0, 0)) + goto out; + + rt->rt6i_gateway = *gw_addr; + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { struct rt6_info *grt; @@ -1663,7 +1926,6 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) (SIT, PtP, NBMA NOARP links) it is handy to allow some exceptions. --ANK */ - err = -EINVAL; if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; @@ -1718,9 +1980,7 @@ install_route: cfg->fc_nlinfo.nl_net = dev_net(dev); - *rt_ret = rt; - - return 0; + return rt; out: if (dev) dev_put(dev); @@ -1729,20 +1989,21 @@ out: if (rt) dst_free(&rt->dst); - *rt_ret = NULL; - - return err; + return ERR_PTR(err); } int ip6_route_add(struct fib6_config *cfg) { struct mx6_config mxc = { .mx = NULL, }; - struct rt6_info *rt = NULL; + struct rt6_info *rt; int err; - err = ip6_route_info_create(cfg, &rt); - if (err) + rt = ip6_route_info_create(cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto out; + } err = ip6_convert_metrics(&mxc, cfg); if (err) @@ -1766,7 +2027,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.ip6_null_entry || + rt->dst.flags & DST_NOCACHE) { err = -ENOENT; goto out; } @@ -1808,6 +2070,9 @@ static int ip6_route_del(struct fib6_config *cfg) if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if ((rt->rt6i_flags & RTF_CACHE) && + !(cfg->fc_flags & RTF_CACHE)) + continue; if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -1830,7 +2095,6 @@ static int ip6_route_del(struct fib6_config *cfg) static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; struct ndisc_options ndopts; @@ -1891,7 +2155,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu } rt = (struct rt6_info *) dst; - if (rt == net->ipv6.ip6_null_entry) { + if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; } @@ -1917,7 +2181,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)) ); - nrt = ip6_rt_copy(rt, &msg->dest); + nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) goto out; @@ -1949,42 +2213,36 @@ out: * Misc support functions */ -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest) +static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - struct net *net = dev_net(ort->dst.dev); - struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, - ort->rt6i_table); + BUG_ON(from->dst.from); - if (rt) { - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->dst.flags |= DST_HOST; - - rt->rt6i_dst.addr = *dest; - rt->rt6i_dst.plen = 128; - dst_copy_metrics(&rt->dst, &ort->dst); - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; - - if (ort->rt6i_flags & RTF_GATEWAY) - rt->rt6i_gateway = ort->rt6i_gateway; - else - rt->rt6i_gateway = *dest; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = 0; + rt->rt6i_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->dst.from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt6i_dst = ort->rt6i_dst; + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); + rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES - memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); + rt->rt6i_src = ort->rt6i_src; #endif - memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); - rt->rt6i_table = ort->rt6i_table; - } - return rt; + rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_table = ort->rt6i_table; + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -2026,7 +2284,6 @@ static struct rt6_info *rt6_add_route_info(struct net *net, unsigned int pref) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_INFO, .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = ifindex, .fc_dst_len = prefixlen, @@ -2037,6 +2294,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_nlinfo.nl_net = net, }; + cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; @@ -2077,7 +2335,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, unsigned int pref) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_DFLT, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | @@ -2124,7 +2382,8 @@ static void rtmsg_to_fib6_config(struct net *net, { memset(cfg, 0, sizeof(*cfg)); - cfg->fc_table = RT6_TABLE_MAIN; + cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? + : RT6_TABLE_MAIN; cfg->fc_ifindex = rtmsg->rtmsg_ifindex; cfg->fc_metric = rtmsg->rtmsg_metric; cfg->fc_expires = rtmsg->rtmsg_info; @@ -2208,7 +2467,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2219,7 +2478,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); @@ -2233,9 +2492,10 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, bool anycast) { + u32 tb_id; struct net *net = dev_net(idev->dev); struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, - DST_NOCOUNT, NULL); + DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); @@ -2255,7 +2515,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; - rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; + rt->rt6i_table = fib6_get_table(net, tb_id); + rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); @@ -2359,6 +2621,8 @@ void rt6_ifdown(struct net *net, struct net_device *dev) fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); + if (dev) + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { @@ -2396,11 +2660,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->dst.dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU) && - (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6))) { - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + !dst_metric_locked(&rt->dst, RTAX_MTU)) { + if (rt->rt6i_flags & RTF_CACHE) { + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) + rt->rt6i_pmtu = arg->mtu; + } else if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } } return 0; } @@ -2423,6 +2696,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2457,6 +2732,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; + if (rtm->rtm_flags & RTM_F_CLONED) + cfg->fc_flags |= RTF_CACHE; + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -2514,6 +2792,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= RTF_PREF(pref); } + if (tb[RTA_ENCAP]) + cfg->fc_encap = tb[RTA_ENCAP]; + + if (tb[RTA_ENCAP_TYPE]) + cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = 0; errout: return err; @@ -2605,11 +2889,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); } - err = ip6_route_info_create(&r_cfg, &rt); - if (err) + rt = ip6_route_info_create(&r_cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto cleanup; + } err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { @@ -2658,8 +2949,7 @@ cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) dst_free(&nh->rt6_info->dst); - if (nh->mxc.mx) - kfree(nh->mxc.mx); + kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); } @@ -2734,7 +3024,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(void) +static inline size_t rt6_nlmsg_size(struct rt6_info *rt) { return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ @@ -2748,7 +3038,8 @@ static inline size_t rt6_nlmsg_size(void) + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ - + nla_total_size(1); /* RTA_PREF */ + + nla_total_size(1) /* RTA_PREF */ + + lwtunnel_get_encap_size(rt->dst.lwtstate); } static int rt6_fill_node(struct net *net, @@ -2757,6 +3048,7 @@ static int rt6_fill_node(struct net *net, int iif, int type, u32 portid, u32 seq, int prefix, int nowait, unsigned int flags) { + u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; @@ -2808,6 +3100,11 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; + if (!netif_carrier_ok(rt->dst.dev)) { + rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + rtm->rtm_flags |= RTNH_F_DEAD; + } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) @@ -2870,7 +3167,10 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt6i_pmtu) + metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (rt->rt6i_flags & RTF_GATEWAY) { @@ -2892,6 +3192,8 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; + lwtunnel_fill_encap(skb, rt->dst.lwtstate); + nlmsg_end(skb, nlh); return 0; @@ -2977,6 +3279,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) } else { fl6.flowi6_oif = oif; + if (netif_index_is_l3_master(net, oif)) { + fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | + FLOWI_FLAG_SKIP_NH_OIF; + } + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } @@ -3008,7 +3315,8 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -3018,12 +3326,12 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3365,6 +3673,7 @@ static struct notifier_block ip6_route_dev_notifier = { int __init ip6_route_init(void) { int ret; + int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = @@ -3424,6 +3733,13 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + + INIT_LIST_HEAD(&ul->head); + spin_lock_init(&ul->lock); + } + out: return ret; diff --git a/kernel/net/ipv6/sit.c b/kernel/net/ipv6/sit.c index ac35a2859..dcccae861 100644 --- a/kernel/net/ipv6/sit.c +++ b/kernel/net/ipv6/sit.c @@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb) goto drop; if (iptunnel_pull_header(skb, 0, tpi.proto)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } return 1; @@ -1394,34 +1394,20 @@ static int ipip6_tunnel_init(struct net_device *dev) return 0; } -static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) +static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); - tunnel->dev = dev; - tunnel->net = dev_net(dev); - iph->version = 4; iph->protocol = IPPROTO_IPV6; iph->ihl = 5; iph->ttl = 64; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { - free_percpu(dev->tstats); - return -ENOMEM; - } - dev_hold(dev); rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); - return 0; } static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1831,23 +1817,19 @@ static int __net_init sit_init_net(struct net *net) */ sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); - if (err) - goto err_dev_free; - - ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); err = register_netdev(sitn->fb_tunnel_dev); if (err) goto err_reg_dev; + ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); + ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + t = netdev_priv(sitn->fb_tunnel_dev); strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: - dev_put(sitn->fb_tunnel_dev); -err_dev_free: ipip6_dev_free(sitn->fb_tunnel_dev); err_alloc_dev: return err; diff --git a/kernel/net/ipv6/syncookies.c b/kernel/net/ipv6/syncookies.c index 21bc2eb53..eaf7ac496 100644 --- a/kernel/net/ipv6/syncookies.c +++ b/kernel/net/ipv6/syncookies.c @@ -41,23 +41,6 @@ static __u16 const msstab[] = { 9000 - 60, }; -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct sock *child; - - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) { - atomic_set(&req->rsk_refcnt, 1); - inet_csk_reqsk_queue_add(sk, req, child); - } else { - reqsk_free(req); - } - return child; -} - static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); @@ -131,14 +114,11 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - tcp_synq_overflow(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return __cookie_v6_init_sequence(iph, th, mssp); } @@ -190,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); if (!req) goto out; @@ -227,7 +207,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->snt_synack.v64 = 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; @@ -242,7 +222,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = ireq->ir_mark; @@ -255,16 +235,16 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - ret = get_cookie_sock(sk, skb, req, dst); + ret = tcp_get_cookie_sock(sk, skb, req, dst); out: return ret; out_free: diff --git a/kernel/net/ipv6/sysctl_net_ipv6.c b/kernel/net/ipv6/sysctl_net_ipv6.c index abcc79f64..45243bbe5 100644 --- a/kernel/net/ipv6/sysctl_net_ipv6.c +++ b/kernel/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,9 @@ #include static int one = 1; +static int auto_flowlabels_min; +static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; + static struct ctl_table ipv6_table_template[] = { { @@ -45,7 +48,9 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.auto_flowlabels, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &auto_flowlabels_min, + .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", @@ -68,6 +73,20 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "flowlabel_state_ranges", + .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -109,6 +128,8 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect; ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries; ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; + ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; + ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/kernel/net/ipv6/tcp_ipv6.c b/kernel/net/ipv6/tcp_ipv6.c index e541d68db..b8d405623 100644 --- a/kernel/net/ipv6/tcp_ipv6.c +++ b/kernel/net/ipv6/tcp_ipv6.c @@ -70,8 +70,8 @@ #include #include -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); @@ -82,7 +82,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #else -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr) { return NULL; @@ -93,14 +93,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { + if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } } @@ -121,7 +119,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; - struct rt6_info *rt; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -237,7 +235,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -257,18 +256,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); - rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr)) + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; - if (np->opt) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -279,7 +277,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) goto late_failure; - ip6_set_txhash(sk); + sk_set_txhash(sk); if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, @@ -330,6 +328,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; + bool fatal; int err; sk = __inet6_lookup_established(net, &tcp_hashinfo, @@ -348,8 +347,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } seq = ntohl(th->seq); + fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, fatal); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -403,7 +403,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { @@ -437,11 +436,11 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -450,10 +449,11 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, int err = -ENOMEM; /* First, grab a route. */ - if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) + if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, + IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, @@ -463,8 +463,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - skb_set_queue_mapping(skb, queue_mapping); - err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -479,13 +481,13 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) } #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr) { return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); } -static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); @@ -624,8 +626,12 @@ clear_hash_noput: return 1; } -static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +#endif + +static bool tcp_v6_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -662,26 +668,27 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) &ip6h->daddr, ntohs(th->dest)); return true; } +#endif return false; } -#endif -static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && + if (!sk_listener->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { @@ -690,13 +697,14 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, } } -static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v6_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { if (strict) *strict = true; - return inet6_csk_route_req(sk, &fl->u.ip6, req); + return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { @@ -723,10 +731,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, - .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; -static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, u8 tclass, u32 label) @@ -825,7 +832,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, kfree_skb(buff); } -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; @@ -896,7 +903,7 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, u32 label) @@ -919,7 +926,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV @@ -927,44 +934,18 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); } -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, th->source, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - tcp_v6_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v6_check(sk, skb); #endif @@ -987,12 +968,16 @@ drop: return 0; /* don't send reset */ } -static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_pinfo *newnp; + const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; @@ -1007,7 +992,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, * v6 mapped */ - newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); if (!newsk) return NULL; @@ -1060,7 +1046,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto out_overflow; if (!dst) { - dst = inet6_csk_route_req(sk, &fl6, req); + dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } @@ -1076,7 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ newsk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; @@ -1093,8 +1079,6 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; - ip6_set_txhash(newsk); - /* Now IPv6 options... First: no IPv4 options. @@ -1106,16 +1090,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; - /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; - if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_atomic(sk, GFP_ATOMIC)); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); - } newnp->opt = NULL; newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; @@ -1129,13 +1104,15 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, but we make one more one thing there: reattach optmem to newsk. */ - if (np->opt) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); @@ -1170,7 +1147,20 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_done(newsk); goto out; } - __inet_hash(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + if (*own_req) { + tcp_move_syn(newtp, req); + + /* Clone pktoptions received with SYN, if we own the req */ + if (ireq->pktopts) { + newnp->pktoptions = skb_clone(ireq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); + consume_skb(ireq->pktopts); + ireq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + } return newsk; @@ -1184,7 +1174,7 @@ out: } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1251,22 +1241,18 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) @@ -1276,7 +1262,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) + if (tcp_rcv_state_process(sk, skb)) goto reset; if (opt_skb) goto ipv6_pktoptions; @@ -1390,6 +1376,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); +lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, inet6_iif(skb)); if (!sk) @@ -1399,6 +1386,37 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk; + + sk = req->rsk_listener; + tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + reqsk_put(req); + goto discard_it; + } + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_and_relse; + } + if (nsk == sk) { + reqsk_put(req); + tcp_v6_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v6_send_reset(nsk, skb); + goto discard_and_relse; + } else { + sock_put(sk); + return 0; + } + } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1409,18 +1427,23 @@ process: tcp_v6_fill_cb(skb, hdr, th); -#ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v6_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1433,6 +1456,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret ? -1 : 0; @@ -1442,7 +1466,7 @@ no_tcp_socket: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: @@ -1467,10 +1491,6 @@ do_time_wait: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1487,8 +1507,7 @@ do_time_wait: ntohs(th->dest), tcp_v6_iif(skb)); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); goto process; @@ -1638,7 +1657,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct request_sock *req, int i, kuid_t uid) + const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; @@ -1662,7 +1681,8 @@ static void get_openreq6(struct seq_file *seq, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, - from_kuid_munged(seq_user_ns(seq), uid), + from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1677,7 +1697,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + int rx_queue; + int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1698,6 +1720,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_expires = jiffies; } + state = sk_state_load(sp); + if (state == TCP_LISTEN) + rx_queue = sp->sk_ack_backlog; + else + /* Because we don't lock the socket, + * we might find a transient negative value. + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", @@ -1706,9 +1737,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - tp->write_seq-tp->snd_una, - (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), + state, + tp->write_seq - tp->snd_una, + rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, @@ -1720,8 +1751,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sp->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + state == TCP_LISTEN ? + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } @@ -1767,18 +1798,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait6_sock(seq, v, st->num); - else - get_tcp6_sock(seq, v, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq6(seq, v, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq6(seq, v, st->num); + else + get_tcp6_sock(seq, v, st->num); out: return 0; } diff --git a/kernel/net/ipv6/tunnel6.c b/kernel/net/ipv6/tunnel6.c index 3c758007b..dae25cad0 100644 --- a/kernel/net/ipv6/tunnel6.c +++ b/kernel/net/ipv6/tunnel6.c @@ -144,6 +144,16 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, break; } +static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_tunnel *handler; + + for_each_tunnel_rcu(tunnel46_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, @@ -152,7 +162,7 @@ static const struct inet6_protocol tunnel6_protocol = { static const struct inet6_protocol tunnel46_protocol = { .handler = tunnel46_rcv, - .err_handler = tunnel6_err, + .err_handler = tunnel46_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; diff --git a/kernel/net/ipv6/udp.c b/kernel/net/ipv6/udp.c index e51fc3eee..9da3287a3 100644 --- a/kernel/net/ipv6/udp.c +++ b/kernel/net/ipv6/udp.c @@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } -#define SCORE2_MAX (1 + 1 + 1) static inline int compute_score2(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, @@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -251,8 +256,7 @@ begin: hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); matches = 1; - } else if (score == SCORE2_MAX) - goto exact_match; + } } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -269,7 +273,6 @@ begin: goto begin; if (result) { -exact_match: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -1107,6 +1110,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; + struct ipv6_txoptions *opt_to_free = NULL; struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; @@ -1260,8 +1264,10 @@ do_udp_sendmsg: opt = NULL; connected = 0; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -1370,6 +1376,7 @@ release_dst: out: dst_release(dst); fl6_sock_release(flowlabel); + txopt_put(opt_to_free); if (!err) return len; /* @@ -1496,7 +1503,8 @@ int __net_init udp6_proc_init(struct net *net) return udp_proc_register(net, &udp6_seq_afinfo); } -void udp6_proc_exit(struct net *net) { +void udp6_proc_exit(struct net *net) +{ udp_proc_unregister(net, &udp6_seq_afinfo); } #endif /* CONFIG_PROC_FS */ diff --git a/kernel/net/ipv6/xfrm6_input.c b/kernel/net/ipv6/xfrm6_input.c index 74bd17882..0eaab1fa6 100644 --- a/kernel/net/ipv6/xfrm6_input.c +++ b/kernel/net/ipv6/xfrm6_input.c @@ -42,8 +42,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_rcv_finish); return -1; } diff --git a/kernel/net/ipv6/xfrm6_mode_tunnel.c b/kernel/net/ipv6/xfrm6_mode_tunnel.c index 901ef6f8a..372855eea 100644 --- a/kernel/net/ipv6/xfrm6_mode_tunnel.c +++ b/kernel/net/ipv6/xfrm6_mode_tunnel.c @@ -20,11 +20,10 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) { - const struct ipv6hdr *outer_iph = ipv6_hdr(skb); struct ipv6hdr *inner_iph = ipipv6_hdr(skb); - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) - IP6_ECN_set_ce(inner_iph); + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP6_ECN_set_ce(skb, inner_iph); } /* Add encapsulation header. diff --git a/kernel/net/ipv6/xfrm6_output.c b/kernel/net/ipv6/xfrm6_output.c index 09c76a7b4..4d09ce6fa 100644 --- a/kernel/net/ipv6/xfrm6_output.c +++ b/kernel/net/ipv6/xfrm6_output.c @@ -79,6 +79,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (!skb->ignore_df && skb->len > mtu) { skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); if (xfrm6_local_dontfrag(skb)) xfrm6_local_rxpmtu(skb, mtu); @@ -131,44 +132,57 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } -static int __xfrm6_output(struct sock *sk, struct sk_buff *skb) +static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + return x->outer_mode->afinfo->output_finish(sk, skb); +} + +static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; int mtu; + bool toobig; #ifdef CONFIG_NETFILTER if (!x) { IP6CB(skb)->flags |= IP6SKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif + if (x->props.mode != XFRM_MODE_TUNNEL) + goto skip_frag; + if (skb->protocol == htons(ETH_P_IPV6)) mtu = ip6_skb_dst_mtu(skb); else mtu = dst_mtu(skb_dst(skb)); - if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { + toobig = skb->len > mtu && !skb_is_gso(skb); + + if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; - } else if (!skb->ignore_df && skb->len > mtu && skb->sk) { + } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); return -EMSGSIZE; } - if (x->props.mode == XFRM_MODE_TUNNEL && - ((skb->len > mtu && !skb_is_gso(skb)) || - dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(sk, skb, - x->outer_mode->afinfo->output_finish); - } + if (toobig || dst_allfrag(skb_dst(skb))) + return ip6_fragment(net, sk, skb, + __xfrm6_output_finish); + +skip_frag: return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm6_output(struct sock *sk, struct sk_buff *skb) +int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb, - NULL, skb_dst(skb)->dev, __xfrm6_output, + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb_dst(skb)->dev, + __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/kernel/net/ipv6/xfrm6_policy.c b/kernel/net/ipv6/xfrm6_policy.c index f337a908a..c074771a1 100644 --- a/kernel/net/ipv6/xfrm6_policy.c +++ b/kernel/net/ipv6/xfrm6_policy.c @@ -20,13 +20,14 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6_MIP6) #include #endif static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, +static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { @@ -35,6 +36,8 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int err; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -50,13 +53,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, return dst; } -static int xfrm6_get_saddr(struct net *net, +static int xfrm6_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; struct net_device *dev; - dst = xfrm6_dst_lookup(net, 0, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; @@ -71,20 +74,12 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } -static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) -{ - struct rt6_info *rt = (struct rt6_info *)xdst; - - rt6_init_peer(rt, net->ipv6.peers); -} - static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_node) - path->path_cookie = rt->rt6i_node->fn_sernum; + path->path_cookie = rt6_get_cookie(rt); } path->u.rt6.rt6i_nfheader_len = nfheader_len; @@ -106,16 +101,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return -ENODEV; } - rt6_transfer_peer(&xdst->u.rt6, rt); - /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->u.rt6.rt6i_metric = rt->rt6i_metric; xdst->u.rt6.rt6i_node = rt->rt6i_node; - if (rt->rt6i_node) - xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; @@ -142,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) nexthdr = nh[nhoff]; if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + oif = l3mdev_fib_oif(skb_dst(skb)->dev); memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; @@ -185,7 +177,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) return; case IPPROTO_ICMPV6: - if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) { + if (!onlyproto && (nh + offset + 2 < skb->data || + pskb_may_pull(skb, nh + offset + 2 - skb->data))) { u8 *icmp; nh = skb_network_header(skb); @@ -199,7 +192,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: offset += ipv6_optlen(exthdr); - if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { + if (!onlyproto && (nh + offset + 3 < skb->data || + pskb_may_pull(skb, nh + offset + 3 - skb->data))) { struct ip6_mh *mh; nh = skb_network_header(skb); @@ -255,10 +249,6 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); - if (rt6_has_peer(&xdst->u.rt6)) { - struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6); - inet_putpeer(peer); - } xfrm_dst_destroy(xdst); } @@ -289,7 +279,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xfrm_dst_ifdown(dst, dev); } -static struct dst_ops xfrm6_dst_ops = { +static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, @@ -298,17 +288,16 @@ static struct dst_ops xfrm6_dst_ops = { .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = 32768, + .gc_thresh = INT_MAX, }; static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .dst_ops = &xfrm6_dst_ops, + .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, - .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, @@ -336,7 +325,7 @@ static struct ctl_table xfrm6_policy_table[] = { { } }; -static int __net_init xfrm6_net_init(struct net *net) +static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -364,7 +353,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm6_net_exit(struct net *net) +static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -376,24 +365,52 @@ static void __net_exit xfrm6_net_exit(struct net *net) if (!net_eq(net, &init_net)) kfree(table); } +#else /* CONFIG_SYSCTL */ +static int inline xfrm6_net_sysctl_init(struct net *net) +{ + return 0; +} + +static void inline xfrm6_net_sysctl_exit(struct net *net) +{ +} +#endif + +static int __net_init xfrm6_net_init(struct net *net) +{ + int ret; + + memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, + sizeof(xfrm6_dst_ops_template)); + ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); + if (ret) + return ret; + + ret = xfrm6_net_sysctl_init(net); + if (ret) + dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); + + return ret; +} + +static void __net_exit xfrm6_net_exit(struct net *net) +{ + xfrm6_net_sysctl_exit(net); + dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); +} static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; -#endif int __init xfrm6_init(void) { int ret; - dst_entries_init(&xfrm6_dst_ops); - ret = xfrm6_policy_init(); - if (ret) { - dst_entries_destroy(&xfrm6_dst_ops); + if (ret) goto out; - } ret = xfrm6_state_init(); if (ret) goto out_policy; @@ -402,9 +419,7 @@ int __init xfrm6_init(void) if (ret) goto out_state; -#ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm6_net_ops); -#endif out: return ret; out_state: @@ -416,11 +431,8 @@ out_policy: void xfrm6_fini(void) { -#ifdef CONFIG_SYSCTL unregister_pernet_subsys(&xfrm6_net_ops); -#endif xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); - dst_entries_destroy(&xfrm6_dst_ops); } diff --git a/kernel/net/ipx/af_ipx.c b/kernel/net/ipx/af_ipx.c index 4ea5d7497..48d0dc89b 100644 --- a/kernel/net/ipx/af_ipx.c +++ b/kernel/net/ipx/af_ipx.c @@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto); + sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern); if (!sk) goto out; diff --git a/kernel/net/irda/af_irda.c b/kernel/net/irda/af_irda.c index ee0ea25c8..923abd6b3 100644 --- a/kernel/net/irda/af_irda.c +++ b/kernel/net/irda/af_irda.c @@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; struct irda_sock *self; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (net != &init_net) return -EAFNOSUPPORT; @@ -1100,7 +1103,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, } /* Allocate networking socket */ - sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto); + sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern); if (sk == NULL) return -ENOMEM; @@ -2123,8 +2126,7 @@ static int irda_setsockopt(struct socket *sock, int level, int optname, } /* Unregister any old registration */ - if (self->skey) - irlmp_unregister_service(self->skey); + irlmp_unregister_service(self->skey); self->skey = irlmp_register_service((__u16) opt); break; diff --git a/kernel/net/irda/ircomm/ircomm_tty.c b/kernel/net/irda/ircomm/ircomm_tty.c index 683346d2d..a4237707f 100644 --- a/kernel/net/irda/ircomm/ircomm_tty.c +++ b/kernel/net/irda/ircomm/ircomm_tty.c @@ -335,8 +335,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, * specified, we cannot return before the IrCOMM link is * ready */ - if (!test_bit(ASYNCB_CLOSING, &port->flags) && - (do_clocal || tty_port_carrier_raised(port)) && + if ((do_clocal || tty_port_carrier_raised(port)) && self->state == IRCOMM_TTY_READY) { break; @@ -443,34 +442,6 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp) /* Not really used by us, but lets do it anyway */ self->port.low_latency = (self->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0; - /* - * If the port is the middle of closing, bail out now - */ - if (test_bit(ASYNCB_CLOSING, &self->port.flags)) { - - /* Hm, why are we blocking on ASYNC_CLOSING if we - * do return -EAGAIN/-ERESTARTSYS below anyway? - * IMHO it's either not needed in the first place - * or for some reason we need to make sure the async - * closing has been finished - if so, wouldn't we - * probably better sleep uninterruptible? - */ - - if (wait_event_interruptible(self->port.close_wait, - !test_bit(ASYNCB_CLOSING, &self->port.flags))) { - net_warn_ratelimited("%s - got signal while blocking on ASYNC_CLOSING!\n", - __func__); - return -ERESTARTSYS; - } - -#ifdef SERIAL_DO_RESTART - return (self->port.flags & ASYNC_HUP_NOTIFY) ? - -EAGAIN : -ERESTARTSYS; -#else - return -EAGAIN; -#endif - } - /* Check if this is a "normal" ircomm device, or an irlpt device */ if (self->line < 0x10) { self->service_type = IRCOMM_3_WIRE | IRCOMM_9_WIRE; diff --git a/kernel/net/irda/irlmp.c b/kernel/net/irda/irlmp.c index a26c401ef..43964594a 100644 --- a/kernel/net/irda/irlmp.c +++ b/kernel/net/irda/irlmp.c @@ -1839,7 +1839,7 @@ static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off) for (element = hashbin_get_first(iter->hashbin); element != NULL; element = hashbin_get_next(iter->hashbin)) { - if (!off || *off-- == 0) { + if (!off || (*off)-- == 0) { /* NB: hashbin left locked */ return element; } diff --git a/kernel/net/irda/timer.c b/kernel/net/irda/timer.c index 0c4c115a5..f2280f73b 100644 --- a/kernel/net/irda/timer.c +++ b/kernel/net/irda/timer.c @@ -60,8 +60,8 @@ void irlap_start_query_timer(struct irlap_cb *self, int S, int s) * to avoid messing with for incoming connections requests and * to accommodate devices that perform discovery slower than us. * Jean II */ - timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s) - + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT); + timeout = msecs_to_jiffies(sysctl_slot_timeout) * (S - s) + + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT; /* Set or re-set the timer. We reset the timer for each received * discovery query, which allow us to automatically adjust to diff --git a/kernel/net/iucv/af_iucv.c b/kernel/net/iucv/af_iucv.c index 6daa52a18..20ab7b2ec 100644 --- a/kernel/net/iucv/af_iucv.c +++ b/kernel/net/iucv/af_iucv.c @@ -95,11 +95,10 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify); /* Call Back functions */ static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); -static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]); -static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8], - u8 ipuser[16]); -static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]); -static void iucv_callback_shutdown(struct iucv_path *, u8 ipuser[16]); +static void iucv_callback_connack(struct iucv_path *, u8 *); +static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *); +static void iucv_callback_connrej(struct iucv_path *, u8 *); +static void iucv_callback_shutdown(struct iucv_path *, u8 *); static struct iucv_sock_list iucv_sk_list = { .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock), @@ -535,12 +534,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; } -static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) +static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; struct iucv_sock *iucv; - sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto); + sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern); if (!sk) return NULL; iucv = iucv_sk(sk); @@ -602,7 +601,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL); + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); if (!sk) return -ENOMEM; @@ -709,6 +708,9 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, if (!addr || addr->sa_family != AF_IUCV) return -EINVAL; + if (addr_len < sizeof(struct sockaddr_iucv)) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != IUCV_OPEN) { err = -EBADFD; @@ -1484,7 +1486,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && iucv_below_msglim(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } @@ -1723,7 +1725,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1933,7 +1935,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || diff --git a/kernel/net/iucv/iucv.c b/kernel/net/iucv/iucv.c index 2a6a1fdd6..7eaa000c9 100644 --- a/kernel/net/iucv/iucv.c +++ b/kernel/net/iucv/iucv.c @@ -713,7 +713,7 @@ static struct notifier_block __refdata iucv_cpu_notifier = { * * Sever an iucv path to free up the pathid. Used internally. */ -static int iucv_sever_pathid(u16 pathid, u8 userdata[16]) +static int iucv_sever_pathid(u16 pathid, u8 *userdata) { union iucv_param *parm; @@ -876,7 +876,7 @@ static struct notifier_block iucv_reboot_notifier = { * Returns the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, - u8 userdata[16], void *private) + u8 *userdata, void *private) { union iucv_param *parm; int rc; @@ -923,7 +923,7 @@ EXPORT_SYMBOL(iucv_path_accept); * Returns the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, - u8 userid[8], u8 system[8], u8 userdata[16], + u8 *userid, u8 *system, u8 *userdata, void *private) { union iucv_param *parm; @@ -985,7 +985,7 @@ EXPORT_SYMBOL(iucv_path_connect); * * Returns the result from the CP IUCV call. */ -int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16]) +int iucv_path_quiesce(struct iucv_path *path, u8 *userdata) { union iucv_param *parm; int rc; @@ -1017,7 +1017,7 @@ EXPORT_SYMBOL(iucv_path_quiesce); * * Returns the result from the CP IUCV call. */ -int iucv_path_resume(struct iucv_path *path, u8 userdata[16]) +int iucv_path_resume(struct iucv_path *path, u8 *userdata) { union iucv_param *parm; int rc; @@ -1047,7 +1047,7 @@ out: * * Returns the result from the CP IUCV call. */ -int iucv_path_sever(struct iucv_path *path, u8 userdata[16]) +int iucv_path_sever(struct iucv_path *path, u8 *userdata) { int rc; diff --git a/kernel/net/key/af_key.c b/kernel/net/key/af_key.c index f0d52d721..f9c9ecb0c 100644 --- a/kernel/net/key/af_key.c +++ b/kernel/net/key/af_key.c @@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, return -EPROTONOSUPPORT; err = -ENOMEM; - sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto); + sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) goto out; @@ -219,7 +219,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 -static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, +static int pfkey_broadcast(struct sk_buff *skb, int broadcast_flags, struct sock *one_sk, struct net *net) { @@ -244,7 +244,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, * socket. */ if (pfk->promisc) - pfkey_broadcast_one(skb, &skb2, allocation, sk); + pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) @@ -259,9 +259,9 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, continue; } - err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk); + err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); - /* Error is cleare after succecful sending to at least one + /* Error is cleared after successful sending to at least one * registered KM */ if ((broadcast_flags & BROADCAST_REGISTERED) && err) err = err2; @@ -269,7 +269,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk); kfree_skb(skb2); kfree_skb(skb); @@ -292,7 +292,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; - pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } @@ -333,7 +333,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1190,6 +1190,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; + x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ @@ -1364,7 +1365,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ xfrm_state_put(x); - pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); + pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net); return 0; } @@ -1451,7 +1452,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x)); return 0; } @@ -1564,7 +1565,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1669,7 +1670,7 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad return -ENOBUFS; } - pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); + pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } @@ -1688,7 +1689,7 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); + return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) @@ -1709,7 +1710,7 @@ static int key_notify_sa_flush(const struct km_event *c) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net); return 0; } @@ -1766,7 +1767,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -1846,7 +1847,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb new_hdr->sadb_msg_errno = 0; } - pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } @@ -2180,7 +2181,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); + pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } @@ -2400,7 +2401,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); + pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: @@ -2654,7 +2655,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -2707,7 +2708,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net); return 0; } @@ -2769,7 +2770,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); @@ -2991,7 +2992,7 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; - pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } @@ -3181,7 +3182,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_ctx->ctx_len); } - return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, @@ -3379,7 +3380,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; - return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE @@ -3571,7 +3572,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net); return 0; diff --git a/kernel/net/l2tp/l2tp_core.c b/kernel/net/l2tp/l2tp_core.c index a29a50449..afca2eb4d 100644 --- a/kernel/net/l2tp/l2tp_core.c +++ b/kernel/net/l2tp/l2tp_core.c @@ -1319,7 +1319,7 @@ static void l2tp_tunnel_del_work(struct work_struct *work) tunnel = container_of(work, struct l2tp_tunnel, del_work); sk = l2tp_tunnel_sock_lookup(tunnel); if (!sk) - return; + goto out; sock = sk->sk_socket; @@ -1334,12 +1334,15 @@ static void l2tp_tunnel_del_work(struct work_struct *work) if (sock) inet_shutdown(sock, 2); } else { - if (sock) + if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sk); + sock_release(sock); + } } l2tp_tunnel_sock_put(sk); +out: + l2tp_tunnel_dec_refcount(tunnel); } /* Create a socket for the tunnel, if one isn't set up by @@ -1399,13 +1402,11 @@ static int l2tp_tunnel_sock_create(struct net *net, if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); @@ -1429,13 +1430,11 @@ static int l2tp_tunnel_sock_create(struct net *net, { struct sockaddr_l2tpip ip_addr = {0}; - err = sock_create_kern(AF_INET, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; @@ -1462,7 +1461,7 @@ out: *sockp = sock; if ((err < 0) && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); *sockp = NULL; } @@ -1639,8 +1638,13 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); */ int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { + l2tp_tunnel_inc_refcount(tunnel); l2tp_tunnel_closeall(tunnel); - return (false == queue_work(l2tp_wq, &tunnel->del_work)); + if (false == queue_work(l2tp_wq, &tunnel->del_work)) { + l2tp_tunnel_dec_refcount(tunnel); + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/kernel/net/l2tp/l2tp_core.h b/kernel/net/l2tp/l2tp_core.h index 68aa9ffd4..5871537af 100644 --- a/kernel/net/l2tp/l2tp_core.h +++ b/kernel/net/l2tp/l2tp_core.h @@ -321,4 +321,7 @@ do { \ #define l2tp_dbg(ptr, type, fmt, ...) \ l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__) +#define MODULE_ALIAS_L2TP_PWTYPE(type) \ + MODULE_ALIAS("net-l2tp-type-" __stringify(type)) + #endif /* _L2TP_CORE_H_ */ diff --git a/kernel/net/l2tp/l2tp_eth.c b/kernel/net/l2tp/l2tp_eth.c index 4b552873b..e253c26f3 100644 --- a/kernel/net/l2tp/l2tp_eth.c +++ b/kernel/net/l2tp/l2tp_eth.c @@ -358,3 +358,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman "); MODULE_DESCRIPTION("L2TP ethernet pseudowire driver"); MODULE_VERSION("1.0"); +MODULE_ALIAS_L2TP_PWTYPE(5); diff --git a/kernel/net/l2tp/l2tp_ip.c b/kernel/net/l2tp/l2tp_ip.c index 79649937e..ec22078b0 100644 --- a/kernel/net/l2tp/l2tp_ip.c +++ b/kernel/net/l2tp/l2tp_ip.c @@ -655,3 +655,4 @@ MODULE_VERSION("1.0"); * enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP); +MODULE_ALIAS_NET_PF_PROTO(PF_INET, IPPROTO_L2TP); diff --git a/kernel/net/l2tp/l2tp_ip6.c b/kernel/net/l2tp/l2tp_ip6.c index d1ded3777..a2c8747d2 100644 --- a/kernel/net/l2tp/l2tp_ip6.c +++ b/kernel/net/l2tp/l2tp_ip6.c @@ -486,6 +486,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; @@ -575,8 +576,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = NULL; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -631,6 +634,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; @@ -801,3 +805,4 @@ MODULE_VERSION("1.0"); * enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP); +MODULE_ALIAS_NET_PF_PROTO(PF_INET6, IPPROTO_L2TP); diff --git a/kernel/net/l2tp/l2tp_netlink.c b/kernel/net/l2tp/l2tp_netlink.c index 9e13c2ff8..2caaa84ce 100644 --- a/kernel/net/l2tp/l2tp_netlink.c +++ b/kernel/net/l2tp/l2tp_netlink.c @@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family, ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); @@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family, ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); @@ -576,6 +586,13 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_MRU]) cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]); +#ifdef CONFIG_MODULES + if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { + genl_unlock(); + request_module("net-l2tp-type-%u", cfg.pw_type); + genl_lock(); + } +#endif if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { ret = -EPROTONOSUPPORT; diff --git a/kernel/net/l2tp/l2tp_ppp.c b/kernel/net/l2tp/l2tp_ppp.c index e9b0dec56..1ad18c550 100644 --- a/kernel/net/l2tp/l2tp_ppp.c +++ b/kernel/net/l2tp/l2tp_ppp.c @@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) /* socket() handler. Initialize a new struct sock. */ -static int pppol2tp_create(struct net *net, struct socket *sock) +static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; - sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto); + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; @@ -1863,3 +1863,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP)); +MODULE_ALIAS_L2TP_PWTYPE(11); diff --git a/kernel/net/l3mdev/Kconfig b/kernel/net/l3mdev/Kconfig new file mode 100644 index 000000000..5d4732503 --- /dev/null +++ b/kernel/net/l3mdev/Kconfig @@ -0,0 +1,10 @@ +# +# Configuration for L3 master device support +# + +config NET_L3_MASTER_DEV + bool "L3 Master device support" + depends on INET || IPV6 + ---help--- + This module provides glue between core networking code and device + drivers to support L3 master devices like VRF. diff --git a/kernel/net/l3mdev/Makefile b/kernel/net/l3mdev/Makefile new file mode 100644 index 000000000..84a53a6f6 --- /dev/null +++ b/kernel/net/l3mdev/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the L3 device API +# + +obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o diff --git a/kernel/net/l3mdev/l3mdev.c b/kernel/net/l3mdev/l3mdev.c new file mode 100644 index 000000000..8e5ead366 --- /dev/null +++ b/kernel/net/l3mdev/l3mdev.c @@ -0,0 +1,92 @@ +/* + * net/l3mdev/l3mdev.c - L3 master device implementation + * Copyright (c) 2015 Cumulus Networks + * Copyright (c) 2015 David Ahern + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +/** + * l3mdev_master_ifindex - get index of L3 master device + * @dev: targeted interface + */ + +int l3mdev_master_ifindex_rcu(struct net_device *dev) +{ + int ifindex = 0; + + if (!dev) + return 0; + + if (netif_is_l3_master(dev)) { + ifindex = dev->ifindex; + } else if (netif_is_l3_slave(dev)) { + struct net_device *master; + + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + ifindex = master->ifindex; + } + + return ifindex; +} +EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); + +/** + * l3mdev_fib_table - get FIB table id associated with an L3 + * master interface + * @dev: targeted interface + */ + +u32 l3mdev_fib_table_rcu(const struct net_device *dev) +{ + u32 tb_id = 0; + + if (!dev) + return 0; + + if (netif_is_l3_master(dev)) { + if (dev->l3mdev_ops->l3mdev_fib_table) + tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev); + } else if (netif_is_l3_slave(dev)) { + /* Users of netdev_master_upper_dev_get_rcu need non-const, + * but current inet_*type functions take a const + */ + struct net_device *_dev = (struct net_device *) dev; + const struct net_device *master; + + master = netdev_master_upper_dev_get_rcu(_dev); + if (master && + master->l3mdev_ops->l3mdev_fib_table) + tb_id = master->l3mdev_ops->l3mdev_fib_table(master); + } + + return tb_id; +} +EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu); + +u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + u32 tb_id = 0; + + if (!ifindex) + return 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + tb_id = l3mdev_fib_table_rcu(dev); + + rcu_read_unlock(); + + return tb_id; +} +EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); diff --git a/kernel/net/llc/af_llc.c b/kernel/net/llc/af_llc.c index 17a8dff06..8dab4e569 100644 --- a/kernel/net/llc/af_llc.c +++ b/kernel/net/llc/af_llc.c @@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; - sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto); + sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); @@ -613,7 +613,7 @@ static int llc_wait_data(struct sock *sk, long timeo) if (signal_pending(current)) break; rc = 0; - if (sk_wait_data(sk, &timeo)) + if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; @@ -802,7 +802,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, release_sock(sk); lock_sock(sk); } else - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", diff --git a/kernel/net/llc/llc_conn.c b/kernel/net/llc/llc_conn.c index 81a61fce3..3e821daf9 100644 --- a/kernel/net/llc/llc_conn.c +++ b/kernel/net/llc/llc_conn.c @@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk, struct llc_addr *daddr) { struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, - sk->sk_prot); + sk->sk_prot, 0); struct llc_sock *newllc, *llc = llc_sk(sk); if (!newsk) @@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk) * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one */ -struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) +struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { - struct sock *sk = sk_alloc(net, family, priority, prot); + struct sock *sk = sk_alloc(net, family, priority, prot, kern); if (!sk) goto out; diff --git a/kernel/net/mac80211/Kconfig b/kernel/net/mac80211/Kconfig index 64a012a0c..3891cbd2a 100644 --- a/kernel/net/mac80211/Kconfig +++ b/kernel/net/mac80211/Kconfig @@ -7,7 +7,6 @@ config MAC80211 select CRYPTO_CCM select CRYPTO_GCM select CRC32 - select AVERAGE ---help--- This option enables the hardware independent IEEE 802.11 networking stack. @@ -302,6 +301,20 @@ config MAC80211_DEBUG_COUNTERS ---help--- Selecting this option causes mac80211 to keep additional and very verbose statistics about TX and RX handler use - and show them in debugfs. + as well as a few selected dot11 counters. These will be + exposed in debugfs. + + Note that some of the counters are not concurrency safe + and may thus not always be accurate. If unsure, say N. + +config MAC80211_STA_HASH_MAX_SIZE + int "Station hash table maximum size" if MAC80211_DEBUG_MENU + default 0 + ---help--- + Setting this option to a low value (e.g. 4) allows testing the + hash table with collisions relatively deterministically (just + connect more stations than the number selected here.) + + If unsure, leave the default of 0. diff --git a/kernel/net/mac80211/Makefile b/kernel/net/mac80211/Makefile index 3275f0188..f9137a834 100644 --- a/kernel/net/mac80211/Makefile +++ b/kernel/net/mac80211/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_MAC80211) += mac80211.o # mac80211 objects mac80211-y := \ main.o status.o \ + driver-ops.o \ sta_info.o \ wep.o \ wpa.o \ @@ -26,7 +27,6 @@ mac80211-y := \ key.o \ util.o \ wme.o \ - event.o \ chan.o \ trace.o mlme.o \ tdls.o \ diff --git a/kernel/net/mac80211/aes_ccm.c b/kernel/net/mac80211/aes_ccm.c index 208df7c0b..7663c28ba 100644 --- a/kernel/net/mac80211/aes_ccm.c +++ b/kernel/net/mac80211/aes_ccm.c @@ -11,9 +11,8 @@ #include #include -#include #include -#include +#include #include #include "key.h" @@ -23,7 +22,7 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, u8 *data, size_t data_len, u8 *mic, size_t mic_len) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] @@ -32,15 +31,14 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, mic_len); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, mic_len); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, &pt, ct, data_len, b_0); + aead_request_set_crypt(aead_req, sg, sg, data_len, b_0); + aead_request_set_ad(aead_req, sg[0].length); crypto_aead_encrypt(aead_req); } @@ -49,7 +47,7 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, u8 *data, size_t data_len, u8 *mic, size_t mic_len) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] __aligned(__alignof__(struct aead_request)); @@ -60,15 +58,14 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, mic_len); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, mic_len); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, ct, &pt, data_len + mic_len, b_0); + aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0); + aead_request_set_ad(aead_req, sg[0].length); return crypto_aead_decrypt(aead_req); } diff --git a/kernel/net/mac80211/aes_cmac.c b/kernel/net/mac80211/aes_cmac.c index 4192806be..bdf0790d8 100644 --- a/kernel/net/mac80211/aes_cmac.c +++ b/kernel/net/mac80211/aes_cmac.c @@ -145,20 +145,3 @@ void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm) { crypto_free_cipher(tfm); } - -void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf, - u8 *k1, u8 *k2) -{ - u8 l[AES_BLOCK_SIZE] = {}; - struct ieee80211_key *key = - container_of(keyconf, struct ieee80211_key, conf); - - crypto_cipher_encrypt_one(key->u.aes_cmac.tfm, l, l); - - memcpy(k1, l, AES_BLOCK_SIZE); - gf_mulx(k1); - - memcpy(k2, k1, AES_BLOCK_SIZE); - gf_mulx(k2); -} -EXPORT_SYMBOL(ieee80211_aes_cmac_calculate_k1_k2); diff --git a/kernel/net/mac80211/aes_gcm.c b/kernel/net/mac80211/aes_gcm.c index fd278bbe1..3afe361fd 100644 --- a/kernel/net/mac80211/aes_gcm.c +++ b/kernel/net/mac80211/aes_gcm.c @@ -8,9 +8,8 @@ #include #include -#include #include -#include +#include #include #include "key.h" @@ -19,7 +18,7 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, u8 *data, size_t data_len, u8 *mic) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] @@ -28,15 +27,14 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, &pt, ct, data_len, j_0); + aead_request_set_crypt(aead_req, sg, sg, data_len, j_0); + aead_request_set_ad(aead_req, sg[0].length); crypto_aead_encrypt(aead_req); } @@ -44,7 +42,7 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, u8 *data, size_t data_len, u8 *mic) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] __aligned(__alignof__(struct aead_request)); @@ -55,16 +53,15 @@ int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, ct, &pt, + aead_request_set_crypt(aead_req, sg, sg, data_len + IEEE80211_GCMP_MIC_LEN, j_0); + aead_request_set_ad(aead_req, sg[0].length); return crypto_aead_decrypt(aead_req); } diff --git a/kernel/net/mac80211/aes_gmac.c b/kernel/net/mac80211/aes_gmac.c index f1321b7d6..3ddd927aa 100644 --- a/kernel/net/mac80211/aes_gmac.c +++ b/kernel/net/mac80211/aes_gmac.c @@ -9,8 +9,8 @@ #include #include -#include #include +#include #include #include @@ -24,7 +24,7 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, const u8 *data, size_t data_len, u8 *mic) { - struct scatterlist sg[3], ct[1]; + struct scatterlist sg[4]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] __aligned(__alignof__(struct aead_request)); @@ -37,21 +37,19 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, memset(aead_req, 0, sizeof(aead_req_data)); memset(zero, 0, GMAC_MIC_LEN); - sg_init_table(sg, 3); + sg_init_table(sg, 4); sg_set_buf(&sg[0], aad, AAD_LEN); sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN); sg_set_buf(&sg[2], zero, GMAC_MIC_LEN); + sg_set_buf(&sg[3], mic, GMAC_MIC_LEN); memcpy(iv, nonce, GMAC_NONCE_LEN); memset(iv + GMAC_NONCE_LEN, 0, sizeof(iv) - GMAC_NONCE_LEN); iv[AES_BLOCK_SIZE - 1] = 0x01; - sg_init_table(ct, 1); - sg_set_buf(&ct[0], mic, GMAC_MIC_LEN); - aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, sg, AAD_LEN + data_len); - aead_request_set_crypt(aead_req, NULL, ct, 0, iv); + aead_request_set_crypt(aead_req, sg, sg, 0, iv); + aead_request_set_ad(aead_req, AAD_LEN + data_len); crypto_aead_encrypt(aead_req); diff --git a/kernel/net/mac80211/agg-rx.c b/kernel/net/mac80211/agg-rx.c index 5c564a68f..367784be5 100644 --- a/kernel/net/mac80211/agg-rx.c +++ b/kernel/net/mac80211/agg-rx.c @@ -79,7 +79,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, (int)reason); if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP, - &sta->sta, tid, NULL, 0)) + &sta->sta, tid, NULL, 0, false)) sdata_info(sta->sdata, "HW problem - can not stop rx aggregation for %pM tid %d\n", sta->sta.addr, tid); @@ -189,6 +189,7 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; + bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU); u16 capab; skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); @@ -217,7 +218,8 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; mgmt->u.action.u.addba_resp.dialog_token = dialog_token; - capab = (u16)(policy << 1); /* bit 1 aggregation policy */ + capab = (u16)(amsdu << 0); /* bit 0 A-MSDU support */ + capab |= (u16)(policy << 1); /* bit 1 aggregation policy */ capab |= (u16)(tid << 2); /* bit 5:2 TID number */ capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ @@ -289,7 +291,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, } /* prepare A-MPDU MLME for Rx aggregation */ - tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL); + tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL); if (!tid_agg_rx) goto end; @@ -321,7 +323,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, __skb_queue_head_init(&tid_agg_rx->reorder_buf[i]); ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, - &sta->sta, tid, &start_seq_num, 0); + &sta->sta, tid, &start_seq_num, 0, false); ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n", sta->sta.addr, tid, ret); if (ret) { diff --git a/kernel/net/mac80211/agg-tx.c b/kernel/net/mac80211/agg-tx.c index cce9d425c..ff757181b 100644 --- a/kernel/net/mac80211/agg-tx.c +++ b/kernel/net/mac80211/agg-tx.c @@ -97,7 +97,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; mgmt->u.action.u.addba_req.dialog_token = dialog_token; - capab = (u16)(1 << 1); /* bit 1 aggregation policy */ + capab = (u16)(1 << 0); /* bit 0 A-MSDU support */ + capab |= (u16)(1 << 1); /* bit 1 aggregation policy */ capab |= (u16)(tid << 2); /* bit 5:2 TID number */ capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */ @@ -331,7 +332,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, return -EALREADY; ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, - &sta->sta, tid, NULL, 0); + &sta->sta, tid, NULL, 0, false); WARN_ON_ONCE(ret); return 0; } @@ -381,7 +382,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST; ret = drv_ampdu_action(local, sta->sdata, action, - &sta->sta, tid, NULL, 0); + &sta->sta, tid, NULL, 0, false); /* HW shall not deny going back to legacy */ if (WARN_ON(ret)) { @@ -469,7 +470,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) start_seq_num = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START, - &sta->sta, tid, &start_seq_num, 0); + &sta->sta, tid, &start_seq_num, 0, false); if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", @@ -499,7 +500,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, start_seq_num, - local->hw.max_tx_aggregation_subframes, + IEEE80211_MAX_AMPDU_BUF, tid_tx->timeout); } @@ -564,8 +565,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; if ((tid >= IEEE80211_NUM_TIDS) || - !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) || - (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) + !ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) || + ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) return -EINVAL; ht_dbg(sdata, "Open BA session requested for %pM tid %u\n", @@ -693,7 +694,8 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_TX_OPERATIONAL, - &sta->sta, tid, NULL, tid_tx->buf_size); + &sta->sta, tid, NULL, tid_tx->buf_size, + tid_tx->amsdu); /* * synchronize with TX path, while splicing the TX path @@ -918,10 +920,13 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, struct tid_ampdu_tx *tid_tx; u16 capab, tid; u8 buf_size; + bool amsdu; capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); + amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); mutex_lock(&sta->ampdu_mlme.mtx); @@ -968,6 +973,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, } tid_tx->buf_size = buf_size; + tid_tx->amsdu = amsdu; if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)) ieee80211_agg_tx_operational(local, sta, tid); diff --git a/kernel/net/mac80211/cfg.c b/kernel/net/mac80211/cfg.c index f06d42267..c12f34813 100644 --- a/kernel/net/mac80211/cfg.c +++ b/kernel/net/mac80211/cfg.c @@ -2,7 +2,7 @@ * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ @@ -17,7 +17,6 @@ #include #include "ieee80211_i.h" #include "driver-ops.h" -#include "cfg.h" #include "rate.h" #include "mesh.h" #include "wme.h" @@ -137,6 +136,9 @@ static int ieee80211_set_noack_map(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; + + ieee80211_check_fast_xmit_iface(sdata); + return 0; } @@ -309,6 +311,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, u32 iv32; u16 iv16; int err = -ENOENT; + struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -339,10 +342,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, iv32 = key->u.tkip.tx.iv32; iv16 = key->u.tkip.tx.iv16; - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) - drv_get_tkip_seq(sdata->local, - key->conf.hw_key_idx, - &iv32, &iv16); + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + iv32 = kseq.tkip.iv32; + iv16 = kseq.tkip.iv16; + } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; @@ -355,52 +360,44 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - params.seq = seq; - params.seq_len = 6; - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - params.seq = seq; - params.seq_len = 6; - break; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - params.seq = seq; - params.seq_len = 6; - break; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), aes_gmac)); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = atomic64_read(&key->u.gcmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), gcmp)); + + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + memcpy(seq, kseq.ccmp.pn, 6); + } else { + pn64 = atomic64_read(&key->conf.tx_pn); + seq[0] = pn64; + seq[1] = pn64 >> 8; + seq[2] = pn64 >> 16; + seq[3] = pn64 >> 24; + seq[4] = pn64 >> 32; + seq[5] = pn64 >> 40; + } params.seq = seq; params.seq_len = 6; break; + default: + if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + break; + if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) + break; + drv_get_key_seq(sdata->local, key, &kseq); + params.seq = kseq.hw.seq; + params.seq_len = kseq.hw.seq_len; + break; } params.key = key->conf.key; @@ -471,45 +468,6 @@ void sta_set_rate_info_tx(struct sta_info *sta, rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } -void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) -{ - rinfo->flags = 0; - - if (sta->last_rx_rate_flag & RX_FLAG_HT) { - rinfo->flags |= RATE_INFO_FLAGS_MCS; - rinfo->mcs = sta->last_rx_rate_idx; - } else if (sta->last_rx_rate_flag & RX_FLAG_VHT) { - rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; - rinfo->nss = sta->last_rx_rate_vht_nss; - rinfo->mcs = sta->last_rx_rate_idx; - } else { - struct ieee80211_supported_band *sband; - int shift = ieee80211_vif_get_shift(&sta->sdata->vif); - u16 brate; - - sband = sta->local->hw.wiphy->bands[ - ieee80211_get_sdata_band(sta->sdata)]; - brate = sband->bitrates[sta->last_rx_rate_idx].bitrate; - rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); - } - - if (sta->last_rx_rate_flag & RX_FLAG_SHORT_GI) - rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; - - if (sta->last_rx_rate_flag & RX_FLAG_5MHZ) - rinfo->bw = RATE_INFO_BW_5; - else if (sta->last_rx_rate_flag & RX_FLAG_10MHZ) - rinfo->bw = RATE_INFO_BW_10; - else if (sta->last_rx_rate_flag & RX_FLAG_40MHZ) - rinfo->bw = RATE_INFO_BW_40; - else if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_80MHZ) - rinfo->bw = RATE_INFO_BW_80; - else if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_160MHZ) - rinfo->bw = RATE_INFO_BW_160; - else - rinfo->bw = RATE_INFO_BW_20; -} - static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { @@ -983,7 +941,7 @@ static int sta_apply_auth_flags(struct ieee80211_local *local, * well. Some drivers require rate control initialized * before drv_sta_state() is called. */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); @@ -1021,6 +979,65 @@ static int sta_apply_auth_flags(struct ieee80211_local *local, return 0; } +static void sta_apply_mesh_params(struct ieee80211_local *local, + struct sta_info *sta, + struct station_parameters *params) +{ +#ifdef CONFIG_MAC80211_MESH + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed = 0; + + if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { + switch (params->plink_state) { + case NL80211_PLINK_ESTAB: + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) + changed = mesh_plink_inc_estab_count(sdata); + sta->mesh->plink_state = params->plink_state; + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + sdata->u.mesh.mshcfg.power_mode); + break; + case NL80211_PLINK_LISTEN: + case NL80211_PLINK_BLOCKED: + case NL80211_PLINK_OPN_SNT: + case NL80211_PLINK_OPN_RCVD: + case NL80211_PLINK_CNF_RCVD: + case NL80211_PLINK_HOLDING: + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) + changed = mesh_plink_dec_estab_count(sdata); + sta->mesh->plink_state = params->plink_state; + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + NL80211_MESH_POWER_UNKNOWN); + break; + default: + /* nothing */ + break; + } + } + + switch (params->plink_action) { + case NL80211_PLINK_ACTION_NO_ACTION: + /* nothing */ + break; + case NL80211_PLINK_ACTION_OPEN: + changed |= mesh_plink_open(sta); + break; + case NL80211_PLINK_ACTION_BLOCK: + changed |= mesh_plink_block(sta); + break; + } + + if (params->local_pm) + changed |= ieee80211_mps_set_sta_local_pm(sta, + params->local_pm); + + ieee80211_mbss_info_change_notify(sdata, changed); +#endif +} + static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) @@ -1063,8 +1080,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, local->hw.queues >= IEEE80211_NUM_ACS) sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); - /* auth flags will be set later for TDLS stations */ - if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + /* auth flags will be set later for TDLS, + * and for unassociated stations that move to assocaited */ + if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && + (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; @@ -1099,6 +1119,13 @@ static int sta_apply_parameters(struct ieee80211_local *local, params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH) set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + !sdata->u.mgd.tdls_wider_bw_prohibited && + ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && + params->ext_capab_len >= 8 && + params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) + set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW); + if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) { sta->sta.uapsd_queues = params->uapsd_queues; sta->sta.max_sp = params->max_sp; @@ -1142,69 +1169,15 @@ static int sta_apply_parameters(struct ieee80211_local *local, * rc isn't initialized here yet, so ignore it */ __ieee80211_vht_handle_opmode(sdata, sta, - params->opmode_notif, - band, false); + params->opmode_notif, band); } - if (ieee80211_vif_is_mesh(&sdata->vif)) { -#ifdef CONFIG_MAC80211_MESH - u32 changed = 0; - - if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { - switch (params->plink_state) { - case NL80211_PLINK_ESTAB: - if (sta->plink_state != NL80211_PLINK_ESTAB) - changed = mesh_plink_inc_estab_count( - sdata); - sta->plink_state = params->plink_state; - - ieee80211_mps_sta_status_update(sta); - changed |= ieee80211_mps_set_sta_local_pm(sta, - sdata->u.mesh.mshcfg.power_mode); - break; - case NL80211_PLINK_LISTEN: - case NL80211_PLINK_BLOCKED: - case NL80211_PLINK_OPN_SNT: - case NL80211_PLINK_OPN_RCVD: - case NL80211_PLINK_CNF_RCVD: - case NL80211_PLINK_HOLDING: - if (sta->plink_state == NL80211_PLINK_ESTAB) - changed = mesh_plink_dec_estab_count( - sdata); - sta->plink_state = params->plink_state; - - ieee80211_mps_sta_status_update(sta); - changed |= ieee80211_mps_set_sta_local_pm(sta, - NL80211_MESH_POWER_UNKNOWN); - break; - default: - /* nothing */ - break; - } - } - - switch (params->plink_action) { - case NL80211_PLINK_ACTION_NO_ACTION: - /* nothing */ - break; - case NL80211_PLINK_ACTION_OPEN: - changed |= mesh_plink_open(sta); - break; - case NL80211_PLINK_ACTION_BLOCK: - changed |= mesh_plink_block(sta); - break; - } - - if (params->local_pm) - changed |= - ieee80211_mps_set_sta_local_pm(sta, - params->local_pm); - ieee80211_mbss_info_change_notify(sdata, changed); -#endif - } + if (ieee80211_vif_is_mesh(&sdata->vif)) + sta_apply_mesh_params(local, sta, params); /* set the STA state after all sta info from usermode has been set */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || + set & BIT(NL80211_STA_FLAG_ASSOCIATED)) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; @@ -1246,12 +1219,14 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, * defaults -- if userspace wants something else we'll * change it accordingly in sta_apply_parameters() */ - if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) { + if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) && + !(params->sta_flags_set & (BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED)))) { sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - } else { - sta->sta.tdls = true; } + if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) + sta->sta.tdls = true; err = sta_apply_parameters(local, sta, params); if (err) { @@ -1260,10 +1235,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, } /* - * for TDLS, rate control should be initialized only when - * rates are known and station is marked authorized + * for TDLS and for unassociated station, rate control should be + * initialized only when rates are known and station is marked + * authorized/associated */ - if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN || @@ -1338,7 +1315,10 @@ static int ieee80211_change_station(struct wiphy *wiphy, break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: - statype = CFG80211_STA_AP_CLIENT; + if (test_sta_flag(sta, WLAN_STA_ASSOC)) + statype = CFG80211_STA_AP_CLIENT; + else + statype = CFG80211_STA_AP_CLIENT_UNASSOC; break; default: err = -EOPNOTSUPP; @@ -1372,6 +1352,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, } sta->sdata = vlansdata; + ieee80211_check_fast_xmit(sta); if (sta->sta_state == IEEE80211_STA_AUTHORIZED && prev_4addr != new_4addr) { @@ -1406,7 +1387,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, if (sdata->vif.type == NL80211_IFTYPE_STATION && params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } @@ -1764,7 +1745,7 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ - if (!(sdata->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)) + if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -ENOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } @@ -2028,12 +2009,12 @@ ieee80211_sched_scan_start(struct wiphy *wiphy, static int ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = wiphy_priv(wiphy); - if (!sdata->local->ops->sched_scan_stop) + if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; - return ieee80211_request_sched_scan_stop(sdata); + return ieee80211_request_sched_scan_stop(local); } static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev, @@ -2099,10 +2080,14 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { + ieee80211_check_fast_xmit_all(local); + err = drv_set_frag_threshold(local, wiphy->frag_threshold); - if (err) + if (err) { + ieee80211_check_fast_xmit_all(local); return err; + } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || @@ -2355,6 +2340,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, const u8 *ap; enum ieee80211_smps_mode old_req; int err; + struct sta_info *sta; + bool tdls_peer_found = false; lockdep_assert_held(&sdata->wdev.mtx); @@ -2379,11 +2366,22 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, ap = sdata->u.mgd.associated->bssid; + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; + + tdls_peer_found = true; + break; + } + rcu_read_unlock(); + if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { - if (sdata->u.mgd.powersave) - smps_mode = IEEE80211_SMPS_DYNAMIC; - else + if (tdls_peer_found || !sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_OFF; + else + smps_mode = IEEE80211_SMPS_DYNAMIC; } /* send SM PS frame to AP */ @@ -2391,6 +2389,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, ap, ap); if (err) sdata->u.mgd.req_smps = old_req; + else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) + ieee80211_teardown_tdls_peers(sdata); return err; } @@ -2404,7 +2404,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && @@ -2419,10 +2419,10 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, __ieee80211_request_smps_mgd(sdata, sdata->u.mgd.req_smps); sdata_unlock(sdata); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); return 0; @@ -2440,8 +2440,13 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, rssi_hyst == bss_conf->cqm_rssi_hyst) return 0; + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER && + !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) + return -EOPNOTSUPP; + bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; + sdata->u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && @@ -2463,7 +2468,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; @@ -2476,16 +2481,28 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, sizeof(mask->control[i].ht_mcs)); + memcpy(sdata->rc_rateidx_vht_mcs_mask[i], + mask->control[i].vht_mcs, + sizeof(mask->control[i].vht_mcs)); sdata->rc_has_mcs_mask[i] = false; + sdata->rc_has_vht_mcs_mask[i] = false; if (!sband) continue; - for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) + for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { if (~sdata->rc_rateidx_mcs_mask[i][j]) { sdata->rc_has_mcs_mask[i] = true; break; } + } + + for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { + if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { + sdata->rc_has_vht_mcs_mask[i] = true; + break; + } + } } return 0; @@ -2514,6 +2531,19 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, return true; } +static u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->mtx); + + local->roc_cookie_counter++; + + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(local->roc_cookie_counter == 0)) + local->roc_cookie_counter++; + + return local->roc_cookie_counter; +} + static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, @@ -2551,7 +2581,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, roc->req_duration = duration; roc->frame = txskb; roc->type = type; - roc->mgmt_tx_cookie = (unsigned long)txskb; roc->sdata = sdata; INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); @@ -2561,17 +2590,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * or the SKB (for mgmt TX) */ if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } + roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { - *cookie = (unsigned long)txskb; + roc->mgmt_tx_cookie = *cookie; } /* if there's one pending or we're scanning, queue this one */ @@ -3244,13 +3266,43 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, return err; } +static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, u64 *cookie, + gfp_t gfp) +{ + unsigned long spin_flags; + struct sk_buff *ack_skb; + int id; + + ack_skb = skb_copy(skb, gfp); + if (!ack_skb) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&local->ack_status_lock, spin_flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x10000, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); + + if (id < 0) { + kfree_skb(ack_skb); + return ERR_PTR(-ENOMEM); + } + + IEEE80211_SKB_CB(skb)->ack_frame_id = id; + + *cookie = ieee80211_mgmt_tx_cookie(local); + IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; + + return ack_skb; +} + static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; + struct sk_buff *skb, *ack_skb; struct sta_info *sta; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; @@ -3299,8 +3351,14 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (!sdata->u.mgd.associated) + sdata_lock(sdata); + if (!sdata->u.mgd.associated || + (params->offchan && params->wait && + local->ops->remain_on_channel && + memcmp(sdata->u.mgd.associated->bssid, + mgmt->bssid, ETH_ALEN))) need_offchan = true; + sdata_unlock(sdata); break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; @@ -3383,8 +3441,27 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, skb->dev = sdata->dev; + if (!params->dont_wait_for_ack) { + /* make a copy to preserve the frame contents + * in case of encryption. + */ + ack_skb = ieee80211_make_ack_skb(local, skb, cookie, + GFP_KERNEL); + if (IS_ERR(ack_skb)) { + ret = PTR_ERR(ack_skb); + kfree_skb(skb); + goto out_unlock; + } + } else { + /* Assign a dummy non-zero cookie, it's not sent to + * userspace in this case but we rely on its value + * internally in the need_offchan case to distinguish + * mgmt-tx from remain-on-channel. + */ + *cookie = 0xffffffff; + } + if (!need_offchan) { - *cookie = (unsigned long) skb; ieee80211_tx_skb(sdata, skb); ret = 0; goto out_unlock; @@ -3392,7 +3469,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; @@ -3421,18 +3498,32 @@ static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, u16 frame_type, bool reg) { struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); switch (frame_type) { case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: - if (reg) + if (reg) { local->probe_req_reg++; - else - local->probe_req_reg--; + sdata->vif.probe_req_reg++; + } else { + if (local->probe_req_reg) + local->probe_req_reg--; + + if (sdata->vif.probe_req_reg) + sdata->vif.probe_req_reg--; + } if (!local->open_count) break; - ieee80211_queue_work(&local->hw, &local->reconfig_filter); + if (sdata->vif.probe_req_reg == 1) + drv_config_iface_filter(local, sdata, FIF_PROBE_REQ, + FIF_PROBE_REQ); + else if (sdata->vif.probe_req_reg == 0) + drv_config_iface_filter(local, sdata, 0, + FIF_PROBE_REQ); + + ieee80211_configure_filter(local); break; default: break; @@ -3477,7 +3568,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; - struct sk_buff *skb; + struct sk_buff *skb, *ack_skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; @@ -3485,20 +3576,24 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum ieee80211_band band; + int ret; + + /* the lock is needed to assign the cookie later */ + mutex_lock(&local->mtx); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return -EINVAL; + ret = -EINVAL; + goto unlock; } band = chanctx_conf->def.chan->band; sta = sta_info_get_bss(sdata, peer); if (sta) { qos = sta->sta.wme; } else { - rcu_read_unlock(); - return -ENOLINK; + ret = -ENOLINK; + goto unlock; } if (qos) { @@ -3514,8 +3609,8 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { - rcu_read_unlock(); - return -ENOMEM; + ret = -ENOMEM; + goto unlock; } skb->dev = dev; @@ -3541,13 +3636,23 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); + ack_skb = ieee80211_make_ack_skb(local, skb, cookie, GFP_ATOMIC); + if (IS_ERR(ack_skb)) { + kfree_skb(skb); + ret = PTR_ERR(ack_skb); + goto unlock; + } + local_bh_disable(); ieee80211_xmit(sdata, sta, skb); local_bh_enable(); + + ret = 0; +unlock: rcu_read_unlock(); + mutex_unlock(&local->mtx); - *cookie = (unsigned long) skb; - return 0; + return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, diff --git a/kernel/net/mac80211/cfg.h b/kernel/net/mac80211/cfg.h deleted file mode 100644 index 2d51f62dc..000000000 --- a/kernel/net/mac80211/cfg.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * mac80211 configuration hooks for cfg80211 - */ -#ifndef __CFG_H -#define __CFG_H - -extern const struct cfg80211_ops mac80211_config_ops; - -#endif /* __CFG_H */ diff --git a/kernel/net/mac80211/chan.c b/kernel/net/mac80211/chan.c index 5bcd4e558..1d1b9b7bd 100644 --- a/kernel/net/mac80211/chan.c +++ b/kernel/net/mac80211/chan.c @@ -190,7 +190,7 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local, return NULL; } -static enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta) +enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta) { switch (sta->bandwidth) { case IEEE80211_STA_RX_BW_20: @@ -264,9 +264,17 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_AP_VLAN: width = ieee80211_get_max_required_bw(sdata); break; + case NL80211_IFTYPE_STATION: + /* + * The ap's sta->bandwidth is not set yet at this + * point, so take the width from the chandef, but + * account also for TDLS peers + */ + width = max(vif->bss_conf.chandef.width, + ieee80211_get_max_required_bw(sdata)); + break; case NL80211_IFTYPE_P2P_DEVICE: continue; - case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: @@ -554,12 +562,13 @@ static void ieee80211_free_chanctx(struct ieee80211_local *local, kfree_rcu(ctx, rcu_head); } -static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) +void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; struct ieee80211_sub_if_data *sdata; const struct cfg80211_chan_def *compat = NULL; + struct sta_info *sta; lockdep_assert_held(&local->chanctx_mtx); @@ -581,6 +590,20 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, if (WARN_ON_ONCE(!compat)) break; } + + /* TDLS peers can sometimes affect the chandef width */ + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (!sta->uploaded || + !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + !sta->tdls_chandef.chan) + continue; + + compat = cfg80211_chandef_compatible(&sta->tdls_chandef, + compat); + if (WARN_ON_ONCE(!compat)) + break; + } rcu_read_unlock(); if (!compat) @@ -664,6 +687,8 @@ out: ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + ieee80211_check_fast_xmit_iface(sdata); + return ret; } @@ -1008,6 +1033,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + ieee80211_change_chanctx(local, new_ctx, chandef); + vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; vif_chsw[0].new_ctx = &new_ctx->conf; @@ -1030,6 +1057,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + ieee80211_check_fast_xmit_iface(sdata); + if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); @@ -1079,6 +1108,8 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + ieee80211_change_chanctx(local, new_ctx, chandef); + list_del(&sdata->reserved_chanctx_list); sdata->reserved_chanctx = NULL; @@ -1376,6 +1407,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + ieee80211_check_fast_xmit_iface(sdata); + sdata->radar_required = sdata->reserved_radar_required; if (sdata->vif.bss_conf.chandef.width != diff --git a/kernel/net/mac80211/debugfs.c b/kernel/net/mac80211/debugfs.c index 23813ebb3..4d2aaebd4 100644 --- a/kernel/net/mac80211/debugfs.c +++ b/kernel/net/mac80211/debugfs.c @@ -1,4 +1,3 @@ - /* * mac80211 debugfs for wireless PHYs * @@ -92,62 +91,69 @@ static const struct file_operations reset_ops = { }; #endif +static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +#define FLAG(F) [IEEE80211_HW_##F] = #F + FLAG(HAS_RATE_CONTROL), + FLAG(RX_INCLUDES_FCS), + FLAG(HOST_BROADCAST_PS_BUFFERING), + FLAG(SIGNAL_UNSPEC), + FLAG(SIGNAL_DBM), + FLAG(NEED_DTIM_BEFORE_ASSOC), + FLAG(SPECTRUM_MGMT), + FLAG(AMPDU_AGGREGATION), + FLAG(SUPPORTS_PS), + FLAG(PS_NULLFUNC_STACK), + FLAG(SUPPORTS_DYNAMIC_PS), + FLAG(MFP_CAPABLE), + FLAG(WANT_MONITOR_VIF), + FLAG(NO_AUTO_VIF), + FLAG(SW_CRYPTO_CONTROL), + FLAG(SUPPORT_FAST_XMIT), + FLAG(REPORTS_TX_ACK_STATUS), + FLAG(CONNECTION_MONITOR), + FLAG(QUEUE_CONTROL), + FLAG(SUPPORTS_PER_STA_GTK), + FLAG(AP_LINK_PS), + FLAG(TX_AMPDU_SETUP_IN_HW), + FLAG(SUPPORTS_RC_TABLE), + FLAG(P2P_DEV_ADDR_FOR_INTF), + FLAG(TIMING_BEACON_ONLY), + FLAG(SUPPORTS_HT_CCK_RATES), + FLAG(CHANCTX_STA_CSA), + FLAG(SUPPORTS_CLONED_SKBS), + FLAG(SINGLE_SCAN_ON_ALL_BANDS), + FLAG(TDLS_WIDER_BW), + FLAG(SUPPORTS_AMSDU_IN_AMPDU), + FLAG(BEACON_TX_STATUS), + + /* keep last for the build bug below */ + (void *)0x1 +#undef FLAG +}; + static ssize_t hwflags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; - int mxln = 500; + size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS; + char *buf = kzalloc(bufsz, GFP_KERNEL); + char *pos = buf, *end = buf + bufsz - 1; ssize_t rv; - char *buf = kzalloc(mxln, GFP_KERNEL); - int sf = 0; /* how many written so far */ + int i; if (!buf) - return 0; - - sf += scnprintf(buf, mxln - sf, "0x%x\n", local->hw.flags); - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) - sf += scnprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n"); - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) - sf += scnprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n"); - if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) - sf += scnprintf(buf + sf, mxln - sf, - "HOST_BCAST_PS_BUFFERING\n"); - if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE) - sf += scnprintf(buf + sf, mxln - sf, - "2GHZ_SHORT_SLOT_INCAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE) - sf += scnprintf(buf + sf, mxln - sf, - "2GHZ_SHORT_PREAMBLE_INCAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) - sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n"); - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n"); - if (local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC) - sf += scnprintf(buf + sf, mxln - sf, - "NEED_DTIM_BEFORE_ASSOC\n"); - if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT) - sf += scnprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n"); - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) - sf += scnprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n"); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) - sf += scnprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); - if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) - sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) - sf += scnprintf(buf + sf, mxln - sf, - "REPORTS_TX_ACK_STATUS\n"); - if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) - sf += scnprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n"); - if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) - sf += scnprintf(buf + sf, mxln - sf, "AP_LINK_PS\n"); - if (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW) - sf += scnprintf(buf + sf, mxln - sf, "TX_AMPDU_SETUP_IN_HW\n"); + return -ENOMEM; + + /* fail compilation if somebody adds or removes + * a flag without updating the name array above + */ + BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + + for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { + if (test_bit(i, local->hw.flags)) + pos += scnprintf(pos, end - pos, "%s\n", + hw_flag_names[i]); + } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); @@ -219,8 +225,8 @@ static const struct file_operations stats_ ##name## _ops = { \ .llseek = generic_file_llseek, \ }; -#define DEBUGFS_STATS_ADD(name, field) \ - debugfs_create_u32(#name, 0400, statsd, (u32 *) &field); +#define DEBUGFS_STATS_ADD(name) \ + debugfs_create_u32(#name, 0400, statsd, &local->name); #define DEBUGFS_DEVSTATS_ADD(name) \ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); @@ -255,53 +261,30 @@ void debugfs_hw_add(struct ieee80211_local *local) if (!statsd) return; - DEBUGFS_STATS_ADD(transmitted_fragment_count, - local->dot11TransmittedFragmentCount); - DEBUGFS_STATS_ADD(multicast_transmitted_frame_count, - local->dot11MulticastTransmittedFrameCount); - DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount); - DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount); - DEBUGFS_STATS_ADD(multiple_retry_count, - local->dot11MultipleRetryCount); - DEBUGFS_STATS_ADD(frame_duplicate_count, - local->dot11FrameDuplicateCount); - DEBUGFS_STATS_ADD(received_fragment_count, - local->dot11ReceivedFragmentCount); - DEBUGFS_STATS_ADD(multicast_received_frame_count, - local->dot11MulticastReceivedFrameCount); - DEBUGFS_STATS_ADD(transmitted_frame_count, - local->dot11TransmittedFrameCount); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS - DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop); - DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued); - DEBUGFS_STATS_ADD(tx_handlers_drop_fragment, - local->tx_handlers_drop_fragment); - DEBUGFS_STATS_ADD(tx_handlers_drop_wep, - local->tx_handlers_drop_wep); - DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc, - local->tx_handlers_drop_not_assoc); - DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port, - local->tx_handlers_drop_unauth_port); - DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop); - DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued); - DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc, - local->rx_handlers_drop_nullfunc); - DEBUGFS_STATS_ADD(rx_handlers_drop_defrag, - local->rx_handlers_drop_defrag); - DEBUGFS_STATS_ADD(rx_handlers_drop_short, - local->rx_handlers_drop_short); - DEBUGFS_STATS_ADD(tx_expand_skb_head, - local->tx_expand_skb_head); - DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned, - local->tx_expand_skb_head_cloned); - DEBUGFS_STATS_ADD(rx_expand_skb_head, - local->rx_expand_skb_head); - DEBUGFS_STATS_ADD(rx_expand_skb_head2, - local->rx_expand_skb_head2); - DEBUGFS_STATS_ADD(rx_handlers_fragments, - local->rx_handlers_fragments); - DEBUGFS_STATS_ADD(tx_status_drop, - local->tx_status_drop); + DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); + DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); + DEBUGFS_STATS_ADD(dot11FailedCount); + DEBUGFS_STATS_ADD(dot11RetryCount); + DEBUGFS_STATS_ADD(dot11MultipleRetryCount); + DEBUGFS_STATS_ADD(dot11FrameDuplicateCount); + DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); + DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); + DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); + DEBUGFS_STATS_ADD(tx_handlers_drop); + DEBUGFS_STATS_ADD(tx_handlers_queued); + DEBUGFS_STATS_ADD(tx_handlers_drop_wep); + DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); + DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port); + DEBUGFS_STATS_ADD(rx_handlers_drop); + DEBUGFS_STATS_ADD(rx_handlers_queued); + DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); + DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); + DEBUGFS_STATS_ADD(tx_expand_skb_head); + DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); + DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); + DEBUGFS_STATS_ADD(rx_handlers_fragments); + DEBUGFS_STATS_ADD(tx_status_drop); #endif DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); diff --git a/kernel/net/mac80211/debugfs_key.c b/kernel/net/mac80211/debugfs_key.c index 71ac1b5f4..7961e7d0b 100644 --- a/kernel/net/mac80211/debugfs_key.c +++ b/kernel/net/mac80211/debugfs_key.c @@ -2,6 +2,7 @@ * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright (C) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -34,6 +35,14 @@ static const struct file_operations key_ ##name## _ops = { \ .llseek = generic_file_llseek, \ } +#define KEY_OPS_W(name) \ +static const struct file_operations key_ ##name## _ops = { \ + .read = key_##name##_read, \ + .write = key_##name##_write, \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + #define KEY_FILE(name, format) \ KEY_READ_##format(name) \ KEY_OPS(name) @@ -57,7 +66,6 @@ KEY_CONF_FILE(keylen, D); KEY_CONF_FILE(keyidx, D); KEY_CONF_FILE(hw_key_idx, D); KEY_FILE(flags, X); -KEY_FILE(tx_rx_count, D); KEY_READ(ifindex, sdata->name, "%s\n"); KEY_OPS(ifindex); @@ -75,6 +83,41 @@ static ssize_t key_algorithm_read(struct file *file, } KEY_OPS(algorithm); +static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + u64 pn; + int ret; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + return -EINVAL; + case WLAN_CIPHER_SUITE_TKIP: + /* not supported yet */ + return -EOPNOTSUPP; + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + ret = kstrtou64_from_user(userbuf, count, 16, &pn); + if (ret) + return ret; + /* PN is a 48-bit counter */ + if (pn >= (1ULL << 48)) + return -ERANGE; + atomic64_set(&key->conf.tx_pn, pn); + return count; + default: + return 0; + } +} + static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -95,28 +138,13 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn = atomic64_read(&key->u.ccmp.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn = atomic64_read(&key->u.aes_cmac.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn = atomic64_read(&key->u.aes_gmac.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn = atomic64_read(&key->u.gcmp.tx_pn); + pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); @@ -126,7 +154,7 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } -KEY_OPS(tx_spec); +KEY_OPS_W(tx_spec); static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -294,6 +322,9 @@ KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ key, &key_##name##_ops); +#define DEBUGFS_ADD_W(name) \ + debugfs_create_file(#name, 0600, key->debugfs.dir, \ + key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) { @@ -325,9 +356,8 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) DEBUGFS_ADD(flags); DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); - DEBUGFS_ADD(tx_rx_count); DEBUGFS_ADD(algorithm); - DEBUGFS_ADD(tx_spec); + DEBUGFS_ADD_W(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); DEBUGFS_ADD(icverrors); diff --git a/kernel/net/mac80211/debugfs_netdev.c b/kernel/net/mac80211/debugfs_netdev.c index c09c0131b..37ea30e07 100644 --- a/kernel/net/mac80211/debugfs_netdev.c +++ b/kernel/net/mac80211/debugfs_netdev.c @@ -114,14 +114,6 @@ static ssize_t ieee80211_if_fmt_##name( \ return scnprintf(buf, buflen, "%pM\n", sdata->field); \ } -#define IEEE80211_IF_FMT_DEC_DIV_16(name, field) \ -static ssize_t ieee80211_if_fmt_##name( \ - const struct ieee80211_sub_if_data *sdata, \ - char *buf, int buflen) \ -{ \ - return scnprintf(buf, buflen, "%d\n", sdata->field / 16); \ -} - #define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field) \ static ssize_t ieee80211_if_fmt_##name( \ const struct ieee80211_sub_if_data *sdata, \ @@ -186,6 +178,38 @@ IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz, IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz, rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY); +static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_2ghz( + const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + int i, len = 0; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_2GHZ]; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); + len += scnprintf(buf + len, buflen - len, "\n"); + + return len; +} + +IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_2ghz); + +static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_5ghz( + const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + int i, len = 0; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_5GHZ]; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); + len += scnprintf(buf + len, buflen - len, "\n"); + + return len; +} + +IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_5ghz); + IEEE80211_IF_FILE(flags, flags, HEX); IEEE80211_IF_FILE(state, state, LHEX); IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC); @@ -215,8 +239,6 @@ IEEE80211_IF_FILE_R(hw_queues); /* STA attributes */ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); -IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC); -IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16); IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS); static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, @@ -423,6 +445,34 @@ static ssize_t ieee80211_if_parse_uapsd_max_sp_len( } IEEE80211_IF_FILE_RW(uapsd_max_sp_len); +static ssize_t ieee80211_if_fmt_tdls_wider_bw( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + const struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool tdls_wider_bw; + + tdls_wider_bw = ieee80211_hw_check(&sdata->local->hw, TDLS_WIDER_BW) && + !ifmgd->tdls_wider_bw_prohibited; + + return snprintf(buf, buflen, "%d\n", tdls_wider_bw); +} + +static ssize_t ieee80211_if_parse_tdls_wider_bw( + struct ieee80211_sub_if_data *sdata, const char *buf, int buflen) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + u8 val; + int ret; + + ret = kstrtou8(buf, 0, &val); + if (ret) + return ret; + + ifmgd->tdls_wider_bw_prohibited = !val; + return buflen; +} +IEEE80211_IF_FILE_RW(tdls_wider_bw); + /* AP attributes */ IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC); IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC); @@ -565,6 +615,8 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_mask_5ghz); DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz); DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz); + DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz); + DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); } @@ -572,14 +624,13 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(bssid); DEBUGFS_ADD(aid); - DEBUGFS_ADD(last_beacon); - DEBUGFS_ADD(ave_beacon); DEBUGFS_ADD(beacon_timeout); DEBUGFS_ADD_MODE(smps, 0600); DEBUGFS_ADD_MODE(tkip_mic_test, 0200); DEBUGFS_ADD_MODE(beacon_loss, 0200); DEBUGFS_ADD_MODE(uapsd_queues, 0600); DEBUGFS_ADD_MODE(uapsd_max_sp_len, 0600); + DEBUGFS_ADD_MODE(tdls_wider_bw, 0600); } static void add_ap_files(struct ieee80211_sub_if_data *sdata) diff --git a/kernel/net/mac80211/debugfs_sta.c b/kernel/net/mac80211/debugfs_sta.c index 252859e90..a39512f09 100644 --- a/kernel/net/mac80211/debugfs_sta.c +++ b/kernel/net/mac80211/debugfs_sta.c @@ -29,8 +29,6 @@ static ssize_t sta_ ##name## _read(struct file *file, \ format_string, sta->field); \ } #define STA_READ_D(name, field) STA_READ(name, field, "%d\n") -#define STA_READ_U(name, field) STA_READ(name, field, "%u\n") -#define STA_READ_S(name, field) STA_READ(name, field, "%s\n") #define STA_OPS(name) \ static const struct file_operations sta_ ##name## _ops = { \ @@ -52,10 +50,6 @@ static const struct file_operations sta_ ##name## _ops = { \ STA_OPS(name) STA_FILE(aid, sta.aid, D); -STA_FILE(dev, sdata->name, S); -STA_FILE(last_signal, last_signal, D); -STA_FILE(last_ack_signal, last_ack_signal, D); -STA_FILE(beacon_loss_count, beacon_loss_count, D); static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -101,40 +95,6 @@ static ssize_t sta_num_ps_buf_frames_read(struct file *file, } STA_OPS(num_ps_buf_frames); -static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - return mac80211_format_buffer(userbuf, count, ppos, "%d\n", - jiffies_to_msecs(jiffies - sta->last_rx)); -} -STA_OPS(inactive_ms); - - -static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct timespec uptime; - struct tm result; - long connected_time_secs; - char buf[100]; - int res; - ktime_get_ts(&uptime); - connected_time_secs = uptime.tv_sec - sta->last_connected; - time_to_tm(connected_time_secs, 0, &result); - result.tm_year -= 70; - result.tm_mday -= 1; - res = scnprintf(buf, sizeof(buf), - "years - %ld\nmonths - %d\ndays - %d\nclock - %d:%d:%d\n\n", - result.tm_year, result.tm_mon, result.tm_mday, - result.tm_hour, result.tm_min, result.tm_sec); - return simple_read_from_buffer(userbuf, count, ppos, buf, res); -} -STA_OPS(connected_time); - - - static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -359,37 +319,6 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, } STA_OPS(vht_capa); -static ssize_t sta_current_tx_rate_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct rate_info rinfo; - u16 rate; - sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo); - rate = cfg80211_calculate_bitrate(&rinfo); - - return mac80211_format_buffer(userbuf, count, ppos, - "%d.%d MBit/s\n", - rate/10, rate%10); -} -STA_OPS(current_tx_rate); - -static ssize_t sta_last_rx_rate_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct rate_info rinfo; - u16 rate; - - sta_set_rate_info_rx(sta, &rinfo); - - rate = cfg80211_calculate_bitrate(&rinfo); - - return mac80211_format_buffer(userbuf, count, ppos, - "%d.%d MBit/s\n", - rate/10, rate%10); -} -STA_OPS(last_rx_rate); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ @@ -432,30 +361,14 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(flags); DEBUGFS_ADD(num_ps_buf_frames); - DEBUGFS_ADD(inactive_ms); - DEBUGFS_ADD(connected_time); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); - DEBUGFS_ADD(dev); - DEBUGFS_ADD(last_signal); - DEBUGFS_ADD(beacon_loss_count); DEBUGFS_ADD(ht_capa); DEBUGFS_ADD(vht_capa); - DEBUGFS_ADD(last_ack_signal); - DEBUGFS_ADD(current_tx_rate); - DEBUGFS_ADD(last_rx_rate); - - DEBUGFS_ADD_COUNTER(rx_packets, rx_packets); - DEBUGFS_ADD_COUNTER(tx_packets, tx_packets); - DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes); - DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes); - DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates); - DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments); - DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped); - DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments); - DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count); - DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed); - DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count); + + DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); + DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); + DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered); if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) debugfs_create_x32("driver_buffered_tids", 0400, diff --git a/kernel/net/mac80211/driver-ops.c b/kernel/net/mac80211/driver-ops.c new file mode 100644 index 000000000..ca1fe5576 --- /dev/null +++ b/kernel/net/mac80211/driver-ops.c @@ -0,0 +1,309 @@ +/* + * Copyright 2015 Intel Deutschland GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include "ieee80211_i.h" +#include "trace.h" +#include "driver-ops.h" + +int drv_start(struct ieee80211_local *local) +{ + int ret; + + might_sleep(); + + if (WARN_ON(local->started)) + return -EALREADY; + + trace_drv_start(local); + local->started = true; + /* allow rx frames */ + smp_mb(); + ret = local->ops->start(&local->hw); + trace_drv_return_int(local, ret); + + if (ret) + local->started = false; + + return ret; +} + +void drv_stop(struct ieee80211_local *local) +{ + might_sleep(); + + if (WARN_ON(!local->started)) + return; + + trace_drv_stop(local); + local->ops->stop(&local->hw); + trace_drv_return_void(local); + + /* sync away all work on the tasklet before clearing started */ + tasklet_disable(&local->tasklet); + tasklet_enable(&local->tasklet); + + barrier(); + + local->started = false; +} + +int drv_add_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + int ret; + + might_sleep(); + + if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) + return -EINVAL; + + trace_drv_add_interface(local, sdata); + ret = local->ops->add_interface(&local->hw, &sdata->vif); + trace_drv_return_int(local, ret); + + if (ret == 0) + sdata->flags |= IEEE80211_SDATA_IN_DRIVER; + + return ret; +} + +int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type, bool p2p) +{ + int ret; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_change_interface(local, sdata, type, p2p); + ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); + trace_drv_return_int(local, ret); + return ret; +} + +void drv_remove_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_remove_interface(local, sdata); + local->ops->remove_interface(&local->hw, &sdata->vif); + sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; + trace_drv_return_void(local); +} + +__must_check +int drv_sta_state(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + enum ieee80211_sta_state old_state, + enum ieee80211_sta_state new_state) +{ + int ret = 0; + + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); + if (local->ops->sta_state) { + ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, + old_state, new_state); + } else if (old_state == IEEE80211_STA_AUTH && + new_state == IEEE80211_STA_ASSOC) { + ret = drv_sta_add(local, sdata, &sta->sta); + if (ret == 0) + sta->uploaded = true; + } else if (old_state == IEEE80211_STA_ASSOC && + new_state == IEEE80211_STA_AUTH) { + drv_sta_remove(local, sdata, &sta->sta); + } + trace_drv_return_int(local, ret); + return ret; +} + +void drv_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, u32 changed) +{ + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && + (sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); + + trace_drv_sta_rc_update(local, sdata, sta, changed); + if (local->ops->sta_rc_update) + local->ops->sta_rc_update(&local->hw, &sdata->vif, + sta, changed); + + trace_drv_return_void(local); +} + +int drv_conf_tx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, u16 ac, + const struct ieee80211_tx_queue_params *params) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return -EIO; + + if (WARN_ONCE(params->cw_min == 0 || + params->cw_min > params->cw_max, + "%s: invalid CW_min/CW_max: %d/%d\n", + sdata->name, params->cw_min, params->cw_max)) + return -EINVAL; + + trace_drv_conf_tx(local, sdata, ac, params); + if (local->ops->conf_tx) + ret = local->ops->conf_tx(&local->hw, &sdata->vif, + ac, params); + trace_drv_return_int(local, ret); + return ret; +} + +u64 drv_get_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + u64 ret = -1ULL; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return ret; + + trace_drv_get_tsf(local, sdata); + if (local->ops->get_tsf) + ret = local->ops->get_tsf(&local->hw, &sdata->vif); + trace_drv_return_u64(local, ret); + return ret; +} + +void drv_set_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u64 tsf) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_set_tsf(local, sdata, tsf); + if (local->ops->set_tsf) + local->ops->set_tsf(&local->hw, &sdata->vif, tsf); + trace_drv_return_void(local); +} + +void drv_reset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_reset_tsf(local, sdata); + if (local->ops->reset_tsf) + local->ops->reset_tsf(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + +int drv_switch_vif_chanctx(struct ieee80211_local *local, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, enum ieee80211_chanctx_switch_mode mode) +{ + int ret = 0; + int i; + + might_sleep(); + + if (!local->ops->switch_vif_chanctx) + return -EOPNOTSUPP; + + for (i = 0; i < n_vifs; i++) { + struct ieee80211_chanctx *new_ctx = + container_of(vifs[i].new_ctx, + struct ieee80211_chanctx, + conf); + struct ieee80211_chanctx *old_ctx = + container_of(vifs[i].old_ctx, + struct ieee80211_chanctx, + conf); + + WARN_ON_ONCE(!old_ctx->driver_present); + WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && + new_ctx->driver_present) || + (mode == CHANCTX_SWMODE_REASSIGN_VIF && + !new_ctx->driver_present)); + } + + trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); + ret = local->ops->switch_vif_chanctx(&local->hw, + vifs, n_vifs, mode); + trace_drv_return_int(local, ret); + + if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { + for (i = 0; i < n_vifs; i++) { + struct ieee80211_chanctx *new_ctx = + container_of(vifs[i].new_ctx, + struct ieee80211_chanctx, + conf); + struct ieee80211_chanctx *old_ctx = + container_of(vifs[i].old_ctx, + struct ieee80211_chanctx, + conf); + + new_ctx->driver_present = true; + old_ctx->driver_present = false; + } + } + + return ret; +} + +int drv_ampdu_action(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_ampdu_mlme_action action, + struct ieee80211_sta *sta, u16 tid, + u16 *ssn, u8 buf_size, bool amsdu) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_ampdu_action(local, sdata, action, sta, tid, + ssn, buf_size, amsdu); + + if (local->ops->ampdu_action) + ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, + sta, tid, ssn, buf_size, amsdu); + + trace_drv_return_int(local, ret); + + return ret; +} diff --git a/kernel/net/mac80211/driver-ops.h b/kernel/net/mac80211/driver-ops.h index 26e1ca8a4..154ce4b13 100644 --- a/kernel/net/mac80211/driver-ops.h +++ b/kernel/net/mac80211/driver-ops.h @@ -66,36 +66,8 @@ static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata, return rv; } -static inline int drv_start(struct ieee80211_local *local) -{ - int ret; - - might_sleep(); - - trace_drv_start(local); - local->started = true; - smp_mb(); - ret = local->ops->start(&local->hw); - trace_drv_return_int(local, ret); - return ret; -} - -static inline void drv_stop(struct ieee80211_local *local) -{ - might_sleep(); - - trace_drv_stop(local); - local->ops->stop(&local->hw); - trace_drv_return_void(local); - - /* sync away all work on the tasklet before clearing started */ - tasklet_disable(&local->tasklet); - tasklet_enable(&local->tasklet); - - barrier(); - - local->started = false; -} +int drv_start(struct ieee80211_local *local); +void drv_stop(struct ieee80211_local *local); #ifdef CONFIG_PM static inline int drv_suspend(struct ieee80211_local *local, @@ -137,59 +109,15 @@ static inline void drv_set_wakeup(struct ieee80211_local *local, } #endif -static inline int drv_add_interface(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - int ret; - - might_sleep(); - - if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF) && - !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) - return -EINVAL; - - trace_drv_add_interface(local, sdata); - ret = local->ops->add_interface(&local->hw, &sdata->vif); - trace_drv_return_int(local, ret); - - if (ret == 0) - sdata->flags |= IEEE80211_SDATA_IN_DRIVER; - - return ret; -} - -static inline int drv_change_interface(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - enum nl80211_iftype type, bool p2p) -{ - int ret; - - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return -EIO; - - trace_drv_change_interface(local, sdata, type, p2p); - ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); - trace_drv_return_int(local, ret); - return ret; -} - -static inline void drv_remove_interface(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - might_sleep(); +int drv_add_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); - if (!check_sdata_in_driver(sdata)) - return; +int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type, bool p2p); - trace_drv_remove_interface(local, sdata); - local->ops->remove_interface(&local->hw, &sdata->vif); - sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; - trace_drv_return_void(local); -} +void drv_remove_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); static inline int drv_config(struct ieee80211_local *local, u32 changed) { @@ -260,6 +188,22 @@ static inline void drv_configure_filter(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_config_iface_filter(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + unsigned int filter_flags, + unsigned int changed_flags) +{ + might_sleep(); + + trace_drv_config_iface_filter(local, sdata, filter_flags, + changed_flags); + if (local->ops->config_iface_filter) + local->ops->config_iface_filter(&local->hw, &sdata->vif, + filter_flags, + changed_flags); + trace_drv_return_void(local); +} + static inline int drv_set_tim(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set) { @@ -417,12 +361,13 @@ static inline int drv_get_stats(struct ieee80211_local *local, return ret; } -static inline void drv_get_tkip_seq(struct ieee80211_local *local, - u8 hw_key_idx, u32 *iv32, u16 *iv16) +static inline void drv_get_key_seq(struct ieee80211_local *local, + struct ieee80211_key *key, + struct ieee80211_key_seq *seq) { - if (local->ops->get_tkip_seq) - local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16); - trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16); + if (local->ops->get_key_seq) + local->ops->get_key_seq(&local->hw, &key->conf, seq); + trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, @@ -572,57 +517,16 @@ static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline __must_check +__must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, - enum ieee80211_sta_state new_state) -{ - int ret = 0; - - might_sleep(); + enum ieee80211_sta_state new_state); - sdata = get_bss_sdata(sdata); - if (!check_sdata_in_driver(sdata)) - return -EIO; - - trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); - if (local->ops->sta_state) { - ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, - old_state, new_state); - } else if (old_state == IEEE80211_STA_AUTH && - new_state == IEEE80211_STA_ASSOC) { - ret = drv_sta_add(local, sdata, &sta->sta); - if (ret == 0) - sta->uploaded = true; - } else if (old_state == IEEE80211_STA_ASSOC && - new_state == IEEE80211_STA_AUTH) { - drv_sta_remove(local, sdata, &sta->sta); - } - trace_drv_return_int(local, ret); - return ret; -} - -static inline void drv_sta_rc_update(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u32 changed) -{ - sdata = get_bss_sdata(sdata); - if (!check_sdata_in_driver(sdata)) - return; - - WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && - (sdata->vif.type != NL80211_IFTYPE_ADHOC && - sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); - - trace_drv_sta_rc_update(local, sdata, sta, changed); - if (local->ops->sta_rc_update) - local->ops->sta_rc_update(&local->hw, &sdata->vif, - sta, changed); - - trace_drv_return_void(local); -} +void drv_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, @@ -654,76 +558,17 @@ static inline void drv_sta_statistics(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline int drv_conf_tx(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, u16 ac, - const struct ieee80211_tx_queue_params *params) -{ - int ret = -EOPNOTSUPP; - - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return -EIO; - - if (WARN_ONCE(params->cw_min == 0 || - params->cw_min > params->cw_max, - "%s: invalid CW_min/CW_max: %d/%d\n", - sdata->name, params->cw_min, params->cw_max)) - return -EINVAL; - - trace_drv_conf_tx(local, sdata, ac, params); - if (local->ops->conf_tx) - ret = local->ops->conf_tx(&local->hw, &sdata->vif, - ac, params); - trace_drv_return_int(local, ret); - return ret; -} - -static inline u64 drv_get_tsf(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - u64 ret = -1ULL; - - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return ret; - - trace_drv_get_tsf(local, sdata); - if (local->ops->get_tsf) - ret = local->ops->get_tsf(&local->hw, &sdata->vif); - trace_drv_return_u64(local, ret); - return ret; -} - -static inline void drv_set_tsf(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - u64 tsf) -{ - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return; - - trace_drv_set_tsf(local, sdata, tsf); - if (local->ops->set_tsf) - local->ops->set_tsf(&local->hw, &sdata->vif, tsf); - trace_drv_return_void(local); -} - -static inline void drv_reset_tsf(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return; +int drv_conf_tx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, u16 ac, + const struct ieee80211_tx_queue_params *params); - trace_drv_reset_tsf(local, sdata); - if (local->ops->reset_tsf) - local->ops->reset_tsf(&local->hw, &sdata->vif); - trace_drv_return_void(local); -} +u64 drv_get_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); +void drv_set_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u64 tsf); +void drv_reset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); static inline int drv_tx_last_beacon(struct ieee80211_local *local) { @@ -738,30 +583,11 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local) return ret; } -static inline int drv_ampdu_action(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size) -{ - int ret = -EOPNOTSUPP; - - might_sleep(); - - sdata = get_bss_sdata(sdata); - if (!check_sdata_in_driver(sdata)) - return -EIO; - - trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size); - - if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, - sta, tid, ssn, buf_size); - - trace_drv_return_int(local, ret); - - return ret; -} +int drv_ampdu_action(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_ampdu_mlme_action action, + struct ieee80211_sta *sta, u16 tid, + u16 *ssn, u8 buf_size, bool amsdu); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) @@ -1017,6 +843,8 @@ static inline int drv_add_chanctx(struct ieee80211_local *local, { int ret = -EOPNOTSUPP; + might_sleep(); + trace_drv_add_chanctx(local, ctx); if (local->ops->add_chanctx) ret = local->ops->add_chanctx(&local->hw, &ctx->conf); @@ -1030,6 +858,8 @@ static inline int drv_add_chanctx(struct ieee80211_local *local, static inline void drv_remove_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { + might_sleep(); + if (WARN_ON(!ctx->driver_present)) return; @@ -1044,6 +874,8 @@ static inline void drv_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, u32 changed) { + might_sleep(); + trace_drv_change_chanctx(local, ctx, changed); if (local->ops->change_chanctx) { WARN_ON_ONCE(!ctx->driver_present); @@ -1077,6 +909,8 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx) { + might_sleep(); + if (!check_sdata_in_driver(sdata)) return; @@ -1090,64 +924,17 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline int -drv_switch_vif_chanctx(struct ieee80211_local *local, - struct ieee80211_vif_chanctx_switch *vifs, - int n_vifs, - enum ieee80211_chanctx_switch_mode mode) -{ - int ret = 0; - int i; - - if (!local->ops->switch_vif_chanctx) - return -EOPNOTSUPP; - - for (i = 0; i < n_vifs; i++) { - struct ieee80211_chanctx *new_ctx = - container_of(vifs[i].new_ctx, - struct ieee80211_chanctx, - conf); - struct ieee80211_chanctx *old_ctx = - container_of(vifs[i].old_ctx, - struct ieee80211_chanctx, - conf); - - WARN_ON_ONCE(!old_ctx->driver_present); - WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && - new_ctx->driver_present) || - (mode == CHANCTX_SWMODE_REASSIGN_VIF && - !new_ctx->driver_present)); - } - - trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); - ret = local->ops->switch_vif_chanctx(&local->hw, - vifs, n_vifs, mode); - trace_drv_return_int(local, ret); - - if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { - for (i = 0; i < n_vifs; i++) { - struct ieee80211_chanctx *new_ctx = - container_of(vifs[i].new_ctx, - struct ieee80211_chanctx, - conf); - struct ieee80211_chanctx *old_ctx = - container_of(vifs[i].old_ctx, - struct ieee80211_chanctx, - conf); - - new_ctx->driver_present = true; - old_ctx->driver_present = false; - } - } - - return ret; -} +int drv_switch_vif_chanctx(struct ieee80211_local *local, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, enum ieee80211_chanctx_switch_mode mode); static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; + might_sleep(); + if (!check_sdata_in_driver(sdata)) return -EIO; diff --git a/kernel/net/mac80211/ethtool.c b/kernel/net/mac80211/ethtool.c index 52bcea6ad..9cc986ded 100644 --- a/kernel/net/mac80211/ethtool.c +++ b/kernel/net/mac80211/ethtool.c @@ -38,9 +38,9 @@ static void ieee80211_get_ringparam(struct net_device *dev, static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", - "tx_packets", "tx_bytes", "tx_fragments", + "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", - "beacon_loss", "sta_state", "txrate", "rxrate", "signal", + "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; @@ -77,21 +77,19 @@ static void ieee80211_get_stats(struct net_device *dev, memset(data, 0, sizeof(u64) * STA_STATS_LEN); -#define ADD_STA_STATS(sta) \ - do { \ - data[i++] += sta->rx_packets; \ - data[i++] += sta->rx_bytes; \ - data[i++] += sta->num_duplicates; \ - data[i++] += sta->rx_fragments; \ - data[i++] += sta->rx_dropped; \ - \ - data[i++] += sinfo.tx_packets; \ - data[i++] += sinfo.tx_bytes; \ - data[i++] += sta->tx_fragments; \ - data[i++] += sta->tx_filtered_count; \ - data[i++] += sta->tx_retry_failed; \ - data[i++] += sta->tx_retry_count; \ - data[i++] += sta->beacon_loss_count; \ +#define ADD_STA_STATS(sta) \ + do { \ + data[i++] += sta->rx_stats.packets; \ + data[i++] += sta->rx_stats.bytes; \ + data[i++] += sta->rx_stats.num_duplicates; \ + data[i++] += sta->rx_stats.fragments; \ + data[i++] += sta->rx_stats.dropped; \ + \ + data[i++] += sinfo.tx_packets; \ + data[i++] += sinfo.tx_bytes; \ + data[i++] += sta->status_stats.filtered; \ + data[i++] += sta->status_stats.retry_failed; \ + data[i++] += sta->status_stats.retry_count; \ } while (0) /* For Managed stations, find the single station based on BSSID diff --git a/kernel/net/mac80211/event.c b/kernel/net/mac80211/event.c deleted file mode 100644 index 01ae75951..000000000 --- a/kernel/net/mac80211/event.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright 2007 Johannes Berg - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * mac80211 - events - */ -#include -#include "ieee80211_i.h" - -/* - * Indicate a failed Michael MIC to userspace. If the caller knows the TSC of - * the frame that generated the MIC failure (i.e., if it was provided by the - * driver or is still in the frame), it should provide that information. - */ -void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx, - struct ieee80211_hdr *hdr, const u8 *tsc, - gfp_t gfp) -{ - cfg80211_michael_mic_failure(sdata->dev, hdr->addr2, - (hdr->addr1[0] & 0x01) ? - NL80211_KEYTYPE_GROUP : - NL80211_KEYTYPE_PAIRWISE, - keyidx, tsc, gfp); -} diff --git a/kernel/net/mac80211/ibss.c b/kernel/net/mac80211/ibss.c index a9c9d961f..6a12b0f5c 100644 --- a/kernel/net/mac80211/ibss.c +++ b/kernel/net/mac80211/ibss.c @@ -188,7 +188,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, * keep them at 0 */ pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap, - chandef, 0); + chandef, 0, false); /* add VHT capability and information IEs */ if (chandef->width != NL80211_CHAN_WIDTH_20 && @@ -229,7 +229,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def chandef; struct ieee80211_channel *chan; struct beacon_data *presp; - enum nl80211_bss_scan_width scan_width; + struct cfg80211_inform_bss bss_meta = {}; bool have_higher_than_11mbit; bool radar_required; int err; @@ -356,7 +356,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, else sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; - ieee80211_set_wmm_default(sdata, true); + ieee80211_set_wmm_default(sdata, true, false); sdata->vif.bss_conf.ibss_joined = true; sdata->vif.bss_conf.ibss_creator = creator; @@ -383,10 +383,11 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, mod_timer(&ifibss->timer, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); - scan_width = cfg80211_chandef_to_scan_width(&chandef); - bss = cfg80211_inform_bss_width_frame(local->hw.wiphy, chan, - scan_width, mgmt, - presp->head_len, 0, GFP_KERNEL); + bss_meta.chan = chan; + bss_meta.scan_width = cfg80211_chandef_to_scan_width(&chandef); + bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, + presp->head_len, GFP_KERNEL); + cfg80211_put_bss(local->hw.wiphy, bss); netif_carrier_on(sdata->dev); cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL); @@ -646,7 +647,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, return NULL; } - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; @@ -668,7 +669,8 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sta->sdata == sdata && - time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL, + time_after(sta->rx_stats.last_rx + + IEEE80211_IBSS_MERGE_INTERVAL, jiffies)) { active++; break; @@ -1032,8 +1034,11 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, } } - if (sta && elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) + if (sta && !sta->sta.wme && + elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) { sta->sta.wme = true; + ieee80211_check_fast_xmit(sta); + } if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && @@ -1231,7 +1236,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, if (!sta) return; - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; @@ -1249,7 +1254,7 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT; - unsigned long exp_rsn_time = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT; + unsigned long exp_rsn = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT; mutex_lock(&local->sta_mtx); @@ -1257,8 +1262,8 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) if (sdata != sta->sdata) continue; - if (time_after(jiffies, sta->last_rx + exp_time) || - (time_after(jiffies, sta->last_rx + exp_rsn_time) && + if (time_after(jiffies, sta->rx_stats.last_rx + exp_time) || + (time_after(jiffies, sta->rx_stats.last_rx + exp_rsn) && sta->sta_state != IEEE80211_STA_AUTHORIZED)) { sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n", sta->sta_state != IEEE80211_STA_AUTHORIZED ? @@ -1727,7 +1732,6 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) if (sdata->vif.type != NL80211_IFTYPE_ADHOC) continue; sdata->u.ibss.last_scan_completed = jiffies; - ieee80211_queue_work(&local->hw, &sdata->work); } mutex_unlock(&local->iflist_mtx); } diff --git a/kernel/net/mac80211/ieee80211_i.h b/kernel/net/mac80211/ieee80211_i.h index c0a9187bc..6837a46ca 100644 --- a/kernel/net/mac80211/ieee80211_i.h +++ b/kernel/net/mac80211/ieee80211_i.h @@ -34,6 +34,8 @@ #include "sta_info.h" #include "debug.h" +extern const struct cfg80211_ops mac80211_config_ops; + struct ieee80211_local; /* Maximum number of broadcast/multicast frames to buffer when some of the @@ -84,13 +86,13 @@ struct ieee80211_local; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) struct ieee80211_fragment_entry { - unsigned long first_frag_time; - unsigned int seq; - unsigned int rx_queue; - unsigned int last_frag; - unsigned int extra_len; struct sk_buff_head skb_list; - int ccmp; /* Whether fragments were encrypted with CCMP */ + unsigned long first_frag_time; + u16 seq; + u16 extra_len; + u16 last_frag; + u8 rx_queue; + bool check_sequential_pn; /* needed for CCMP/GCMP */ u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ }; @@ -181,9 +183,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; /** * enum ieee80211_packet_rx_flags - packet RX flags - * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed - * (incl. multicast frames) - * @IEEE80211_RX_FRAGMENTED: fragmented frame * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering @@ -192,8 +191,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { - IEEE80211_RX_RA_MATCH = BIT(1), - IEEE80211_RX_FRAGMENTED = BIT(2), IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), IEEE80211_RX_DEFERRED_RELEASE = BIT(5), @@ -205,8 +202,6 @@ enum ieee80211_packet_rx_flags { * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). - * @IEEE80211_RX_REORDER_TIMER: this frame is released by the - * reorder buffer timeout timer, not the normal RX path * * These flags are used across handling multiple interfaces * for a single frame. @@ -214,10 +209,10 @@ enum ieee80211_packet_rx_flags { enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), - IEEE80211_RX_REORDER_TIMER = BIT(2), }; struct ieee80211_rx_data { + struct napi_struct *napi; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -426,6 +421,8 @@ struct ieee80211_sta_tx_tspec { bool downgraded; }; +DECLARE_EWMA(beacon_signal, 16, 4) + struct ieee80211_if_managed { struct timer_list timer; struct timer_list conn_mon_timer; @@ -497,16 +494,7 @@ struct ieee80211_if_managed { s16 p2p_noa_index; - /* Signal strength from the last Beacon frame in the current BSS. */ - int last_beacon_signal; - - /* - * Weighted average of the signal strength from Beacon frames in the - * current BSS. This is in units of 1/16 of the signal unit to maintain - * accuracy and to speed up calculations, i.e., the value need to be - * divided by 16 to get the actual value. - */ - int ave_beacon_signal; + struct ewma_beacon_signal ave_beacon_signal; /* * Number of Beacon frames used in ave_beacon_signal. This can be used @@ -515,6 +503,9 @@ struct ieee80211_if_managed { */ unsigned int count_beacon_signal; + /* Number of times beacon loss was invoked. */ + unsigned int beacon_loss_count; + /* * Last Beacon frame signal strength average (ave_beacon_signal / 16) * that triggered a cqm event. 0 indicates that no event has been @@ -542,6 +533,7 @@ struct ieee80211_if_managed { struct sk_buff *teardown_skb; /* A copy to send through the AP */ spinlock_t teardown_lock; /* To lock changing teardown_skb */ bool tdls_chan_switch_prohibited; + bool tdls_wider_bw_prohibited; /* WMM-AC TSPEC support */ struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS]; @@ -722,21 +714,21 @@ struct ieee80211_if_mesh { * enum ieee80211_sub_if_data_flags - virtual interface flags * * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets - * @IEEE80211_SDATA_PROMISC: interface is promisc * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between * associated stations and deliver multicast frames both * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver + * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), - IEEE80211_SDATA_PROMISC = BIT(1), IEEE80211_SDATA_OPERATING_GMODE = BIT(2), IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), + IEEE80211_SDATA_MU_MIMO_OWNER = BIT(6), }; /** @@ -908,6 +900,9 @@ struct ieee80211_sub_if_data { bool rc_has_mcs_mask[IEEE80211_NUM_BANDS]; u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN]; + bool rc_has_vht_mcs_mask[IEEE80211_NUM_BANDS]; + u16 rc_rateidx_vht_mcs_mask[IEEE80211_NUM_BANDS][NL80211_VHT_NSS_MAX]; + union { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; @@ -1015,7 +1010,6 @@ enum sdata_queue_type { IEEE80211_SDATA_QUEUE_AGG_STOP = 2, IEEE80211_SDATA_QUEUE_RX_AGG_START = 3, IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4, - IEEE80211_SDATA_QUEUE_TDLS_CHSW = 5, }; enum { @@ -1040,7 +1034,6 @@ enum queue_stop_reason { #ifdef CONFIG_MAC80211_LEDS struct tpt_led_trigger { - struct led_trigger trig; char name[32]; const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; @@ -1208,8 +1201,8 @@ struct ieee80211_local { atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; - /* number of interfaces with corresponding IFF_ flags */ - atomic_t iff_allmultis, iff_promiscs; + /* number of interfaces with allmulti RX */ + atomic_t iff_allmultis; struct rate_control_ref *rate_ctrl; @@ -1261,6 +1254,15 @@ struct ieee80211_local { struct list_head chanctx_list; struct mutex chanctx_mtx; +#ifdef CONFIG_MAC80211_LEDS + struct led_trigger tx_led, rx_led, assoc_led, radio_led; + struct led_trigger tpt_led; + atomic_t tx_led_active, rx_led_active, assoc_led_active; + atomic_t radio_led_active, tpt_led_active; + struct tpt_led_trigger *tpt_led_trigger; +#endif + +#ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* SNMP counters */ /* dot11CountersTable */ u32 dot11TransmittedFragmentCount; @@ -1273,18 +1275,9 @@ struct ieee80211_local { u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; -#ifdef CONFIG_MAC80211_LEDS - struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; - struct tpt_led_trigger *tpt_led_trigger; - char tx_led_name[32], rx_led_name[32], - assoc_led_name[32], radio_led_name[32]; -#endif - -#ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* TX/RX handler statistics */ unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; - unsigned int tx_handlers_drop_fragment; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; unsigned int tx_handlers_drop_unauth_port; @@ -1292,11 +1285,9 @@ struct ieee80211_local { unsigned int rx_handlers_queued; unsigned int rx_handlers_drop_nullfunc; unsigned int rx_handlers_drop_defrag; - unsigned int rx_handlers_drop_short; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; - unsigned int rx_expand_skb_head; - unsigned int rx_expand_skb_head2; + unsigned int rx_expand_skb_head_defrag; unsigned int rx_handlers_fragments; unsigned int tx_status_drop; #define I802_DEBUG_INC(c) (c)++ @@ -1319,7 +1310,6 @@ struct ieee80211_local { struct work_struct dynamic_ps_enable_work; struct work_struct dynamic_ps_disable_work; struct timer_list dynamic_ps_timer; - struct notifier_block network_latency_notifier; struct notifier_block ifa_notifier; struct notifier_block ifa6_notifier; @@ -1355,14 +1345,16 @@ struct ieee80211_local { struct ieee80211_sub_if_data __rcu *p2p_sdata; - struct napi_struct *napi; - /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct cfg80211_chan_def monitor_chandef; /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; + + /* TDLS channel switch */ + struct work_struct tdls_chsw_work; + struct sk_buff_head skb_queue_tdls_chsw; }; static inline struct ieee80211_sub_if_data * @@ -1503,10 +1495,8 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_disassoc_request *req); void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); -void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency); +void ieee80211_recalc_ps(struct ieee80211_local *local); void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata); -int ieee80211_max_network_latency(struct notifier_block *nb, - unsigned long data, void *dummy); int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, @@ -1583,7 +1573,7 @@ __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); -int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata); +int ieee80211_request_sched_scan_stop(struct ieee80211_local *local); void ieee80211_sched_scan_end(struct ieee80211_local *local); void ieee80211_sched_scan_stopped_work(struct work_struct *work); @@ -1647,6 +1637,14 @@ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); +void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, + struct ieee80211_supported_band *sband, + int retry_count, int shift, bool send_to_cooked); + +void ieee80211_check_fast_xmit(struct sta_info *sta); +void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); +void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); +void ieee80211_clear_fast_xmit(struct sta_info *sta); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, @@ -1711,12 +1709,14 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only); + enum ieee80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only); + enum ieee80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); +void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, + u16 vht_mask[NL80211_VHT_NSS_MAX]); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -1765,16 +1765,11 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw) /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ -u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, - enum nl80211_iftype type); int ieee80211_frame_duration(enum ieee80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); -void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx, - struct ieee80211_hdr *hdr, const u8 *tsc, - gfp_t gfp); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, - bool bss_notify); + bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb); @@ -1854,7 +1849,7 @@ void ieee80211_dynamic_ps_disable_work(struct work_struct *work); void ieee80211_dynamic_ps_timer(unsigned long data); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - int powersave); + bool powersave); void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, @@ -1967,7 +1962,7 @@ u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap); u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, - u16 prot_mode); + u16 prot_mode, bool rifs_mode); u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, @@ -2044,6 +2039,9 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, enum ieee80211_chanctx_mode chanmode, u8 radar_detect); int ieee80211_max_num_channels(struct ieee80211_local *local); +enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta); +void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx); /* TDLS */ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, @@ -2060,8 +2058,8 @@ int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); -void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb); +void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); +void ieee80211_tdls_chsw_work(struct work_struct *wk); extern const struct ethtool_ops ieee80211_ethtool_ops; diff --git a/kernel/net/mac80211/iface.c b/kernel/net/mac80211/iface.c index 84cef600c..c9e325d2e 100644 --- a/kernel/net/mac80211/iface.c +++ b/kernel/net/mac80211/iface.c @@ -76,7 +76,8 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, bool update_bss) { - if (__ieee80211_recalc_txpower(sdata) || update_bss) + if (__ieee80211_recalc_txpower(sdata) || + (update_bss && ieee80211_sdata_running(sdata))) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER); } @@ -338,7 +339,7 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, if ((iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_P2P_GO && iftype != NL80211_IFTYPE_MESH_POINT) || - !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) { + !ieee80211_hw_check(&sdata->local->hw, QUEUE_CONTROL)) { sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; return 0; } @@ -378,7 +379,7 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) int i; for (i = 0; i < IEEE80211_NUM_ACS; i++) { - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE; else if (local->hw.queues >= IEEE80211_NUM_ACS) sdata->vif.hw_queue[i] = i; @@ -393,7 +394,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; int ret; - if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return 0; ASSERT_RTNL(); @@ -454,7 +455,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return; ASSERT_RTNL(); @@ -661,11 +662,13 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) } /* - * set default queue parameters so drivers don't + * Set default queue parameters so drivers don't * need to initialise the hardware if the hardware - * doesn't start up with sane defaults + * doesn't start up with sane defaults. + * Enable QoS for anything but station interfaces. */ - ieee80211_set_wmm_default(sdata, true); + ieee80211_set_wmm_default(sdata, true, + sdata->vif.type != NL80211_IFTYPE_STATION); } set_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -703,16 +706,13 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_inc(&local->iff_allmultis); - if (sdata->flags & IEEE80211_SDATA_PROMISC) - atomic_inc(&local->iff_promiscs); - if (coming_up) local->open_count++; if (hw_reconf_flags) ieee80211_hw_config(local, hw_reconf_flags); - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); if (sdata->vif.type == NL80211_IFTYPE_MONITOR || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { @@ -835,13 +835,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1))); - /* don't count this interface for promisc/allmulti while it is down */ + /* don't count this interface for allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_dec(&local->iff_allmultis); - if (sdata->flags & IEEE80211_SDATA_PROMISC) - atomic_dec(&local->iff_promiscs); - if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll--; local->fif_probe_req--; @@ -1022,7 +1019,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, drv_remove_interface(local, sdata); } - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); if (cancel_scan) flush_delayed_work(&local->scan_work); @@ -1055,12 +1052,10 @@ static void ieee80211_set_multicast_list(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - int allmulti, promisc, sdata_allmulti, sdata_promisc; + int allmulti, sdata_allmulti; allmulti = !!(dev->flags & IFF_ALLMULTI); - promisc = !!(dev->flags & IFF_PROMISC); sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI); - sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC); if (allmulti != sdata_allmulti) { if (dev->flags & IFF_ALLMULTI) @@ -1070,13 +1065,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev) sdata->flags ^= IEEE80211_SDATA_ALLMULTI; } - if (promisc != sdata_promisc) { - if (dev->flags & IFF_PROMISC) - atomic_inc(&local->iff_promiscs); - else - atomic_dec(&local->iff_promiscs); - sdata->flags ^= IEEE80211_SDATA_PROMISC; - } spin_lock_bh(&local->filter_lock); __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); spin_unlock_bh(&local->filter_lock); @@ -1117,6 +1105,35 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev, return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } +static struct rtnl_link_stats64 * +ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats; + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + tstats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + stats->rx_packets += rx_packets; + stats->tx_packets += tx_packets; + stats->rx_bytes += rx_bytes; + stats->tx_bytes += tx_bytes; + } + + return stats; +} + static const struct net_device_ops ieee80211_dataif_ops = { .ndo_open = ieee80211_open, .ndo_stop = ieee80211_stop, @@ -1126,6 +1143,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { .ndo_change_mtu = ieee80211_change_mtu, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_netdev_select_queue, + .ndo_get_stats64 = ieee80211_get_stats64, }; static u16 ieee80211_monitor_select_queue(struct net_device *dev, @@ -1159,14 +1177,21 @@ static const struct net_device_ops ieee80211_monitorif_ops = { .ndo_change_mtu = ieee80211_change_mtu, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_monitor_select_queue, + .ndo_get_stats64 = ieee80211_get_stats64, }; +static void ieee80211_if_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &ieee80211_dataif_ops; - dev->destructor = free_netdev; + dev->destructor = ieee80211_if_free; } static void ieee80211_iface_work(struct work_struct *work) @@ -1182,7 +1207,7 @@ static void ieee80211_iface_work(struct work_struct *work) if (!ieee80211_sdata_running(sdata)) return; - if (local->scanning) + if (test_bit(SCAN_SW_SCANNING, &local->scanning)) return; if (!ieee80211_can_run_worker(local)) @@ -1220,8 +1245,6 @@ static void ieee80211_iface_work(struct work_struct *work) WLAN_BACK_RECIPIENT, 0, false); mutex_unlock(&local->sta_mtx); - } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_TDLS_CHSW) { - ieee80211_process_tdls_channel_switch(sdata, skb); } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK) { int len = skb->len; @@ -1564,7 +1587,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: - if (local->hw.flags & IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF) { + if (ieee80211_hw_check(&local->hw, P2P_DEV_ADDR_FOR_INTF)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) continue; @@ -1707,6 +1730,12 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); + ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!ndev->tstats) { + free_netdev(ndev); + return -ENOMEM; + } + ndev->needed_headroom = local->tx_headroom + 4*6 /* four MAC addresses */ + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */ @@ -1762,13 +1791,23 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sband = local->hw.wiphy->bands[i]; sdata->rc_rateidx_mask[i] = sband ? (1 << sband->n_bitrates) - 1 : 0; - if (sband) + if (sband) { + __le16 cap; + u16 *vht_rate_mask; + memcpy(sdata->rc_rateidx_mcs_mask[i], sband->ht_cap.mcs.rx_mask, sizeof(sdata->rc_rateidx_mcs_mask[i])); - else + + cap = sband->vht_cap.vht_mcs.rx_mcs_map; + vht_rate_mask = sdata->rc_rateidx_vht_mcs_mask[i]; + ieee80211_get_vht_mask_from_cap(cap, vht_rate_mask); + } else { memset(sdata->rc_rateidx_mcs_mask[i], 0, sizeof(sdata->rc_rateidx_mcs_mask[i])); + memset(sdata->rc_rateidx_vht_mcs_mask[i], 0, + sizeof(sdata->rc_rateidx_vht_mcs_mask[i])); + } } ieee80211_set_default_queues(sdata); @@ -1823,6 +1862,7 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) unregister_netdevice(sdata->dev); } else { cfg80211_unregister_wdev(&sdata->wdev); + ieee80211_teardown_sdata(sdata); kfree(sdata); } } @@ -1832,13 +1872,8 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata) if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state))) return; ieee80211_do_stop(sdata, true); - ieee80211_teardown_sdata(sdata); } -/* - * Remove all interfaces, may only be called at hardware unregistration - * time because it doesn't do RCU-safe list removals. - */ void ieee80211_remove_interfaces(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *tmp; @@ -1847,14 +1882,21 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) ASSERT_RTNL(); - /* - * Close all AP_VLAN interfaces first, as otherwise they - * might be closed while the AP interface they belong to - * is closed, causing unregister_netdevice_many() to crash. + /* Before destroying the interfaces, make sure they're all stopped so + * that the hardware is stopped. Otherwise, the driver might still be + * iterating the interfaces during the shutdown, e.g. from a worker + * or from RX processing or similar, and if it does so (using atomic + * iteration) while we're manipulating the list, the iteration will + * crash. + * + * After this, the hardware should be stopped and the driver should + * have stopped all of its activities, so that we can do RCU-unaware + * manipulations of the interface list below. */ - list_for_each_entry(sdata, &local->interfaces, list) - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - dev_close(sdata->dev); + cfg80211_shutdown_all_interfaces(local->hw.wiphy); + + WARN(local->open_count, "%s: open count remains %d\n", + wiphy_name(local->hw.wiphy), local->open_count); mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { diff --git a/kernel/net/mac80211/key.c b/kernel/net/mac80211/key.c index 81e9785f3..44388d6a1 100644 --- a/kernel/net/mac80211/key.c +++ b/kernel/net/mac80211/key.c @@ -154,7 +154,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) * is supported; if not, return. */ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && - !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)) + !ieee80211_hw_check(&key->local->hw, SUPPORTS_PER_STA_GTK)) goto out_unsupported; if (sta && !sta->uploaded) @@ -208,7 +208,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (key->local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL) + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; return 0; default: @@ -263,6 +263,7 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, if (uni) { rcu_assign_pointer(sdata->default_unicast_key, key); + ieee80211_check_fast_xmit_iface(sdata); drv_set_default_unicast_key(sdata->local, sdata, idx); } @@ -332,9 +333,9 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); sta->ptk_idx = idx; + ieee80211_check_fast_xmit(sta); } else { rcu_assign_pointer(sta->gtk[idx], new); - sta->gtk_idx = idx; } } else { defunikey = old && @@ -517,15 +518,17 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, break; default: if (cs) { - size_t len = (seq_len > MAX_PN_LEN) ? - MAX_PN_LEN : seq_len; + if (seq_len && seq_len != cs->pn_len) { + kfree(key); + return ERR_PTR(-EINVAL); + } key->conf.iv_len = cs->hdr_len; key->conf.icv_len = cs->mic_len; for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) - for (j = 0; j < len; j++) + for (j = 0; j < seq_len; j++) key->u.gen.rx_pn[i][j] = - seq[len - j - 1]; + seq[seq_len - j - 1]; key->flags |= KEY_FLAG_CIPHER_SCHEME; } } @@ -899,27 +902,19 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_gmac)); + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), gcmp)); + pn64 = atomic64_read(&key->conf.tx_pn); seq->ccmp.pn[5] = pn64; seq->ccmp.pn[4] = pn64 >> 8; seq->ccmp.pn[3] = pn64 >> 16; @@ -927,16 +922,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, seq->ccmp.pn[1] = pn64 >> 32; seq->ccmp.pn[0] = pn64 >> 40; break; - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = atomic64_read(&key->u.gcmp.tx_pn); - seq->gcmp.pn[5] = pn64; - seq->gcmp.pn[4] = pn64 >> 8; - seq->gcmp.pn[3] = pn64 >> 16; - seq->gcmp.pn[2] = pn64 >> 24; - seq->gcmp.pn[1] = pn64 >> 32; - seq->gcmp.pn[0] = pn64 >> 40; - break; default: WARN_ON(1); } @@ -1011,43 +996,25 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = (u64)seq->ccmp.pn[5] | - ((u64)seq->ccmp.pn[4] << 8) | - ((u64)seq->ccmp.pn[3] << 16) | - ((u64)seq->ccmp.pn[2] << 24) | - ((u64)seq->ccmp.pn[1] << 32) | - ((u64)seq->ccmp.pn[0] << 40); - atomic64_set(&key->u.ccmp.tx_pn, pn64); - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = (u64)seq->aes_cmac.pn[5] | - ((u64)seq->aes_cmac.pn[4] << 8) | - ((u64)seq->aes_cmac.pn[3] << 16) | - ((u64)seq->aes_cmac.pn[2] << 24) | - ((u64)seq->aes_cmac.pn[1] << 32) | - ((u64)seq->aes_cmac.pn[0] << 40); - atomic64_set(&key->u.aes_cmac.tx_pn, pn64); - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = (u64)seq->aes_gmac.pn[5] | - ((u64)seq->aes_gmac.pn[4] << 8) | - ((u64)seq->aes_gmac.pn[3] << 16) | - ((u64)seq->aes_gmac.pn[2] << 24) | - ((u64)seq->aes_gmac.pn[1] << 32) | - ((u64)seq->aes_gmac.pn[0] << 40); - atomic64_set(&key->u.aes_gmac.tx_pn, pn64); - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_gmac)); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = (u64)seq->gcmp.pn[5] | - ((u64)seq->gcmp.pn[4] << 8) | - ((u64)seq->gcmp.pn[3] << 16) | - ((u64)seq->gcmp.pn[2] << 24) | - ((u64)seq->gcmp.pn[1] << 32) | - ((u64)seq->gcmp.pn[0] << 40); - atomic64_set(&key->u.gcmp.tx_pn, pn64); + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), gcmp)); + pn64 = (u64)seq->ccmp.pn[5] | + ((u64)seq->ccmp.pn[4] << 8) | + ((u64)seq->ccmp.pn[3] << 16) | + ((u64)seq->ccmp.pn[2] << 24) | + ((u64)seq->ccmp.pn[1] << 32) | + ((u64)seq->ccmp.pn[0] << 40); + atomic64_set(&key->conf.tx_pn, pn64); break; default: WARN_ON(1); diff --git a/kernel/net/mac80211/key.h b/kernel/net/mac80211/key.h index 96557dd1e..9951ef063 100644 --- a/kernel/net/mac80211/key.h +++ b/kernel/net/mac80211/key.h @@ -18,7 +18,6 @@ #define NUM_DEFAULT_KEYS 4 #define NUM_DEFAULT_MGMT_KEYS 2 -#define MAX_PN_LEN 16 struct ieee80211_local; struct ieee80211_sub_if_data; @@ -78,7 +77,6 @@ struct ieee80211_key { u32 mic_failures; } tkip; struct { - atomic64_t tx_pn; /* * Last received packet number. The first * IEEE80211_NUM_TIDS counters are used with Data @@ -90,21 +88,18 @@ struct ieee80211_key { u32 replays; /* dot11RSNAStatsCCMPReplays */ } ccmp; struct { - atomic64_t tx_pn; u8 rx_pn[IEEE80211_CMAC_PN_LEN]; struct crypto_cipher *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_cmac; struct { - atomic64_t tx_pn; u8 rx_pn[IEEE80211_GMAC_PN_LEN]; struct crypto_aead *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_gmac; struct { - atomic64_t tx_pn; /* Last received packet number. The first * IEEE80211_NUM_TIDS counters are used with Data * frames and the last counter is used with Robust @@ -116,13 +111,10 @@ struct ieee80211_key { } gcmp; struct { /* generic cipher scheme */ - u8 rx_pn[IEEE80211_NUM_TIDS + 1][MAX_PN_LEN]; + u8 rx_pn[IEEE80211_NUM_TIDS + 1][IEEE80211_MAX_PN_LEN]; } gen; } u; - /* number of times this key has been used */ - int tx_rx_count; - #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *stalink; diff --git a/kernel/net/mac80211/led.c b/kernel/net/mac80211/led.c index e2b836446..0505845b7 100644 --- a/kernel/net/mac80211/led.c +++ b/kernel/net/mac80211/led.c @@ -12,96 +12,175 @@ #include #include "led.h" -#define MAC80211_BLINK_DELAY 50 /* ms */ - -void ieee80211_led_rx(struct ieee80211_local *local) -{ - unsigned long led_delay = MAC80211_BLINK_DELAY; - if (unlikely(!local->rx_led)) - return; - led_trigger_blink_oneshot(local->rx_led, &led_delay, &led_delay, 0); -} - -void ieee80211_led_tx(struct ieee80211_local *local) -{ - unsigned long led_delay = MAC80211_BLINK_DELAY; - if (unlikely(!local->tx_led)) - return; - led_trigger_blink_oneshot(local->tx_led, &led_delay, &led_delay, 0); -} - void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { - if (unlikely(!local->assoc_led)) + if (!atomic_read(&local->assoc_led_active)) return; if (associated) - led_trigger_event(local->assoc_led, LED_FULL); + led_trigger_event(&local->assoc_led, LED_FULL); else - led_trigger_event(local->assoc_led, LED_OFF); + led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { - if (unlikely(!local->radio_led)) + if (!atomic_read(&local->radio_led_active)) return; if (enabled) - led_trigger_event(local->radio_led, LED_FULL); + led_trigger_event(&local->radio_led, LED_FULL); else - led_trigger_event(local->radio_led, LED_OFF); + led_trigger_event(&local->radio_led, LED_OFF); +} + +void ieee80211_alloc_led_names(struct ieee80211_local *local) +{ + local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", + wiphy_name(local->hw.wiphy)); + local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", + wiphy_name(local->hw.wiphy)); + local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", + wiphy_name(local->hw.wiphy)); + local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", + wiphy_name(local->hw.wiphy)); +} + +void ieee80211_free_led_names(struct ieee80211_local *local) +{ + kfree(local->rx_led.name); + kfree(local->tx_led.name); + kfree(local->assoc_led.name); + kfree(local->radio_led.name); +} + +static void ieee80211_tx_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tx_led); + + atomic_inc(&local->tx_led_active); +} + +static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tx_led); + + atomic_dec(&local->tx_led_active); +} + +static void ieee80211_rx_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + rx_led); + + atomic_inc(&local->rx_led_active); +} + +static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + rx_led); + + atomic_dec(&local->rx_led_active); +} + +static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + assoc_led); + + atomic_inc(&local->assoc_led_active); +} + +static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + assoc_led); + + atomic_dec(&local->assoc_led_active); +} + +static void ieee80211_radio_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + radio_led); + + atomic_inc(&local->radio_led_active); +} + +static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + radio_led); + + atomic_dec(&local->radio_led_active); +} + +static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tpt_led); + + atomic_inc(&local->tpt_led_active); } -void ieee80211_led_names(struct ieee80211_local *local) +static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { - snprintf(local->rx_led_name, sizeof(local->rx_led_name), - "%srx", wiphy_name(local->hw.wiphy)); - snprintf(local->tx_led_name, sizeof(local->tx_led_name), - "%stx", wiphy_name(local->hw.wiphy)); - snprintf(local->assoc_led_name, sizeof(local->assoc_led_name), - "%sassoc", wiphy_name(local->hw.wiphy)); - snprintf(local->radio_led_name, sizeof(local->radio_led_name), - "%sradio", wiphy_name(local->hw.wiphy)); + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tpt_led); + + atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { - local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->rx_led) { - local->rx_led->name = local->rx_led_name; - if (led_trigger_register(local->rx_led)) { - kfree(local->rx_led); - local->rx_led = NULL; - } + atomic_set(&local->rx_led_active, 0); + local->rx_led.activate = ieee80211_rx_led_activate; + local->rx_led.deactivate = ieee80211_rx_led_deactivate; + if (local->rx_led.name && led_trigger_register(&local->rx_led)) { + kfree(local->rx_led.name); + local->rx_led.name = NULL; } - local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->tx_led) { - local->tx_led->name = local->tx_led_name; - if (led_trigger_register(local->tx_led)) { - kfree(local->tx_led); - local->tx_led = NULL; - } + atomic_set(&local->tx_led_active, 0); + local->tx_led.activate = ieee80211_tx_led_activate; + local->tx_led.deactivate = ieee80211_tx_led_deactivate; + if (local->tx_led.name && led_trigger_register(&local->tx_led)) { + kfree(local->tx_led.name); + local->tx_led.name = NULL; } - local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->assoc_led) { - local->assoc_led->name = local->assoc_led_name; - if (led_trigger_register(local->assoc_led)) { - kfree(local->assoc_led); - local->assoc_led = NULL; - } + atomic_set(&local->assoc_led_active, 0); + local->assoc_led.activate = ieee80211_assoc_led_activate; + local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; + if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { + kfree(local->assoc_led.name); + local->assoc_led.name = NULL; } - local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->radio_led) { - local->radio_led->name = local->radio_led_name; - if (led_trigger_register(local->radio_led)) { - kfree(local->radio_led); - local->radio_led = NULL; - } + atomic_set(&local->radio_led_active, 0); + local->radio_led.activate = ieee80211_radio_led_activate; + local->radio_led.deactivate = ieee80211_radio_led_deactivate; + if (local->radio_led.name && led_trigger_register(&local->radio_led)) { + kfree(local->radio_led.name); + local->radio_led.name = NULL; } + atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { - if (led_trigger_register(&local->tpt_led_trigger->trig)) { + local->tpt_led.activate = ieee80211_tpt_led_activate; + local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; + if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } @@ -110,58 +189,50 @@ void ieee80211_led_init(struct ieee80211_local *local) void ieee80211_led_exit(struct ieee80211_local *local) { - if (local->radio_led) { - led_trigger_unregister(local->radio_led); - kfree(local->radio_led); - } - if (local->assoc_led) { - led_trigger_unregister(local->assoc_led); - kfree(local->assoc_led); - } - if (local->tx_led) { - led_trigger_unregister(local->tx_led); - kfree(local->tx_led); - } - if (local->rx_led) { - led_trigger_unregister(local->rx_led); - kfree(local->rx_led); - } + if (local->radio_led.name) + led_trigger_unregister(&local->radio_led); + if (local->assoc_led.name) + led_trigger_unregister(&local->assoc_led); + if (local->tx_led.name) + led_trigger_unregister(&local->tx_led); + if (local->rx_led.name) + led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { - led_trigger_unregister(&local->tpt_led_trigger->trig); + led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } -char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->radio_led_name; + return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); -char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->assoc_led_name; + return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); -char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->tx_led_name; + return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); -char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->rx_led_name; + return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); @@ -205,16 +276,17 @@ static void tpt_trig_timer(unsigned long data) } } - read_lock(&tpt_trig->trig.leddev_list_lock); - list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + read_lock(&local->tpt_led.leddev_list_lock); + list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) led_blink_set(led_cdev, &on, &off); - read_unlock(&tpt_trig->trig.leddev_list_lock); + read_unlock(&local->tpt_led.leddev_list_lock); } -char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, - unsigned int flags, - const struct ieee80211_tpt_blink *blink_table, - unsigned int blink_table_len) +const char * +__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, + unsigned int flags, + const struct ieee80211_tpt_blink *blink_table, + unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; @@ -229,7 +301,7 @@ char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); - tpt_trig->trig.name = tpt_trig->name; + local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; @@ -269,10 +341,10 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); - read_lock(&tpt_trig->trig.leddev_list_lock); - list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + read_lock(&local->tpt_led.leddev_list_lock); + list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) led_set_brightness(led_cdev, LED_OFF); - read_unlock(&tpt_trig->trig.leddev_list_lock); + read_unlock(&local->tpt_led.leddev_list_lock); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, diff --git a/kernel/net/mac80211/led.h b/kernel/net/mac80211/led.h index 89f4344f1..a7893a1ac 100644 --- a/kernel/net/mac80211/led.h +++ b/kernel/net/mac80211/led.h @@ -11,25 +11,42 @@ #include #include "ieee80211_i.h" +#define MAC80211_BLINK_DELAY 50 /* ms */ + +static inline void ieee80211_led_rx(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_LEDS + unsigned long led_delay = MAC80211_BLINK_DELAY; + + if (!atomic_read(&local->rx_led_active)) + return; + led_trigger_blink_oneshot(&local->rx_led, &led_delay, &led_delay, 0); +#endif +} + +static inline void ieee80211_led_tx(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_LEDS + unsigned long led_delay = MAC80211_BLINK_DELAY; + + if (!atomic_read(&local->tx_led_active)) + return; + led_trigger_blink_oneshot(&local->tx_led, &led_delay, &led_delay, 0); +#endif +} + #ifdef CONFIG_MAC80211_LEDS -void ieee80211_led_rx(struct ieee80211_local *local); -void ieee80211_led_tx(struct ieee80211_local *local); void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); -void ieee80211_led_names(struct ieee80211_local *local); +void ieee80211_alloc_led_names(struct ieee80211_local *local); +void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else -static inline void ieee80211_led_rx(struct ieee80211_local *local) -{ -} -static inline void ieee80211_led_tx(struct ieee80211_local *local) -{ -} static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { @@ -38,7 +55,10 @@ static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } -static inline void ieee80211_led_names(struct ieee80211_local *local) +static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) +{ +} +static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) @@ -58,7 +78,7 @@ static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (local->tpt_led_trigger && ieee80211_is_data(fc)) + if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } @@ -67,7 +87,7 @@ static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (local->tpt_led_trigger && ieee80211_is_data(fc)) + if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif } diff --git a/kernel/net/mac80211/main.c b/kernel/net/mac80211/main.c index e86daed83..175ffcf7f 100644 --- a/kernel/net/mac80211/main.c +++ b/kernel/net/mac80211/main.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -32,7 +31,6 @@ #include "mesh.h" #include "wep.h" #include "led.h" -#include "cfg.h" #include "debugfs.h" void ieee80211_configure_filter(struct ieee80211_local *local) @@ -41,9 +39,6 @@ void ieee80211_configure_filter(struct ieee80211_local *local) unsigned int changed_flags; unsigned int new_flags = 0; - if (atomic_read(&local->iff_promiscs)) - new_flags |= FIF_PROMISC_IN_BSS; - if (atomic_read(&local->iff_allmultis)) new_flags |= FIF_ALLMULTI; @@ -286,7 +281,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) local->in_reconfig = true; barrier(); - schedule_work(&local->restart_work); + queue_work(system_freezable_wq, &local->restart_work); } EXPORT_SYMBOL(ieee80211_restart_hw); @@ -632,6 +627,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, INIT_WORK(&local->sched_scan_stopped_work, ieee80211_sched_scan_stopped_work); + INIT_WORK(&local->tdls_chsw_work, ieee80211_tdls_chsw_work); + spin_lock_init(&local->ack_status_lock); idr_init(&local->ack_status_frames); @@ -648,8 +645,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); + skb_queue_head_init(&local->skb_queue_tdls_chsw); - ieee80211_led_names(local); + ieee80211_alloc_led_names(local); ieee80211_roc_setup(local); @@ -664,7 +662,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) { bool have_wep = !(IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)); - bool have_mfp = local->hw.flags & IEEE80211_HW_MFP_CAPABLE; + bool have_mfp = ieee80211_hw_check(&local->hw, MFP_CAPABLE); int n_suites = 0, r = 0, w = 0; u32 *suites; static const u32 cipher_suites[] = { @@ -684,7 +682,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) WLAN_CIPHER_SUITE_BIP_GMAC_256, }; - if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL || + if (ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) || local->hw.wiphy->cipher_suites) { /* If the driver advertises, or doesn't support SW crypto, * we only need to remove WEP if necessary. @@ -774,8 +772,13 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_256; } - for (r = 0; r < local->hw.n_cipher_schemes; r++) + for (r = 0; r < local->hw.n_cipher_schemes; r++) { suites[w++] = cs[r].cipher; + if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) { + kfree(suites); + return -EINVAL; + } + } } local->hw.wiphy->cipher_suites = suites; @@ -795,7 +798,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; - if (hw->flags & IEEE80211_HW_QUEUE_CONTROL && + if (ieee80211_hw_check(hw, QUEUE_CONTROL) && (local->hw.offchannel_tx_hw_queue == IEEE80211_INVAL_HW_QUEUE || local->hw.offchannel_tx_hw_queue >= local->hw.queues)) return -EINVAL; @@ -843,7 +846,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* Only HW csum features are currently compatible with mac80211 */ feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | - NETIF_F_HW_CSUM; + NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | + NETIF_F_GSO_SOFTWARE; if (WARN_ON(hw->netdev_features & ~feature_whitelist)) return -EINVAL; @@ -942,9 +946,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 supports control port protocol changing */ local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; - } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { + } else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; if (hw->max_signal <= 0) { result = -EINVAL; @@ -998,7 +1002,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP; /* mac80211 supports eCSA, if the driver supports STA CSA at all */ - if (local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA) + if (ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) local->ext_capa[0] |= WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING; local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM; @@ -1066,7 +1070,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && - !(hw->flags & IEEE80211_HW_NO_AUTO_VIF)) { + !ieee80211_hw_check(hw, NO_AUTO_VIF)) { result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL, NL80211_IFTYPE_STATION, NULL); if (result) @@ -1076,13 +1080,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); - local->network_latency_notifier.notifier_call = - ieee80211_max_network_latency; - result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY, - &local->network_latency_notifier); - if (result) - goto fail_pm_qos; - #ifdef CONFIG_INET local->ifa_notifier.notifier_call = ieee80211_ifa_changed; result = register_inetaddr_notifier(&local->ifa_notifier); @@ -1107,10 +1104,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) #endif #if defined(CONFIG_INET) || defined(CONFIG_IPV6) fail_ifa: - pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, - &local->network_latency_notifier); #endif - fail_pm_qos: rtnl_lock(); rate_control_deinitialize(local); ieee80211_remove_interfaces(local); @@ -1129,18 +1123,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_register_hw); -void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, - struct net_device *napi_dev, - int (*poll)(struct napi_struct *, int), - int weight) -{ - struct ieee80211_local *local = hw_to_local(hw); - - netif_napi_add(napi_dev, napi, poll, weight); - local->napi = napi; -} -EXPORT_SYMBOL_GPL(ieee80211_napi_add); - void ieee80211_unregister_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); @@ -1148,8 +1130,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) tasklet_kill(&local->tx_pending_tasklet); tasklet_kill(&local->tasklet); - pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, - &local->network_latency_notifier); #ifdef CONFIG_INET unregister_inetaddr_notifier(&local->ifa_notifier); #endif @@ -1170,6 +1150,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); + cancel_work_sync(&local->tdls_chsw_work); flush_work(&local->sched_scan_stopped_work); ieee80211_clear_tx_pending(local); @@ -1180,6 +1161,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); + skb_queue_purge(&local->skb_queue_tdls_chsw); destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); @@ -1212,6 +1194,8 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) sta_info_stop(local); + ieee80211_free_led_names(local); + wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); diff --git a/kernel/net/mac80211/mesh.c b/kernel/net/mac80211/mesh.c index 817098add..6f85b6ab8 100644 --- a/kernel/net/mac80211/mesh.c +++ b/kernel/net/mac80211/mesh.c @@ -94,6 +94,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, ie->ht_operation, &sta_chan_def); + ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, + ie->vht_operation, &sta_chan_def); + if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) return false; @@ -158,7 +161,7 @@ void mesh_sta_cleanup(struct sta_info *sta) changed = mesh_accept_plinks_update(sdata); if (!sdata->u.mesh.user_mpm) { changed |= mesh_plink_deactivate(sta); - del_timer_sync(&sta->plink_timer); + del_timer_sync(&sta->mesh->plink_timer); } if (changed) @@ -436,8 +439,6 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; - enum nl80211_channel_type channel_type = - cfg80211_get_chandef_type(&sdata->vif.bss_conf.chandef); struct ieee80211_supported_band *sband; struct ieee80211_sta_ht_cap *ht_cap; u8 *pos; @@ -454,7 +455,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; ht_cap = &sband->ht_cap; - if (!ht_cap->ht_supported || channel_type == NL80211_CHAN_NO_HT) + if (!ht_cap->ht_supported || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) return 0; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_operation)) @@ -462,7 +466,70 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chandef, - sdata->vif.bss_conf.ht_operation_mode); + sdata->vif.bss_conf.ht_operation_mode, + false); + + return 0; +} + +int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband; + u8 *pos; + + sband = local->hw.wiphy->bands[band]; + if (!sband->vht_cap.vht_supported || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return 0; + + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) + return -ENOMEM; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); + ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); + + return 0; +} + +int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_channel *channel; + struct ieee80211_supported_band *sband; + struct ieee80211_sta_vht_cap *vht_cap; + u8 *pos; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return -EINVAL; + } + channel = chanctx_conf->def.chan; + rcu_read_unlock(); + + sband = local->hw.wiphy->bands[channel->band]; + vht_cap = &sband->vht_cap; + + if (!vht_cap->vht_supported || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return 0; + + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_operation)) + return -ENOMEM; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); + ieee80211_ie_build_vht_oper(pos, vht_cap, + &sdata->vif.bss_conf.chandef); return 0; } @@ -540,9 +607,9 @@ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, * * Return the header length. */ -int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, - struct ieee80211s_hdr *meshhdr, - const char *addr4or5, const char *addr6) +unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, + struct ieee80211s_hdr *meshhdr, + const char *addr4or5, const char *addr6) { if (WARN_ON(!addr4or5 && addr6)) return 0; @@ -637,6 +704,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + ifmsh->mesh_id_len + 2 + sizeof(struct ieee80211_meshconf_ie) + 2 + sizeof(__le16) + /* awake window */ + 2 + sizeof(struct ieee80211_vht_cap) + + 2 + sizeof(struct ieee80211_vht_operation) + ifmsh->ie_len; bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); @@ -718,6 +787,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) mesh_add_meshid_ie(sdata, skb) || mesh_add_meshconf_ie(sdata, skb) || mesh_add_awake_window_ie(sdata, skb) || + mesh_add_vht_cap_ie(sdata, skb) || + mesh_add_vht_oper_ie(sdata, skb) || mesh_add_vendor_ies(sdata, skb)) goto out_free; @@ -1299,17 +1370,6 @@ out: sdata_unlock(sdata); } -void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) -{ - struct ieee80211_sub_if_data *sdata; - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) - if (ieee80211_vif_is_mesh(&sdata->vif) && - ieee80211_sdata_running(sdata)) - ieee80211_queue_work(&local->hw, &sdata->work); - rcu_read_unlock(); -} void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) { diff --git a/kernel/net/mac80211/mesh.h b/kernel/net/mac80211/mesh.h index 50c8473cf..4a8019f79 100644 --- a/kernel/net/mac80211/mesh.h +++ b/kernel/net/mac80211/mesh.h @@ -207,9 +207,9 @@ struct mesh_rmc { /* Various */ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, const u8 *da, const u8 *sa); -int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, - struct ieee80211s_hdr *meshhdr, - const char *addr4or5, const char *addr6); +unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, + struct ieee80211s_hdr *meshhdr, + const char *addr4or5, const char *addr6); int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, const u8 *addr, struct ieee80211s_hdr *mesh_hdr); bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, @@ -227,6 +227,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); @@ -358,14 +362,10 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP; } -void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local); - void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); void ieee80211s_stop(void); #else -static inline void -ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {} static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) { return false; } static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) diff --git a/kernel/net/mac80211/mesh_hwmp.c b/kernel/net/mac80211/mesh_hwmp.c index 214e63b84..c6be0b4f4 100644 --- a/kernel/net/mac80211/mesh_hwmp.c +++ b/kernel/net/mac80211/mesh_hwmp.c @@ -19,15 +19,6 @@ #define MAX_PREQ_QUEUE_LEN 64 -/* Destination only */ -#define MP_F_DO 0x1 -/* Reply and forward */ -#define MP_F_RF 0x2 -/* Unknown Sequence Number */ -#define MP_F_USN 0x01 -/* Reason code Present */ -#define MP_F_RCODE 0x02 - static void mesh_queue_preq(struct mesh_path *, u8); static inline u32 u32_field_get(const u8 *preq_elem, int offset, bool ae) @@ -79,6 +70,12 @@ static inline u16 u16_field_get(const u8 *preq_elem, int offset, bool ae) #define MSEC_TO_TU(x) (x*1000/1024) #define SN_GT(x, y) ((s32)(y - x) < 0) #define SN_LT(x, y) ((s32)(x - y) < 0) +#define MAX_SANE_SN_DELTA 32 + +static inline u32 SN_DELTA(u32 x, u32 y) +{ + return x >= y ? x - y : y - x; +} #define net_traversal_jiffies(s) \ msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime) @@ -279,15 +276,10 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, *pos++ = ttl; /* number of destinations */ *pos++ = 1; - /* - * flags bit, bit 1 is unset if we know the sequence number and - * bit 2 is set if we have a reason code + /* Flags field has AE bit only as defined in + * sec 8.4.2.117 IEEE802.11-2012 */ *pos = 0; - if (!target_sn) - *pos |= MP_F_USN; - if (target_rcode) - *pos |= MP_F_RCODE; pos++; memcpy(pos, target, ETH_ALEN); pos += ETH_ALEN; @@ -316,8 +308,9 @@ void ieee80211s_update_metric(struct ieee80211_local *local, failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); /* moving average, scaled to 100 */ - sta->fail_avg = ((80 * sta->fail_avg + 5) / 100 + 20 * failed); - if (sta->fail_avg > 95) + sta->mesh->fail_avg = + ((80 * sta->mesh->fail_avg + 5) / 100 + 20 * failed); + if (sta->mesh->fail_avg > 95) mesh_plink_broken(sta); } @@ -333,15 +326,15 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, u32 tx_time, estimated_retx; u64 result; - if (sta->fail_avg >= 100) + if (sta->mesh->fail_avg >= 100) return MAX_METRIC; - sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo); + sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); rate = cfg80211_calculate_bitrate(&rinfo); if (WARN_ON(!rate)) return MAX_METRIC; - err = (sta->fail_avg << ARITH_SHIFT) / 100; + err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; /* bitrate is in units of 100 Kbps, while we need rate in units of * 1Mbps. This will be corrected on tx_time computation. @@ -441,6 +434,26 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, process = false; fresh_info = false; } + } else if (!(mpath->flags & MESH_PATH_ACTIVE)) { + bool have_sn, newer_sn, bounced; + + have_sn = mpath->flags & MESH_PATH_SN_VALID; + newer_sn = have_sn && SN_GT(orig_sn, mpath->sn); + bounced = have_sn && + (SN_DELTA(orig_sn, mpath->sn) > + MAX_SANE_SN_DELTA); + + if (!have_sn || newer_sn) { + /* if SN is newer than what we had + * then we can take it */; + } else if (bounced) { + /* if SN is way different than what + * we had then assume the other side + * rebooted or restarted */; + } else { + process = false; + fresh_info = false; + } } } else { mpath = mesh_path_add(sdata, orig_addr); @@ -510,14 +523,14 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - const u8 *preq_elem, u32 metric) + const u8 *preq_elem, u32 orig_metric) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath = NULL; const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; - u32 orig_sn, target_sn, lifetime, orig_metric; + u32 orig_sn, target_sn, lifetime, target_metric; bool reply = false; bool forward = true; bool root_is_gate; @@ -528,7 +541,6 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, target_sn = PREQ_IE_TARGET_SN(preq_elem); orig_sn = PREQ_IE_ORIG_SN(preq_elem); target_flags = PREQ_IE_TARGET_F(preq_elem); - orig_metric = metric; /* Proactive PREQ gate announcements */ flags = PREQ_IE_FLAGS(preq_elem); root_is_gate = !!(flags & RANN_FLAG_IS_GATE); @@ -539,7 +551,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mhwmp_dbg(sdata, "PREQ is for us\n"); forward = false; reply = true; - metric = 0; + target_metric = 0; if (time_after(jiffies, ifmsh->last_sn_update + net_traversal_jiffies(sdata)) || time_before(jiffies, ifmsh->last_sn_update)) { @@ -556,7 +568,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, reply = true; target_addr = sdata->vif.addr; target_sn = ++ifmsh->sn; - metric = 0; + target_metric = 0; ifmsh->last_sn_update = jiffies; } if (root_is_gate) @@ -571,15 +583,13 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, SN_LT(mpath->sn, target_sn)) { mpath->sn = target_sn; mpath->flags |= MESH_PATH_SN_VALID; - } else if ((!(target_flags & MP_F_DO)) && + } else if ((!(target_flags & IEEE80211_PREQ_TO_FLAG)) && (mpath->flags & MESH_PATH_ACTIVE)) { reply = true; - metric = mpath->metric; + target_metric = mpath->metric; target_sn = mpath->sn; - if (target_flags & MP_F_RF) - target_flags |= MP_F_DO; - else - forward = false; + /* Case E2 of sec 13.10.9.3 IEEE 802.11-2012*/ + target_flags |= IEEE80211_PREQ_TO_FLAG; } } rcu_read_unlock(); @@ -593,7 +603,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_PREP, 0, orig_addr, orig_sn, 0, target_addr, target_sn, mgmt->sa, 0, ttl, - lifetime, metric, 0, sdata); + lifetime, target_metric, 0, + sdata); } else { ifmsh->mshstats.dropped_frames_ttl++; } @@ -619,13 +630,12 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) { target_addr = PREQ_IE_TARGET_ADDR(preq_elem); target_sn = PREQ_IE_TARGET_SN(preq_elem); - metric = orig_metric; } mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr, orig_sn, target_flags, target_addr, target_sn, da, hopcount, ttl, lifetime, - metric, preq_id, sdata); + orig_metric, preq_id, sdata); if (!is_multicast_ether_addr(da)) ifmsh->mshstats.fwded_unicast++; else @@ -737,9 +747,12 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, if (mpath->flags & MESH_PATH_ACTIVE && ether_addr_equal(ta, sta->sta.addr) && (!(mpath->flags & MESH_PATH_SN_VALID) || - SN_GT(target_sn, mpath->sn))) { + SN_GT(target_sn, mpath->sn) || target_sn == 0)) { mpath->flags &= ~MESH_PATH_ACTIVE; - mpath->sn = target_sn; + if (target_sn != 0) + mpath->sn = target_sn; + else + mpath->sn += 1; spin_unlock_bh(&mpath->state_lock); if (!ifmsh->mshcfg.dot11MeshForwarding) goto endperr; @@ -854,7 +867,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, { struct ieee802_11_elems elems; size_t baselen; - u32 last_hop_metric; + u32 path_metric; struct sta_info *sta; /* need action_code */ @@ -863,7 +876,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); - if (!sta || sta->plink_state != NL80211_PLINK_ESTAB) { + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { rcu_read_unlock(); return; } @@ -877,21 +890,21 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, if (elems.preq_len != 37) /* Right now we support just 1 destination and no AE */ return; - last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, - MPATH_PREQ); - if (last_hop_metric) + path_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, + MPATH_PREQ); + if (path_metric) hwmp_preq_frame_process(sdata, mgmt, elems.preq, - last_hop_metric); + path_metric); } if (elems.prep) { if (elems.prep_len != 31) /* Right now we support no AE */ return; - last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, - MPATH_PREP); - if (last_hop_metric) + path_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, + MPATH_PREP); + if (path_metric) hwmp_prep_frame_process(sdata, mgmt, elems.prep, - last_hop_metric); + path_metric); } if (elems.perr) { if (elems.perr_len != 15) @@ -975,7 +988,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_preq_queue *preq_node; struct mesh_path *mpath; - u8 ttl, target_flags; + u8 ttl, target_flags = 0; const u8 *da; u32 lifetime; @@ -1034,9 +1047,9 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) } if (preq_node->flags & PREQ_Q_F_REFRESH) - target_flags = MP_F_DO; + target_flags |= IEEE80211_PREQ_TO_FLAG; else - target_flags = MP_F_RF; + target_flags &= ~IEEE80211_PREQ_TO_FLAG; spin_unlock_bh(&mpath->state_lock); da = (mpath->is_root) ? mpath->rann_snd_addr : broadcast_addr; @@ -1177,7 +1190,9 @@ void mesh_path_timer(unsigned long data) spin_unlock_bh(&mpath->state_lock); mesh_queue_preq(mpath, 0); } else { - mpath->flags = 0; + mpath->flags &= ~(MESH_PATH_RESOLVING | + MESH_PATH_RESOLVED | + MESH_PATH_REQ_QUEUED); mpath->exp_time = jiffies; spin_unlock_bh(&mpath->state_lock); if (!mpath->is_gate && mesh_gate_num(sdata) > 0) { diff --git a/kernel/net/mac80211/mesh_pathtbl.c b/kernel/net/mac80211/mesh_pathtbl.c index b890e225a..b3b44a5dd 100644 --- a/kernel/net/mac80211/mesh_pathtbl.c +++ b/kernel/net/mac80211/mesh_pathtbl.c @@ -779,10 +779,8 @@ void mesh_plink_broken(struct sta_info *sta) static void mesh_path_node_reclaim(struct rcu_head *rp) { struct mpath_node *node = container_of(rp, struct mpath_node, rcu); - struct ieee80211_sub_if_data *sdata = node->mpath->sdata; del_timer_sync(&node->mpath->timer); - atomic_dec(&sdata->u.mesh.mpaths); kfree(node->mpath); kfree(node); } @@ -790,8 +788,9 @@ static void mesh_path_node_reclaim(struct rcu_head *rp) /* needs to be called with the corresponding hashwlock taken */ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) { - struct mesh_path *mpath; - mpath = node->mpath; + struct mesh_path *mpath = node->mpath; + struct ieee80211_sub_if_data *sdata = node->mpath->sdata; + spin_lock(&mpath->state_lock); mpath->flags |= MESH_PATH_RESOLVING; if (mpath->is_gate) @@ -799,6 +798,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) hlist_del_rcu(&node->list); call_rcu(&node->rcu, mesh_path_node_reclaim); spin_unlock(&mpath->state_lock); + atomic_dec(&sdata->u.mesh.mpaths); atomic_dec(&tbl->entries); } diff --git a/kernel/net/mac80211/mesh_plink.c b/kernel/net/mac80211/mesh_plink.c index 60d737f14..bd3d55eb2 100644 --- a/kernel/net/mac80211/mesh_plink.c +++ b/kernel/net/mac80211/mesh_plink.c @@ -13,10 +13,11 @@ #include "rate.h" #include "mesh.h" +#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.u.self_prot.variable + 2) #define PLINK_GET_LLID(p) (p + 2) #define PLINK_GET_PLID(p) (p + 4) -#define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \ +#define mod_plink_timer(s, t) (mod_timer(&s->mesh->plink_timer, \ jiffies + msecs_to_jiffies(t))) enum plink_event { @@ -53,18 +54,15 @@ static const char * const mplevents[] = { [CLS_IGNR] = "CLS_IGNR" }; -static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, - enum ieee80211_self_protected_actioncode action, - u8 *da, u16 llid, u16 plid, u16 reason); - - /* We only need a valid sta if user configured a minimum rssi_threshold. */ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold; return rssi_threshold == 0 || - (sta && (s8) -ewma_read(&sta->avg_signal) > rssi_threshold); + (sta && + (s8)-ewma_signal_read(&sta->rx_stats.avg_signal) > + rssi_threshold); } /** @@ -72,13 +70,14 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, * * @sta: mesh peer link to restart * - * Locking: this function must be called holding sta->lock + * Locking: this function must be called holding sta->mesh->plink_lock */ static inline void mesh_plink_fsm_restart(struct sta_info *sta) { - sta->plink_state = NL80211_PLINK_LISTEN; - sta->llid = sta->plid = sta->reason = 0; - sta->plink_retries = 0; + lockdep_assert_held(&sta->mesh->plink_lock); + sta->mesh->plink_state = NL80211_PLINK_LISTEN; + sta->mesh->llid = sta->mesh->plid = sta->mesh->reason = 0; + sta->mesh->plink_retries = 0; } /* @@ -105,9 +104,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) /* (IEEE 802.11-2012 19.4.5) */ short_slot = true; goto out; - } else if (band != IEEE80211_BAND_2GHZ || - (band == IEEE80211_BAND_2GHZ && - local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) + } else if (band != IEEE80211_BAND_2GHZ) goto out; for (i = 0; i < sband->n_bitrates; i++) @@ -120,7 +117,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || - sta->plink_state != NL80211_PLINK_ESTAB) + sta->mesh->plink_state != NL80211_PLINK_ESTAB) continue; short_slot = false; @@ -170,7 +167,7 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || - sta->plink_state != NL80211_PLINK_ESTAB) + sta->mesh->plink_state != NL80211_PLINK_ESTAB) continue; if (sta->sta.bandwidth > IEEE80211_STA_RX_BW_20) @@ -205,57 +202,8 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) return BSS_CHANGED_HT; } -/** - * __mesh_plink_deactivate - deactivate mesh peer link - * - * @sta: mesh peer link to deactivate - * - * All mesh paths with this peer as next hop will be flushed - * Returns beacon changed flag if the beacon content changed. - * - * Locking: the caller must hold sta->lock - */ -static u32 __mesh_plink_deactivate(struct sta_info *sta) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = 0; - - if (sta->plink_state == NL80211_PLINK_ESTAB) - changed = mesh_plink_dec_estab_count(sdata); - sta->plink_state = NL80211_PLINK_BLOCKED; - mesh_path_flush_by_nexthop(sta); - - ieee80211_mps_sta_status_update(sta); - changed |= ieee80211_mps_set_sta_local_pm(sta, - NL80211_MESH_POWER_UNKNOWN); - - return changed; -} - -/** - * mesh_plink_deactivate - deactivate mesh peer link - * - * @sta: mesh peer link to deactivate - * - * All mesh paths with this peer as next hop will be flushed - */ -u32 mesh_plink_deactivate(struct sta_info *sta) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed; - - spin_lock_bh(&sta->lock); - changed = __mesh_plink_deactivate(sta); - sta->reason = WLAN_REASON_MESH_PEER_CANCELED; - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, - sta->sta.addr, sta->llid, sta->plid, - sta->reason); - spin_unlock_bh(&sta->lock); - - return changed; -} - static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, enum ieee80211_self_protected_actioncode action, u8 *da, u16 llid, u16 plid, u16 reason) { @@ -280,6 +228,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_meshconf_ie) + 2 + sizeof(struct ieee80211_ht_cap) + 2 + sizeof(struct ieee80211_ht_operation) + + 2 + sizeof(struct ieee80211_vht_cap) + + 2 + sizeof(struct ieee80211_vht_operation) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); if (!skb) @@ -305,7 +255,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action == WLAN_SP_MESH_PEERING_CONFIRM) { /* AID */ pos = skb_put(skb, 2); - put_unaligned_le16(plid, pos + 2); + put_unaligned_le16(sta->sta.aid, pos); } if (ieee80211_add_srates_ie(sdata, skb, true, band) || ieee80211_add_ext_srates_ie(sdata, skb, true, band) || @@ -360,7 +310,9 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action != WLAN_SP_MESH_PEERING_CLOSE) { if (mesh_add_ht_cap_ie(sdata, skb) || - mesh_add_ht_oper_ie(sdata, skb)) + mesh_add_ht_oper_ie(sdata, skb) || + mesh_add_vht_cap_ie(sdata, skb) || + mesh_add_vht_oper_ie(sdata, skb)) goto free; } @@ -374,6 +326,58 @@ free: return err; } +/** + * __mesh_plink_deactivate - deactivate mesh peer link + * + * @sta: mesh peer link to deactivate + * + * All mesh paths with this peer as next hop will be flushed + * Returns beacon changed flag if the beacon content changed. + * + * Locking: the caller must hold sta->mesh->plink_lock + */ +static u32 __mesh_plink_deactivate(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed = 0; + + lockdep_assert_held(&sta->mesh->plink_lock); + + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) + changed = mesh_plink_dec_estab_count(sdata); + sta->mesh->plink_state = NL80211_PLINK_BLOCKED; + mesh_path_flush_by_nexthop(sta); + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + NL80211_MESH_POWER_UNKNOWN); + + return changed; +} + +/** + * mesh_plink_deactivate - deactivate mesh peer link + * + * @sta: mesh peer link to deactivate + * + * All mesh paths with this peer as next hop will be flushed + */ +u32 mesh_plink_deactivate(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed; + + spin_lock_bh(&sta->mesh->plink_lock); + changed = __mesh_plink_deactivate(sta); + sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE, + sta->sta.addr, sta->mesh->llid, sta->mesh->plid, + sta->mesh->reason); + spin_unlock_bh(&sta->mesh->plink_lock); + + return changed; +} + static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee802_11_elems *elems, bool insert) @@ -387,12 +391,14 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[band]; rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates); - spin_lock_bh(&sta->lock); - sta->last_rx = jiffies; + spin_lock_bh(&sta->mesh->plink_lock); + sta->rx_stats.last_rx = jiffies; /* rates and capabilities don't change during peering */ - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB && + sta->mesh->processed_beacon) goto out; + sta->mesh->processed_beacon = true; if (sta->sta.supp_rates[band] != rates) changed |= IEEE80211_RC_SUPP_RATES_CHANGED; @@ -402,6 +408,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->ht_cap_elem, sta)) changed |= IEEE80211_RC_BW_CHANGED; + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, + elems->vht_cap_elem, sta); + if (bw != sta->sta.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; @@ -419,23 +428,57 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, else rate_control_rate_update(local, sband, sta, changed); out: - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->mesh->plink_lock); +} + +static int mesh_allocate_aid(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + unsigned long *aid_map; + int aid; + + aid_map = kcalloc(BITS_TO_LONGS(IEEE80211_MAX_AID + 1), + sizeof(*aid_map), GFP_KERNEL); + if (!aid_map) + return -ENOMEM; + + /* reserve aid 0 for mcast indication */ + __set_bit(0, aid_map); + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) + __set_bit(sta->sta.aid, aid_map); + rcu_read_unlock(); + + aid = find_first_zero_bit(aid_map, IEEE80211_MAX_AID + 1); + kfree(aid_map); + + if (aid > IEEE80211_MAX_AID) + return -ENOBUFS; + + return aid; } static struct sta_info * __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) { struct sta_info *sta; + int aid; if (sdata->local->num_sta >= MESH_MAX_PLINKS) return NULL; + aid = mesh_allocate_aid(sdata); + if (aid < 0) + return NULL; + sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL); if (!sta) return NULL; - sta->plink_state = NL80211_PLINK_LISTEN; + sta->mesh->plink_state = NL80211_PLINK_LISTEN; sta->sta.wme = true; + sta->sta.aid = aid; sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); @@ -522,7 +565,7 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, goto out; if (mesh_peer_accepts_plinks(elems) && - sta->plink_state == NL80211_PLINK_LISTEN && + sta->mesh->plink_state == NL80211_PLINK_LISTEN && sdata->u.mesh.accepting_plinks && sdata->u.mesh.mshcfg.auto_open_plinks && rssi_threshold_check(sdata, sta)) @@ -552,52 +595,52 @@ static void mesh_plink_timer(unsigned long data) if (sta->sdata->local->quiescing) return; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->mesh->plink_lock); /* If a timer fires just before a state transition on another CPU, * we may have already extended the timeout and changed state by the * time we've acquired the lock and arrived here. In that case, * skip this timer and wait for the new one. */ - if (time_before(jiffies, sta->plink_timer.expires)) { + if (time_before(jiffies, sta->mesh->plink_timer.expires)) { mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer adjusted)", - sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->lock); + sta->sta.addr, mplstates[sta->mesh->plink_state]); + spin_unlock_bh(&sta->mesh->plink_lock); return; } /* del_timer() and handler may race when entering these states */ - if (sta->plink_state == NL80211_PLINK_LISTEN || - sta->plink_state == NL80211_PLINK_ESTAB) { + if (sta->mesh->plink_state == NL80211_PLINK_LISTEN || + sta->mesh->plink_state == NL80211_PLINK_ESTAB) { mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer deleted)", - sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->lock); + sta->sta.addr, mplstates[sta->mesh->plink_state]); + spin_unlock_bh(&sta->mesh->plink_lock); return; } mpl_dbg(sta->sdata, "Mesh plink timer for %pM fired on state %s\n", - sta->sta.addr, mplstates[sta->plink_state]); + sta->sta.addr, mplstates[sta->mesh->plink_state]); sdata = sta->sdata; mshcfg = &sdata->u.mesh.mshcfg; - switch (sta->plink_state) { + switch (sta->mesh->plink_state) { case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_OPN_SNT: /* retry timer */ - if (sta->plink_retries < mshcfg->dot11MeshMaxRetries) { + if (sta->mesh->plink_retries < mshcfg->dot11MeshMaxRetries) { u32 rand; mpl_dbg(sta->sdata, "Mesh plink for %pM (retry, timeout): %d %d\n", - sta->sta.addr, sta->plink_retries, - sta->plink_timeout); + sta->sta.addr, sta->mesh->plink_retries, + sta->mesh->plink_timeout); get_random_bytes(&rand, sizeof(u32)); - sta->plink_timeout = sta->plink_timeout + - rand % sta->plink_timeout; - ++sta->plink_retries; - mod_plink_timer(sta, sta->plink_timeout); + sta->mesh->plink_timeout = sta->mesh->plink_timeout + + rand % sta->mesh->plink_timeout; + ++sta->mesh->plink_retries; + mod_plink_timer(sta, sta->mesh->plink_timeout); action = WLAN_SP_MESH_PEERING_OPEN; break; } @@ -607,31 +650,31 @@ static void mesh_plink_timer(unsigned long data) /* confirm timer */ if (!reason) reason = WLAN_REASON_MESH_CONFIRM_TIMEOUT; - sta->plink_state = NL80211_PLINK_HOLDING; + sta->mesh->plink_state = NL80211_PLINK_HOLDING; mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout); action = WLAN_SP_MESH_PEERING_CLOSE; break; case NL80211_PLINK_HOLDING: /* holding timer */ - del_timer(&sta->plink_timer); + del_timer(&sta->mesh->plink_timer); mesh_plink_fsm_restart(sta); break; default: break; } - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->mesh->plink_lock); if (action) - mesh_plink_frame_tx(sdata, action, sta->sta.addr, - sta->llid, sta->plid, reason); + mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr, + sta->mesh->llid, sta->mesh->plid, reason); } static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout) { - sta->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); - sta->plink_timer.data = (unsigned long) sta; - sta->plink_timer.function = mesh_plink_timer; - sta->plink_timeout = timeout; - add_timer(&sta->plink_timer); + sta->mesh->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); + sta->mesh->plink_timer.data = (unsigned long) sta; + sta->mesh->plink_timer.function = mesh_plink_timer; + sta->mesh->plink_timeout = timeout; + add_timer(&sta->mesh->plink_timer); } static bool llid_in_use(struct ieee80211_sub_if_data *sdata, @@ -643,7 +686,10 @@ static bool llid_in_use(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { - if (!memcmp(&sta->llid, &llid, sizeof(llid))) { + if (sdata != sta->sdata) + continue; + + if (!memcmp(&sta->mesh->llid, &llid, sizeof(llid))) { in_use = true; break; } @@ -659,8 +705,6 @@ static u16 mesh_get_new_llid(struct ieee80211_sub_if_data *sdata) do { get_random_bytes(&llid, sizeof(llid)); - /* for mesh PS we still only have the AID range for TIM bits */ - llid = (llid % IEEE80211_MAX_AID) + 1; } while (llid_in_use(sdata, llid)); return llid; @@ -674,16 +718,16 @@ u32 mesh_plink_open(struct sta_info *sta) if (!test_sta_flag(sta, WLAN_STA_AUTH)) return 0; - spin_lock_bh(&sta->lock); - sta->llid = mesh_get_new_llid(sdata); - if (sta->plink_state != NL80211_PLINK_LISTEN && - sta->plink_state != NL80211_PLINK_BLOCKED) { - spin_unlock_bh(&sta->lock); + spin_lock_bh(&sta->mesh->plink_lock); + sta->mesh->llid = mesh_get_new_llid(sdata); + if (sta->mesh->plink_state != NL80211_PLINK_LISTEN && + sta->mesh->plink_state != NL80211_PLINK_BLOCKED) { + spin_unlock_bh(&sta->mesh->plink_lock); return 0; } - sta->plink_state = NL80211_PLINK_OPN_SNT; + sta->mesh->plink_state = NL80211_PLINK_OPN_SNT; mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->mesh->plink_lock); mpl_dbg(sdata, "Mesh plink: starting establishment with %pM\n", sta->sta.addr); @@ -691,8 +735,8 @@ u32 mesh_plink_open(struct sta_info *sta) /* set the non-peer mode to active during peering */ changed = ieee80211_mps_local_status_update(sdata); - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN, - sta->sta.addr, sta->llid, 0, 0); + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_OPEN, + sta->sta.addr, sta->mesh->llid, 0, 0); return changed; } @@ -700,10 +744,10 @@ u32 mesh_plink_block(struct sta_info *sta) { u32 changed; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); - sta->plink_state = NL80211_PLINK_BLOCKED; - spin_unlock_bh(&sta->lock); + sta->mesh->plink_state = NL80211_PLINK_BLOCKED; + spin_unlock_bh(&sta->mesh->plink_lock); return changed; } @@ -713,12 +757,11 @@ static void mesh_plink_close(struct ieee80211_sub_if_data *sdata, enum plink_event event) { struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; - u16 reason = (event == CLS_ACPT) ? WLAN_REASON_MESH_CLOSE : WLAN_REASON_MESH_CONFIG; - sta->reason = reason; - sta->plink_state = NL80211_PLINK_HOLDING; + sta->mesh->reason = reason; + sta->mesh->plink_state = NL80211_PLINK_HOLDING; mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout); } @@ -728,8 +771,8 @@ static u32 mesh_plink_establish(struct ieee80211_sub_if_data *sdata, struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; u32 changed = 0; - del_timer(&sta->plink_timer); - sta->plink_state = NL80211_PLINK_ESTAB; + del_timer(&sta->mesh->plink_timer); + sta->mesh->plink_state = NL80211_PLINK_ESTAB; changed |= mesh_plink_inc_estab_count(sdata); changed |= mesh_set_ht_prot_mode(sdata); changed |= mesh_set_short_slot_time(sdata); @@ -756,18 +799,18 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, u32 changed = 0; mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr, - mplstates[sta->plink_state], mplevents[event]); + mplstates[sta->mesh->plink_state], mplevents[event]); - spin_lock_bh(&sta->lock); - switch (sta->plink_state) { + spin_lock_bh(&sta->mesh->plink_lock); + switch (sta->mesh->plink_state) { case NL80211_PLINK_LISTEN: switch (event) { case CLS_ACPT: mesh_plink_fsm_restart(sta); break; case OPN_ACPT: - sta->plink_state = NL80211_PLINK_OPN_RCVD; - sta->llid = mesh_get_new_llid(sdata); + sta->mesh->plink_state = NL80211_PLINK_OPN_RCVD; + sta->mesh->llid = mesh_get_new_llid(sdata); mesh_plink_timer_set(sta, mshcfg->dot11MeshRetryTimeout); @@ -789,11 +832,11 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, break; case OPN_ACPT: /* retry timer is left untouched */ - sta->plink_state = NL80211_PLINK_OPN_RCVD; + sta->mesh->plink_state = NL80211_PLINK_OPN_RCVD; action = WLAN_SP_MESH_PEERING_CONFIRM; break; case CNF_ACPT: - sta->plink_state = NL80211_PLINK_CNF_RCVD; + sta->mesh->plink_state = NL80211_PLINK_CNF_RCVD; mod_plink_timer(sta, mshcfg->dot11MeshConfirmTimeout); break; default: @@ -853,7 +896,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, case NL80211_PLINK_HOLDING: switch (event) { case CLS_ACPT: - del_timer(&sta->plink_timer); + del_timer(&sta->mesh->plink_timer); mesh_plink_fsm_restart(sta); break; case OPN_ACPT: @@ -872,17 +915,18 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, */ break; } - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->mesh->plink_lock); if (action) { - mesh_plink_frame_tx(sdata, action, sta->sta.addr, - sta->llid, sta->plid, sta->reason); + mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr, + sta->mesh->llid, sta->mesh->plid, + sta->mesh->reason); /* also send confirm in open case */ if (action == WLAN_SP_MESH_PEERING_OPEN) { - mesh_plink_frame_tx(sdata, + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CONFIRM, - sta->sta.addr, sta->llid, - sta->plid, 0); + sta->sta.addr, sta->mesh->llid, + sta->mesh->plid, 0); } } @@ -937,7 +981,7 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n"); goto out; } - if (sta->plink_state == NL80211_PLINK_BLOCKED) + if (sta->mesh->plink_state == NL80211_PLINK_BLOCKED) goto out; } @@ -952,7 +996,7 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, if (!matches_local) event = OPN_RJCT; if (!mesh_plink_free_count(sdata) || - (sta->plid && sta->plid != plid)) + (sta->mesh->plid && sta->mesh->plid != plid)) event = OPN_IGNR; else event = OPN_ACPT; @@ -961,14 +1005,14 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, if (!matches_local) event = CNF_RJCT; if (!mesh_plink_free_count(sdata) || - sta->llid != llid || - (sta->plid && sta->plid != plid)) + sta->mesh->llid != llid || + (sta->mesh->plid && sta->mesh->plid != plid)) event = CNF_IGNR; else event = CNF_ACPT; break; case WLAN_SP_MESH_PEERING_CLOSE: - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) /* Do not check for llid or plid. This does not * follow the standard but since multiple plinks * per sta are not supported, it is necessary in @@ -979,9 +1023,9 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, * restarted. */ event = CLS_ACPT; - else if (sta->plid != plid) + else if (sta->mesh->plid != plid) event = CLS_IGNR; - else if (ie_len == 8 && sta->llid != llid) + else if (ie_len == 8 && sta->mesh->llid != llid) event = CLS_IGNR; else event = CLS_ACPT; @@ -1068,9 +1112,9 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "Mesh plink: failed to init peer!\n"); goto unlock_rcu; } - sta->plid = plid; + sta->mesh->plid = plid; } else if (!sta && event == OPN_RJCT) { - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, + mesh_plink_frame_tx(sdata, NULL, WLAN_SP_MESH_PEERING_CLOSE, mgmt->sa, 0, plid, WLAN_REASON_MESH_CONFIG); goto unlock_rcu; @@ -1079,9 +1123,13 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, goto unlock_rcu; } - /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ - if (!sta->plid && event == CNF_ACPT) - sta->plid = plid; + if (event == CNF_ACPT) { + /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ + if (!sta->mesh->plid) + sta->mesh->plid = plid; + + sta->mesh->aid = get_unaligned_le16(PLINK_CNF_AID(mgmt)); + } changed |= mesh_plink_fsm(sdata, sta, event); @@ -1120,6 +1168,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, WLAN_SP_MESH_PEERING_CONFIRM) { baseaddr += 4; baselen += 4; + + if (baselen > len) + return; } ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems); mesh_process_plink_frame(sdata, mgmt, &elems); diff --git a/kernel/net/mac80211/mesh_ps.c b/kernel/net/mac80211/mesh_ps.c index ad8b377b4..90a268abe 100644 --- a/kernel/net/mac80211/mesh_ps.c +++ b/kernel/net/mac80211/mesh_ps.c @@ -92,16 +92,16 @@ u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata) if (sdata != sta->sdata) continue; - switch (sta->plink_state) { + switch (sta->mesh->plink_state) { case NL80211_PLINK_OPN_SNT: case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: peering = true; break; case NL80211_PLINK_ESTAB: - if (sta->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP) + if (sta->mesh->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP) light_sleep_cnt++; - else if (sta->local_pm == NL80211_MESH_POWER_DEEP_SLEEP) + else if (sta->mesh->local_pm == NL80211_MESH_POWER_DEEP_SLEEP) deep_sleep_cnt++; break; default: @@ -153,19 +153,19 @@ u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, { struct ieee80211_sub_if_data *sdata = sta->sdata; - if (sta->local_pm == pm) + if (sta->mesh->local_pm == pm) return 0; mps_dbg(sdata, "local STA operates in mode %d with %pM\n", pm, sta->sta.addr); - sta->local_pm = pm; + sta->mesh->local_pm = pm; /* * announce peer-specific power mode transition * (see IEEE802.11-2012 13.14.3.2 and 13.14.3.3) */ - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) mps_qos_null_tx(sta); return ieee80211_mps_local_status_update(sdata); @@ -197,8 +197,8 @@ void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata, if (is_unicast_ether_addr(hdr->addr1) && ieee80211_is_data_qos(hdr->frame_control) && - sta->plink_state == NL80211_PLINK_ESTAB) - pm = sta->local_pm; + sta->mesh->plink_state == NL80211_PLINK_ESTAB) + pm = sta->mesh->local_pm; else pm = sdata->u.mesh.nonpeer_pm; @@ -241,16 +241,16 @@ void ieee80211_mps_sta_status_update(struct sta_info *sta) * use peer-specific power mode if peering is established and the * peer's power mode is known */ - if (sta->plink_state == NL80211_PLINK_ESTAB && - sta->peer_pm != NL80211_MESH_POWER_UNKNOWN) - pm = sta->peer_pm; + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB && + sta->mesh->peer_pm != NL80211_MESH_POWER_UNKNOWN) + pm = sta->mesh->peer_pm; else - pm = sta->nonpeer_pm; + pm = sta->mesh->nonpeer_pm; do_buffer = (pm != NL80211_MESH_POWER_ACTIVE); /* clear the MPSP flags for non-peers or active STA */ - if (sta->plink_state != NL80211_PLINK_ESTAB) { + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) { clear_sta_flag(sta, WLAN_STA_MPSP_OWNER); clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); } else if (!do_buffer) { @@ -296,13 +296,13 @@ static void mps_set_sta_peer_pm(struct sta_info *sta, pm = NL80211_MESH_POWER_ACTIVE; } - if (sta->peer_pm == pm) + if (sta->mesh->peer_pm == pm) return; mps_dbg(sta->sdata, "STA %pM enters mode %d\n", sta->sta.addr, pm); - sta->peer_pm = pm; + sta->mesh->peer_pm = pm; ieee80211_mps_sta_status_update(sta); } @@ -317,13 +317,13 @@ static void mps_set_sta_nonpeer_pm(struct sta_info *sta, else pm = NL80211_MESH_POWER_ACTIVE; - if (sta->nonpeer_pm == pm) + if (sta->mesh->nonpeer_pm == pm) return; mps_dbg(sta->sdata, "STA %pM sets non-peer mode to %d\n", sta->sta.addr, pm); - sta->nonpeer_pm = pm; + sta->mesh->nonpeer_pm = pm; ieee80211_mps_sta_status_update(sta); } @@ -552,7 +552,7 @@ void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta, } else { if (eosp) clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); - else if (sta->local_pm != NL80211_MESH_POWER_ACTIVE) + else if (sta->mesh->local_pm != NL80211_MESH_POWER_ACTIVE) set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); if (rspi && !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER)) @@ -577,9 +577,9 @@ void ieee80211_mps_frame_release(struct sta_info *sta, int ac, buffer_local = 0; bool has_buffered = false; - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->llid); + sta->mesh->aid); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", @@ -598,7 +598,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (!has_buffered && !buffer_local) return; - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) mpsp_trigger_send(sta, has_buffered, !buffer_local); else mps_frame_deliver(sta, 1); diff --git a/kernel/net/mac80211/mesh_sync.c b/kernel/net/mac80211/mesh_sync.c index 09625d620..64bc22ad9 100644 --- a/kernel/net/mac80211/mesh_sync.c +++ b/kernel/net/mac80211/mesh_sync.c @@ -127,14 +127,14 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, /* Timing offset calculation (see 13.13.2.2.2) */ t_t = le64_to_cpu(mgmt->u.beacon.timestamp); - sta->t_offset = t_t - t_r; + sta->mesh->t_offset = t_t - t_r; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - s64 t_clockdrift = sta->t_offset_setpoint - sta->t_offset; + s64 t_clockdrift = sta->mesh->t_offset_setpoint - sta->mesh->t_offset; msync_dbg(sdata, - "STA %pM : sta->t_offset=%lld, sta->t_offset_setpoint=%lld, t_clockdrift=%lld\n", - sta->sta.addr, (long long) sta->t_offset, - (long long) sta->t_offset_setpoint, + "STA %pM : t_offset=%lld, t_offset_setpoint=%lld, t_clockdrift=%lld\n", + sta->sta.addr, (long long) sta->mesh->t_offset, + (long long) sta->mesh->t_offset_setpoint, (long long) t_clockdrift); if (t_clockdrift > TOFFSET_MAXIMUM_ADJUSTMENT || @@ -152,12 +152,12 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, ifmsh->sync_offset_clockdrift_max = t_clockdrift; spin_unlock_bh(&ifmsh->sync_offset_lock); } else { - sta->t_offset_setpoint = sta->t_offset - TOFFSET_SET_MARGIN; + sta->mesh->t_offset_setpoint = sta->mesh->t_offset - TOFFSET_SET_MARGIN; set_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); msync_dbg(sdata, - "STA %pM : offset was invalid, sta->t_offset=%lld\n", + "STA %pM : offset was invalid, t_offset=%lld\n", sta->sta.addr, - (long long) sta->t_offset); + (long long) sta->mesh->t_offset); } no_sync: diff --git a/kernel/net/mac80211/mlme.c b/kernel/net/mac80211/mlme.c index 26053bf2f..83097c383 100644 --- a/kernel/net/mac80211/mlme.c +++ b/kernel/net/mac80211/mlme.c @@ -6,6 +6,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -80,13 +80,6 @@ MODULE_PARM_DESC(probe_wait_ms, "Maximum time(ms) to wait for probe response" " before disconnecting (reason 4)."); -/* - * Weight given to the latest Beacon frame when calculating average signal - * strength for Beacon frames received in the current BSS. This must be - * between 1 and 15. - */ -#define IEEE80211_SIGNAL_AVE_WEIGHT 3 - /* * How many Beacon frames need to have been used in average signal strength * before starting to indicate signal change events. @@ -118,7 +111,7 @@ void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.bcn_mon_timer, @@ -134,7 +127,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) ifmgd->probe_send_count = 0; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.conn_mon_timer, @@ -538,11 +531,16 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_ht_cap(pos, &ht_cap, cap); } +/* This function determines vht capability flags for the association + * and builds the IE. + * Note - the function may set the owner of the MU-MIMO capability + */ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_supported_band *sband, struct ieee80211_vht_cap *ap_vht_cap) { + struct ieee80211_local *local = sdata->local; u8 *pos; u32 cap; struct ieee80211_sta_vht_cap vht_cap; @@ -576,7 +574,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, */ if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) - cap &= ~IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE; + cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | + IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + else if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) + cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + + /* + * If some other vif is using the MU-MIMO capablity we cannot associate + * using MU-MIMO - this will lead to contradictions in the group-id + * mechanism. + * Ownership is defined since association request, in order to avoid + * simultaneous associations with MU-MIMO. + */ + if (cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) { + bool disable_mu_mimo = false; + struct ieee80211_sub_if_data *other; + + list_for_each_entry_rcu(other, &local->interfaces, list) { + if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) { + disable_mu_mimo = true; + break; + } + } + if (disable_mu_mimo) + cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + else + sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER; + } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; @@ -669,17 +694,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) capab = WLAN_CAPABILITY_ESS; if (sband->band == IEEE80211_BAND_2GHZ) { - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; + capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } if (assoc_data->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; if ((assoc_data->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && - (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) + ieee80211_hw_check(&local->hw, SPECTRUM_MGMT)) capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) @@ -887,7 +910,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) drv_mgd_prepare_tx(local, sdata); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); @@ -912,7 +935,7 @@ void ieee80211_send_pspoll(struct ieee80211_local *local, void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - int powersave) + bool powersave) { struct sk_buff *skb; struct ieee80211_hdr_3addr *nullfunc; @@ -929,7 +952,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) @@ -1161,6 +1184,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + /* + * Drop all TDLS peers - either we disconnect or move to a different + * channel from this point on. There's no telling what our peer will do. + * The TDLS WIDER_BW scenario is also problematic, as peers might now + * have an incompatible wider chandef. + */ + ieee80211_teardown_tdls_peers(sdata); + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, @@ -1174,7 +1205,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (local->use_chanctx && - !(local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)) { + !ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { sdata_info(sdata, "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; @@ -1348,21 +1379,26 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, */ if (has_80211h_pwr && (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { + new_ap_level = pwr_level_80211h; + + if (sdata->ap_power_level == new_ap_level) + return 0; + sdata_dbg(sdata, "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", pwr_level_80211h, chan_pwr, pwr_reduction_80211h, sdata->u.mgd.bssid); - new_ap_level = pwr_level_80211h; } else { /* has_cisco_pwr is always true here. */ + new_ap_level = pwr_level_cisco; + + if (sdata->ap_power_level == new_ap_level) + return 0; + sdata_dbg(sdata, "Limiting TX power to %d dBm as advertised by %pM\n", pwr_level_cisco, sdata->u.mgd.bssid); - new_ap_level = pwr_level_cisco; } - if (sdata->ap_power_level == new_ap_level) - return 0; - sdata->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(sdata)) return BSS_CHANGED_TXPOWER; @@ -1383,15 +1419,15 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, return; if (conf->dynamic_ps_timeout > 0 && - !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) { + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) - ieee80211_send_nullfunc(local, sdata, 1); + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) + ieee80211_send_nullfunc(local, sdata, true); - if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) return; conf->flags |= IEEE80211_CONF_PS; @@ -1444,13 +1480,13 @@ static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata) } /* need to hold RTNL or interface lock */ -void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) +void ieee80211_recalc_ps(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *found = NULL; int count = 0; int timeout; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) { + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) { local->ps_sdata = NULL; return; } @@ -1473,48 +1509,23 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) } if (count == 1 && ieee80211_powersave_allowed(found)) { + u8 dtimper = found->u.mgd.dtim_period; s32 beaconint_us; - if (latency < 0) - latency = pm_qos_request(PM_QOS_NETWORK_LATENCY); - beaconint_us = ieee80211_tu_to_usec( found->vif.bss_conf.beacon_int); timeout = local->dynamic_ps_forced_timeout; - if (timeout < 0) { - /* - * Go to full PSM if the user configures a very low - * latency requirement. - * The 2000 second value is there for compatibility - * until the PM_QOS_NETWORK_LATENCY is configured - * with real values. - */ - if (latency > (1900 * USEC_PER_MSEC) && - latency != (2000 * USEC_PER_SEC)) - timeout = 0; - else - timeout = 100; - } + if (timeout < 0) + timeout = 100; local->hw.conf.dynamic_ps_timeout = timeout; - if (beaconint_us > latency) { - local->ps_sdata = NULL; - } else { - int maxslp = 1; - u8 dtimper = found->u.mgd.dtim_period; - - /* If the TIM IE is invalid, pretend the value is 1 */ - if (!dtimper) - dtimper = 1; - else if (dtimper > 1) - maxslp = min_t(int, dtimper, - latency / beaconint_us); - - local->hw.conf.max_sleep_period = maxslp; - local->hw.conf.ps_dtim_period = dtimper; - local->ps_sdata = found; - } + /* If the TIM IE is invalid, pretend the value is 1 */ + if (!dtimper) + dtimper = 1; + + local->hw.conf.ps_dtim_period = dtimper; + local->ps_sdata = found; } else { local->ps_sdata = NULL; } @@ -1596,21 +1607,21 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } - if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); } else { - ieee80211_send_nullfunc(local, sdata, 1); + ieee80211_send_nullfunc(local, sdata, true); /* Flush to get the tx status of nullfunc frame */ ieee80211_flush_queues(local, sdata, false); } } - if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && - (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) || + if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && + ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) || (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; @@ -1738,10 +1749,10 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const u8 *wmm_param, size_t wmm_param_len) { - struct ieee80211_tx_queue_params params; + struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t left; - int count; + int count, ac; const u8 *pos; u8 uapsd_queues = 0; @@ -1775,25 +1786,24 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, int aci = (pos[0] >> 5) & 0x03; int acm = (pos[0] >> 4) & 0x01; bool uapsd = false; - int queue; switch (aci) { case 1: /* AC_BK */ - queue = 3; + ac = IEEE80211_AC_BK; if (acm) sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; break; case 2: /* AC_VI */ - queue = 1; + ac = IEEE80211_AC_VI; if (acm) sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; break; case 3: /* AC_VO */ - queue = 0; + ac = IEEE80211_AC_VO; if (acm) sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) @@ -1801,7 +1811,7 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, break; case 0: /* AC_BE */ default: - queue = 2; + ac = IEEE80211_AC_BE; if (acm) sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) @@ -1809,25 +1819,41 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, break; } - params.aifs = pos[0] & 0x0f; - params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4); - params.cw_min = ecw2cw(pos[1] & 0x0f); - params.txop = get_unaligned_le16(pos + 2); - params.acm = acm; - params.uapsd = uapsd; + params[ac].aifs = pos[0] & 0x0f; + + if (params[ac].aifs < 2) { + sdata_info(sdata, + "AP has invalid WMM params (AIFSN=%d for ACI %d), will use 2\n", + params[ac].aifs, aci); + params[ac].aifs = 2; + } + params[ac].cw_max = ecw2cw((pos[1] & 0xf0) >> 4); + params[ac].cw_min = ecw2cw(pos[1] & 0x0f); + params[ac].txop = get_unaligned_le16(pos + 2); + params[ac].acm = acm; + params[ac].uapsd = uapsd; + + if (params[ac].cw_min > params[ac].cw_max) { + sdata_info(sdata, + "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n", + params[ac].cw_min, params[ac].cw_max, aci); + return false; + } + } + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { mlme_dbg(sdata, - "WMM queue=%d aci=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n", - queue, aci, acm, - params.aifs, params.cw_min, params.cw_max, - params.txop, params.uapsd, - ifmgd->tx_tspec[queue].downgraded); - sdata->tx_conf[queue] = params; - if (!ifmgd->tx_tspec[queue].downgraded && - drv_conf_tx(local, sdata, queue, ¶ms)) + "WMM AC=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n", + ac, params[ac].acm, + params[ac].aifs, params[ac].cw_min, params[ac].cw_max, + params[ac].txop, params[ac].uapsd, + ifmgd->tx_tspec[ac].downgraded); + sdata->tx_conf[ac] = params[ac]; + if (!ifmgd->tx_tspec[ac].downgraded && + drv_conf_tx(local, sdata, ac, ¶ms[ac])) sdata_err(sdata, - "failed to set TX queue parameters for queue %d\n", - queue); + "failed to set TX queue parameters for AC %d\n", + ac); } /* enable WMM or activate new settings */ @@ -1965,7 +1991,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, ieee80211_bss_info_change_notify(sdata, bss_info_changed); mutex_lock(&local->iflist_mtx); - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); mutex_unlock(&local->iflist_mtx); ieee80211_recalc_smps(sdata); @@ -2052,6 +2078,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); + sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -2070,7 +2097,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_bss_info_change_notify(sdata, changed); /* disassociated - set to defaults now */ - ieee80211_set_wmm_default(sdata, false); + ieee80211_set_wmm_default(sdata, false, false); del_timer_sync(&sdata->u.mgd.conn_mon_timer); del_timer_sync(&sdata->u.mgd.bcn_mon_timer); @@ -2132,10 +2159,10 @@ static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) __ieee80211_stop_poll(sdata); mutex_lock(&local->iflist_mtx); - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); mutex_unlock(&local->iflist_mtx); - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) goto out; /* @@ -2233,9 +2260,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) */ ifmgd->probe_send_count++; - if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; - ieee80211_send_nullfunc(sdata->local, sdata, 0); + ieee80211_send_nullfunc(sdata->local, sdata, false); } else { int ssid_len; @@ -2308,7 +2335,7 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, goto out; mutex_lock(&sdata->local->iflist_mtx); - ieee80211_recalc_ps(sdata->local, -1); + ieee80211_recalc_ps(sdata->local); mutex_unlock(&sdata->local->iflist_mtx); ifmgd->probe_send_count = 0; @@ -2413,15 +2440,9 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work) container_of(work, struct ieee80211_sub_if_data, u.mgd.beacon_connection_loss_work); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct sta_info *sta; - if (ifmgd->associated) { - rcu_read_lock(); - sta = sta_info_get(sdata, ifmgd->bssid); - if (sta) - sta->beacon_loss_count++; - rcu_read_unlock(); - } + if (ifmgd->associated) + ifmgd->beacon_loss_count++; if (ifmgd->connection_loss) { sdata_info(sdata, "Connection to AP %pM lost\n", @@ -2495,6 +2516,35 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.auth_data = NULL; } +static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, + bool assoc) +{ + struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; + + sdata_assert_lock(sdata); + + if (!assoc) { + /* + * we are not associated yet, the only timer that could be + * running is the timeout for the association response which + * which is not relevant anymore. + */ + del_timer_sync(&sdata->u.mgd.timer); + sta_info_destroy_addr(sdata, assoc_data->bss->bssid); + + eth_zero_addr(sdata->u.mgd.bssid); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); + sdata->u.mgd.flags = 0; + sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; + mutex_lock(&sdata->local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); + } + + kfree(assoc_data); + sdata->u.mgd.assoc_data = NULL; +} + static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { @@ -2510,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, @@ -2687,28 +2737,42 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - const u8 *bssid = NULL; - u16 reason_code; + u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); sdata_assert_lock(sdata); if (len < 24 + 2) return; - if (!ifmgd->associated || - !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) - return; + if (ifmgd->associated && + ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) { + const u8 *bssid = ifmgd->associated->bssid; - bssid = ifmgd->associated->bssid; + sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", + bssid, reason_code, + ieee80211_get_reason_code_string(reason_code)); - reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); + ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", - bssid, reason_code, ieee80211_get_reason_code_string(reason_code)); + ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, + reason_code); + return; + } - ieee80211_set_disassoc(sdata, 0, 0, false, NULL); + if (ifmgd->assoc_data && + ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { + const u8 *bssid = ifmgd->assoc_data->bss->bssid; - ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code); + sdata_info(sdata, + "deauthenticated from %pM while associating (Reason: %u=%s)\n", + bssid, reason_code, + ieee80211_get_reason_code_string(reason_code)); + + ieee80211_destroy_assoc_data(sdata, false); + + cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + return; + } } @@ -2788,34 +2852,6 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, } } -static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, - bool assoc) -{ - struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; - - sdata_assert_lock(sdata); - - if (!assoc) { - /* - * we are not associated yet, the only timer that could be - * running is the timeout for the association response which - * which is not relevant anymore. - */ - del_timer_sync(&sdata->u.mgd.timer); - sta_info_destroy_addr(sdata, assoc_data->bss->bssid); - - eth_zero_addr(sdata->u.mgd.bssid); - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); - sdata->u.mgd.flags = 0; - mutex_lock(&sdata->local->mtx); - ieee80211_vif_release_channel(sdata); - mutex_unlock(&sdata->local->mtx); - } - - kfree(assoc_data); - sdata->u.mgd.assoc_data = NULL; -} - static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) @@ -3028,11 +3064,21 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, */ ifmgd->wmm_last_param_set = -1; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && elems.wmm_param) - ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len); - else - ieee80211_set_wmm_default(sdata, false); + if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { + ieee80211_set_wmm_default(sdata, false, false); + } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, + elems.wmm_param_len)) { + /* still enable QoS since we might have HT/VHT */ + ieee80211_set_wmm_default(sdata, false, true); + /* set the disable-WMM flag in this case to disable + * tracking WMM parameter changes in the beacon if + * the parameters weren't actually valid. Doing so + * avoids changing parameters very strangely when + * the AP is going back and forth between valid and + * invalid parameters. + */ + ifmgd->flags |= IEEE80211_STA_DISABLE_WMM; + } changed |= BSS_CHANGED_QOS; /* set AID and assoc capability, @@ -3211,16 +3257,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, if (ifmgd->associated && ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) ieee80211_reset_ap_probe(sdata); - - if (ifmgd->auth_data && !ifmgd->auth_data->bss->proberesp_ies && - ether_addr_equal(mgmt->bssid, ifmgd->auth_data->bss->bssid)) { - /* got probe response, continue with auth */ - sdata_info(sdata, "direct probe responded\n"); - ifmgd->auth_data->tries = 0; - ifmgd->auth_data->timeout = jiffies; - ifmgd->auth_data->timeout_started = true; - run_again(sdata, ifmgd->auth_data->timeout); - } } /* @@ -3299,7 +3335,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } ifmgd->have_beacon = true; ifmgd->assoc_data->need_beacon = false; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = @@ -3323,24 +3359,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, bssid = ifmgd->associated->bssid; /* Track average RSSI from the Beacon frames of the current AP */ - ifmgd->last_beacon_signal = rx_status->signal; if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; - ifmgd->ave_beacon_signal = rx_status->signal * 16; + ewma_beacon_signal_init(&ifmgd->ave_beacon_signal); ifmgd->last_cqm_event_signal = 0; ifmgd->count_beacon_signal = 1; ifmgd->last_ave_beacon_signal = 0; } else { - ifmgd->ave_beacon_signal = - (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 + - (16 - IEEE80211_SIGNAL_AVE_WEIGHT) * - ifmgd->ave_beacon_signal) / 16; ifmgd->count_beacon_signal++; } + ewma_beacon_signal_add(&ifmgd->ave_beacon_signal, -rx_status->signal); + if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold && ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { - int sig = ifmgd->ave_beacon_signal; + int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); int last_sig = ifmgd->last_ave_beacon_signal; struct ieee80211_event event = { .type = RSSI_EVENT, @@ -3367,10 +3400,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (bss_conf->cqm_rssi_thold && ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) { - int sig = ifmgd->ave_beacon_signal / 16; + int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); int last_event = ifmgd->last_cqm_event_signal; int thold = bss_conf->cqm_rssi_thold; int hyst = bss_conf->cqm_rssi_hyst; + if (sig < thold && (last_event == 0 || sig < last_event - hyst)) { ifmgd->last_cqm_event_signal = sig; @@ -3405,31 +3439,27 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, len - baselen, false, &elems, care_about_ies, ncrc); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { - bool directed_tim = ieee80211_check_tim(elems.tim, - elems.tim_len, - ifmgd->aid); - if (directed_tim) { - if (local->hw.conf.dynamic_ps_timeout > 0) { - if (local->hw.conf.flags & IEEE80211_CONF_PS) { - local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, - IEEE80211_CONF_CHANGE_PS); - } - ieee80211_send_nullfunc(local, sdata, 0); - } else if (!local->pspolling && sdata->u.mgd.powersave) { - local->pspolling = true; - - /* - * Here is assumed that the driver will be - * able to send ps-poll frame and receive a - * response even though power save mode is - * enabled, but some drivers might require - * to disable power save here. This needs - * to be investigated. - */ - ieee80211_send_pspoll(local, sdata); + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && + ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) { + if (local->hw.conf.dynamic_ps_timeout > 0) { + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); } + ieee80211_send_nullfunc(local, sdata, false); + } else if (!local->pspolling && sdata->u.mgd.powersave) { + local->pspolling = true; + + /* + * Here is assumed that the driver will be + * able to send ps-poll frame and receive a + * response even though power save mode is + * enabled, but some drivers might require + * to disable power save here. This needs + * to be investigated. + */ + ieee80211_send_pspoll(local, sdata); } } @@ -3473,7 +3503,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, * the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. */ - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = @@ -3516,7 +3546,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->have_beacon = true; mutex_lock(&local->iflist_mtx); - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); mutex_unlock(&local->iflist_mtx); ieee80211_recalc_ps_vif(sdata); @@ -3550,7 +3580,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (sta && elems.opmode_notif) ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif, - rx_status->band, true); + rx_status->band); mutex_unlock(&local->sta_mtx); changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, @@ -3666,12 +3696,14 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, reason); } -static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) +static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data; u32 tx_flags = 0; + u16 trans = 1; + u16 status = 0; sdata_assert_lock(sdata); @@ -3695,54 +3727,27 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) drv_mgd_prepare_tx(local, sdata); - if (auth_data->bss->proberesp_ies) { - u16 trans = 1; - u16 status = 0; - - sdata_info(sdata, "send auth to %pM (try %d/%d)\n", - auth_data->bss->bssid, auth_data->tries, - IEEE80211_AUTH_MAX_TRIES); + sdata_info(sdata, "send auth to %pM (try %d/%d)\n", + auth_data->bss->bssid, auth_data->tries, + IEEE80211_AUTH_MAX_TRIES); - auth_data->expected_transaction = 2; + auth_data->expected_transaction = 2; - if (auth_data->algorithm == WLAN_AUTH_SAE) { - trans = auth_data->sae_trans; - status = auth_data->sae_status; - auth_data->expected_transaction = trans; - } - - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) - tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | - IEEE80211_TX_INTFL_MLME_CONN_TX; - - ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, - auth_data->data, auth_data->data_len, - auth_data->bss->bssid, - auth_data->bss->bssid, NULL, 0, 0, - tx_flags); - } else { - const u8 *ssidie; + if (auth_data->algorithm == WLAN_AUTH_SAE) { + trans = auth_data->sae_trans; + status = auth_data->sae_status; + auth_data->expected_transaction = trans; + } - sdata_info(sdata, "direct probe to %pM (try %d/%i)\n", - auth_data->bss->bssid, auth_data->tries, - IEEE80211_AUTH_MAX_TRIES); + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) + tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_INTFL_MLME_CONN_TX; - rcu_read_lock(); - ssidie = ieee80211_bss_get_ie(auth_data->bss, WLAN_EID_SSID); - if (!ssidie) { - rcu_read_unlock(); - return -EINVAL; - } - /* - * Direct probe is sent to broadcast address as some APs - * will not answer to direct packet in unassociated state. - */ - ieee80211_send_probe_req(sdata, sdata->vif.addr, NULL, - ssidie + 2, ssidie[1], - NULL, 0, (u32) -1, true, 0, - auth_data->bss->channel, false); - rcu_read_unlock(); - } + ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, + auth_data->data, auth_data->data_len, + auth_data->bss->bssid, + auth_data->bss->bssid, NULL, 0, 0, + tx_flags); if (tx_flags == 0) { auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; @@ -3784,7 +3789,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) IEEE80211_ASSOC_MAX_TRIES); ieee80211_send_assoc(sdata); - if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); @@ -3823,8 +3828,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) bool status_acked = ifmgd->status_acked; ifmgd->status_received = false; - if (ifmgd->auth_data && - (ieee80211_is_probe_req(fc) || ieee80211_is_auth(fc))) { + if (ifmgd->auth_data && ieee80211_is_auth(fc)) { if (status_acked) { ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SHORT; @@ -3855,7 +3859,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) * so let's just kill the auth data */ ieee80211_destroy_auth_data(sdata, false); - } else if (ieee80211_probe_auth(sdata)) { + } else if (ieee80211_auth(sdata)) { u8 bssid[ETH_ALEN]; struct ieee80211_event event = { .type = MLME_EVENT, @@ -3898,7 +3902,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) max_tries = max_nullfunc_tries; else max_tries = max_probe_tries; @@ -3923,7 +3927,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) } } else if (time_is_after_jiffies(ifmgd->probe_timeout)) run_again(sdata, ifmgd->probe_timeout); - else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + else if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { mlme_dbg(sdata, "Failed to send nullfunc to AP %pM after %dms, disconnecting\n", bssid, probe_wait_ms); @@ -3992,18 +3996,13 @@ static void ieee80211_sta_monitor_work(struct work_struct *work) static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { - u32 flags; - if (sdata->vif.type == NL80211_IFTYPE_STATION) { __ieee80211_stop_poll(sdata); /* let's probe the connection once */ - flags = sdata->local->hw.flags; - if (!(flags & IEEE80211_HW_CONNECTION_MONITOR)) + if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.monitor_work); - /* and do all the other regular work too */ - ieee80211_queue_work(&sdata->local->hw, &sdata->work); } } @@ -4149,21 +4148,6 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) rcu_read_unlock(); } -int ieee80211_max_network_latency(struct notifier_block *nb, - unsigned long data, void *dummy) -{ - s32 latency_usec = (s32) data; - struct ieee80211_local *local = - container_of(nb, struct ieee80211_local, - network_latency_notifier); - - mutex_lock(&local->iflist_mtx); - ieee80211_recalc_ps(local, latency_usec); - mutex_unlock(&local->iflist_mtx); - - return NOTIFY_OK; -} - static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss) { @@ -4219,6 +4203,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; + u32 i; + bool have_80mhz; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4269,6 +4255,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, ht_cap, ht_oper, vht_oper, @@ -4307,15 +4307,15 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, - struct cfg80211_bss *cbss, bool assoc) + struct cfg80211_bss *cbss, bool assoc, + bool override) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_supported_band *sband; - struct ieee80211_sta_ht_cap sta_ht_cap; - bool have_sta = false, is_override = false; + bool have_sta = false; int err; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4335,14 +4335,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return -ENOMEM; } - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); - - is_override = (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) != - (sband->ht_cap.cap & - IEEE80211_HT_CAP_SUP_WIDTH_20_40); - - if (new_sta || is_override) { + if (new_sta || override) { err = ieee80211_prep_channel(sdata, cbss); if (err) { if (new_sta) @@ -4419,8 +4412,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = tim_ie[2]; else sdata->vif.bss_conf.sync_dtim_count = 0; - } else if (!(local->hw.flags & - IEEE80211_HW_TIMING_BEACON_ONLY)) { + } else if (!ieee80211_hw_check(&sdata->local->hw, + TIMING_BEACON_ONLY)) { ies = rcu_dereference(cbss->proberesp_ies); /* must be non-NULL since beacon IEs were NULL */ sdata->vif.bss_conf.sync_tsf = ies->tsf; @@ -4552,11 +4545,11 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); - err = ieee80211_prep_connection(sdata, req->bss, false); + err = ieee80211_prep_connection(sdata, req->bss, false, false); if (err) goto err_clear; - err = ieee80211_probe_auth(sdata); + err = ieee80211_auth(sdata); if (err) { sta_info_destroy_addr(sdata, req->bss->bssid); goto err_clear; @@ -4570,49 +4563,14 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, eth_zero_addr(ifmgd->bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->auth_data = NULL; + mutex_lock(&sdata->local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); err_free: kfree(auth_data); return err; } -static bool ieee80211_usable_wmm_params(struct ieee80211_sub_if_data *sdata, - const u8 *wmm_param, int len) -{ - const u8 *pos; - size_t left; - - if (len < 8) - return false; - - if (wmm_param[5] != 1 /* version */) - return false; - - pos = wmm_param + 8; - left = len - 8; - - for (; left >= 4; left -= 4, pos += 4) { - u8 aifsn = pos[0] & 0x0f; - u8 ecwmin = pos[1] & 0x0f; - u8 ecwmax = (pos[1] & 0xf0) >> 4; - int aci = (pos[0] >> 5) & 0x03; - - if (aifsn < 2) { - sdata_info(sdata, - "AP has invalid WMM params (AIFSN=%d for ACI %d), disabling WMM\n", - aifsn, aci); - return false; - } - if (ecwmin > ecwmax) { - sdata_info(sdata, - "AP has invalid WMM params (ECWmin/max=%d/%d for ACI %d), disabling WMM\n", - ecwmin, ecwmax, aci); - return false; - } - } - - return true; -} - int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { @@ -4624,6 +4582,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; const u8 *ssidie, *ht_ie, *vht_ie; int i, err; + bool override = false; assoc_data = kzalloc(sizeof(*assoc_data) + req->ie_len, GFP_KERNEL); if (!assoc_data) @@ -4676,39 +4635,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->wmm = bss->wmm_used && (local->hw.queues >= IEEE80211_NUM_ACS); - if (assoc_data->wmm) { - /* try to check validity of WMM params IE */ - const struct cfg80211_bss_ies *ies; - const u8 *wp, *start, *end; - - rcu_read_lock(); - ies = rcu_dereference(req->bss->ies); - start = ies->data; - end = start + ies->len; - - while (true) { - wp = cfg80211_find_vendor_ie( - WLAN_OUI_MICROSOFT, - WLAN_OUI_TYPE_MICROSOFT_WMM, - start, end - start); - if (!wp) - break; - start = wp + wp[1] + 2; - /* if this IE is too short, try the next */ - if (wp[1] <= 4) - continue; - /* if this IE is WMM params, we found what we wanted */ - if (wp[6] == 1) - break; - } - - if (!wp || !ieee80211_usable_wmm_params(sdata, wp + 2, - wp[1] - 2)) { - assoc_data->wmm = false; - ifmgd->flags |= IEEE80211_STA_DISABLE_WMM; - } - rcu_read_unlock(); - } /* * IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode. @@ -4728,14 +4654,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, } } - if (req->flags & ASSOC_REQ_DISABLE_HT) { - ifmgd->flags |= IEEE80211_STA_DISABLE_HT; - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - } - - if (req->flags & ASSOC_REQ_DISABLE_VHT) - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - /* Also disable HT if we don't support it or the AP doesn't use WMM */ sband = local->hw.wiphy->bands[req->bss->channel->band]; if (!sband->ht_cap.ht_supported || @@ -4802,7 +4720,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && - (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), + ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK), "U-APSD not supported with HW_PS_NULLFUNC_STACK\n")) sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD; @@ -4847,14 +4765,43 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->dtim_period = 0; ifmgd->have_beacon = false; - err = ieee80211_prep_connection(sdata, req->bss, true); + /* override HT/VHT configuration only if the AP and we support it */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + struct ieee80211_sta_ht_cap sta_ht_cap; + + if (req->flags & ASSOC_REQ_DISABLE_HT) + override = true; + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + + /* check for 40 MHz disable override */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ) && + sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && + !(sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) + override = true; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + req->flags & ASSOC_REQ_DISABLE_VHT) + override = true; + } + + if (req->flags & ASSOC_REQ_DISABLE_HT) { + ifmgd->flags |= IEEE80211_STA_DISABLE_HT; + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + } + + if (req->flags & ASSOC_REQ_DISABLE_VHT) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + + err = ieee80211_prep_connection(sdata, req->bss, true, override); if (err) goto err_clear; rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); - if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC && + if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC) && !beacon_ies) { /* * Wait up to one beacon interval ... @@ -4881,7 +4828,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout = jiffies; assoc_data->timeout_started = true; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = beacon_ies->tsf; sdata->vif.bss_conf.sync_device_ts = bss->device_ts_beacon; @@ -4946,6 +4893,25 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, return 0; } + if (ifmgd->assoc_data && + ether_addr_equal(ifmgd->assoc_data->bss->bssid, req->bssid)) { + sdata_info(sdata, + "aborting association with %pM by local choice (Reason: %u=%s)\n", + req->bssid, req->reason_code, + ieee80211_get_reason_code_string(req->reason_code)); + + drv_mgd_prepare_tx(sdata->local, sdata); + ieee80211_send_deauth_disassoc(sdata, req->bssid, + IEEE80211_STYPE_DEAUTH, + req->reason_code, tx, + frame_buf); + ieee80211_destroy_assoc_data(sdata, false); + ieee80211_report_disconnect(sdata, frame_buf, + sizeof(frame_buf), true, + req->reason_code); + return 0; + } + if (ifmgd->associated && ether_addr_equal(ifmgd->associated->bssid, req->bssid)) { sdata_info(sdata, diff --git a/kernel/net/mac80211/ocb.c b/kernel/net/mac80211/ocb.c index 358d5f9d8..0be0aadfc 100644 --- a/kernel/net/mac80211/ocb.c +++ b/kernel/net/mac80211/ocb.c @@ -75,7 +75,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, if (!sta) return; - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; /* Add only mandatory rates for now */ sband = local->hw.wiphy->bands[band]; @@ -179,7 +179,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; - u32 changed = BSS_CHANGED_OCB; + u32 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID; int err; if (ifocb->joined == true) diff --git a/kernel/net/mac80211/offchannel.c b/kernel/net/mac80211/offchannel.c index 683f0e3cb..044010371 100644 --- a/kernel/net/mac80211/offchannel.c +++ b/kernel/net/mac80211/offchannel.c @@ -46,7 +46,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) } if (!local->offchannel_ps_enabled || - !(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) + !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the @@ -57,7 +57,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) * to send a new nullfunc frame to inform the AP that we * are again sleeping. */ - ieee80211_send_nullfunc(local, sdata, 1); + ieee80211_send_nullfunc(local, sdata, true); } /* inform AP that we are awake again, unless power save is enabled */ @@ -66,7 +66,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; if (!local->ps_sdata) - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); else if (local->offchannel_ps_enabled) { /* * In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware @@ -93,7 +93,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) * restart the timer now and send a nullfunc frame to inform * the AP that we are awake. */ - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } diff --git a/kernel/net/mac80211/pm.c b/kernel/net/mac80211/pm.c index ac6ad6238..00a43a70e 100644 --- a/kernel/net/mac80211/pm.c +++ b/kernel/net/mac80211/pm.c @@ -6,6 +6,13 @@ #include "driver-ops.h" #include "led.h" +static void ieee80211_sched_scan_cancel(struct ieee80211_local *local) +{ + if (ieee80211_request_sched_scan_stop(local)) + return; + cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); +} + int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) { struct ieee80211_local *local = hw_to_local(hw); @@ -23,7 +30,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) ieee80211_del_virtual_monitor(local); - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION) && + !(wowlan && wowlan->any)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); @@ -33,6 +41,10 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) mutex_unlock(&local->sta_mtx); } + /* keep sched_scan only in case of 'any' trigger */ + if (!(wowlan && wowlan->any)) + ieee80211_sched_scan_cancel(local); + ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, @@ -76,13 +88,29 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; ieee80211_mgd_quiesce(sdata); + /* If suspended during TX in progress, and wowlan + * is enabled (connection will be active) there + * can be a race where the driver is put out + * of power-save due to TX and during suspend + * dynamic_ps_timer is cancelled and TX packet + * is flushed, leaving the driver in ACTIVE even + * after resuming until dynamic_ps_timer puts + * driver back in DOZE. + */ + if (sdata->u.mgd.associated && + sdata->u.mgd.powersave && + !(local->hw.conf.flags & IEEE80211_CONF_PS)) { + local->hw.conf.flags |= IEEE80211_CONF_PS; + ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); + } } err = drv_suspend(local, wowlan); if (err < 0) { local->quiescing = false; local->wowlan = false; - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { diff --git a/kernel/net/mac80211/rate.c b/kernel/net/mac80211/rate.c index d53355b01..a4e2f4e67 100644 --- a/kernel/net/mac80211/rate.c +++ b/kernel/net/mac80211/rate.c @@ -29,6 +29,65 @@ module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); +void rate_control_rate_init(struct sta_info *sta) +{ + struct ieee80211_local *local = sta->sdata->local; + struct rate_control_ref *ref = sta->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + struct ieee80211_supported_band *sband; + struct ieee80211_chanctx_conf *chanctx_conf; + + ieee80211_sta_set_rx_nss(sta); + + if (!ref) + return; + + rcu_read_lock(); + + chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return; + } + + sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; + + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, + priv_sta); + spin_unlock_bh(&sta->rate_ctrl_lock); + rcu_read_unlock(); + set_sta_flag(sta, WLAN_STA_RATE_CONTROL); +} + +void rate_control_rate_update(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct sta_info *sta, u32 changed) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + struct ieee80211_chanctx_conf *chanctx_conf; + + if (ref && ref->ops->rate_update) { + rcu_read_lock(); + + chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return; + } + + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, + ista, priv_sta, changed); + spin_unlock_bh(&sta->rate_ctrl_lock); + rcu_read_unlock(); + } + drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); +} + int ieee80211_rate_control_register(const struct rate_control_ops *ops) { struct rate_control_alg *alg; @@ -103,7 +162,7 @@ ieee80211_rate_control_ops_get(const char *name) const struct rate_control_ops *ops; const char *alg_name; - kparam_block_sysfs_write(ieee80211_default_rc_algo); + kernel_param_lock(THIS_MODULE); if (!name) alg_name = ieee80211_default_rc_algo; else @@ -117,7 +176,7 @@ ieee80211_rate_control_ops_get(const char *name) /* try built-in one if specific alg requested but not found */ if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT)) ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); - kparam_unblock_sysfs_write(ieee80211_default_rc_algo); + kernel_param_unlock(THIS_MODULE); return ops; } @@ -246,7 +305,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, info->control.rates[0].idx = i; break; } - WARN_ON_ONCE(i == sband->n_bitrates); + WARN_ONCE(i == sband->n_bitrates, + "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n", + sta ? sta->supp_rates[sband->band] : -1, + rate_mask, rate_flags); info->control.rates[0].count = (info->flags & IEEE80211_TX_CTL_NO_ACK) ? @@ -294,39 +356,37 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(rate_control_send_low); -static bool rate_idx_match_legacy_mask(struct ieee80211_tx_rate *rate, - int n_bitrates, u32 mask) +static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { int j; /* See whether the selected rate or anything below it is allowed. */ - for (j = rate->idx; j >= 0; j--) { + for (j = *rate_idx; j >= 0; j--) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ - rate->idx = j; + *rate_idx = j; return true; } } /* Try to find a higher rate that would be allowed */ - for (j = rate->idx + 1; j < n_bitrates; j++) { + for (j = *rate_idx + 1; j < n_bitrates; j++) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ - rate->idx = j; + *rate_idx = j; return true; } } return false; } -static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, - u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) +static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask) { int i, j; int ridx, rbit; - ridx = rate->idx / 8; - rbit = rate->idx % 8; + ridx = *rate_idx / 8; + rbit = *rate_idx % 8; /* sanity check */ if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN) @@ -336,20 +396,20 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) if (mcs_mask[i] & BIT(j)) { - rate->idx = i * 8 + j; + *rate_idx = i * 8 + j; return true; } rbit = 7; } /* Try to find a higher rate that would be allowed */ - ridx = (rate->idx + 1) / 8; - rbit = (rate->idx + 1) % 8; + ridx = (*rate_idx + 1) / 8; + rbit = (*rate_idx + 1) % 8; for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) { for (j = rbit; j < 8; j++) if (mcs_mask[i] & BIT(j)) { - rate->idx = i * 8 + j; + *rate_idx = i * 8 + j; return true; } rbit = 0; @@ -357,37 +417,93 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, return false; } +static bool rate_idx_match_vht_mcs_mask(s8 *rate_idx, u16 *vht_mask) +{ + int i, j; + int ridx, rbit; + + ridx = *rate_idx >> 4; + rbit = *rate_idx & 0xf; + + if (ridx < 0 || ridx >= NL80211_VHT_NSS_MAX) + return false; + + /* See whether the selected rate or anything below it is allowed. */ + for (i = ridx; i >= 0; i--) { + for (j = rbit; j >= 0; j--) { + if (vht_mask[i] & BIT(j)) { + *rate_idx = (i << 4) | j; + return true; + } + } + rbit = 15; + } + + /* Try to find a higher rate that would be allowed */ + ridx = (*rate_idx + 1) >> 4; + rbit = (*rate_idx + 1) & 0xf; + for (i = ridx; i < NL80211_VHT_NSS_MAX; i++) { + for (j = rbit; j < 16; j++) { + if (vht_mask[i] & BIT(j)) { + *rate_idx = (i << 4) | j; + return true; + } + } + rbit = 0; + } + return false; +} -static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, +static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, struct ieee80211_supported_band *sband, enum nl80211_chan_width chan_width, u32 mask, - u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], + u16 vht_mask[NL80211_VHT_NSS_MAX]) { - struct ieee80211_tx_rate alt_rate; + if (*rate_flags & IEEE80211_TX_RC_VHT_MCS) { + /* handle VHT rates */ + if (rate_idx_match_vht_mcs_mask(rate_idx, vht_mask)) + return; + + *rate_idx = 0; + /* keep protection flags */ + *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | + IEEE80211_TX_RC_USE_CTS_PROTECT | + IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - /* handle HT rates */ - if (rate->flags & IEEE80211_TX_RC_MCS) { - if (rate_idx_match_mcs_mask(rate, mcs_mask)) + *rate_flags |= IEEE80211_TX_RC_MCS; + if (chan_width == NL80211_CHAN_WIDTH_40) + *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + + if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ - alt_rate.idx = 0; + *rate_flags &= ~(IEEE80211_TX_RC_MCS | + IEEE80211_TX_RC_40_MHZ_WIDTH); + if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, + mask)) + return; + } else if (*rate_flags & IEEE80211_TX_RC_MCS) { + /* handle HT rates */ + if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) + return; + + /* also try the legacy rates. */ + *rate_idx = 0; /* keep protection flags */ - alt_rate.flags = rate->flags & - (IEEE80211_TX_RC_USE_RTS_CTS | - IEEE80211_TX_RC_USE_CTS_PROTECT | - IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - alt_rate.count = rate->count; - if (rate_idx_match_legacy_mask(&alt_rate, - sband->n_bitrates, mask)) { - *rate = alt_rate; + *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | + IEEE80211_TX_RC_USE_CTS_PROTECT | + IEEE80211_TX_RC_USE_SHORT_PREAMBLE); + if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, + mask)) return; - } - } else if (!(rate->flags & IEEE80211_TX_RC_VHT_MCS)) { + } else { /* handle legacy rates */ - if (rate_idx_match_legacy_mask(rate, sband->n_bitrates, mask)) + if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, + mask)) return; /* if HT BSS, and we handle a data frame, also try HT rates */ @@ -400,23 +516,19 @@ static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, break; } - alt_rate.idx = 0; + *rate_idx = 0; /* keep protection flags */ - alt_rate.flags = rate->flags & - (IEEE80211_TX_RC_USE_RTS_CTS | - IEEE80211_TX_RC_USE_CTS_PROTECT | - IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - alt_rate.count = rate->count; + *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | + IEEE80211_TX_RC_USE_CTS_PROTECT | + IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - alt_rate.flags |= IEEE80211_TX_RC_MCS; + *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) - alt_rate.flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; - if (rate_idx_match_mcs_mask(&alt_rate, mcs_mask)) { - *rate = alt_rate; + if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; - } } /* @@ -569,18 +681,92 @@ static void rate_control_fill_sta_table(struct ieee80211_sta *sta, } } +static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, u32 *mask, + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], + u16 vht_mask[NL80211_VHT_NSS_MAX]) +{ + u32 i, flags; + + *mask = sdata->rc_rateidx_mask[sband->band]; + flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); + for (i = 0; i < sband->n_bitrates; i++) { + if ((flags & sband->bitrates[i].flags) != flags) + *mask &= ~BIT(i); + } + + if (*mask == (1 << sband->n_bitrates) - 1 && + !sdata->rc_has_mcs_mask[sband->band] && + !sdata->rc_has_vht_mcs_mask[sband->band]) + return false; + + if (sdata->rc_has_mcs_mask[sband->band]) + memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], + IEEE80211_HT_MCS_MASK_LEN); + else + memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN); + + if (sdata->rc_has_vht_mcs_mask[sband->band]) + memcpy(vht_mask, sdata->rc_rateidx_vht_mcs_mask[sband->band], + sizeof(u16) * NL80211_VHT_NSS_MAX); + else + memset(vht_mask, 0xff, sizeof(u16) * NL80211_VHT_NSS_MAX); + + if (sta) { + __le16 sta_vht_cap; + u16 sta_vht_mask[NL80211_VHT_NSS_MAX]; + + /* Filter out rates that the STA does not support */ + *mask &= sta->supp_rates[sband->band]; + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) + mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; + + sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; + ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask); + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + vht_mask[i] &= sta_vht_mask[i]; + } + + return true; +} + +static void +rate_control_apply_mask_ratetbl(struct sta_info *sta, + struct ieee80211_supported_band *sband, + struct ieee80211_sta_rates *rates) +{ + int i; + u32 mask; + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; + u16 vht_mask[NL80211_VHT_NSS_MAX]; + enum nl80211_chan_width chan_width; + + if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask, + mcs_mask, vht_mask)) + return; + + chan_width = sta->sdata->vif.bss_conf.chandef.width; + for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) { + if (rates->rate[i].idx < 0) + break; + + rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags, + sband, chan_width, mask, mcs_mask, + vht_mask); + } +} + static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, - struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { enum nl80211_chan_width chan_width; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; - bool has_mcs_mask; u32 mask; - u32 rate_flags; + u16 rate_flags, vht_mask[NL80211_VHT_NSS_MAX]; int i; /* @@ -588,30 +774,10 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, * default mask (allow all rates) is used to save some processing for * the common case. */ - mask = sdata->rc_rateidx_mask[info->band]; - has_mcs_mask = sdata->rc_has_mcs_mask[info->band]; - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); - for (i = 0; i < sband->n_bitrates; i++) - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - mask &= ~BIT(i); - - if (mask == (1 << sband->n_bitrates) - 1 && !has_mcs_mask) + if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask, + vht_mask)) return; - if (has_mcs_mask) - memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[info->band], - sizeof(mcs_mask)); - else - memset(mcs_mask, 0xff, sizeof(mcs_mask)); - - if (sta) { - /* Filter out rates that the STA does not support */ - mask &= sta->supp_rates[info->band]; - for (i = 0; i < sizeof(mcs_mask); i++) - mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; - } - /* * Make sure the rate index selected for each TX rate is * included in the configured mask and change the rate indexes @@ -623,8 +789,10 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, if (rates[i].idx < 0) break; - rate_idx_match_mask(&rates[i], sband, chan_width, mask, - mcs_mask); + rate_flags = rates[i].flags; + rate_idx_match_mask(&rates[i].idx, &rate_flags, sband, + chan_width, mask, mcs_mask, vht_mask); + rates[i].flags = rate_flags; } } @@ -648,7 +816,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_data(hdr->frame_control)) - rate_control_apply_mask(sdata, sta, sband, info, dest, max_rates); + rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (dest[0].idx < 0) __rate_control_send_low(&sdata->local->hw, sband, sta, info, @@ -680,12 +848,18 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, info->control.rates[i].count = 0; } - if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; - ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + if (ista) { + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + spin_unlock_bh(&sta->rate_ctrl_lock); + } else { + ref->ops->get_rate(ref->priv, NULL, NULL, txrc); + } - if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE) + if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) return; ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, @@ -699,7 +873,10 @@ int rate_control_set_rates(struct ieee80211_hw *hw, { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sta_rates *old; + struct ieee80211_supported_band *sband; + sband = hw->wiphy->bands[ieee80211_get_sdata_band(sta->sdata)]; + rate_control_apply_mask_ratetbl(sta, sband, rates); /* * mac80211 guarantees that this function will not be called * concurrently, so the following RCU access is safe, even without @@ -727,7 +904,7 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, if (local->open_count) return -EBUSY; - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { if (WARN_ON(!local->ops->set_rts_threshold)) return -EINVAL; return 0; diff --git a/kernel/net/mac80211/rate.h b/kernel/net/mac80211/rate.h index 38652f09f..624fe5b81 100644 --- a/kernel/net/mac80211/rate.h +++ b/kernel/net/mac80211/rate.h @@ -42,10 +42,12 @@ static inline void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + spin_lock_bh(&sta->rate_ctrl_lock); if (ref->ops->tx_status) ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); else ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); + spin_unlock_bh(&sta->rate_ctrl_lock); } static inline void @@ -64,69 +66,21 @@ rate_control_tx_status_noskb(struct ieee80211_local *local, if (WARN_ON_ONCE(!ref->ops->tx_status_noskb)) return; + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); + spin_unlock_bh(&sta->rate_ctrl_lock); } -static inline void rate_control_rate_init(struct sta_info *sta) -{ - struct ieee80211_local *local = sta->sdata->local; - struct rate_control_ref *ref = sta->rate_ctrl; - struct ieee80211_sta *ista = &sta->sta; - void *priv_sta = sta->rate_ctrl_priv; - struct ieee80211_supported_band *sband; - struct ieee80211_chanctx_conf *chanctx_conf; - - ieee80211_sta_set_rx_nss(sta); - - if (!ref) - return; - - rcu_read_lock(); - - chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); - if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return; - } - - sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; - - ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, - priv_sta); - rcu_read_unlock(); - set_sta_flag(sta, WLAN_STA_RATE_CONTROL); -} - -static inline void rate_control_rate_update(struct ieee80211_local *local, +void rate_control_rate_init(struct sta_info *sta); +void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, - struct sta_info *sta, u32 changed) -{ - struct rate_control_ref *ref = local->rate_ctrl; - struct ieee80211_sta *ista = &sta->sta; - void *priv_sta = sta->rate_ctrl_priv; - struct ieee80211_chanctx_conf *chanctx_conf; - - if (ref && ref->ops->rate_update) { - rcu_read_lock(); - - chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); - if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return; - } - - ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, - ista, priv_sta, changed); - rcu_read_unlock(); - } - drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); -} + struct sta_info *sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, - struct ieee80211_sta *sta, - gfp_t gfp) + struct sta_info *sta, gfp_t gfp) { - return ref->ops->alloc_sta(ref->priv, sta, gfp); + spin_lock_init(&sta->rate_ctrl_lock); + return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) diff --git a/kernel/net/mac80211/rc80211_minstrel.c b/kernel/net/mac80211/rc80211_minstrel.c index 3ece7d103..b54f398cd 100644 --- a/kernel/net/mac80211/rc80211_minstrel.c +++ b/kernel/net/mac80211/rc80211_minstrel.c @@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma); + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/kernel/net/mac80211/rc80211_minstrel_debugfs.c b/kernel/net/mac80211/rc80211_minstrel_debugfs.c index 1db5f7c33..820b0abc9 100644 --- a/kernel/net/mac80211/rc80211_minstrel_debugfs.c +++ b/kernel/net/mac80211/rc80211_minstrel_debugfs.c @@ -85,12 +85,10 @@ minstrel_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; p += sprintf(p, "\n"); - p += sprintf(p, "best __________rate_________ ______" - "statistics______ ________last_______ " - "______sum-of________\n"); - p += sprintf(p, "rate [name idx airtime max_tp] [ ø(tp) ø(prob) " - "sd(prob)] [prob.|retry|suc|att] " - "[#success | #attempts]\n"); + p += sprintf(p, + "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n"); + p += sprintf(p, + "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; @@ -112,7 +110,7 @@ minstrel_stats_open(struct inode *inode, struct file *file) prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" " %3u.%1u %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, diff --git a/kernel/net/mac80211/rc80211_minstrel_ht.c b/kernel/net/mac80211/rc80211_minstrel_ht.c index 7430a1df2..239ed6e92 100644 --- a/kernel/net/mac80211/rc80211_minstrel_ht.c +++ b/kernel/net/mac80211/rc80211_minstrel_ht.c @@ -691,7 +691,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) if (likely(sta->ampdu_mlme.tid_tx[tid])) return; - ieee80211_start_tx_ba_session(pubsta, tid, 5000); + ieee80211_start_tx_ba_session(pubsta, tid, 0); } static void @@ -867,7 +867,13 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, else idx = index % MCS_GROUP_RATES + (group->streams - 1) * 8; - if (offset > 0) { + /* enable RTS/CTS if needed: + * - if station is in dynamic SMPS (and streams > 1) + * - for fallback rates, to increase chances of getting through + */ + if (offset > 0 || + (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC && + group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; flags |= IEEE80211_TX_RC_USE_RTS_CTS; } @@ -1070,7 +1076,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != IEEE80211_BAND_2GHZ) return; - if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES)) + if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; mi->cck_supported = 0; @@ -1328,7 +1334,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) prob = mi->groups[i].rates[j].prob_ewma; /* convert tp_avg from pkt per second in kbps */ - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024; + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; + tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024; return tp_avg; } diff --git a/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c b/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c index 6822ce0f9..5320e35ed 100644 --- a/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -86,7 +86,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" " %3u.%1u %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, @@ -129,12 +129,10 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) p = ms->buf; p += sprintf(p, "\n"); - p += sprintf(p, " best ____________rate__________ " - "______statistics______ ________last_______ " - "______sum-of________\n"); - p += sprintf(p, "mode guard # rate [name idx airtime max_tp] " - "[ ø(tp) ø(prob) sd(prob)] [prob.|retry|suc|att] [#success | " - "#attempts]\n"); + p += sprintf(p, + " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n"); + p += sprintf(p, + "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) diff --git a/kernel/net/mac80211/rx.c b/kernel/net/mac80211/rx.c index f6f8d9880..0e9e264bc 100644 --- a/kernel/net/mac80211/rx.c +++ b/kernel/net/mac80211/rx.c @@ -32,6 +32,61 @@ #include "wme.h" #include "rate.h" +static inline void ieee80211_rx_stats(struct net_device *dev, u32 len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += len; + u64_stats_update_end(&tstats->syncp); +} + +static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, + enum nl80211_iftype type) +{ + __le16 fc = hdr->frame_control; + + if (ieee80211_is_data(fc)) { + if (len < 24) /* drop incorrect hdr len (data) */ + return NULL; + + if (ieee80211_has_a4(fc)) + return NULL; + if (ieee80211_has_tods(fc)) + return hdr->addr1; + if (ieee80211_has_fromds(fc)) + return hdr->addr2; + + return hdr->addr3; + } + + if (ieee80211_is_mgmt(fc)) { + if (len < 24) /* drop incorrect hdr len (mgmt) */ + return NULL; + return hdr->addr3; + } + + if (ieee80211_is_ctl(fc)) { + if (ieee80211_is_pspoll(fc)) + return hdr->addr1; + + if (ieee80211_is_back_req(fc)) { + switch (type) { + case NL80211_IFTYPE_STATION: + return hdr->addr2; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + return hdr->addr1; + default: + break; /* fall through to the return */ + } + } + } + + return NULL; +} + /* * monitor mode reception * @@ -42,7 +97,7 @@ static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, struct sk_buff *skb, unsigned int rtap_vendor_space) { - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) { + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { if (likely(skb->len > FCS_LEN)) __pskb_trim(skb, skb->len - FCS_LEN); else { @@ -67,8 +122,7 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, hdr = (void *)(skb->data + rtap_vendor_space); if (status->flag & (RX_FLAG_FAILED_FCS_CRC | - RX_FLAG_FAILED_PLCP_CRC | - RX_FLAG_AMPDU_IS_ZEROLEN)) + RX_FLAG_FAILED_PLCP_CRC)) return true; if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space)) @@ -100,7 +154,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len = ALIGN(len, 8); len += 8; } - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) len += 1; /* antenna field, if we don't have per-chain info */ @@ -175,7 +229,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } mpdulen = skb->len; - if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS))) + if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS))) mpdulen += FCS_LEN; rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len); @@ -229,7 +283,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } /* IEEE80211_RADIOTAP_FLAGS */ - if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)) + if (has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) *pos |= IEEE80211_RADIOTAP_F_FCS; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) *pos |= IEEE80211_RADIOTAP_F_BADFCS; @@ -279,7 +333,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos += 2; /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */ - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM && + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM) && !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { *pos = status->signal; rthdr->it_present |= @@ -336,10 +390,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS); put_unaligned_le32(status->ampdu_reference, pos); pos += 4; - if (status->flag & RX_FLAG_AMPDU_REPORT_ZEROLEN) - flags |= IEEE80211_RADIOTAP_AMPDU_REPORT_ZEROLEN; - if (status->flag & RX_FLAG_AMPDU_IS_ZEROLEN) - flags |= IEEE80211_RADIOTAP_AMPDU_IS_ZEROLEN; if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN) flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN; if (status->flag & RX_FLAG_AMPDU_IS_LAST) @@ -448,7 +498,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, * the SKB because it has a bad FCS/PLCP checksum. */ - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) present_fcs_len = FCS_LEN; /* ensure hdr->frame_control and vendor radiotap data are in skb head */ @@ -529,8 +579,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, } prev_dev = sdata->dev; - sdata->dev->stats.rx_packets++; - sdata->dev->stats.rx_bytes += skb->len; + ieee80211_rx_stats(sdata->dev, skb->len); } if (prev_dev) { @@ -981,7 +1030,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_local *local = rx->local; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct sta_info *sta = rx->sta; struct tid_ampdu_rx *tid_agg_rx; u16 sc; @@ -1016,10 +1064,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL) goto dont_reorder; - /* not actually part of this BA session */ - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - goto dont_reorder; - /* new, potentially un-ordered, ampdu frame - process it */ /* reset session timer */ @@ -1069,18 +1113,16 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; - if (rx->sta) { - if (unlikely(ieee80211_has_retry(hdr->frame_control) && - rx->sta->last_seq_ctrl[rx->seqno_idx] == - hdr->seq_ctrl)) { - if (status->rx_flags & IEEE80211_RX_RA_MATCH) { - rx->local->dot11FrameDuplicateCount++; - rx->sta->num_duplicates++; - } - return RX_DROP_UNUSABLE; - } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) { - rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl; - } + if (!rx->sta) + return RX_CONTINUE; + + if (unlikely(ieee80211_has_retry(hdr->frame_control) && + rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) { + I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount); + rx->sta->rx_stats.num_duplicates++; + return RX_DROP_UNUSABLE; + } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) { + rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl; } return RX_CONTINUE; @@ -1091,11 +1133,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; - if (unlikely(rx->skb->len < 16)) { - I802_DEBUG_INC(rx->local->rx_handlers_drop_short); - return RX_DROP_MONITOR; - } - /* Drop disallowed frame classes based on STA auth/assoc state; * IEEE 802.11, Chap 5.5. * @@ -1195,11 +1232,13 @@ static void sta_ps_start(struct sta_info *sta) atomic_inc(&ps->num_sta_ps); set_sta_flag(sta, WLAN_STA_PS_STA); - if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta); ps_dbg(sdata, "STA %pM aid %d enters power save mode\n", sta->sta.addr, sta->sta.aid); + ieee80211_clear_fast_xmit(sta); + if (!sta->sta.txq[0]) return; @@ -1236,22 +1275,22 @@ static void sta_ps_end(struct sta_info *sta) ieee80211_sta_ps_deliver_wakeup(sta); } -int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start) +int ieee80211_sta_ps_transition(struct ieee80211_sta *pubsta, bool start) { - struct sta_info *sta_inf = container_of(sta, struct sta_info, sta); + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); bool in_ps; - WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS)); + WARN_ON(!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS)); /* Don't let the same PS state be set twice */ - in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA); + in_ps = test_sta_flag(sta, WLAN_STA_PS_STA); if ((start && in_ps) || (!start && !in_ps)) return -EINVAL; if (start) - sta_ps_start(sta_inf); + sta_ps_start(sta); else - sta_ps_end(sta_inf); + sta_ps_end(sta); return 0; } @@ -1265,7 +1304,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); int tid, ac; - if (!rx->sta || !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + if (!rx->sta) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_AP && @@ -1277,7 +1316,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) * uAPSD and PS-Poll frames (the latter shouldn't even come up from * it to mac80211 since they're handled.) */ - if (sdata->local->hw.flags & IEEE80211_HW_AP_LINK_PS) + if (ieee80211_hw_check(&sdata->local->hw, AP_LINK_PS)) return RX_CONTINUE; /* @@ -1357,58 +1396,56 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) NL80211_IFTYPE_ADHOC); if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) && test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; if (ieee80211_is_data(hdr->frame_control) && !is_multicast_ether_addr(hdr->addr1)) { - sta->last_rx_rate_idx = status->rate_idx; - sta->last_rx_rate_flag = status->flag; - sta->last_rx_rate_vht_flag = status->vht_flag; - sta->last_rx_rate_vht_nss = status->vht_nss; + sta->rx_stats.last_rate_idx = + status->rate_idx; + sta->rx_stats.last_rate_flag = + status->flag; + sta->rx_stats.last_rate_vht_flag = + status->vht_flag; + sta->rx_stats.last_rate_vht_nss = + status->vht_nss; } } } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) { - u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, - NL80211_IFTYPE_OCB); - /* OCB uses wild-card BSSID */ - if (is_broadcast_ether_addr(bssid)) - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; } else if (!is_multicast_ether_addr(hdr->addr1)) { /* * Mesh beacons will update last_rx when if they are found to * match the current local configuration when processed. */ - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; if (ieee80211_is_data(hdr->frame_control)) { - sta->last_rx_rate_idx = status->rate_idx; - sta->last_rx_rate_flag = status->flag; - sta->last_rx_rate_vht_flag = status->vht_flag; - sta->last_rx_rate_vht_nss = status->vht_nss; + sta->rx_stats.last_rate_idx = status->rate_idx; + sta->rx_stats.last_rate_flag = status->flag; + sta->rx_stats.last_rate_vht_flag = status->vht_flag; + sta->rx_stats.last_rate_vht_nss = status->vht_nss; } } - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_CONTINUE; - if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_notify(rx->sdata, hdr); - sta->rx_fragments++; - sta->rx_bytes += rx->skb->len; + sta->rx_stats.fragments++; + sta->rx_stats.bytes += rx->skb->len; if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { - sta->last_signal = status->signal; - ewma_add(&sta->avg_signal, -status->signal); + sta->rx_stats.last_signal = status->signal; + ewma_signal_add(&sta->rx_stats.avg_signal, -status->signal); } if (status->chains) { - sta->chains = status->chains; + sta->rx_stats.chains = status->chains; for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { int signal = status->chain_signal[i]; if (!(status->chains & BIT(i))) continue; - sta->chain_signal_last[i] = signal; - ewma_add(&sta->chain_signal_avg[i], -signal); + sta->rx_stats.chain_signal_last[i] = signal; + ewma_signal_add(&sta->rx_stats.chain_signal_avg[i], + -signal); } } @@ -1416,7 +1453,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * Change STA power saving mode only at the end of a frame * exchange sequence. */ - if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) && + if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) && !ieee80211_has_morefrags(hdr->frame_control) && !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && (rx->sdata->vif.type == NL80211_IFTYPE_AP || @@ -1468,7 +1505,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * Update counter and free packet here to avoid * counting this as a dropped packed. */ - sta->rx_packets++; + sta->rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; } @@ -1517,13 +1554,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * possible. */ - /* - * No point in finding a key and decrypting if the frame is neither - * addressed to us nor a multicast frame. - */ - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_CONTINUE; - /* start without a key */ rx->key = NULL; fc = hdr->frame_control; @@ -1657,7 +1687,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (unlikely(rx->key->flags & KEY_FLAG_TAINTED)) return RX_DROP_MONITOR; - rx->key->tx_rx_count++; /* TODO: add threshold stuff again */ } else { return RX_DROP_MONITOR; @@ -1725,7 +1754,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata, entry->seq = seq; entry->rx_queue = rx_queue; entry->last_frag = frag; - entry->ccmp = 0; + entry->check_sequential_pn = false; entry->extra_len = 0; return entry; @@ -1795,7 +1824,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) frag = sc & IEEE80211_SCTL_FRAG; if (is_multicast_ether_addr(hdr->addr1)) { - rx->local->dot11MulticastReceivedFrameCount++; + I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount); goto out_no_led; } @@ -1821,15 +1850,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, &(rx->skb)); if (rx->key && (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP || - rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) && + rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) && ieee80211_has_protected(fc)) { int queue = rx->security_idx; - /* Store CCMP PN so that we can verify that the next - * fragment has a sequential PN value. */ - entry->ccmp = 1; + + /* Store CCMP/GCMP PN so that we can verify that the + * next fragment has a sequential PN value. + */ + entry->check_sequential_pn = true; memcpy(entry->last_pn, rx->key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN); + BUILD_BUG_ON(offsetof(struct ieee80211_key, + u.ccmp.rx_pn) != + offsetof(struct ieee80211_key, + u.gcmp.rx_pn)); + BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) != + sizeof(rx->key->u.gcmp.rx_pn[queue])); + BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN != + IEEE80211_GCMP_PN_LEN); } return RX_QUEUED; } @@ -1844,15 +1885,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; } - /* Verify that MPDUs within one MSDU have sequential PN values. - * (IEEE 802.11i, 8.3.3.4.5) */ - if (entry->ccmp) { + /* "The receiver shall discard MSDUs and MMPDUs whose constituent + * MPDU PN values are not incrementing in steps of 1." + * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP) + * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP) + */ + if (entry->check_sequential_pn) { int i; u8 pn[IEEE80211_CCMP_PN_LEN], *rpn; int queue; + if (!rx->key || (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256)) + rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256)) return RX_DROP_UNUSABLE; memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN); for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) { @@ -1878,7 +1925,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->skb = __skb_dequeue(&entry->skb_list); if (skb_tailroom(rx->skb) < entry->extra_len) { - I802_DEBUG_INC(rx->local->rx_expand_skb_head2); + I802_DEBUG_INC(rx->local->rx_expand_skb_head_defrag); if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len, GFP_ATOMIC))) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); @@ -1893,13 +1940,12 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) /* Complete frame has been reassembled - process it now */ status = IEEE80211_SKB_RXCB(rx->skb); - status->rx_flags |= IEEE80211_RX_FRAGMENTED; out: ieee80211_led_rx(rx->local); out_no_led: if (rx->sta) - rx->sta->rx_packets++; + rx->sta->rx_stats.packets++; return RX_CONTINUE; } @@ -2054,18 +2100,15 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) struct sk_buff *skb, *xmit_skb; struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; struct sta_info *dsta; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); - - dev->stats.rx_packets++; - dev->stats.rx_bytes += rx->skb->len; skb = rx->skb; xmit_skb = NULL; + ieee80211_rx_stats(dev, skb->len); + if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && - (status->rx_flags & IEEE80211_RX_RA_MATCH) && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { if (is_multicast_ether_addr(ehdr->h_dest)) { /* @@ -2121,9 +2164,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) && - rx->local->napi) - napi_gro_receive(rx->local->napi, skb); + if (rx->napi) + napi_gro_receive(rx->napi, skb); else netif_receive_skb(skb); } @@ -2207,7 +2249,6 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct sk_buff *skb = rx->skb, *fwd_skb; struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u16 q, hdrlen; @@ -2238,8 +2279,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr)) return RX_DROP_MONITOR; - if (!ieee80211_is_data(hdr->frame_control) || - !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + if (!ieee80211_is_data(hdr->frame_control)) return RX_CONTINUE; if (!mesh_hdr->ttl) @@ -2330,11 +2370,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames); ieee80211_add_pending_skb(local, fwd_skb); out: - if (is_multicast_ether_addr(hdr->addr1) || - sdata->dev->flags & IFF_PROMISC) + if (is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; - else - return RX_DROP_MONITOR; + return RX_DROP_MONITOR; } #endif @@ -2361,7 +2399,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) * for non-QoS-data frames. Here we know it's a data * frame, so count MSDUs. */ - rx->sta->rx_msdu[rx->seqno_idx]++; + rx->sta->rx_stats.msdu[rx->seqno_idx]++; } /* @@ -2395,11 +2433,10 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) tf->category == WLAN_CATEGORY_TDLS && (tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST || tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) { - rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TDLS_CHSW; - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&rx->local->hw, &sdata->work); + skb_queue_tail(&local->skb_queue_tdls_chsw, rx->skb); + schedule_work(&local->tdls_chsw_work); if (rx->sta) - rx->sta->rx_packets++; + rx->sta->rx_stats.packets++; return RX_QUEUED; } @@ -2445,6 +2482,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) struct { __le16 control, start_seq_num; } __packed bar_data; + struct ieee80211_event event = { + .type = BAR_RX_EVENT, + }; if (!rx->sta) return RX_DROP_MONITOR; @@ -2460,6 +2500,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_DROP_MONITOR; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; + event.u.ba.tid = tid; + event.u.ba.ssn = start_seq_num; + event.u.ba.sta = &rx->sta->sta; /* reset session timer */ if (tid_agg_rx->timeout) @@ -2472,6 +2515,8 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) start_seq_num, frames); spin_unlock(&tid_agg_rx->reorder_lock); + drv_event_callback(rx->local, rx->sdata, &event); + kfree_skb(skb); return RX_QUEUED; } @@ -2552,7 +2597,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { int sig = 0; - if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) sig = status->signal; cfg80211_report_obss_beacon(rx->local->hw.wiphy, @@ -2561,9 +2606,6 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) rx->flags |= IEEE80211_RX_BEACON_REPORTED; } - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_MONITOR; - if (ieee80211_drop_unencrypted_mgmt(rx)) return RX_DROP_UNUSABLE; @@ -2591,9 +2633,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT) return RX_DROP_UNUSABLE; - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_UNUSABLE; - switch (mgmt->u.action.category) { case WLAN_CATEGORY_HT: /* reject HT action frames from stations not supporting HT */ @@ -2715,8 +2754,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; ieee80211_vht_handle_opmode(rx->sdata, rx->sta, - opmode, status->band, - false); + opmode, status->band); goto handled; } default: @@ -2859,7 +2897,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) handled: if (rx->sta) - rx->sta->rx_packets++; + rx->sta->rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -2868,7 +2906,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) skb_queue_tail(&sdata->skb_queue, rx->skb); ieee80211_queue_work(&local->hw, &sdata->work); if (rx->sta) - rx->sta->rx_packets++; + rx->sta->rx_stats.packets++; return RX_QUEUED; } @@ -2889,13 +2927,13 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) * it transmitted were processed or returned. */ - if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, rx->skb->data, rx->skb->len, 0)) { if (rx->sta) - rx->sta->rx_packets++; + rx->sta->rx_stats.packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; } @@ -2954,7 +2992,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) info->flags = IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK | IEEE80211_TX_CTL_NO_CCK_RATE; - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = local->hw.offchannel_tx_hw_queue; } @@ -3014,12 +3052,11 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) skb_queue_tail(&sdata->skb_queue, rx->skb); ieee80211_queue_work(&rx->local->hw, &sdata->work); if (rx->sta) - rx->sta->rx_packets++; + rx->sta->rx_stats.packets++; return RX_QUEUED; } -/* TODO: use IEEE80211_RX_FRAGMENTED */ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, struct ieee80211_rate *rate) { @@ -3077,8 +3114,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, } prev_dev = sdata->dev; - sdata->dev->stats.rx_packets++; - sdata->dev->stats.rx_bytes += skb->len; + ieee80211_rx_stats(sdata->dev, skb->len); } if (prev_dev) { @@ -3098,7 +3134,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, case RX_DROP_MONITOR: I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); if (rx->sta) - rx->sta->rx_dropped++; + rx->sta->rx_stats.dropped++; /* fall through */ case RX_CONTINUE: { struct ieee80211_rate *rate = NULL; @@ -3118,7 +3154,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, case RX_DROP_UNUSABLE: I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); if (rx->sta) - rx->sta->rx_dropped++; + rx->sta->rx_stats.dropped++; dev_kfree_skb(rx->skb); break; case RX_QUEUED: @@ -3232,7 +3268,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .flags = IEEE80211_RX_REORDER_TIMER, + .napi = NULL, /* must be NULL to not have races */ }; struct tid_ampdu_rx *tid_agg_rx; @@ -3246,16 +3282,25 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames); spin_unlock(&tid_agg_rx->reorder_lock); + if (!skb_queue_empty(&frames)) { + struct ieee80211_event event = { + .type = BA_FRAME_TIMEOUT, + .u.ba.tid = tid, + .u.ba.sta = &sta->sta, + }; + drv_event_callback(rx.local, rx.sdata, &event); + } + ieee80211_rx_handlers(&rx, &frames); } /* main receive path */ -static bool prepare_for_handlers(struct ieee80211_rx_data *rx, - struct ieee80211_hdr *hdr) +static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) { struct ieee80211_sub_if_data *sdata = rx->sdata; struct sk_buff *skb = rx->skb; + struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); int multicast = is_multicast_ether_addr(hdr->addr1); @@ -3264,30 +3309,23 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, case NL80211_IFTYPE_STATION: if (!bssid && !sdata->u.mgd.use_4addr) return false; - if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC) || - sdata->u.mgd.use_4addr) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } - break; + if (multicast) + return true; + return ether_addr_equal(sdata->vif.addr, hdr->addr1); case NL80211_IFTYPE_ADHOC: if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) return false; - if (ieee80211_is_beacon(hdr->frame_control)) { + if (ieee80211_is_beacon(hdr->frame_control)) return true; - } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { + if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) return false; - } else if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } else if (!rx->sta) { + if (!multicast && + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) + return false; + if (!rx->sta) { int rate_idx; if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) rate_idx = 0; /* TODO: HT/VHT rates */ @@ -3296,25 +3334,18 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2, BIT(rate_idx)); } - break; + return true; case NL80211_IFTYPE_OCB: if (!bssid) return false; - if (ieee80211_is_beacon(hdr->frame_control)) { + if (!ieee80211_is_data_present(hdr->frame_control)) return false; - } else if (!is_broadcast_ether_addr(bssid)) { - ocb_dbg(sdata, "BSSID mismatch in OCB mode!\n"); + if (!is_broadcast_ether_addr(bssid)) return false; - } else if (!multicast && - !ether_addr_equal(sdata->dev->dev_addr, - hdr->addr1)) { - /* if we are in promisc mode we also accept - * packets not destined for us - */ - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - rx->flags &= ~IEEE80211_RX_RA_MATCH; - } else if (!rx->sta) { + if (!multicast && + !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1)) + return false; + if (!rx->sta) { int rate_idx; if (status->flag & RX_FLAG_HT) rate_idx = 0; /* TODO: HT rates */ @@ -3323,22 +3354,17 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, ieee80211_ocb_rx_no_sta(sdata, bssid, hdr->addr2, BIT(rate_idx)); } - break; + return true; case NL80211_IFTYPE_MESH_POINT: - if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } - break; + if (multicast) + return true; + return ether_addr_equal(sdata->vif.addr, hdr->addr1); case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: - if (!bssid) { - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return false; - } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { + if (!bssid) + return ether_addr_equal(sdata->vif.addr, hdr->addr1); + + if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { /* * Accept public action frames even when the * BSSID doesn't match, this is used for P2P @@ -3350,42 +3376,35 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, return false; if (ieee80211_is_public_action(hdr, skb->len)) return true; - if (!ieee80211_is_beacon(hdr->frame_control)) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } else if (!ieee80211_has_tods(hdr->frame_control)) { + return ieee80211_is_beacon(hdr->frame_control); + } + + if (!ieee80211_has_tods(hdr->frame_control)) { /* ignore data frames to TDLS-peers */ if (ieee80211_is_data(hdr->frame_control)) return false; /* ignore action frames to TDLS-peers */ if (ieee80211_is_action(hdr->frame_control) && + !is_broadcast_ether_addr(bssid) && !ether_addr_equal(bssid, hdr->addr1)) return false; } - break; + return true; case NL80211_IFTYPE_WDS: if (bssid || !ieee80211_is_data(hdr->frame_control)) return false; - if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2)) - return false; - break; + return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2); case NL80211_IFTYPE_P2P_DEVICE: - if (!ieee80211_is_public_action(hdr, skb->len) && - !ieee80211_is_probe_req(hdr->frame_control) && - !ieee80211_is_probe_resp(hdr->frame_control) && - !ieee80211_is_beacon(hdr->frame_control)) - return false; - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) && - !multicast) - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - break; + return ieee80211_is_public_action(hdr, skb->len) || + ieee80211_is_probe_req(hdr->frame_control) || + ieee80211_is_probe_resp(hdr->frame_control) || + ieee80211_is_beacon(hdr->frame_control); default: - /* should never get here */ - WARN_ON_ONCE(1); break; } - return true; + WARN_ON_ONCE(1); + return false; } /* @@ -3399,13 +3418,10 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - struct ieee80211_hdr *hdr = (void *)skb->data; rx->skb = skb; - status->rx_flags |= IEEE80211_RX_RA_MATCH; - if (!prepare_for_handlers(rx, hdr)) + if (!ieee80211_accept_frame(rx)) return false; if (!consume) { @@ -3430,7 +3446,8 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, - struct sk_buff *skb) + struct sk_buff *skb, + struct napi_struct *napi) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; @@ -3446,9 +3463,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, memset(&rx, 0, sizeof(rx)); rx.skb = skb; rx.local = local; + rx.napi = napi; if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) - local->dot11ReceivedFragmentCount++; + I802_DEBUG_INC(local->dot11ReceivedFragmentCount); if (ieee80211_is_mgmt(fc)) { /* drop frame if too short for header */ @@ -3547,7 +3565,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, * This is the receive path handler. It is called by a low level driver when an * 802.11 MPDU is received from the hardware. */ -void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, + struct napi_struct *napi) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate = NULL; @@ -3646,7 +3665,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_tpt_led_trig_rx(local, ((struct ieee80211_hdr *)skb->data)->frame_control, skb->len); - __ieee80211_rx_handle_packet(hw, skb); + __ieee80211_rx_handle_packet(hw, skb, napi); rcu_read_unlock(); @@ -3654,7 +3673,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) drop: kfree_skb(skb); } -EXPORT_SYMBOL(ieee80211_rx); +EXPORT_SYMBOL(ieee80211_rx_napi); /* This is a version of the rx handler that can be called from hard irq * context. Post the skb on the queue and schedule the tasklet */ diff --git a/kernel/net/mac80211/scan.c b/kernel/net/mac80211/scan.c index 7bb6a9383..acbe182b7 100644 --- a/kernel/net/mac80211/scan.c +++ b/kernel/net/mac80211/scan.c @@ -6,7 +6,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -67,25 +66,30 @@ ieee80211_bss_info_update(struct ieee80211_local *local, struct cfg80211_bss *cbss; struct ieee80211_bss *bss; int clen, srlen; - enum nl80211_bss_scan_width scan_width; - s32 signal = 0; + struct cfg80211_inform_bss bss_meta = {}; + bool signal_valid; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - signal = rx_status->signal * 100; - else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) - signal = (rx_status->signal * 100) / local->hw.max_signal; + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) + bss_meta.signal = rx_status->signal * 100; + else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) + bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; - scan_width = NL80211_BSS_CHAN_WIDTH_20; + bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_20; if (rx_status->flag & RX_FLAG_5MHZ) - scan_width = NL80211_BSS_CHAN_WIDTH_5; + bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_5; if (rx_status->flag & RX_FLAG_10MHZ) - scan_width = NL80211_BSS_CHAN_WIDTH_10; + bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10; - cbss = cfg80211_inform_bss_width_frame(local->hw.wiphy, channel, - scan_width, mgmt, len, signal, - GFP_ATOMIC); + bss_meta.chan = channel; + cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, + mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; + /* In case the signal is invalid update the status */ + signal_valid = abs(channel->center_freq - cbss->channel->center_freq) + <= local->hw.wiphy->max_adj_channel_rssi_comp; + if (!signal_valid) + rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; bss = (void *)cbss->priv; @@ -257,7 +261,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; - if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); @@ -310,6 +314,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; + struct ieee80211_sub_if_data *sdata; lockdep_assert_held(&local->mtx); @@ -326,7 +331,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) return; if (hw_scan && !aborted && - !(local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) && + !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(local)) { int rc; @@ -369,7 +374,16 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); - ieee80211_mesh_notify_scan_completed(local); + + /* Requeue all the work that might have been ignored while + * the scan was in progress; if there was none this will + * just be a no-op for the particular interface. + */ + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (ieee80211_sdata_running(sdata)) + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + } + if (was_scanning) ieee80211_start_next_roc(local); } @@ -520,7 +534,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; - if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; @@ -593,8 +607,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, /* We need to ensure power level is at max for scanning. */ ieee80211_hw_config(local, 0); - if ((req->channels[0]->flags & - IEEE80211_CHAN_NO_IR) || + if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | + IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; } else { @@ -641,7 +655,7 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ - if (chan->flags & IEEE80211_CHAN_NO_IR) + if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } @@ -773,7 +787,8 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, * * In any case, it is not necessary for a passive scan. */ - if (chan->flags & IEEE80211_CHAN_NO_IR || !scan_req->n_ssids) { + if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || + !scan_req->n_ssids) { *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; return; @@ -1136,10 +1151,10 @@ int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, return ret; } -int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) +int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { - struct ieee80211_local *local = sdata->local; - int ret = 0; + struct ieee80211_sub_if_data *sched_scan_sdata; + int ret = -ENOENT; mutex_lock(&local->mtx); @@ -1151,8 +1166,10 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); - if (rcu_access_pointer(local->sched_scan_sdata)) { - ret = drv_sched_scan_stop(local, sdata); + sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, + lockdep_is_held(&local->mtx)); + if (sched_scan_sdata) { + ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } diff --git a/kernel/net/mac80211/sta_info.c b/kernel/net/mac80211/sta_info.c index 2880f2ae9..f91d18732 100644 --- a/kernel/net/mac80211/sta_info.c +++ b/kernel/net/mac80211/sta_info.c @@ -68,9 +68,10 @@ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), - .key_offset = offsetof(struct sta_info, sta.addr), + .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .hashfn = sta_addr_hash, + .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; /* Caller must hold local->sta_mtx */ @@ -248,6 +249,9 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); +#ifdef CONFIG_MAC80211_MESH + kfree(sta->mesh); +#endif kfree(sta); } @@ -281,12 +285,12 @@ static void sta_deliver_ps_frames(struct work_struct *wk) static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, - &sta->sta, gfp); + sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; @@ -299,7 +303,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; - struct timespec uptime; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); @@ -312,27 +315,33 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); #ifdef CONFIG_MAC80211_MESH - if (ieee80211_vif_is_mesh(&sdata->vif) && - !sdata->u.mesh.user_mpm) - init_timer(&sta->plink_timer); - sta->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; + if (ieee80211_vif_is_mesh(&sdata->vif)) { + sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); + if (!sta->mesh) + goto free; + spin_lock_init(&sta->mesh->plink_lock); + if (ieee80211_vif_is_mesh(&sdata->vif) && + !sdata->u.mesh.user_mpm) + init_timer(&sta->mesh->plink_timer); + sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; + } #endif + memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); sta->local = local; sta->sdata = sdata; - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; sta->sta_state = IEEE80211_STA_NONE; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; - ktime_get_ts(&uptime); - sta->last_connected = uptime.tv_sec; - ewma_init(&sta->avg_signal, 1024, 8); - for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) - ewma_init(&sta->chain_signal_avg[i], 1024, 8); + sta->last_connected = ktime_get_seconds(); + ewma_signal_init(&sta->rx_stats.avg_signal); + for (i = 0; i < ARRAY_SIZE(sta->rx_stats.chain_signal_avg); i++) + ewma_signal_init(&sta->rx_stats.chain_signal_avg[i]); if (local->ops->wake_tx_queue) { void *txq_data; @@ -403,6 +412,9 @@ free_txq: if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); free: +#ifdef CONFIG_MAC80211_MESH + kfree(sta->mesh); +#endif kfree(sta); return NULL; } @@ -621,7 +633,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; - u16 id; + u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { @@ -629,19 +641,16 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) return; ps = &sta->sdata->bss->ps; - id = sta->sta.aid; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; - /* TIM map only for 1 <= PLID <= IEEE80211_MAX_AID */ - id = sta->plid % (IEEE80211_MAX_AID + 1); #endif } else { return; } /* No need to do anything if the driver does all */ - if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) return; if (sta->dead) @@ -1057,7 +1066,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, if (sdata != sta->sdata) continue; - if (time_after(jiffies, sta->last_rx + exp_time)) { + if (time_after(jiffies, sta->rx_stats.last_rx + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); @@ -1146,7 +1155,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; - if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); if (sta->sta.txq[0]) { @@ -1217,6 +1226,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", sta->sta.addr, sta->sta.aid, filtered, buffered); + + ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, @@ -1615,6 +1626,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_clear_fast_xmit(sta); return; } @@ -1632,6 +1644,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); @@ -1736,6 +1749,7 @@ int sta_info_move_state(struct sta_info *sta, !sta->sdata->u.vlan.sta)) atomic_dec(&sta->sdata->bss->num_mcast_sta); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_clear_fast_xmit(sta); } break; case IEEE80211_STA_AUTHORIZED: @@ -1745,6 +1759,7 @@ int sta_info_move_state(struct sta_info *sta, !sta->sdata->u.vlan.sta)) atomic_inc(&sta->sdata->bss->num_mcast_sta); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_check_fast_xmit(sta); } break; default: @@ -1791,12 +1806,50 @@ u8 sta_info_tx_streams(struct sta_info *sta) >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; } +static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) +{ + rinfo->flags = 0; + + if (sta->rx_stats.last_rate_flag & RX_FLAG_HT) { + rinfo->flags |= RATE_INFO_FLAGS_MCS; + rinfo->mcs = sta->rx_stats.last_rate_idx; + } else if (sta->rx_stats.last_rate_flag & RX_FLAG_VHT) { + rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; + rinfo->nss = sta->rx_stats.last_rate_vht_nss; + rinfo->mcs = sta->rx_stats.last_rate_idx; + } else { + struct ieee80211_supported_band *sband; + int shift = ieee80211_vif_get_shift(&sta->sdata->vif); + u16 brate; + + sband = sta->local->hw.wiphy->bands[ + ieee80211_get_sdata_band(sta->sdata)]; + brate = sband->bitrates[sta->rx_stats.last_rate_idx].bitrate; + rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); + } + + if (sta->rx_stats.last_rate_flag & RX_FLAG_SHORT_GI) + rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; + + if (sta->rx_stats.last_rate_flag & RX_FLAG_5MHZ) + rinfo->bw = RATE_INFO_BW_5; + else if (sta->rx_stats.last_rate_flag & RX_FLAG_10MHZ) + rinfo->bw = RATE_INFO_BW_10; + else if (sta->rx_stats.last_rate_flag & RX_FLAG_40MHZ) + rinfo->bw = RATE_INFO_BW_40; + else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_80MHZ) + rinfo->bw = RATE_INFO_BW_80; + else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_160MHZ) + rinfo->bw = RATE_INFO_BW_160; + else + rinfo->bw = RATE_INFO_BW_20; +} + void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; - struct timespec uptime; u32 thr = 0; int i, ac; @@ -1818,51 +1871,54 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) BIT(NL80211_STA_INFO_STA_FLAGS) | BIT(NL80211_STA_INFO_BSS_PARAM) | BIT(NL80211_STA_INFO_CONNECTED_TIME) | - BIT(NL80211_STA_INFO_RX_DROP_MISC) | - BIT(NL80211_STA_INFO_BEACON_LOSS); + BIT(NL80211_STA_INFO_RX_DROP_MISC); + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count; + sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS); + } - ktime_get_ts(&uptime); - sinfo->connected_time = uptime.tv_sec - sta->last_connected; - sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); + sinfo->connected_time = ktime_get_seconds() - sta->last_connected; + sinfo->inactive_time = + jiffies_to_msecs(jiffies - sta->rx_stats.last_rx); if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | BIT(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) - sinfo->tx_bytes += sta->tx_bytes[ac]; + sinfo->tx_bytes += sta->tx_stats.bytes[ac]; sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) - sinfo->tx_packets += sta->tx_packets[ac]; + sinfo->tx_packets += sta->tx_stats.packets[ac]; sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) | BIT(NL80211_STA_INFO_RX_BYTES)))) { - sinfo->rx_bytes = sta->rx_bytes; + sinfo->rx_bytes = sta->rx_stats.bytes; sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) { - sinfo->rx_packets = sta->rx_packets; + sinfo->rx_packets = sta->rx_stats.packets; sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) { - sinfo->tx_retries = sta->tx_retry_count; + sinfo->tx_retries = sta->status_stats.retry_count; sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) { - sinfo->tx_failed = sta->tx_retry_failed; + sinfo->tx_failed = sta->status_stats.retry_failed; sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED); } - sinfo->rx_dropped_misc = sta->rx_dropped; - sinfo->beacon_loss_count = sta->beacon_loss_count; + sinfo->rx_dropped_misc = sta->rx_stats.dropped; if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { @@ -1871,35 +1927,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } - if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || - (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { + if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || + ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { - sinfo->signal = (s8)sta->last_signal; + sinfo->signal = (s8)sta->rx_stats.last_signal; sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); } if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { - sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal); + sinfo->signal_avg = + -ewma_signal_read(&sta->rx_stats.avg_signal); sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); } } - if (sta->chains && + if (sta->rx_stats.chains && !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); - sinfo->chains = sta->chains; + sinfo->chains = sta->rx_stats.chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { - sinfo->chain_signal[i] = sta->chain_signal_last[i]; + sinfo->chain_signal[i] = + sta->rx_stats.chain_signal_last[i]; sinfo->chain_signal_avg[i] = - (s8) -ewma_read(&sta->chain_signal_avg[i]); + -ewma_signal_read(&sta->rx_stats.chain_signal_avg[i]); } } if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) { - sta_set_rate_info_tx(sta, &sta->last_tx_rate, &sinfo->txrate); + sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, + &sinfo->txrate); sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE); } @@ -1914,28 +1973,30 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); - tidstats->rx_msdu = sta->rx_msdu[i]; + tidstats->rx_msdu = sta->rx_stats.msdu[i]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); - tidstats->tx_msdu = sta->tx_msdu[i]; + tidstats->tx_msdu = sta->tx_stats.msdu[i]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && - local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); - tidstats->tx_msdu_retries = sta->tx_msdu_retries[i]; + tidstats->tx_msdu_retries = + sta->status_stats.msdu_retries[i]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && - local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); - tidstats->tx_msdu_failed = sta->tx_msdu_failed[i]; + tidstats->tx_msdu_failed = + sta->status_stats.msdu_failed[i]; } } @@ -1948,16 +2009,16 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) BIT(NL80211_STA_INFO_PEER_PM) | BIT(NL80211_STA_INFO_NONPEER_PM); - sinfo->llid = sta->llid; - sinfo->plid = sta->plid; - sinfo->plink_state = sta->plink_state; + sinfo->llid = sta->mesh->llid; + sinfo->plid = sta->mesh->plid; + sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET); - sinfo->t_offset = sta->t_offset; + sinfo->t_offset = sta->mesh->t_offset; } - sinfo->local_pm = sta->local_pm; - sinfo->peer_pm = sta->peer_pm; - sinfo->nonpeer_pm = sta->nonpeer_pm; + sinfo->local_pm = sta->mesh->local_pm; + sinfo->peer_pm = sta->mesh->peer_pm; + sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; #endif } diff --git a/kernel/net/mac80211/sta_info.h b/kernel/net/mac80211/sta_info.h index 5c164fb3f..2cafb21b4 100644 --- a/kernel/net/mac80211/sta_info.h +++ b/kernel/net/mac80211/sta_info.h @@ -53,6 +53,8 @@ * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this * TDLS peer + * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on + * the BSS base channel. * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was * keeping station in power-save mode, reply when the driver * unblocks the station. @@ -84,6 +86,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_TDLS_INITIATOR, WLAN_STA_TDLS_CHAN_SWITCH, WLAN_STA_TDLS_OFF_CHANNEL, + WLAN_STA_TDLS_WIDER_BW, WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, @@ -130,6 +133,7 @@ enum ieee80211_agg_stop_reason { * @buf_size: reorder buffer size at receiver * @failed_bar_ssn: ssn of the last failed BAR tx attempt * @bar_pending: BAR needs to be re-sent + * @amsdu: support A-MSDU withing A-MDPU * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -155,6 +159,7 @@ struct tid_ampdu_tx { u16 failed_bar_ssn; bool bar_pending; + bool amsdu; }; /** @@ -241,6 +246,84 @@ struct sta_ampdu_mlme { /* Value to indicate no TID reservation */ #define IEEE80211_TID_UNRESERVED 0xff +#define IEEE80211_FAST_XMIT_MAX_IV 18 + +/** + * struct ieee80211_fast_tx - TX fastpath information + * @key: key to use for hw crypto + * @hdr: the 802.11 header to put with the frame + * @hdr_len: actual 802.11 header length + * @sa_offs: offset of the SA + * @da_offs: offset of the DA + * @pn_offs: offset where to put PN for crypto (or 0 if not needed) + * @band: band this will be transmitted on, for tx_info + * @rcu_head: RCU head to free this struct + * + * This struct is small enough so that the common case (maximum crypto + * header length of 8 like for CCMP/GCMP) fits into a single 64-byte + * cache line. + */ +struct ieee80211_fast_tx { + struct ieee80211_key *key; + u8 hdr_len; + u8 sa_offs, da_offs, pn_offs; + u8 band; + u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV + + sizeof(rfc1042_header)]; + + struct rcu_head rcu_head; +}; + +/** + * struct mesh_sta - mesh STA information + * @plink_lock: serialize access to plink fields + * @llid: Local link ID + * @plid: Peer link ID + * @aid: local aid supplied by peer + * @reason: Cancel reason on PLINK_HOLDING state + * @plink_retries: Retries in establishment + * @plink_state: peer link state + * @plink_timeout: timeout of peer link + * @plink_timer: peer link watch timer + * @t_offset: timing offset relative to this host + * @t_offset_setpoint: reference timing offset of this sta to be used when + * calculating clockdrift + * @local_pm: local link-specific power save mode + * @peer_pm: peer-specific power save mode towards local STA + * @nonpeer_pm: STA power save mode towards non-peer neighbors + * @processed_beacon: set to true after peer rates and capabilities are + * processed + * @fail_avg: moving percentage of failed MSDUs + */ +struct mesh_sta { + struct timer_list plink_timer; + + s64 t_offset; + s64 t_offset_setpoint; + + spinlock_t plink_lock; + u16 llid; + u16 plid; + u16 aid; + u16 reason; + u8 plink_retries; + + bool processed_beacon; + + enum nl80211_plink_state plink_state; + u32 plink_timeout; + + /* mesh power save */ + enum nl80211_mesh_power_mode local_pm; + enum nl80211_mesh_power_mode peer_pm; + enum nl80211_mesh_power_mode nonpeer_pm; + + /* moving percentage of failed MSDUs */ + unsigned int fail_avg; +}; + +DECLARE_EWMA(signal, 1024, 8) + /** * struct sta_info - STA information * @@ -250,20 +333,17 @@ struct sta_ampdu_mlme { * @list: global linked list entry * @free_list: list entry for keeping track of stations to free * @hash_node: hash node for rhashtable + * @addr: station's MAC address - duplicated from public part to + * let the hash table work with just a single cacheline * @local: pointer to the global information * @sdata: virtual interface this station belongs to * @ptk: peer keys negotiated with this station, if any * @ptk_idx: last installed peer key index * @gtk: group keys negotiated with this station, if any - * @gtk_idx: last installed group key index * @rate_ctrl: rate control algorithm reference + * @rate_ctrl_lock: spinlock used to protect rate control data + * (data inside the algorithm, so serializes calls there) * @rate_ctrl_priv: rate control private per-STA pointer - * @last_tx_rate: rate used for last transmit, to report to userspace as - * "the" transmit rate - * @last_rx_rate_idx: rx status rate index of the last data packet - * @last_rx_rate_flag: rx status flag of the last data packet - * @last_rx_rate_vht_flag: rx status vht flag of the last data packet - * @last_rx_rate_vht_nss: rx status nss of last data packet * @lock: used for locking all fields that require locking, see comments * in the header file. * @drv_deliver_wk: used for delivering frames after driver PS unblocking @@ -278,82 +358,55 @@ struct sta_ampdu_mlme { * the station when it leaves powersave or polls for frames * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on - * @rx_packets: Number of MSDUs received from this STA - * @rx_bytes: Number of bytes received from this STA - * @last_rx: time (in jiffies) when last frame was received from this STA * @last_connected: time (in seconds) when a station got connected - * @num_duplicates: number of duplicate frames received from this STA - * @rx_fragments: number of received MPDUs - * @rx_dropped: number of dropped MPDUs from this STA - * @last_signal: signal of last received frame from this STA - * @avg_signal: moving average of signal of received frames from this STA - * @last_ack_signal: signal of last received Ack frame from this STA - * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue) - * @tx_filtered_count: number of frames the hardware filtered for this STA - * @tx_retry_failed: number of frames that failed retry - * @tx_retry_count: total number of retries for frames to this STA - * @fail_avg: moving percentage of failed MSDUs - * @tx_packets: number of RX/TX MSDUs - * @tx_bytes: number of bytes transmitted to this STA - * @tx_fragments: number of transmitted MPDUs + * @last_seq_ctrl: last received seq/frag number from this STA (per TID + * plus one for non-QoS frames) * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers - * @llid: Local link ID - * @plid: Peer link ID - * @reason: Cancel reason on PLINK_HOLDING state - * @plink_retries: Retries in establishment - * @plink_state: peer link state - * @plink_timeout: timeout of peer link - * @plink_timer: peer link watch timer - * @t_offset: timing offset relative to this host - * @t_offset_setpoint: reference timing offset of this sta to be used when - * calculating clockdrift - * @local_pm: local link-specific power save mode - * @peer_pm: peer-specific power save mode towards local STA - * @nonpeer_pm: STA power save mode towards non-peer neighbors + * @mesh: mesh STA information * @debugfs: debug filesystem info * @dead: set to true when sta is unlinked * @uploaded: set to true when sta is uploaded to the driver - * @lost_packets: number of consecutive lost packets * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @beacon_loss_count: number of times beacon loss has triggered * @rcu_head: RCU head used for freeing this station struct * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification - * @chains: chains ever used for RX from this station - * @chain_signal_last: last signal (per chain) - * @chain_signal_avg: signal average (per chain) * @known_smps_mode: the smps_mode the client thinks we are in. Relevant for * AP only. * @cipher_scheme: optional cipher scheme for this station - * @last_tdls_pkt_time: holds the time in jiffies of last TDLS pkt ACKed * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) - * @tx_msdu: MSDUs transmitted to this station, using IEEE80211_NUM_TID - * entry for non-QoS frames - * @tx_msdu_retries: MSDU retries for transmissions to to this station, - * using IEEE80211_NUM_TID entry for non-QoS frames - * @tx_msdu_failed: MSDU failures for transmissions to to this station, - * using IEEE80211_NUM_TID entry for non-QoS frames - * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID - * entry for non-QoS frames + * @fast_tx: TX fastpath information + * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to + * the BSS one. + * @tx_stats: TX statistics + * @rx_stats: RX statistics + * @status_stats: TX status statistics */ struct sta_info { /* General information, mostly static */ struct list_head list, free_list; struct rcu_head rcu_head; struct rhash_head hash_node; + u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS]; - u8 gtk_idx; u8 ptk_idx; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; + spinlock_t rate_ctrl_lock; spinlock_t lock; + struct ieee80211_fast_tx __rcu *fast_tx; + +#ifdef CONFIG_MAC80211_MESH + struct mesh_sta *mesh; +#endif + struct work_struct drv_deliver_wk; u16 listen_interval; @@ -374,45 +427,49 @@ struct sta_info { unsigned long driver_buffered_tids; unsigned long txq_buffered_tids; - /* Updated from RX path only, no locking requirements */ - unsigned long rx_packets; - u64 rx_bytes; - unsigned long last_rx; long last_connected; - unsigned long num_duplicates; - unsigned long rx_fragments; - unsigned long rx_dropped; - int last_signal; - struct ewma avg_signal; - int last_ack_signal; - u8 chains; - s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - struct ewma chain_signal_avg[IEEE80211_MAX_CHAINS]; + /* Updated from RX path only, no locking requirements */ + struct { + unsigned long packets; + u64 bytes; + unsigned long last_rx; + unsigned long num_duplicates; + unsigned long fragments; + unsigned long dropped; + int last_signal; + struct ewma_signal avg_signal; + u8 chains; + s8 chain_signal_last[IEEE80211_MAX_CHAINS]; + struct ewma_signal chain_signal_avg[IEEE80211_MAX_CHAINS]; + int last_rate_idx; + u32 last_rate_flag; + u32 last_rate_vht_flag; + u8 last_rate_vht_nss; + u64 msdu[IEEE80211_NUM_TIDS + 1]; + } rx_stats; /* Plus 1 for non-QoS frames */ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1]; /* Updated from TX status path only, no locking requirements */ - unsigned long tx_filtered_count; - unsigned long tx_retry_failed, tx_retry_count; - /* moving percentage of failed MSDUs */ - unsigned int fail_avg; + struct { + unsigned long filtered; + unsigned long retry_failed, retry_count; + unsigned int lost_packets; + unsigned long last_tdls_pkt_time; + u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; + u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; + } status_stats; /* Updated from TX path only, no locking requirements */ - u32 tx_fragments; - u64 tx_packets[IEEE80211_NUM_ACS]; - u64 tx_bytes[IEEE80211_NUM_ACS]; - struct ieee80211_tx_rate last_tx_rate; - int last_rx_rate_idx; - u32 last_rx_rate_flag; - u32 last_rx_rate_vht_flag; - u8 last_rx_rate_vht_nss; + struct { + u64 packets[IEEE80211_NUM_ACS]; + u64 bytes[IEEE80211_NUM_ACS]; + struct ieee80211_tx_rate last_rate; + u64 msdu[IEEE80211_NUM_TIDS + 1]; + } tx_stats; u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; - u64 tx_msdu[IEEE80211_NUM_TIDS + 1]; - u64 tx_msdu_retries[IEEE80211_NUM_TIDS + 1]; - u64 tx_msdu_failed[IEEE80211_NUM_TIDS + 1]; - u64 rx_msdu[IEEE80211_NUM_TIDS + 1]; /* * Aggregation information, locked with lock. @@ -420,26 +477,6 @@ struct sta_info { struct sta_ampdu_mlme ampdu_mlme; u8 timer_to_tid[IEEE80211_NUM_TIDS]; -#ifdef CONFIG_MAC80211_MESH - /* - * Mesh peer link attributes - * TODO: move to a sub-structure that is referenced with pointer? - */ - u16 llid; - u16 plid; - u16 reason; - u8 plink_retries; - enum nl80211_plink_state plink_state; - u32 plink_timeout; - struct timer_list plink_timer; - s64 t_offset; - s64 t_offset_setpoint; - /* mesh power save */ - enum nl80211_mesh_power_mode local_pm; - enum nl80211_mesh_power_mode peer_pm; - enum nl80211_mesh_power_mode nonpeer_pm; -#endif - #ifdef CONFIG_MAC80211_DEBUGFS struct sta_info_debugfsdentries { struct dentry *dir; @@ -449,17 +486,13 @@ struct sta_info { enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; - unsigned int lost_packets; - unsigned int beacon_loss_count; - enum ieee80211_smps_mode known_smps_mode; const struct ieee80211_cipher_scheme *cipher_scheme; - /* TDLS timeout data */ - unsigned long last_tdls_pkt_time; - u8 reserved_tid; + struct cfg80211_chan_def tdls_chandef; + /* keep last! */ struct ieee80211_sta sta; }; @@ -467,7 +500,7 @@ struct sta_info { static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta) { #ifdef CONFIG_MAC80211_MESH - return sta->plink_state; + return sta->mesh->plink_state; #endif return NL80211_PLINK_LISTEN; } @@ -570,7 +603,7 @@ u32 sta_addr_hash(const void *key, u32 length, u32 seed); _sta_bucket_idx(tbl, _addr), \ hash_node) \ /* compare address and run code only if it matches */ \ - if (ether_addr_equal(_sta->sta.addr, (_addr))) + if (ether_addr_equal(_sta->addr, (_addr))) /* * Get STA info by index, BROKEN! @@ -626,8 +659,6 @@ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata) void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo); -void sta_set_rate_info_rx(struct sta_info *sta, - struct rate_info *rinfo); void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, diff --git a/kernel/net/mac80211/status.c b/kernel/net/mac80211/status.c index 005fdbe39..5bad05e9a 100644 --- a/kernel/net/mac80211/status.c +++ b/kernel/net/mac80211/status.c @@ -67,7 +67,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; - sta->tx_filtered_count++; + sta->status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set @@ -101,6 +101,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); + ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: @@ -181,8 +182,8 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) - sta->last_rx = jiffies; + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) + sta->rx_stats.last_rx = jiffies; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; @@ -414,8 +415,7 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, if (is_teardown) { /* This mechanism relies on being able to get ACKs */ - WARN_ON(!(local->hw.flags & - IEEE80211_HW_REPORTS_TX_ACK_STATUS)); + WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { @@ -429,6 +429,74 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, } } +static struct ieee80211_sub_if_data * +ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata; + + if (skb->dev) { + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!sdata->dev) + continue; + + if (skb->dev == sdata->dev) + return sdata; + } + + return NULL; + } + + return rcu_dereference(local->p2p_sdata); +} + +static void ieee80211_report_ack_skb(struct ieee80211_local *local, + struct ieee80211_tx_info *info, + bool acked, bool dropped) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&local->ack_status_lock, flags); + skb = idr_find(&local->ack_status_frames, info->ack_frame_id); + if (skb) + idr_remove(&local->ack_status_frames, info->ack_frame_id); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (!skb) + return; + + if (dropped) { + dev_kfree_skb_any(skb); + return; + } + + if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { + u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (void *)skb->data; + + rcu_read_lock(); + sdata = ieee80211_sdata_from_skb(local, skb); + if (sdata) { + if (ieee80211_is_nullfunc(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control)) + cfg80211_probe_status(sdata->dev, hdr->addr1, + cookie, acked, + GFP_ATOMIC); + else + cfg80211_mgmt_tx_status(&sdata->wdev, cookie, + skb->data, skb->len, + acked, GFP_ATOMIC); + } + rcu_read_unlock(); + + dev_kfree_skb_any(skb); + } else { + /* consumes skb */ + skb_complete_wifi_ack(skb, acked); + } +} + static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { @@ -439,32 +507,16 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (dropped) acked = false; - if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_INTFL_MLME_CONN_TX)) { - struct ieee80211_sub_if_data *sdata = NULL; - struct ieee80211_sub_if_data *iter_sdata; - u64 cookie = (unsigned long)skb; + if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { + struct ieee80211_sub_if_data *sdata; rcu_read_lock(); - if (skb->dev) { - list_for_each_entry_rcu(iter_sdata, &local->interfaces, - list) { - if (!iter_sdata->dev) - continue; - - if (skb->dev == iter_sdata->dev) { - sdata = iter_sdata; - break; - } - } - } else { - sdata = rcu_dereference(local->p2p_sdata); - } + sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; - } else if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { + } else { unsigned int hdr_size = ieee80211_hdrlen(hdr->frame_control); @@ -478,38 +530,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); - } else if (ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control)) { - cfg80211_probe_status(sdata->dev, hdr->addr1, - cookie, acked, GFP_ATOMIC); - } else { - cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, - skb->len, acked, GFP_ATOMIC); } rcu_read_unlock(); - } - - if (unlikely(info->ack_frame_id)) { - struct sk_buff *ack_skb; - unsigned long flags; - - spin_lock_irqsave(&local->ack_status_lock, flags); - ack_skb = idr_find(&local->ack_status_frames, - info->ack_frame_id); - if (ack_skb) - idr_remove(&local->ack_status_frames, - info->ack_frame_id); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (ack_skb) { - if (!dropped) { - /* consumes ack_skb */ - skb_complete_wifi_ack(ack_skb, acked); - } else { - dev_kfree_skb_any(ack_skb); - } - } + } else if (info->ack_frame_id) { + ieee80211_report_ack_skb(local, info, acked, dropped); } } @@ -532,8 +557,9 @@ static void ieee80211_lost_packet(struct sta_info *sta, !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; - sta->lost_packets++; - if (!sta->sta.tdls && sta->lost_packets < STA_LOST_PKT_THRESHOLD) + sta->status_stats.lost_packets++; + if (!sta->sta.tdls && + sta->status_stats.lost_packets < STA_LOST_PKT_THRESHOLD) return; /* @@ -543,14 +569,15 @@ static void ieee80211_lost_packet(struct sta_info *sta, * mechanism. */ if (sta->sta.tdls && - (sta->lost_packets < STA_LOST_TDLS_PKT_THRESHOLD || + (sta->status_stats.lost_packets < STA_LOST_TDLS_PKT_THRESHOLD || time_before(jiffies, - sta->last_tdls_pkt_time + STA_LOST_TDLS_PKT_TIME))) + sta->status_stats.last_tdls_pkt_time + + STA_LOST_TDLS_PKT_TIME))) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, - sta->lost_packets, GFP_ATOMIC); - sta->lost_packets = 0; + sta->status_stats.lost_packets, GFP_ATOMIC); + sta->status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, @@ -611,18 +638,18 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, sta = container_of(pubsta, struct sta_info, sta); if (!acked) - sta->tx_retry_failed++; - sta->tx_retry_count += retry_count; + sta->status_stats.retry_failed++; + sta->status_stats.retry_count += retry_count; if (acked) { - sta->last_rx = jiffies; + sta->rx_stats.last_rx = jiffies; - if (sta->lost_packets) - sta->lost_packets = 0; + if (sta->status_stats.lost_packets) + sta->status_stats.lost_packets = 0; /* Track when last TDLS packet was ACKed */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) - sta->last_tdls_pkt_time = jiffies; + sta->status_stats.last_tdls_pkt_time = jiffies; } else { ieee80211_lost_packet(sta, info); } @@ -631,29 +658,83 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, } if (acked || noack_success) { - local->dot11TransmittedFrameCount++; - if (!pubsta) - local->dot11MulticastTransmittedFrameCount++; - if (retry_count > 0) - local->dot11RetryCount++; - if (retry_count > 1) - local->dot11MultipleRetryCount++; + I802_DEBUG_INC(local->dot11TransmittedFrameCount); + if (!pubsta) + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); + if (retry_count > 0) + I802_DEBUG_INC(local->dot11RetryCount); + if (retry_count > 1) + I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { - local->dot11FailedCount++; + I802_DEBUG_INC(local->dot11FailedCount); } } EXPORT_SYMBOL(ieee80211_tx_status_noskb); -void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, + struct ieee80211_supported_band *sband, + int retry_count, int shift, bool send_to_cooked) { struct sk_buff *skb2; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_sub_if_data *sdata; + struct net_device *prev_dev = NULL; + int rtap_len; + + /* send frame to monitor interfaces now */ + rtap_len = ieee80211_tx_radiotap_len(info); + if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { + pr_err("ieee80211_tx_status: headroom too small\n"); + dev_kfree_skb(skb); + return; + } + ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, + rtap_len, shift); + + /* XXX: is this sufficient for BPF? */ + skb_set_mac_header(skb, 0); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_OTHERHOST; + skb->protocol = htons(ETH_P_802_2); + memset(skb->cb, 0, sizeof(skb->cb)); + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && + !send_to_cooked) + continue; + + if (prev_dev) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) { + skb2->dev = prev_dev; + netif_rx(skb2); + } + } + + prev_dev = sdata->dev; + } + } + if (prev_dev) { + skb->dev = prev_dev; + netif_rx(skb); + skb = NULL; + } + rcu_read_unlock(); + dev_kfree_skb(skb); +} + +void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +{ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); __le16 fc; struct ieee80211_supported_band *sband; - struct ieee80211_sub_if_data *sdata; - struct net_device *prev_dev = NULL; struct sta_info *sta; struct rhash_head *tmp; int retry_count; @@ -661,7 +742,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) bool send_to_cooked; bool acked; struct ieee80211_bar *bar; - int rtap_len; int shift = 0; int tid = IEEE80211_NUM_TIDS; const struct bucket_table *tbl; @@ -703,10 +783,11 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_get_qos_ctl(hdr), sta, true, acked); - if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) - sta->last_tx_rate = info->status.rates[rates_idx]; + sta->tx_stats.last_rate = + info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { @@ -752,13 +833,15 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) return; } else { if (!acked) - sta->tx_retry_failed++; - sta->tx_retry_count += retry_count; + sta->status_stats.retry_failed++; + sta->status_stats.retry_count += retry_count; if (ieee80211_is_data_present(fc)) { if (!acked) - sta->tx_msdu_failed[tid]++; - sta->tx_msdu_retries[tid] += retry_count; + sta->status_stats.msdu_failed[tid]++; + + sta->status_stats.msdu_retries[tid] += + retry_count; } } @@ -770,25 +853,23 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_frame_acked(sta, skb); if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data, acked, info->status.tx_time); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (info->flags & IEEE80211_TX_STAT_ACK) { - if (sta->lost_packets) - sta->lost_packets = 0; + if (sta->status_stats.lost_packets) + sta->status_stats.lost_packets = 0; /* Track when last TDLS packet was ACKed */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) - sta->last_tdls_pkt_time = jiffies; + sta->status_stats.last_tdls_pkt_time = + jiffies; } else { ieee80211_lost_packet(sta, info); } } - - if (acked) - sta->last_ack_signal = info->status.ack_signal; } rcu_read_unlock(); @@ -802,13 +883,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { - local->dot11TransmittedFrameCount++; + I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) - local->dot11MulticastTransmittedFrameCount++; + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) - local->dot11RetryCount++; + I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) - local->dot11MultipleRetryCount++; + I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU @@ -818,14 +899,14 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) - local->dot11TransmittedFragmentCount++; + I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) - local->dot11FailedCount++; + I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) { @@ -854,51 +935,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) return; } - /* send frame to monitor interfaces now */ - rtap_len = ieee80211_tx_radiotap_len(info); - if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { - pr_err("ieee80211_tx_status: headroom too small\n"); - dev_kfree_skb(skb); - return; - } - ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, - rtap_len, shift); - - /* XXX: is this sufficient for BPF? */ - skb_set_mac_header(skb, 0); - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->pkt_type = PACKET_OTHERHOST; - skb->protocol = htons(ETH_P_802_2); - memset(skb->cb, 0, sizeof(skb->cb)); - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { - if (!ieee80211_sdata_running(sdata)) - continue; - - if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && - !send_to_cooked) - continue; - - if (prev_dev) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2) { - skb2->dev = prev_dev; - netif_rx(skb2); - } - } - - prev_dev = sdata->dev; - } - } - if (prev_dev) { - skb->dev = prev_dev; - netif_rx(skb); - skb = NULL; - } - rcu_read_unlock(); - dev_kfree_skb(skb); + /* send to monitor interfaces */ + ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked); } EXPORT_SYMBOL(ieee80211_tx_status); diff --git a/kernel/net/mac80211/tdls.c b/kernel/net/mac80211/tdls.c index fff0d864a..c9eeb3f12 100644 --- a/kernel/net/mac80211/tdls.c +++ b/kernel/net/mac80211/tdls.c @@ -4,6 +4,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH + * Copyright 2015 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -11,6 +12,7 @@ #include #include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" @@ -35,20 +37,30 @@ void ieee80211_tdls_peer_del_work(struct work_struct *wk) mutex_unlock(&local->mtx); } -static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local, +static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - u8 *pos = (void *)skb_put(skb, 7); + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool chan_switch = local->hw.wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH; + bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && + !ifmgd->tdls_wider_bw_prohibited; + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; + bool vht = sband && sband->vht_cap.vht_supported; + u8 *pos = (void *)skb_put(skb, 10); *pos++ = WLAN_EID_EXT_CAPABILITY; - *pos++ = 5; /* len */ + *pos++ = 8; /* len */ *pos++ = 0x0; *pos++ = 0x0; *pos++ = 0x0; *pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0; *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; + *pos++ = 0; + *pos++ = 0; + *pos++ = (vht && wider_band) ? WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED : 0; } static u8 @@ -60,6 +72,7 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *ch; struct cfg80211_chan_def chandef; int i, subband_start; + struct wiphy *wiphy = sdata->local->hw.wiphy; for (i = start; i <= end; i += spacing) { if (!ch_cnt) @@ -70,9 +83,8 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata, /* we will be active on the channel */ cfg80211_chandef_create(&chandef, ch, NL80211_CHAN_NO_HT); - if (cfg80211_reg_can_beacon(sdata->local->hw.wiphy, - &chandef, - sdata->wdev.iftype)) { + if (cfg80211_reg_can_beacon_relax(wiphy, &chandef, + sdata->wdev.iftype)) { ch_cnt++; /* * check if the next channel is also part of @@ -167,23 +179,16 @@ static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb) static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, u16 status_code) { - struct ieee80211_local *local = sdata->local; - u16 capab; - /* The capability will be 0 when sending a failure code */ if (status_code != 0) return 0; - capab = 0; - if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ) - return capab; - - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) { + return WLAN_CAPABILITY_SHORT_SLOT_TIME | + WLAN_CAPABILITY_SHORT_PREAMBLE; + } - return capab; + return 0; } static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, @@ -290,6 +295,60 @@ static void ieee80211_tdls_add_wmm_param_ie(struct ieee80211_sub_if_data *sdata, } } +static void +ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + /* IEEE802.11ac-2013 Table E-4 */ + u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 }; + struct cfg80211_chan_def uc = sta->tdls_chandef; + enum nl80211_chan_width max_width = ieee80211_get_sta_bw(&sta->sta); + int i; + + /* only support upgrading non-narrow channels up to 80Mhz */ + if (max_width == NL80211_CHAN_WIDTH_5 || + max_width == NL80211_CHAN_WIDTH_10) + return; + + if (max_width > NL80211_CHAN_WIDTH_80) + max_width = NL80211_CHAN_WIDTH_80; + + if (uc.width == max_width) + return; + /* + * Channel usage constrains in the IEEE802.11ac-2013 specification only + * allow expanding a 20MHz channel to 80MHz in a single way. In + * addition, there are no 40MHz allowed channels that are not part of + * the allowed 80MHz range in the 5GHz spectrum (the relevant one here). + */ + for (i = 0; i < ARRAY_SIZE(centers_80mhz); i++) + if (abs(uc.chan->center_freq - centers_80mhz[i]) <= 30) { + uc.center_freq1 = centers_80mhz[i]; + uc.width = NL80211_CHAN_WIDTH_80; + break; + } + + if (!uc.center_freq1) + return; + + /* proceed to downgrade the chandef until usable or the same */ + while (uc.width > max_width && + !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, + sdata->wdev.iftype)) + ieee80211_chandef_downgrade(&uc); + + if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) { + tdls_dbg(sdata, "TDLS ch width upgraded %d -> %d\n", + sta->tdls_chandef.width, uc.width); + + /* + * the station is not yet authorized when BW upgrade is done, + * locking is not required + */ + sta->tdls_chandef = uc; + } +} + static void ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, const u8 *peer, @@ -327,7 +386,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } - ieee80211_tdls_add_ext_capab(local, skb); + ieee80211_tdls_add_ext_capab(sdata, skb); /* add the QoS element if we support it */ if (local->hw.queues >= IEEE80211_NUM_ACS && @@ -357,15 +416,17 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } - rcu_read_lock(); + mutex_lock(&local->sta_mtx); /* we should have the peer STA if we're already responding */ if (action_code == WLAN_TDLS_SETUP_RESPONSE) { sta = sta_info_get(sdata, peer); if (WARN_ON_ONCE(!sta)) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); return; } + + sta->tdls_chandef = sdata->vif.bss_conf.chandef; } ieee80211_tdls_add_oper_classes(sdata, skb); @@ -391,10 +452,6 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); } else if (action_code == WLAN_TDLS_SETUP_RESPONSE && ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { - /* disable SMPS in TDLS responder */ - sta->sta.ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED - << IEEE80211_HT_CAP_SM_PS_SHIFT; - /* the peer caps are already intersected with our own */ memcpy(&ht_cap, &sta->sta.ht_cap, sizeof(ht_cap)); @@ -455,9 +512,16 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap); + + /* + * if both peers support WIDER_BW, we can expand the chandef to + * a wider compatible one, up to 80MHz + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + ieee80211_tdls_chandef_vht_upgrade(sdata, sta); } - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); /* add any remaining IEs */ if (extra_ies_len) { @@ -481,15 +545,17 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, enum ieee80211_band band = ieee80211_get_sdata_band(sdata); u8 *pos; - rcu_read_lock(); + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); ap_sta = sta_info_get(sdata, ifmgd->bssid); if (WARN_ON_ONCE(!sta || !ap_sta)) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); return; } + sta->tdls_chandef = sdata->vif.bss_conf.chandef; + /* add any custom IEs that go before the QoS IE */ if (extra_ies_len) { static const u8 before_qos[] = { @@ -525,35 +591,38 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } - /* if HT support is only added in TDLS, we need an HT-operation IE */ + /* + * if HT support is only added in TDLS, we need an HT-operation IE. + * add the IE as required by IEEE802.11-2012 9.23.3.2. + */ if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { - struct ieee80211_chanctx_conf *chanctx_conf = - rcu_dereference(sdata->vif.chanctx_conf); - if (!WARN_ON(!chanctx_conf)) { - pos = skb_put(skb, 2 + - sizeof(struct ieee80211_ht_operation)); - /* send an empty HT operation IE */ - ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, - &chanctx_conf->def, 0); - } + u16 prot = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | + IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | + IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); + ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, + &sdata->vif.bss_conf.chandef, prot, + true); } ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); /* only include VHT-operation if not on the 2.4GHz band */ - if (band != IEEE80211_BAND_2GHZ && !ap_sta->sta.vht_cap.vht_supported && - sta->sta.vht_cap.vht_supported) { - struct ieee80211_chanctx_conf *chanctx_conf = - rcu_dereference(sdata->vif.chanctx_conf); - if (!WARN_ON(!chanctx_conf)) { - pos = skb_put(skb, 2 + - sizeof(struct ieee80211_vht_operation)); - ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, - &chanctx_conf->def); - } + if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + /* + * if both peers support WIDER_BW, we can expand the chandef to + * a wider compatible one, up to 80MHz + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + ieee80211_tdls_chandef_vht_upgrade(sdata, sta); + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); + ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, + &sta->tdls_chandef); } - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); /* add any remaining IEs */ if (extra_ies_len) { @@ -802,7 +871,7 @@ ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata, max(sizeof(struct ieee80211_mgmt), sizeof(struct ieee80211_tdls_data)) + 50 + /* supported rates */ - 7 + /* ext capab */ + 10 + /* ext capab */ 26 + /* max(WMM-info, WMM-param) */ 2 + max(sizeof(struct ieee80211_ht_cap), sizeof(struct ieee80211_ht_operation)) + @@ -953,7 +1022,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, * packet through the AP. */ if ((action_code == WLAN_TDLS_TEARDOWN) && - (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { bool try_resend; /* Should we keep skb for possible resend */ /* If not sending directly to peer - no point in keeping skb */ @@ -1001,8 +1070,17 @@ ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + enum ieee80211_smps_mode smps_mode = sdata->u.mgd.driver_smps_mode; int ret; + /* don't support setup with forced SMPS mode that's not off */ + if (smps_mode != IEEE80211_SMPS_AUTOMATIC && + smps_mode != IEEE80211_SMPS_OFF) { + tdls_dbg(sdata, "Aborting TDLS setup due to SMPS mode %d\n", + smps_mode); + return -ENOTSUPP; + } + mutex_lock(&local->mtx); /* we don't support concurrent TDLS peer setups */ @@ -1164,6 +1242,74 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, return ret; } +static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (conf) { + ctx = container_of(conf, struct ieee80211_chanctx, conf); + ieee80211_recalc_chanctx_chantype(local, ctx); + } + mutex_unlock(&local->chanctx_mtx); +} + +static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + bool result = false; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) || + !sta->sta.ht_cap.ht_supported) + continue; + result = true; + break; + } + rcu_read_unlock(); + + return result; +} + +static void +iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool tdls_ht; + u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | + IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | + IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; + u16 opmode; + + /* Nothing to do if the BSS connection uses HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + return; + + tdls_ht = (sta && sta->sta.ht_cap.ht_supported) || + iee80211_tdls_have_ht_peers(sdata); + + opmode = sdata->vif.bss_conf.ht_operation_mode; + + if (tdls_ht) + opmode |= protection; + else + opmode &= ~protection; + + if (opmode == sdata->vif.bss_conf.ht_operation_mode) + return; + + sdata->vif.bss_conf.ht_operation_mode = opmode; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT); +} + int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper) { @@ -1189,21 +1335,35 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, return -ENOTSUPP; } + /* protect possible bss_conf changes and avoid concurrency in + * ieee80211_bss_info_change_notify() + */ + sdata_lock(sdata); mutex_lock(&local->mtx); tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); switch (oper) { case NL80211_TDLS_ENABLE_LINK: - rcu_read_lock(); + if (sdata->vif.csa_active) { + tdls_dbg(sdata, "TDLS: disallow link during CSA\n"); + ret = -EBUSY; + break; + } + + iee80211_tdls_recalc_chanctx(sdata); + + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); if (!sta) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); ret = -ENOLINK; break; } + iee80211_tdls_recalc_ht_protection(sdata, sta); + set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) || !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)); @@ -1225,6 +1385,12 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, ieee80211_flush_queues(local, sdata, false); ret = sta_info_destroy_addr(sdata, peer); + + mutex_lock(&local->sta_mtx); + iee80211_tdls_recalc_ht_protection(sdata, NULL); + mutex_unlock(&local->sta_mtx); + + iee80211_tdls_recalc_chanctx(sdata); break; default: ret = -ENOTSUPP; @@ -1236,7 +1402,12 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, eth_zero_addr(sdata->u.mgd.tdls_peer); } + if (ret == 0) + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); + mutex_unlock(&local->mtx); + sdata_unlock(sdata); return ret; } @@ -1639,6 +1810,31 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, return -EINVAL; } + if (!elems.sec_chan_offs) { + chan_type = NL80211_CHAN_HT20; + } else { + switch (elems.sec_chan_offs->sec_chan_offs) { + case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: + chan_type = NL80211_CHAN_HT40PLUS; + break; + case IEEE80211_HT_PARAM_CHA_SEC_BELOW: + chan_type = NL80211_CHAN_HT40MINUS; + break; + default: + chan_type = NL80211_CHAN_HT20; + break; + } + } + + cfg80211_chandef_create(&chandef, chan, chan_type); + + /* we will be active on the TDLS link */ + if (!cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &chandef, + sdata->wdev.iftype)) { + tdls_dbg(sdata, "TDLS chan switch to forbidden channel\n"); + return -EINVAL; + } + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, tf->sa); if (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) { @@ -1659,27 +1855,15 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, goto out; } - if (!sta->sta.ht_cap.ht_supported) { - chan_type = NL80211_CHAN_NO_HT; - } else if (!elems.sec_chan_offs) { - chan_type = NL80211_CHAN_HT20; - } else { - switch (elems.sec_chan_offs->sec_chan_offs) { - case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: - chan_type = NL80211_CHAN_HT40PLUS; - break; - case IEEE80211_HT_PARAM_CHA_SEC_BELOW: - chan_type = NL80211_CHAN_HT40MINUS; - break; - default: - chan_type = NL80211_CHAN_HT20; - break; - } + /* peer should have known better */ + if (!sta->sta.ht_cap.ht_supported && elems.sec_chan_offs && + elems.sec_chan_offs->sec_chan_offs) { + tdls_dbg(sdata, "TDLS chan switch - wide chan unsupported\n"); + ret = -ENOTSUPP; + goto out; } - cfg80211_chandef_create(&chandef, chan, chan_type); params.chandef = &chandef; - params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time); params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout); @@ -1703,12 +1887,15 @@ out: return ret; } -void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb) +static void +ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_tdls_data *tf = (void *)skb->data; struct wiphy *wiphy = sdata->local->hw.wiphy; + ASSERT_RTNL(); + /* make sure the driver supports it */ if (!(wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH)) return; @@ -1732,3 +1919,47 @@ void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, return; } } + +void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; + + ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr, + NL80211_TDLS_TEARDOWN, reason, + GFP_ATOMIC); + } + rcu_read_unlock(); +} + +void ieee80211_tdls_chsw_work(struct work_struct *wk) +{ + struct ieee80211_local *local = + container_of(wk, struct ieee80211_local, tdls_chsw_work); + struct ieee80211_sub_if_data *sdata; + struct sk_buff *skb; + struct ieee80211_tdls_data *tf; + + rtnl_lock(); + while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) { + tf = (struct ieee80211_tdls_data *)skb->data; + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata) || + sdata->vif.type != NL80211_IFTYPE_STATION || + !ether_addr_equal(tf->da, sdata->vif.addr)) + continue; + + ieee80211_process_tdls_channel_switch(sdata, skb); + break; + } + + kfree_skb(skb); + } + rtnl_unlock(); +} diff --git a/kernel/net/mac80211/trace.h b/kernel/net/mac80211/trace.h index 4c2e76902..56c6d6cfa 100644 --- a/kernel/net/mac80211/trace.h +++ b/kernel/net/mac80211/trace.h @@ -33,11 +33,11 @@ __field(u32, chan_width) \ __field(u32, center_freq1) \ __field(u32, center_freq2) -#define CHANDEF_ASSIGN(c) \ - __entry->control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ - __entry->chan_width = (c)->width; \ - __entry->center_freq1 = (c)->center_freq1; \ - __entry->center_freq2 = (c)->center_freq2; +#define CHANDEF_ASSIGN(c) \ + __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \ + __entry->chan_width = (c) ? (c)->width : 0; \ + __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ + __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; #define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz" #define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \ __entry->center_freq1, __entry->center_freq2 @@ -69,6 +69,17 @@ #define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic +#define KEY_ENTRY __field(u32, cipher) \ + __field(u8, hw_key_idx) \ + __field(u8, flags) \ + __field(s8, keyidx) +#define KEY_ASSIGN(k) __entry->cipher = (k)->cipher; \ + __entry->flags = (k)->flags; \ + __entry->keyidx = (k)->keyidx; \ + __entry->hw_key_idx = (k)->hw_key_idx; +#define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" +#define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx + /* @@ -314,7 +325,6 @@ TRACE_EVENT(drv_config, __field(u32, flags) __field(int, power_level) __field(int, dynamic_ps_timeout) - __field(int, max_sleep_period) __field(u16, listen_interval) __field(u8, long_frame_max_tx_count) __field(u8, short_frame_max_tx_count) @@ -328,7 +338,6 @@ TRACE_EVENT(drv_config, __entry->flags = local->hw.conf.flags; __entry->power_level = local->hw.conf.power_level; __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout; - __entry->max_sleep_period = local->hw.conf.max_sleep_period; __entry->listen_interval = local->hw.conf.listen_interval; __entry->long_frame_max_tx_count = local->hw.conf.long_frame_max_tx_count; @@ -486,6 +495,36 @@ TRACE_EVENT(drv_configure_filter, ) ); +TRACE_EVENT(drv_config_iface_filter, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + unsigned int filter_flags, + unsigned int changed_flags), + + TP_ARGS(local, sdata, filter_flags, changed_flags), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(unsigned int, filter_flags) + __field(unsigned int, changed_flags) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->filter_flags = filter_flags; + __entry->changed_flags = changed_flags; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + " filter_flags: %#x changed_flags: %#x", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags, + __entry->changed_flags + ) +); + TRACE_EVENT(drv_set_tim, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set), @@ -522,25 +561,19 @@ TRACE_EVENT(drv_set_key, LOCAL_ENTRY VIF_ENTRY STA_ENTRY - __field(u32, cipher) - __field(u8, hw_key_idx) - __field(u8, flags) - __field(s8, keyidx) + KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; - __entry->cipher = key->cipher; - __entry->flags = key->flags; - __entry->keyidx = key->keyidx; - __entry->hw_key_idx = key->hw_key_idx; + KEY_ASSIGN(key); ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT KEY_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, KEY_PR_ARG ) ); @@ -656,28 +689,25 @@ TRACE_EVENT(drv_get_stats, ) ); -TRACE_EVENT(drv_get_tkip_seq, +TRACE_EVENT(drv_get_key_seq, TP_PROTO(struct ieee80211_local *local, - u8 hw_key_idx, u32 *iv32, u16 *iv16), + struct ieee80211_key_conf *key), - TP_ARGS(local, hw_key_idx, iv32, iv16), + TP_ARGS(local, key), TP_STRUCT__entry( LOCAL_ENTRY - __field(u8, hw_key_idx) - __field(u32, iv32) - __field(u16, iv16) + KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; - __entry->hw_key_idx = hw_key_idx; - __entry->iv32 = *iv32; - __entry->iv16 = *iv16; + KEY_ASSIGN(key); ), TP_printk( - LOCAL_PR_FMT, LOCAL_PR_ARG + LOCAL_PR_FMT KEY_PR_FMT, + LOCAL_PR_ARG, KEY_PR_ARG ) ); @@ -942,9 +972,9 @@ TRACE_EVENT(drv_ampdu_action, struct ieee80211_sub_if_data *sdata, enum ieee80211_ampdu_mlme_action action, struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size), + u16 *ssn, u8 buf_size, bool amsdu), - TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size), + TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu), TP_STRUCT__entry( LOCAL_ENTRY @@ -953,6 +983,7 @@ TRACE_EVENT(drv_ampdu_action, __field(u16, tid) __field(u16, ssn) __field(u8, buf_size) + __field(bool, amsdu) VIF_ENTRY ), @@ -964,12 +995,13 @@ TRACE_EVENT(drv_ampdu_action, __entry->tid = tid; __entry->ssn = ssn ? *ssn : 0; __entry->buf_size = buf_size; + __entry->amsdu = amsdu; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d", + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, - __entry->tid, __entry->buf_size + __entry->tid, __entry->buf_size, __entry->amsdu ) ); diff --git a/kernel/net/mac80211/tx.c b/kernel/net/mac80211/tx.c index 5787f15a3..bdc224d50 100644 --- a/kernel/net/mac80211/tx.c +++ b/kernel/net/mac80211/tx.c @@ -37,6 +37,16 @@ /* misc utils */ +static inline void ieee80211_tx_stats(struct net_device *dev, u32 len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_packets++; + tstats->tx_bytes += len; + u64_stats_update_end(&tstats->syncp); +} + static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) @@ -201,11 +211,11 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) struct ieee80211_if_managed *ifmgd; /* driver doesn't support power save */ - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return TX_CONTINUE; /* hardware does dynamic power save */ - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) return TX_CONTINUE; /* dynamic power save disabled */ @@ -418,7 +428,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_is_probe_req(hdr->frame_control)) return TX_CONTINUE; - if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; /* no stations in PS mode */ @@ -428,7 +438,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; /* device releases frame after DTIM beacon */ - if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)) + if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING)) return TX_CONTINUE; /* buffered in mac80211 */ @@ -597,7 +607,6 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (tx->key) { bool skip_hw = false; - tx->key->tx_rx_count++; /* TODO: add threshold stuff again */ switch (tx->key->conf.cipher) { @@ -677,7 +686,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || - tx->sdata->vif.type == NL80211_IFTYPE_ADHOC); + tx->sdata->vif.type == NL80211_IFTYPE_ADHOC || + tx->sdata->vif.type == NL80211_IFTYPE_OCB); /* set up RTS protection if desired */ if (len > tx->local->hw.wiphy->rts_threshold) { @@ -747,9 +757,9 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; if (tx->sta && ieee80211_is_data(hdr->frame_control)) - tx->sta->last_tx_rate = txrc.reported_rate; + tx->sta->tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) - tx->sta->last_tx_rate = txrc.reported_rate; + tx->sta->tx_stats.last_rate = txrc.reported_rate; if (ratetbl) return TX_CONTINUE; @@ -814,7 +824,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number); tx->sdata->sequence_number += 0x10; if (tx->sta) - tx->sta->tx_msdu[IEEE80211_NUM_TIDS]++; + tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++; return TX_CONTINUE; } @@ -830,7 +840,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) qc = ieee80211_get_qos_ctl(hdr); tid = *qc & IEEE80211_QOS_CTL_TID_MASK; - tx->sta->tx_msdu[tid]++; + tx->sta->tx_stats.msdu[tid]++; if (!tx->sta->sta.txq[0]) hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); @@ -984,11 +994,10 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); - tx->sta->tx_fragments++; - tx->sta->tx_bytes[ac] += skb->len; + tx->sta->tx_stats.bytes[ac] += skb->len; } if (ac >= 0) - tx->sta->tx_packets[ac]++; + tx->sta->tx_stats.packets[ac]++; return TX_CONTINUE; } @@ -1105,7 +1114,9 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, queued = true; info->control.vif = &tx->sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; - info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS | + IEEE80211_TX_CTL_NO_PS_BUFFER | + IEEE80211_TX_STATUS_EOSP; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); @@ -1173,8 +1184,8 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && !ieee80211_is_qos_nullfunc(hdr->frame_control) && - (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) && - !(local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) { + ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && + !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; qc = ieee80211_get_qos_ctl(hdr); @@ -1207,8 +1218,10 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, if (!tx->sta) info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; - else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) + else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) { info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; + ieee80211_check_fast_xmit(tx->sta); + } info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; @@ -1417,7 +1430,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; - } else if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { dev_kfree_skb(skb); return true; } else @@ -1463,7 +1476,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); - if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { @@ -1478,7 +1491,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) /* handlers after fragment must be aware of tx info fragmentation! */ CALL_TXH(ieee80211_tx_h_stats); CALL_TXH(ieee80211_tx_h_encrypt); - if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_calculate_duration); #undef CALL_TXH @@ -1568,7 +1581,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, /* set up hw_queue value early */ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) || - !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) + !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; @@ -1595,9 +1608,9 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } if (skb_cloned(skb) && - (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || + (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || - sdata->crypto_tx_tailroom_needed_cnt)) + (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt))) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); @@ -2384,12 +2397,461 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, return ERR_PTR(ret); } +/* + * fast-xmit overview + * + * The core idea of this fast-xmit is to remove per-packet checks by checking + * them out of band. ieee80211_check_fast_xmit() implements the out-of-band + * checks that are needed to get the sta->fast_tx pointer assigned, after which + * much less work can be done per packet. For example, fragmentation must be + * disabled or the fast_tx pointer will not be set. All the conditions are seen + * in the code here. + * + * Once assigned, the fast_tx data structure also caches the per-packet 802.11 + * header and other data to aid packet processing in ieee80211_xmit_fast(). + * + * The most difficult part of this is that when any of these assumptions + * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(), + * ieee80211_check_fast_xmit() or friends) is required to reset the data, + * since the per-packet code no longer checks the conditions. This is reflected + * by the calls to these functions throughout the rest of the code, and must be + * maintained if any of the TX path checks change. + */ + +void ieee80211_check_fast_xmit(struct sta_info *sta) +{ + struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_hdr *hdr = (void *)build.hdr; + struct ieee80211_chanctx_conf *chanctx_conf; + __le16 fc; + + if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT)) + return; + + /* Locking here protects both the pointer itself, and against concurrent + * invocations winning data access races to, e.g., the key pointer that + * is used. + * Without it, the invocation of this function right after the key + * pointer changes wouldn't be sufficient, as another CPU could access + * the pointer, then stall, and then do the cache update after the CPU + * that invalidated the key. + * With the locking, such scenarios cannot happen as the check for the + * key and the fast-tx assignment are done atomically, so the CPU that + * modifies the key will either wait or other one will see the key + * cleared/changed already. + */ + spin_lock_bh(&sta->lock); + if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && + sdata->vif.type == NL80211_IFTYPE_STATION) + goto out; + + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + goto out; + + if (test_sta_flag(sta, WLAN_STA_PS_STA) || + test_sta_flag(sta, WLAN_STA_PS_DRIVER) || + test_sta_flag(sta, WLAN_STA_PS_DELIVER) || + test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT)) + goto out; + + if (sdata->noack_map) + goto out; + + /* fast-xmit doesn't handle fragmentation at all */ + if (local->hw.wiphy->frag_threshold != (u32)-1 && + !local->ops->set_frag_threshold) + goto out; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) { + rcu_read_unlock(); + goto out; + } + build.band = chanctx_conf->def.chan->band; + rcu_read_unlock(); + + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_ADHOC: + /* DA SA BSSID */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN); + build.hdr_len = 24; + break; + case NL80211_IFTYPE_STATION: + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + /* DA SA BSSID */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + memcpy(hdr->addr3, sdata->u.mgd.bssid, ETH_ALEN); + build.hdr_len = 24; + break; + } + + if (sdata->u.mgd.use_4addr) { + /* non-regular ethertype cannot use the fastpath */ + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr4); + build.hdr_len = 30; + break; + } + fc |= cpu_to_le16(IEEE80211_FCTL_TODS); + /* BSSID SA DA */ + memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + build.hdr_len = 24; + break; + case NL80211_IFTYPE_AP_VLAN: + if (sdata->wdev.use_4addr) { + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr4); + build.hdr_len = 30; + break; + } + /* fall through */ + case NL80211_IFTYPE_AP: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA BSSID SA */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.sa_offs = offsetof(struct ieee80211_hdr, addr3); + build.hdr_len = 24; + break; + default: + /* not handled on fast-xmit */ + goto out; + } + + if (sta->sta.wme) { + build.hdr_len += 2; + fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); + } + + /* We store the key here so there's no point in using rcu_dereference() + * but that's fine because the code that changes the pointers will call + * this function after doing so. For a single CPU that would be enough, + * for multiple see the comment above. + */ + build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]); + if (!build.key) + build.key = rcu_access_pointer(sdata->default_unicast_key); + if (build.key) { + bool gen_iv, iv_spc, mmic; + + gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; + iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; + mmic = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC; + + /* don't handle software crypto */ + if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + goto out; + + switch (build.key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + /* add fixed key ID */ + if (gen_iv) { + (build.hdr + build.hdr_len)[3] = + 0x20 | (build.key->conf.keyidx << 6); + build.pn_offs = build.hdr_len; + } + if (gen_iv || iv_spc) + build.hdr_len += IEEE80211_CCMP_HDR_LEN; + break; + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + /* add fixed key ID */ + if (gen_iv) { + (build.hdr + build.hdr_len)[3] = + 0x20 | (build.key->conf.keyidx << 6); + build.pn_offs = build.hdr_len; + } + if (gen_iv || iv_spc) + build.hdr_len += IEEE80211_GCMP_HDR_LEN; + break; + case WLAN_CIPHER_SUITE_TKIP: + /* cannot handle MMIC or IV generation in xmit-fast */ + if (mmic || gen_iv) + goto out; + if (iv_spc) + build.hdr_len += IEEE80211_TKIP_IV_LEN; + break; + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + /* cannot handle IV generation in fast-xmit */ + if (gen_iv) + goto out; + if (iv_spc) + build.hdr_len += IEEE80211_WEP_IV_LEN; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + WARN(1, + "management cipher suite 0x%x enabled for data\n", + build.key->conf.cipher); + goto out; + default: + /* we don't know how to generate IVs for this at all */ + if (WARN_ON(gen_iv)) + goto out; + /* pure hardware keys are OK, of course */ + if (!(build.key->flags & KEY_FLAG_CIPHER_SCHEME)) + break; + /* cipher scheme might require space allocation */ + if (iv_spc && + build.key->conf.iv_len > IEEE80211_FAST_XMIT_MAX_IV) + goto out; + if (iv_spc) + build.hdr_len += build.key->conf.iv_len; + } + + fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + } + + hdr->frame_control = fc; + + memcpy(build.hdr + build.hdr_len, + rfc1042_header, sizeof(rfc1042_header)); + build.hdr_len += sizeof(rfc1042_header); + + fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC); + /* if the kmemdup fails, continue w/o fast_tx */ + if (!fast_tx) + goto out; + + out: + /* we might have raced against another call to this function */ + old = rcu_dereference_protected(sta->fast_tx, + lockdep_is_held(&sta->lock)); + rcu_assign_pointer(sta->fast_tx, fast_tx); + if (old) + kfree_rcu(old, rcu_head); + spin_unlock_bh(&sta->lock); +} + +void ieee80211_check_fast_xmit_all(struct ieee80211_local *local) +{ + struct sta_info *sta; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) + ieee80211_check_fast_xmit(sta); + rcu_read_unlock(); +} + +void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata && + (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) + continue; + ieee80211_check_fast_xmit(sta); + } + + rcu_read_unlock(); +} + +void ieee80211_clear_fast_xmit(struct sta_info *sta) +{ + struct ieee80211_fast_tx *fast_tx; + + spin_lock_bh(&sta->lock); + fast_tx = rcu_dereference_protected(sta->fast_tx, + lockdep_is_held(&sta->lock)); + RCU_INIT_POINTER(sta->fast_tx, NULL); + spin_unlock_bh(&sta->lock); + + if (fast_tx) + kfree_rcu(fast_tx, rcu_head); +} + +static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, + struct net_device *dev, struct sta_info *sta, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + u16 ethertype = (skb->data[12] << 8) | skb->data[13]; + int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); + int hw_headroom = sdata->local->hw.extra_tx_headroom; + struct ethhdr eth; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; + struct ieee80211_tx_data tx; + ieee80211_tx_result r; + struct tid_ampdu_tx *tid_tx = NULL; + u8 tid = IEEE80211_NUM_TIDS; + + /* control port protocol needs a lot of special handling */ + if (cpu_to_be16(ethertype) == sdata->control_port_protocol) + return false; + + /* only RFC 1042 SNAP */ + if (ethertype < ETH_P_802_3_MIN) + return false; + + /* don't handle TX status request here either */ + if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + return false; + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); + if (tid_tx) { + if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) + return false; + if (tid_tx->timeout) + tid_tx->last_tx = jiffies; + } + } + + /* after this point (skb is modified) we cannot return false */ + + if (skb_shared(skb)) { + struct sk_buff *tmp_skb = skb; + + skb = skb_clone(skb, GFP_ATOMIC); + kfree_skb(tmp_skb); + + if (!skb) + return true; + } + + ieee80211_tx_stats(dev, skb->len + extra_head); + + /* will not be crypto-handled beyond what we do here, so use false + * as the may-encrypt argument for the resize to not account for + * more room than we already have in 'extra_head' + */ + if (unlikely(ieee80211_skb_resize(sdata, skb, + max_t(int, extra_head + hw_headroom - + skb_headroom(skb), 0), + false))) { + kfree_skb(skb); + return true; + } + + memcpy(ð, skb->data, ETH_HLEN - 2); + hdr = (void *)skb_push(skb, extra_head); + memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len); + memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); + memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); + + memset(info, 0, sizeof(*info)); + info->band = fast_tx->band; + info->control.vif = &sdata->vif; + info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | + IEEE80211_TX_CTL_DONTFRAG | + (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + *ieee80211_get_qos_ctl(hdr) = tid; + if (!sta->sta.txq[0]) + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); + } else { + info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; + hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); + sdata->sequence_number += 0x10; + } + + if (skb_shinfo(skb)->gso_size) + sta->tx_stats.msdu[tid] += + DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); + else + sta->tx_stats.msdu[tid]++; + + info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + + __skb_queue_head_init(&tx.skbs); + + tx.flags = IEEE80211_TX_UNICAST; + tx.local = local; + tx.sdata = sdata; + tx.sta = sta; + tx.key = fast_tx->key; + + if (fast_tx->key) + info->control.hw_key = &fast_tx->key->conf; + + if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { + tx.skb = skb; + r = ieee80211_tx_h_rate_ctrl(&tx); + skb = tx.skb; + tx.skb = NULL; + + if (r != TX_CONTINUE) { + if (r != TX_QUEUED) + kfree_skb(skb); + return true; + } + } + + /* statistics normally done by ieee80211_tx_h_stats (but that + * has to consider fragmentation, so is more complex) + */ + sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; + + if (fast_tx->pn_offs) { + u64 pn; + u8 *crypto_hdr = skb->data + fast_tx->pn_offs; + + switch (fast_tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn); + crypto_hdr[0] = pn; + crypto_hdr[1] = pn >> 8; + crypto_hdr[4] = pn >> 16; + crypto_hdr[5] = pn >> 24; + crypto_hdr[6] = pn >> 32; + crypto_hdr[7] = pn >> 40; + break; + } + } + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + + __skb_queue_tail(&tx.skbs, skb); + ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + return true; +} + void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; + struct sk_buff *next; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -2398,20 +2860,67 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, rcu_read_lock(); - if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { - kfree_skb(skb); - goto out; + if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) + goto out_free; + + if (!IS_ERR_OR_NULL(sta)) { + struct ieee80211_fast_tx *fast_tx; + + fast_tx = rcu_dereference(sta->fast_tx); + + if (fast_tx && + ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb)) + goto out; } - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); - if (IS_ERR(skb)) - goto out; + if (skb_is_gso(skb)) { + struct sk_buff *segs; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - dev->trans_start = jiffies; + segs = skb_gso_segment(skb, 0); + if (IS_ERR(segs)) { + goto out_free; + } else if (segs) { + consume_skb(skb); + skb = segs; + } + } else { + /* we cannot process non-linear frames on this path */ + if (skb_linearize(skb)) { + kfree_skb(skb); + goto out; + } + + /* the frame could be fragmented, software-encrypted, and other + * things so we cannot really handle checksum offload with it - + * fix it up in software before we handle anything else. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_checksum_help(skb)) + goto out_free; + } + } + + next = skb; + while (next) { + skb = next; + next = skb->next; + + skb->prev = NULL; + skb->next = NULL; - ieee80211_xmit(sdata, sta, skb); + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); + if (IS_ERR(skb)) + goto out; + + ieee80211_tx_stats(dev, skb->len); + + ieee80211_xmit(sdata, sta, skb); + } + goto out; + out_free: + kfree_skb(skb); out: rcu_read_unlock(); } @@ -2709,6 +3218,16 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); } +static u8 __ieee80211_csa_update_counter(struct beacon_data *beacon) +{ + beacon->csa_current_counter--; + + /* the counter should never reach 0 */ + WARN_ON_ONCE(!beacon->csa_current_counter); + + return beacon->csa_current_counter; +} + u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); @@ -2727,11 +3246,7 @@ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) if (!beacon) goto unlock; - beacon->csa_current_counter--; - - /* the counter should never reach 0 */ - WARN_ON_ONCE(!beacon->csa_current_counter); - count = beacon->csa_current_counter; + count = __ieee80211_csa_update_counter(beacon); unlock: rcu_read_unlock(); @@ -2831,7 +3346,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (beacon) { if (beacon->csa_counter_offsets[0]) { if (!is_template) - ieee80211_csa_update_counter(vif); + __ieee80211_csa_update_counter(beacon); ieee80211_set_csa(sdata, beacon); } @@ -2877,7 +3392,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (beacon->csa_counter_offsets[0]) { if (!is_template) - ieee80211_csa_update_counter(vif); + __ieee80211_csa_update_counter(beacon); ieee80211_set_csa(sdata, beacon); } @@ -2907,7 +3422,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, * for now we leave it consistent with overall * mac80211's behavior. */ - ieee80211_csa_update_counter(vif); + __ieee80211_csa_update_counter(beacon); ieee80211_set_csa(sdata, beacon); } @@ -3001,6 +3516,12 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, { struct ieee80211_mutable_offsets offs = {}; struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false); + struct sk_buff *copy; + struct ieee80211_supported_band *sband; + int shift; + + if (!bcn) + return bcn; if (tim_offset) *tim_offset = offs.tim_offset; @@ -3008,6 +3529,19 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (tim_length) *tim_length = offs.tim_length; + if (ieee80211_hw_check(hw, BEACON_TX_STATUS) || + !hw_to_local(hw)->monitors) + return bcn; + + /* send a copy to monitor interfaces */ + copy = skb_copy(bcn, GFP_ATOMIC); + if (!copy) + return bcn; + + shift = ieee80211_vif_get_shift(vif); + sband = hw->wiphy->bands[ieee80211_get_sdata_band(vif_to_sdata(vif))]; + ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false); + return bcn; } EXPORT_SYMBOL(ieee80211_beacon_get_tim); @@ -3305,7 +3839,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) synchronize_net(); /* Tear down BA sessions so we stop aggregating on this TID */ - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); @@ -3319,7 +3853,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ret = 0; diff --git a/kernel/net/mac80211/util.c b/kernel/net/mac80211/util.c index b864ebc6a..33344f5a6 100644 --- a/kernel/net/mac80211/util.c +++ b/kernel/net/mac80211/util.c @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -47,55 +48,6 @@ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); -u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, - enum nl80211_iftype type) -{ - __le16 fc = hdr->frame_control; - - /* drop ACK/CTS frames and incorrect hdr len (ctrl) */ - if (len < 16) - return NULL; - - if (ieee80211_is_data(fc)) { - if (len < 24) /* drop incorrect hdr len (data) */ - return NULL; - - if (ieee80211_has_a4(fc)) - return NULL; - if (ieee80211_has_tods(fc)) - return hdr->addr1; - if (ieee80211_has_fromds(fc)) - return hdr->addr2; - - return hdr->addr3; - } - - if (ieee80211_is_mgmt(fc)) { - if (len < 24) /* drop incorrect hdr len (mgmt) */ - return NULL; - return hdr->addr3; - } - - if (ieee80211_is_ctl(fc)) { - if (ieee80211_is_pspoll(fc)) - return hdr->addr1; - - if (ieee80211_is_back_req(fc)) { - switch (type) { - case NL80211_IFTYPE_STATION: - return hdr->addr2; - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - return hdr->addr1; - default: - break; /* fall through to the return */ - } - } - } - - return NULL; -} - void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; @@ -564,7 +516,7 @@ ieee80211_get_vif_queues(struct ieee80211_local *local, { unsigned int queues; - if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; @@ -592,7 +544,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ - if (!queues || !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) + if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, @@ -752,7 +704,12 @@ EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_sub_if_data *sdata; + + if (!vif) + return NULL; + + sdata = vif_to_sdata(vif); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) @@ -1148,13 +1105,13 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, - bool bss_notify) + bool bss_notify, bool enable_qos) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_queue_params qparam; struct ieee80211_chanctx_conf *chanctx_conf; int ac; - bool use_11b, enable_qos; + bool use_11b; bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */ int aCWmin, aCWmax; @@ -1173,13 +1130,6 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE); rcu_read_unlock(); - /* - * By default disable QoS in STA mode for old access points, which do - * not support 802.11e. New APs will provide proper queue parameters, - * that we will configure later. - */ - enable_qos = (sdata->vif.type != NL80211_IFTYPE_STATION); - is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB); /* Set defaults according to 802.11-2007 Table 7-37 */ @@ -1691,6 +1641,29 @@ void ieee80211_stop_device(struct ieee80211_local *local) drv_stop(local); } +static void ieee80211_flush_completed_scan(struct ieee80211_local *local, + bool aborted) +{ + /* It's possible that we don't handle the scan completion in + * time during suspend, so if it's still marked as completed + * here, queue the work and flush it to clean things up. + * Instead of calling the worker function directly here, we + * really queue it to avoid potential races with other flows + * scheduling the same work. + */ + if (test_bit(SCAN_COMPLETED, &local->scanning)) { + /* If coming from reconfiguration failure, abort the scan so + * we don't attempt to continue a partial HW scan - which is + * possible otherwise if (e.g.) the 2.4 GHz portion was the + * completed scan, and a 5 GHz portion is still pending. + */ + if (aborted) + set_bit(SCAN_ABORTED, &local->scanning); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); + flush_delayed_work(&local->scan_work); + } +} + static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; @@ -1708,7 +1681,9 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) local->resuming = false; local->suspended = false; - local->started = false; + local->in_reconfig = false; + + ieee80211_flush_completed_scan(local, true); /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state @@ -1748,6 +1723,27 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local, mutex_unlock(&local->chanctx_mtx); } +static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + /* add STAs back */ + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + enum ieee80211_sta_state state; + + if (!sta->uploaded || sta->sdata != sdata) + continue; + + for (state = IEEE80211_STA_NOTEXIST; + state < sta->sta_state; state++) + WARN_ON(drv_sta_state(local, sta->sdata, sta, state, + state + 1)); + } + mutex_unlock(&local->sta_mtx); +} + int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; @@ -1759,16 +1755,24 @@ int ieee80211_reconfig(struct ieee80211_local *local) struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; + bool suspended = local->suspended; /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM - if (local->suspended) + if (suspended) local->resuming = true; if (local->wowlan) { + /* + * In the wowlan case, both mac80211 and the device + * are functional when the resume op is called, so + * clear local->suspended so the device could operate + * normally (e.g. pass rx frames). + */ + local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { @@ -1781,11 +1785,25 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. + * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; + local->suspended = true; } #endif + /* + * In case of hw_restart during suspend (without wowlan), + * cancel restart work, as we are reconfiguring the device + * anyway. + * Note that restart_work is scheduled on a frozen workqueue, + * so we can't deadlock in this case. + */ + if (suspended && local->in_reconfig && !reconfig_due_to_wowlan) + cancel_work_sync(&local->restart_work); + + local->started = false; + /* * Upon resume hardware can sometimes be goofy due to * various platform / driver / bus issues, so restarting @@ -1794,7 +1812,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) */ res = drv_start(local); if (res) { - if (local->suspended) + if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); @@ -1861,50 +1879,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(drv_add_chanctx(local, ctx)); mutex_unlock(&local->chanctx_mtx); - list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - ieee80211_assign_chanctx(local, sdata); - } - sdata = rtnl_dereference(local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata); } - /* add STAs back */ - mutex_lock(&local->sta_mtx); - list_for_each_entry(sta, &local->sta_list, list) { - enum ieee80211_sta_state state; - - if (!sta->uploaded) - continue; - - /* AP-mode stations will be added later */ - if (sta->sdata->vif.type == NL80211_IFTYPE_AP) - continue; - - for (state = IEEE80211_STA_NOTEXIST; - state < sta->sta_state; state++) - WARN_ON(drv_sta_state(local, sta->sdata, sta, state, - state + 1)); - } - mutex_unlock(&local->sta_mtx); - - /* reconfigure tx conf */ - if (hw->queues >= IEEE80211_NUM_ACS) { - list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_MONITOR || - !ieee80211_sdata_running(sdata)) - continue; - - for (i = 0; i < IEEE80211_NUM_ACS; i++) - drv_conf_tx(local, sdata, i, - &sdata->tx_conf[i]); - } - } - /* reconfigure hardware */ ieee80211_hw_config(local, ~0); @@ -1917,6 +1896,22 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (!ieee80211_sdata_running(sdata)) continue; + ieee80211_assign_chanctx(local, sdata); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MONITOR: + break; + default: + ieee80211_reconfig_stations(sdata); + /* fall through */ + case NL80211_IFTYPE_AP: /* AP stations are handled later */ + for (i = 0; i < IEEE80211_NUM_ACS; i++) + drv_conf_tx(local, sdata, i, + &sdata->tx_conf[i]); + break; + } + /* common change flags for all interface types */ changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | @@ -1984,7 +1979,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) } } - ieee80211_recalc_ps(local, -1); + ieee80211_recalc_ps(local); /* * The sta might be in psm against the ap (e.g. because @@ -1999,7 +1994,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (!sdata->u.mgd.associated) continue; - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); } } @@ -2029,6 +2024,29 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (ieee80211_sdata_running(sdata)) ieee80211_enable_keys(sdata); + /* Reconfigure sched scan if it was interrupted by FW restart */ + mutex_lock(&local->mtx); + sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, + lockdep_is_held(&local->mtx)); + sched_scan_req = rcu_dereference_protected(local->sched_scan_req, + lockdep_is_held(&local->mtx)); + if (sched_scan_sdata && sched_scan_req) + /* + * Sched scan stopped, but we don't want to report it. Instead, + * we're trying to reschedule. However, if more than one scan + * plan was set, we cannot reschedule since we don't know which + * scan plan was currently running (and some scan plans may have + * already finished). + */ + if (sched_scan_req->n_scan_plans > 1 || + __ieee80211_request_sched_scan_start(sched_scan_sdata, + sched_scan_req)) + sched_scan_stopped = true; + mutex_unlock(&local->mtx); + + if (sched_scan_stopped) + cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); + wake_up: local->in_reconfig = false; barrier(); @@ -2046,12 +2064,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { - ieee80211_sta_tear_down_BA_sessions( - sta, AGG_STOP_LOCAL_REQUEST); + if (!local->resuming) + ieee80211_sta_tear_down_BA_sessions( + sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } @@ -2062,36 +2081,14 @@ int ieee80211_reconfig(struct ieee80211_local *local) IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); - /* - * Reconfigure sched scan if it was interrupted by FW restart or - * suspend. - */ - mutex_lock(&local->mtx); - sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, - lockdep_is_held(&local->mtx)); - sched_scan_req = rcu_dereference_protected(local->sched_scan_req, - lockdep_is_held(&local->mtx)); - if (sched_scan_sdata && sched_scan_req) - /* - * Sched scan stopped, but we don't want to report it. Instead, - * we're trying to reschedule. - */ - if (__ieee80211_request_sched_scan_start(sched_scan_sdata, - sched_scan_req)) - sched_scan_stopped = true; - mutex_unlock(&local->mtx); - - if (sched_scan_stopped) - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); - /* * If this is for hw restart things are still running. * We may want to change that later, however. */ - if (local->open_count && (!local->suspended || reconfig_due_to_wowlan)) + if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); - if (!local->suspended) + if (!suspended) return 0; #ifdef CONFIG_PM @@ -2100,17 +2097,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mb(); local->resuming = false; - /* It's possible that we don't handle the scan completion in - * time during suspend, so if it's still marked as completed - * here, queue the work and flush it to clean things up. - * Instead of calling the worker function directly here, we - * really queue it to avoid potential races with other flows - * scheduling the same work. - */ - if (test_bit(SCAN_COMPLETED, &local->scanning)) { - ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); - flush_delayed_work(&local->scan_work); - } + ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); @@ -2168,7 +2155,13 @@ void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata) chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); - if (WARN_ON_ONCE(!chanctx_conf)) + /* + * This function can be called from a work, thus it may be possible + * that the chanctx_conf is removed (due to a disconnection, for + * example). + * So nothing should be done in such case. + */ + if (!chanctx_conf) goto unlock; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); @@ -2305,7 +2298,7 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, - u16 prot_mode) + u16 prot_mode, bool rifs_mode) { struct ieee80211_ht_operation *ht_oper; /* Build HT Information */ @@ -2333,6 +2326,9 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, chandef->width != NL80211_CHAN_WIDTH_20) ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY; + if (rifs_mode) + ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE; + ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x0000; @@ -2357,6 +2353,8 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, if (chandef->center_freq2) vht_oper->center_freq_seg2_idx = ieee80211_frequency_to_channel(chandef->center_freq2); + else + vht_oper->center_freq_seg2_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: @@ -2574,7 +2572,7 @@ int ieee80211_ave_rssi(struct ieee80211_vif *vif) /* non-managed type inferfaces */ return 0; } - return ifmgd->ave_beacon_signal / 16; + return -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); @@ -2984,6 +2982,13 @@ ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) if (end > 0) return false; + /* One shot NOA */ + if (data->count[i] == 1) + return false; + + if (data->desc[i].interval == 0) + return false; + /* End time is in the past, check for repetitions */ skip = DIV_ROUND_UP(-end, data->desc[i].interval); if (data->count[i] < 255) { @@ -3331,9 +3336,11 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, if (sta) { txqi->txq.sta = &sta->sta; sta->sta.txq[tid] = &txqi->txq; + txqi->txq.tid = tid; txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; } else { sdata->vif.txq = &txqi->txq; + txqi->txq.tid = 0; txqi->txq.ac = IEEE80211_AC_BE; } } diff --git a/kernel/net/mac80211/vht.c b/kernel/net/mac80211/vht.c index 80694d55d..c38b2f07a 100644 --- a/kernel/net/mac80211/vht.c +++ b/kernel/net/mac80211/vht.c @@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; + bool have_80mhz; memset(vht_cap, 0, sizeof(*vht_cap)); @@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + return; + /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 @@ -308,11 +323,15 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; enum ieee80211_sta_rx_bandwidth bw; + enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width; - bw = ieee80211_chan_width_to_rx_bw(sdata->vif.bss_conf.chandef.width); - bw = min(bw, ieee80211_sta_cap_rx_bw(sta)); + bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); + /* do not cap the BW of TDLS WIDER_BW peers by the bss */ + if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); + return bw; } @@ -359,7 +378,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only) + enum ieee80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -382,9 +401,6 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, changed |= IEEE80211_RC_NSS_CHANGED; } - if (nss_only) - return changed; - switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; @@ -411,14 +427,39 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only) + enum ieee80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; - u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, - band, nss_only); + u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band); if (changed > 0) rate_control_rate_update(local, sband, sta, changed); } + +void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, + u16 vht_mask[NL80211_VHT_NSS_MAX]) +{ + int i; + u16 mask, cap = le16_to_cpu(vht_cap); + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + switch (mask) { + case IEEE80211_VHT_MCS_SUPPORT_0_7: + vht_mask[i] = 0x00FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_8: + vht_mask[i] = 0x01FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_9: + vht_mask[i] = 0x03FF; + break; + case IEEE80211_VHT_MCS_NOT_SUPPORTED: + default: + vht_mask[i] = 0; + break; + } + } +} diff --git a/kernel/net/mac80211/wpa.c b/kernel/net/mac80211/wpa.c index 9d63d93c8..d824c3897 100644 --- a/kernel/net/mac80211/wpa.c +++ b/kernel/net/mac80211/wpa.c @@ -174,9 +174,12 @@ mic_fail_no_key: * a driver that supports HW encryption. Send up the key idx only if * the key is set. */ - mac80211_ev_michael_mic_failure(rx->sdata, - rx->key ? rx->key->conf.keyidx : -1, - (void *) skb->data, NULL, GFP_ATOMIC); + cfg80211_michael_mic_failure(rx->sdata->dev, hdr->addr2, + is_multicast_ether_addr(hdr->addr1) ? + NL80211_KEYTYPE_GROUP : + NL80211_KEYTYPE_PAIRWISE, + rx->key ? rx->key->conf.keyidx : -1, + NULL, GFP_ATOMIC); return RX_DROP_UNUSABLE; } @@ -444,7 +447,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, hdr = (struct ieee80211_hdr *) pos; pos += hdrlen; - pn64 = atomic64_inc_return(&key->u.ccmp.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; @@ -516,31 +519,34 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } - ccmp_hdr2pn(pn, skb->data + hdrlen); + if (!(status->flag & RX_FLAG_PN_VALIDATED)) { + ccmp_hdr2pn(pn, skb->data + hdrlen); - queue = rx->security_idx; + queue = rx->security_idx; - if (memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN) <= 0) { - key->u.ccmp.replays++; - return RX_DROP_UNUSABLE; - } + if (memcmp(pn, key->u.ccmp.rx_pn[queue], + IEEE80211_CCMP_PN_LEN) <= 0) { + key->u.ccmp.replays++; + return RX_DROP_UNUSABLE; + } - if (!(status->flag & RX_FLAG_DECRYPTED)) { - u8 aad[2 * AES_BLOCK_SIZE]; - u8 b_0[AES_BLOCK_SIZE]; - /* hardware didn't decrypt/verify MIC */ - ccmp_special_blocks(skb, pn, b_0, aad); + if (!(status->flag & RX_FLAG_DECRYPTED)) { + u8 aad[2 * AES_BLOCK_SIZE]; + u8 b_0[AES_BLOCK_SIZE]; + /* hardware didn't decrypt/verify MIC */ + ccmp_special_blocks(skb, pn, b_0, aad); + + if (ieee80211_aes_ccm_decrypt( + key->u.ccmp.tfm, b_0, aad, + skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, + data_len, + skb->data + skb->len - mic_len, mic_len)) + return RX_DROP_UNUSABLE; + } - if (ieee80211_aes_ccm_decrypt( - key->u.ccmp.tfm, b_0, aad, - skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, - data_len, - skb->data + skb->len - mic_len, mic_len)) - return RX_DROP_UNUSABLE; + memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); } - memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); - /* Remove CCMP header and MIC */ if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_UNUSABLE; @@ -670,7 +676,7 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) hdr = (struct ieee80211_hdr *)pos; pos += hdrlen; - pn64 = atomic64_inc_return(&key->u.gcmp.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; @@ -739,31 +745,35 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } - gcmp_hdr2pn(pn, skb->data + hdrlen); + if (!(status->flag & RX_FLAG_PN_VALIDATED)) { + gcmp_hdr2pn(pn, skb->data + hdrlen); - queue = rx->security_idx; + queue = rx->security_idx; - if (memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN) <= 0) { - key->u.gcmp.replays++; - return RX_DROP_UNUSABLE; - } + if (memcmp(pn, key->u.gcmp.rx_pn[queue], + IEEE80211_GCMP_PN_LEN) <= 0) { + key->u.gcmp.replays++; + return RX_DROP_UNUSABLE; + } - if (!(status->flag & RX_FLAG_DECRYPTED)) { - u8 aad[2 * AES_BLOCK_SIZE]; - u8 j_0[AES_BLOCK_SIZE]; - /* hardware didn't decrypt/verify MIC */ - gcmp_special_blocks(skb, pn, j_0, aad); + if (!(status->flag & RX_FLAG_DECRYPTED)) { + u8 aad[2 * AES_BLOCK_SIZE]; + u8 j_0[AES_BLOCK_SIZE]; + /* hardware didn't decrypt/verify MIC */ + gcmp_special_blocks(skb, pn, j_0, aad); + + if (ieee80211_aes_gcm_decrypt( + key->u.gcmp.tfm, j_0, aad, + skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN, + data_len, + skb->data + skb->len - + IEEE80211_GCMP_MIC_LEN)) + return RX_DROP_UNUSABLE; + } - if (ieee80211_aes_gcm_decrypt( - key->u.gcmp.tfm, j_0, aad, - skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN, - data_len, - skb->data + skb->len - IEEE80211_GCMP_MIC_LEN)) - return RX_DROP_UNUSABLE; + memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); } - memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); - /* Remove GCMP header and MIC */ if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN)) return RX_DROP_UNUSABLE; @@ -940,7 +950,7 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -984,7 +994,7 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -1129,7 +1139,7 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_gmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); diff --git a/kernel/net/mac802154/Kconfig b/kernel/net/mac802154/Kconfig index aa462b480..fb45287eb 100644 --- a/kernel/net/mac802154/Kconfig +++ b/kernel/net/mac802154/Kconfig @@ -2,6 +2,7 @@ config MAC802154 tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)" depends on IEEE802154 select CRC_CCITT + select CRYPTO select CRYPTO_AUTHENC select CRYPTO_CCM select CRYPTO_CTR diff --git a/kernel/net/mac802154/Makefile b/kernel/net/mac802154/Makefile index 702d8b466..17a51e838 100644 --- a/kernel/net/mac802154/Makefile +++ b/kernel/net/mac802154/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_MAC802154) += mac802154.o mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \ - iface.o llsec.o util.o cfg.o + iface.o llsec.o util.o cfg.o trace.o + +CFLAGS_trace.o := -I$(src) ccflags-y += -D__CHECK_ENDIAN__ diff --git a/kernel/net/mac802154/cfg.c b/kernel/net/mac802154/cfg.c index 70be9c799..57b5e9447 100644 --- a/kernel/net/mac802154/cfg.c +++ b/kernel/net/mac802154/cfg.c @@ -44,6 +44,49 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy, ieee802154_if_remove(sdata); } +#ifdef CONFIG_PM +static int ieee802154_suspend(struct wpan_phy *wpan_phy) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + + if (!local->open_count) + goto suspend; + + ieee802154_stop_queue(&local->hw); + synchronize_net(); + + /* stop hardware - this must stop RX */ + ieee802154_stop_device(local); + +suspend: + local->suspended = true; + return 0; +} + +static int ieee802154_resume(struct wpan_phy *wpan_phy) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + /* nothing to do if HW shouldn't run */ + if (!local->open_count) + goto wake_up; + + /* restart hardware */ + ret = drv_start(local); + if (ret) + return ret; + +wake_up: + ieee802154_wake_queue(&local->hw); + local->suspended = false; + return 0; +} +#else +#define ieee802154_suspend NULL +#define ieee802154_resume NULL +#endif + static int ieee802154_add_iface(struct wpan_phy *phy, const char *name, unsigned char name_assign_type, @@ -73,9 +116,9 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel) ASSERT_RTNL(); - /* check if phy support this setting */ - if (!(wpan_phy->channels_supported[page] & BIT(channel))) - return -EINVAL; + if (wpan_phy->current_page == page && + wpan_phy->current_channel == channel) + return 0; ret = drv_set_channel(local, page, channel); if (!ret) { @@ -95,9 +138,8 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, ASSERT_RTNL(); - /* check if phy support this setting */ - if (!(local->hw.flags & IEEE802154_HW_CCA_MODE)) - return -EOPNOTSUPP; + if (wpan_phy_cca_cmp(&wpan_phy->cca, cca)) + return 0; ret = drv_set_cca_mode(local, cca); if (!ret) @@ -106,24 +148,58 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, return ret; } +static int +ieee802154_set_cca_ed_level(struct wpan_phy *wpan_phy, s32 ed_level) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + ASSERT_RTNL(); + + if (wpan_phy->cca_ed_level == ed_level) + return 0; + + ret = drv_set_cca_ed_level(local, ed_level); + if (!ret) + wpan_phy->cca_ed_level = ed_level; + + return ret; +} + +static int +ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + ASSERT_RTNL(); + + if (wpan_phy->transmit_power == power) + return 0; + + ret = drv_set_tx_power(local, power); + if (!ret) + wpan_phy->transmit_power = power; + + return ret; +} + static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 pan_id) { + int ret; + ASSERT_RTNL(); - /* TODO - * I am not sure about to check here on broadcast pan_id. - * Broadcast is a valid setting, comment from 802.15.4: - * If this value is 0xffff, the device is not associated. - * - * This could useful to simple deassociate an device. - */ - if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) - return -EINVAL; - - wpan_dev->pan_id = pan_id; - return 0; + if (wpan_dev->pan_id == pan_id) + return 0; + + ret = mac802154_wpan_update_llsec(wpan_dev->netdev); + if (!ret) + wpan_dev->pan_id = pan_id; + + return ret; } static int @@ -131,13 +207,8 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) - return -EOPNOTSUPP; - wpan_dev->min_be = min_be; wpan_dev->max_be = max_be; return 0; @@ -149,21 +220,6 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); - /* TODO - * I am not sure about to check here on broadcast short_addr. - * Broadcast is a valid setting, comment from 802.15.4: - * A value of 0xfffe indicates that the device has - * associated but has not been allocated an address. A - * value of 0xffff indicates that the device does not - * have a short address. - * - * I think we should allow to set these settings but - * don't allow to allow socket communication with it. - */ - if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || - short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) - return -EINVAL; - wpan_dev->short_addr = short_addr; return 0; } @@ -173,13 +229,8 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) - return -EOPNOTSUPP; - wpan_dev->csma_retries = max_csma_backoffs; return 0; } @@ -189,13 +240,8 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, s8 max_frame_retries) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES)) - return -EOPNOTSUPP; - wpan_dev->frame_retries = max_frame_retries; return 0; } @@ -204,28 +250,243 @@ static int ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_LBT)) - return -EOPNOTSUPP; - wpan_dev->lbt = mode; return 0; } +static int +ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, bool ackreq) +{ + ASSERT_RTNL(); + + wpan_dev->ackreq = ackreq; + return 0; +} + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static void +ieee802154_get_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_table **table) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + *table = &sdata->sec.table; +} + +static void +ieee802154_lock_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + mutex_lock(&sdata->sec_mtx); +} + +static void +ieee802154_unlock_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + mutex_unlock(&sdata->sec_mtx); +} + +static int +ieee802154_set_llsec_params(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_params *params, + int changed) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_set_params(&sdata->sec, params, changed); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_get_llsec_params(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_params *params) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_get_params(&sdata->sec, params); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_key_add(&sdata->sec, id, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_key_del(&sdata->sec, id); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_seclevel_add(&sdata->sec, sl); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_seclevel_del(&sdata->sec, sl); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_device *dev_desc) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_dev_add(&sdata->sec, dev_desc); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_dev_del(&sdata->sec, extended_addr); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_devkey_add(&sdata->sec, extended_addr, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_devkey_del(&sdata->sec, extended_addr, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + const struct cfg802154_ops mac802154_config_ops = { .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated, .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated, + .suspend = ieee802154_suspend, + .resume = ieee802154_resume, .add_virtual_intf = ieee802154_add_iface, .del_virtual_intf = ieee802154_del_iface, .set_channel = ieee802154_set_channel, .set_cca_mode = ieee802154_set_cca_mode, + .set_cca_ed_level = ieee802154_set_cca_ed_level, + .set_tx_power = ieee802154_set_tx_power, .set_pan_id = ieee802154_set_pan_id, .set_short_addr = ieee802154_set_short_addr, .set_backoff_exponent = ieee802154_set_backoff_exponent, .set_max_csma_backoffs = ieee802154_set_max_csma_backoffs, .set_max_frame_retries = ieee802154_set_max_frame_retries, .set_lbt_mode = ieee802154_set_lbt_mode, + .set_ackreq_default = ieee802154_set_ackreq_default, +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + .get_llsec_table = ieee802154_get_llsec_table, + .lock_llsec_table = ieee802154_lock_llsec_table, + .unlock_llsec_table = ieee802154_unlock_llsec_table, + /* TODO above */ + .set_llsec_params = ieee802154_set_llsec_params, + .get_llsec_params = ieee802154_get_llsec_params, + .add_llsec_key = ieee802154_add_llsec_key, + .del_llsec_key = ieee802154_del_llsec_key, + .add_seclevel = ieee802154_add_seclevel, + .del_seclevel = ieee802154_del_seclevel, + .add_device = ieee802154_add_device, + .del_device = ieee802154_del_device, + .add_devkey = ieee802154_add_devkey, + .del_devkey = ieee802154_del_devkey, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; diff --git a/kernel/net/mac802154/driver-ops.h b/kernel/net/mac802154/driver-ops.h index a0533357b..0550f3365 100644 --- a/kernel/net/mac802154/driver-ops.h +++ b/kernel/net/mac802154/driver-ops.h @@ -7,6 +7,7 @@ #include #include "ieee802154_i.h" +#include "trace.h" static inline int drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb) @@ -27,19 +28,25 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) static inline int drv_start(struct ieee802154_local *local) { + int ret; + might_sleep(); + trace_802154_drv_start(local); local->started = true; smp_mb(); - - return local->ops->start(&local->hw); + ret = local->ops->start(&local->hw); + trace_802154_drv_return_int(local, ret); + return ret; } static inline void drv_stop(struct ieee802154_local *local) { might_sleep(); + trace_802154_drv_stop(local); local->ops->stop(&local->hw); + trace_802154_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); @@ -53,13 +60,20 @@ static inline void drv_stop(struct ieee802154_local *local) static inline int drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { + int ret; + might_sleep(); - return local->ops->set_channel(&local->hw, page, channel); + trace_802154_drv_set_channel(local, page, channel); + ret = local->ops->set_channel(&local->hw, page, channel); + trace_802154_drv_return_int(local, ret); + return ret; } -static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm) +static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { + int ret; + might_sleep(); if (!local->ops->set_txpower) { @@ -67,12 +81,17 @@ static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm) return -EOPNOTSUPP; } - return local->ops->set_txpower(&local->hw, dbm); + trace_802154_drv_set_tx_power(local, mbm); + ret = local->ops->set_txpower(&local->hw, mbm); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_cca_mode(struct ieee802154_local *local, const struct wpan_phy_cca *cca) { + int ret; + might_sleep(); if (!local->ops->set_cca_mode) { @@ -80,11 +99,16 @@ static inline int drv_set_cca_mode(struct ieee802154_local *local, return -EOPNOTSUPP; } - return local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_set_cca_mode(local, cca); + ret = local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { + int ret; + might_sleep(); if (!local->ops->set_lbt) { @@ -92,12 +116,17 @@ static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) return -EOPNOTSUPP; } - return local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_set_lbt_mode(local, mode); + ret = local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int -drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level) +drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { + int ret; + might_sleep(); if (!local->ops->set_cca_ed_level) { @@ -105,12 +134,16 @@ drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level) return -EOPNOTSUPP; } - return local->ops->set_cca_ed_level(&local->hw, ed_level); + trace_802154_drv_set_cca_ed_level(local, mbm); + ret = local->ops->set_cca_ed_level(&local->hw, mbm); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -121,14 +154,18 @@ static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) filt.pan_id = pan_id; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_pan_id(local, pan_id); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANID_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -139,14 +176,18 @@ drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) filt.ieee_addr = extended_addr; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_extended_addr(local, extended_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -157,14 +198,18 @@ drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) filt.short_addr = short_addr; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_short_addr(local, short_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_SADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -175,14 +220,19 @@ drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) filt.pan_coord = is_coord; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_pan_coord(local, is_coord); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANC_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, u8 max_csma_backoffs) { + int ret; + might_sleep(); if (!local->ops->set_csma_params) { @@ -190,13 +240,19 @@ drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, return -EOPNOTSUPP; } - return local->ops->set_csma_params(&local->hw, min_be, max_be, + trace_802154_drv_set_csma_params(local, min_be, max_be, + max_csma_backoffs); + ret = local->ops->set_csma_params(&local->hw, min_be, max_be, max_csma_backoffs); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) { + int ret; + might_sleep(); if (!local->ops->set_frame_retries) { @@ -204,12 +260,17 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) return -EOPNOTSUPP; } - return local->ops->set_frame_retries(&local->hw, max_frame_retries); + trace_802154_drv_set_max_frame_retries(local, max_frame_retries); + ret = local->ops->set_frame_retries(&local->hw, max_frame_retries); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { + int ret; + might_sleep(); if (!local->ops->set_promiscuous_mode) { @@ -217,7 +278,10 @@ drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) return -EOPNOTSUPP; } - return local->ops->set_promiscuous_mode(&local->hw, on); + trace_802154_drv_set_promiscuous_mode(local, on); + ret = local->ops->set_promiscuous_mode(&local->hw, on); + trace_802154_drv_return_int(local, ret); + return ret; } #endif /* __MAC802154_DRIVER_OPS */ diff --git a/kernel/net/mac802154/ieee802154_i.h b/kernel/net/mac802154/ieee802154_i.h index 127ba1838..56ccffa3f 100644 --- a/kernel/net/mac802154/ieee802154_i.h +++ b/kernel/net/mac802154/ieee802154_i.h @@ -56,9 +56,13 @@ struct ieee802154_local { struct hrtimer ifs_timer; bool started; + bool suspended; struct tasklet_struct tasklet; struct sk_buff_head skb_queue; + + struct sk_buff *tx_skb; + struct work_struct tx_work; }; enum { @@ -86,20 +90,14 @@ struct ieee802154_sub_if_data { unsigned long state; char name[IFNAMSIZ]; - spinlock_t mib_lock; - /* protects sec from concurrent access by netlink. access by * encrypt/decrypt/header_create safe without additional protection. */ struct mutex sec_mtx; struct mac802154_llsec sec; - /* must be last, dynamically sized area in this! */ - struct ieee802154_vif vif; }; -#define MAC802154_CHAN_NONE 0xff /* No channel is assigned */ - /* utility functions/constants */ extern const void *const mac802154_wpan_phy_privid; /* for wpan_phy privid */ @@ -129,6 +127,8 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata) extern struct ieee802154_mlme_ops mac802154_mlme_wpan; +void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); +void ieee802154_xmit_worker(struct work_struct *work); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t @@ -136,12 +136,7 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer); /* MIB callbacks */ -void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val); -__le16 mac802154_dev_get_short_addr(const struct net_device *dev); -__le16 mac802154_dev_get_pan_id(const struct net_device *dev); -void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val); void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); -u8 mac802154_dev_get_dsn(const struct net_device *dev); int mac802154_get_params(struct net_device *dev, struct ieee802154_llsec_params *params); @@ -176,6 +171,8 @@ void mac802154_get_table(struct net_device *dev, struct ieee802154_llsec_table **t); void mac802154_unlock_table(struct net_device *dev); +int mac802154_wpan_update_llsec(struct net_device *dev); + /* interface handling */ int ieee802154_iface_init(void); void ieee802154_iface_exit(void); @@ -185,5 +182,6 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr); void ieee802154_remove_interfaces(struct ieee802154_local *local); +void ieee802154_stop_device(struct ieee802154_local *local); #endif /* __IEEE802154_I_H */ diff --git a/kernel/net/mac802154/iface.c b/kernel/net/mac802154/iface.c index 91b75abbd..7079cd32a 100644 --- a/kernel/net/mac802154/iface.c +++ b/kernel/net/mac802154/iface.c @@ -30,7 +30,7 @@ #include "ieee802154_i.h" #include "driver-ops.h" -static int mac802154_wpan_update_llsec(struct net_device *dev) +int mac802154_wpan_update_llsec(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); @@ -62,9 +62,10 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; - ASSERT_RTNL(); + if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) + return err; - spin_lock_bh(&sdata->mib_lock); + rtnl_lock(); switch (cmd) { case SIOCGIFADDR: @@ -89,7 +90,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } case SIOCSIFADDR: if (netif_running(dev)) { - spin_unlock_bh(&sdata->mib_lock); + rtnl_unlock(); return -EBUSY; } @@ -111,7 +112,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - spin_unlock_bh(&sdata->mib_lock); + rtnl_unlock(); return err; } @@ -124,29 +125,97 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) if (netif_running(dev)) return -EBUSY; + /* lowpan need to be down for update + * SLAAC address after ifup + */ + if (sdata->wpan_dev.lowpan_dev) { + if (netif_running(sdata->wpan_dev.lowpan_dev)) + return -EBUSY; + } + ieee802154_be64_to_le64(&extended_addr, addr->sa_data); - if (!ieee802154_is_valid_extended_addr(extended_addr)) + if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); sdata->wpan_dev.extended_addr = extended_addr; + /* update lowpan interface mac address when + * wpan mac has been changed + */ + if (sdata->wpan_dev.lowpan_dev) + memcpy(sdata->wpan_dev.lowpan_dev->dev_addr, dev->dev_addr, + dev->addr_len); + return mac802154_wpan_update_llsec(dev); } +static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) +{ + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + int ret; + + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, + wpan_dev->promiscuous_mode); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, wpan_dev->pan_id); + if (ret < 0) + return ret; + + ret = drv_set_extended_addr(local, wpan_dev->extended_addr); + if (ret < 0) + return ret; + + ret = drv_set_short_addr(local, wpan_dev->short_addr); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_LBT) { + ret = drv_set_lbt_mode(local, wpan_dev->lbt); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { + ret = drv_set_csma_params(local, wpan_dev->min_be, + wpan_dev->max_be, + wpan_dev->csma_retries); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { + ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); + if (ret < 0) + return ret; + } + + return 0; +} + static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; - int res = 0; + int res; ASSERT_RTNL(); set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { + res = ieee802154_setup_hw(sdata); + if (res) + goto err; + res = drv_start(local); - WARN_ON(res); if (res) goto err; } @@ -218,8 +287,8 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, * exist really an use case if we need to support * multiple node types at the same time. */ - if (sdata->vif.type == NL802154_IFTYPE_NODE && - nsdata->vif.type == NL802154_IFTYPE_NODE) + if (wpan_dev->iftype == NL802154_IFTYPE_NODE && + nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE) return -EBUSY; /* check all phy mac sublayer settings are the same. @@ -239,67 +308,13 @@ static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; - struct wpan_phy *phy = sdata->local->phy; - - rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type); - if (rc < 0) - return rc; - rc = mac802154_slave_open(dev); + rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; - mutex_lock(&phy->pib_lock); - - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - rc = drv_set_promiscuous_mode(local, - wpan_dev->promiscuous_mode); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_AFILT) { - rc = drv_set_pan_id(local, wpan_dev->pan_id); - if (rc < 0) - goto out; - - rc = drv_set_extended_addr(local, wpan_dev->extended_addr); - if (rc < 0) - goto out; - - rc = drv_set_short_addr(local, wpan_dev->short_addr); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_LBT) { - rc = drv_set_lbt_mode(local, wpan_dev->lbt); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { - rc = drv_set_csma_params(local, wpan_dev->min_be, - wpan_dev->max_be, - wpan_dev->csma_retries); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { - rc = drv_set_max_frame_retries(local, wpan_dev->frame_retries); - if (rc < 0) - goto out; - } - - mutex_unlock(&phy->pib_lock); - return 0; - -out: - mutex_unlock(&phy->pib_lock); - return rc; + return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) @@ -309,15 +324,13 @@ static int mac802154_slave_close(struct net_device *dev) ASSERT_RTNL(); - hrtimer_cancel(&local->ifs_timer); - netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) - drv_stop(local); + ieee802154_stop_device(local); return 0; } @@ -354,12 +367,11 @@ static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, return 0; } -static int mac802154_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, - const void *daddr, - const void *saddr, - unsigned len) +static int ieee802154_header_create(struct sk_buff *skb, + struct net_device *dev, + const struct ieee802154_addr *daddr, + const struct ieee802154_addr *saddr, + unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); @@ -374,14 +386,12 @@ static int mac802154_header_create(struct sk_buff *skb, hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; - hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev); + hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; if (!saddr) { - spin_lock_bh(&sdata->mib_lock); - if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { @@ -393,8 +403,6 @@ static int mac802154_header_create(struct sk_buff *skb, } hdr.source.pan_id = wpan_dev->pan_id; - - spin_unlock_bh(&sdata->mib_lock); } else { hdr.source = *(const struct ieee802154_addr *)saddr; } @@ -414,24 +422,89 @@ static int mac802154_header_create(struct sk_buff *skb, return hlen; } +static const struct wpan_dev_header_ops ieee802154_header_ops = { + .create = ieee802154_header_create, +}; + +/* This header create functionality assumes a 8 byte array for + * source and destination pointer at maximum. To adapt this for + * the 802.15.4 dataframe header we use extended address handling + * here only and intra pan connection. fc fields are mostly fallback + * handling. For provide dev_hard_header for dgram sockets. + */ +static int mac802154_header_create(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, + const void *saddr, + unsigned len) +{ + struct ieee802154_hdr hdr; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct ieee802154_mac_cb cb = { }; + int hlen; + + if (!daddr) + return -EINVAL; + + memset(&hdr.fc, 0, sizeof(hdr.fc)); + hdr.fc.type = IEEE802154_FC_TYPE_DATA; + hdr.fc.ack_request = wpan_dev->ackreq; + hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; + + /* TODO currently a workaround to give zero cb block to set + * security parameters defaults according MIB. + */ + if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) + return -EINVAL; + + hdr.dest.pan_id = wpan_dev->pan_id; + hdr.dest.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); + + hdr.source.pan_id = hdr.dest.pan_id; + hdr.source.mode = IEEE802154_ADDR_LONG; + + if (!saddr) + hdr.source.extended_addr = wpan_dev->extended_addr; + else + ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); + + hlen = ieee802154_hdr_push(skb, &hdr); + if (hlen < 0) + return -EINVAL; + + skb_reset_mac_header(skb); + skb->mac_len = hlen; + + if (len > ieee802154_max_payload(&hdr)) + return -EMSGSIZE; + + return hlen; +} + static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; - struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } - *addr = hdr.source; - return sizeof(*addr); + if (hdr.source.mode == IEEE802154_ADDR_LONG) { + ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); + return IEEE802154_EXTENDED_ADDR_LEN; + } + + return 0; } -static struct header_ops mac802154_header_ops = { - .create = mac802154_header_create, - .parse = mac802154_header_parse, +static const struct header_ops mac802154_header_ops = { + .create = mac802154_header_create, + .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { @@ -462,9 +535,29 @@ static void ieee802154_if_setup(struct net_device *dev) dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); - dev->hard_header_len = MAC802154_FRAME_HARD_HEADER_LEN; - dev->needed_tailroom = 2 + 16; /* FCS + MIC */ - dev->mtu = IEEE802154_MTU; + /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET + * will not send frames without any payload, but ack frames + * has no payload, so substract one that we can send a 3 bytes + * frame. The xmit callback assumes at least a hard header where two + * bytes fc and sequence field are set. + */ + dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; + /* The auth_tag header is for security and places in private payload + * room of mac frame which stucks between payload and FCS field. + */ + dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + + IEEE802154_FCS_LEN; + /* The mtu size is the payload without mac header in this case. + * We have a dynamic length header with a minimum header length + * which is hard_header_len. In this case we let mtu to the size + * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - + * hard_header_len. The FCS which is set by hardware or ndo_start_xmit + * and the minimum mac header which can be evaluated inside driver + * layer. The rest of mac header will be part of payload if greater + * than hard_header_len. + */ + dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - + dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } @@ -474,20 +567,22 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; + int ret; + u8 tmp; /* set some type-dependent values */ - sdata->vif.type = type; sdata->wpan_dev.iftype = type; - get_random_bytes(&wpan_dev->bsn, 1); - get_random_bytes(&wpan_dev->dsn, 1); + get_random_bytes(&tmp, sizeof(tmp)); + atomic_set(&wpan_dev->bsn, tmp); + get_random_bytes(&tmp, sizeof(tmp)); + atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; wpan_dev->max_be = 5; wpan_dev->csma_retries = 4; - /* for compatibility, actual default is 3 */ - wpan_dev->frame_retries = -1; + wpan_dev->frame_retries = 3; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); @@ -502,11 +597,15 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; wpan_dev->promiscuous_mode = false; + wpan_dev->header_ops = &ieee802154_header_ops; - spin_lock_init(&sdata->mib_lock); mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); + ret = mac802154_wpan_update_llsec(sdata->dev); + if (ret < 0) + return ret; + break; case NL802154_IFTYPE_MONITOR: sdata->dev->destructor = free_netdev; @@ -531,12 +630,13 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, ASSERT_RTNL(); - ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, + ndev = alloc_netdev(sizeof(*sdata), name, name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); - ndev->needed_headroom = local->hw.extra_tx_headroom; + ndev->needed_headroom = local->hw.extra_tx_headroom + + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) @@ -547,7 +647,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, switch (type) { case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; - if (ieee802154_is_valid_extended_addr(extended_addr)) + if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr); else memcpy(ndev->dev_addr, ndev->perm_addr, diff --git a/kernel/net/mac802154/llsec.c b/kernel/net/mac802154/llsec.c index 5b2be1283..a13d02b7c 100644 --- a/kernel/net/mac802154/llsec.c +++ b/kernel/net/mac802154/llsec.c @@ -17,8 +17,9 @@ #include #include #include +#include #include -#include +#include #include "ieee802154_i.h" #include "llsec.h" @@ -54,7 +55,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec) msl = container_of(sl, struct mac802154_llsec_seclevel, level); list_del(&sl->list); - kfree(msl); + kzfree(msl); } list_for_each_entry_safe(dev, dn, &sec->table.devices, list) { @@ -71,7 +72,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec) mkey = container_of(key->key, struct mac802154_llsec_key, key); list_del(&key->list); llsec_key_put(mkey); - kfree(key); + kzfree(key); } } @@ -160,7 +161,7 @@ err_tfm: if (key->tfm[i]) crypto_free_aead(key->tfm[i]); - kfree(key); + kzfree(key); return NULL; } @@ -175,7 +176,7 @@ static void llsec_key_release(struct kref *ref) crypto_free_aead(key->tfm[i]); crypto_free_blkcipher(key->tfm0); - kfree(key); + kzfree(key); } static struct mac802154_llsec_key* @@ -266,7 +267,7 @@ int mac802154_llsec_key_add(struct mac802154_llsec *sec, return 0; fail: - kfree(new); + kzfree(new); return -ENOMEM; } @@ -346,10 +347,10 @@ static void llsec_dev_free(struct mac802154_llsec_device *dev) devkey); list_del(&pos->list); - kfree(devkey); + kzfree(devkey); } - kfree(dev); + kzfree(dev); } int mac802154_llsec_dev_add(struct mac802154_llsec *sec, @@ -400,6 +401,7 @@ int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr) hash_del_rcu(&pos->bucket_s); hash_del_rcu(&pos->bucket_hw); + list_del_rcu(&pos->dev.list); call_rcu(&pos->rcu, llsec_dev_free_rcu); return 0; @@ -649,7 +651,7 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; unsigned char *data; int authlen, assoclen, datalen, rc; - struct scatterlist src, assoc[2], dst[2]; + struct scatterlist sg; struct aead_request *req; authlen = ieee802154_sechdr_authtag_len(&hdr->sec); @@ -659,34 +661,27 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, if (!req) return -ENOMEM; - sg_init_table(assoc, 2); - sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len); assoclen = skb->mac_len; data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; - if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) { - sg_set_buf(&assoc[1], data, 0); - } else { - sg_set_buf(&assoc[1], data, datalen); + skb_put(skb, authlen); + + sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen + authlen); + + if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) { assoclen += datalen; datalen = 0; } - sg_init_one(&src, data, datalen); - - sg_init_table(dst, 2); - sg_set_buf(&dst[0], data, datalen); - sg_set_buf(&dst[1], skb_put(skb, authlen), authlen); - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_assoc(req, assoc, assoclen); - aead_request_set_crypt(req, &src, dst, datalen, iv); + aead_request_set_crypt(req, &sg, &sg, datalen, iv); + aead_request_set_ad(req, assoclen); rc = crypto_aead_encrypt(req); - kfree(req); + kzfree(req); return rc; } @@ -858,7 +853,7 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; unsigned char *data; int authlen, datalen, assoclen, rc; - struct scatterlist src, assoc[2]; + struct scatterlist sg; struct aead_request *req; authlen = ieee802154_sechdr_authtag_len(&hdr->sec); @@ -868,31 +863,25 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, if (!req) return -ENOMEM; - sg_init_table(assoc, 2); - sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len); assoclen = skb->mac_len; data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; - if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) { - sg_set_buf(&assoc[1], data, 0); - } else { - sg_set_buf(&assoc[1], data, datalen - authlen); + sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen); + + if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) { assoclen += datalen - authlen; - data += datalen - authlen; datalen = authlen; } - sg_init_one(&src, data, datalen); - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_assoc(req, assoc, assoclen); - aead_request_set_crypt(req, &src, &src, datalen, iv); + aead_request_set_crypt(req, &sg, &sg, datalen, iv); + aead_request_set_ad(req, assoclen); rc = crypto_aead_decrypt(req); - kfree(req); + kzfree(req); skb_trim(skb, skb->len - authlen); return rc; @@ -932,7 +921,7 @@ llsec_update_devkey_record(struct mac802154_llsec_device *dev, if (!devkey) list_add_rcu(&next->devkey.list, &dev->dev.keys); else - kfree(next); + kzfree(next); spin_unlock_bh(&dev->lock); } diff --git a/kernel/net/mac802154/mac_cmd.c b/kernel/net/mac802154/mac_cmd.c index bdccb4ecd..8606da459 100644 --- a/kernel/net/mac802154/mac_cmd.c +++ b/kernel/net/mac802154/mac_cmd.c @@ -36,37 +36,30 @@ static int mac802154_mlme_start_req(struct net_device *dev, u8 pan_coord, u8 blx, u8 coord_realign) { - struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); - int rc = 0; + struct ieee802154_llsec_params params; + int changed = 0; ASSERT_RTNL(); BUG_ON(addr->mode != IEEE802154_ADDR_SHORT); - mac802154_dev_set_pan_id(dev, addr->pan_id); - mac802154_dev_set_short_addr(dev, addr->short_addr); + dev->ieee802154_ptr->pan_id = addr->pan_id; + dev->ieee802154_ptr->short_addr = addr->short_addr; mac802154_dev_set_page_channel(dev, page, channel); - if (ops->llsec) { - struct ieee802154_llsec_params params; - int changed = 0; + params.pan_id = addr->pan_id; + changed |= IEEE802154_LLSEC_PARAM_PAN_ID; - params.coord_shortaddr = addr->short_addr; - changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; + params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); + changed |= IEEE802154_LLSEC_PARAM_HWADDR; - params.pan_id = addr->pan_id; - changed |= IEEE802154_LLSEC_PARAM_PAN_ID; + params.coord_hwaddr = params.hwaddr; + changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; - params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); - changed |= IEEE802154_LLSEC_PARAM_HWADDR; + params.coord_shortaddr = addr->short_addr; + changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; - params.coord_hwaddr = params.hwaddr; - changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; - - rc = ops->llsec->set_params(dev, ¶ms, changed); - } - - return rc; + return mac802154_set_params(dev, ¶ms, changed); } static int mac802154_set_mac_params(struct net_device *dev, @@ -91,19 +84,19 @@ static int mac802154_set_mac_params(struct net_device *dev, wpan_dev->frame_retries = params->frame_retries; wpan_dev->lbt = params->lbt; - if (local->hw.flags & IEEE802154_HW_TXPOWER) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_TXPOWER) { ret = drv_set_tx_power(local, params->transmit_power); if (ret < 0) return ret; } - if (local->hw.flags & IEEE802154_HW_CCA_MODE) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_MODE) { ret = drv_set_cca_mode(local, ¶ms->cca); if (ret < 0) return ret; } - if (local->hw.flags & IEEE802154_HW_CCA_ED_LEVEL) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { ret = drv_set_cca_ed_level(local, params->cca_ed_level); if (ret < 0) return ret; @@ -151,9 +144,6 @@ static struct ieee802154_llsec_ops mac802154_llsec_ops = { struct ieee802154_mlme_ops mac802154_mlme_wpan = { .start_req = mac802154_mlme_start_req, - .get_pan_id = mac802154_dev_get_pan_id, - .get_short_addr = mac802154_dev_get_short_addr, - .get_dsn = mac802154_dev_get_dsn, .llsec = &mac802154_llsec_ops, diff --git a/kernel/net/mac802154/main.c b/kernel/net/mac802154/main.c index 08cb32dc8..e8cab5bb8 100644 --- a/kernel/net/mac802154/main.c +++ b/kernel/net/mac802154/main.c @@ -40,7 +40,7 @@ static void ieee802154_tasklet_handler(unsigned long data) * netstack. */ skb->pkt_type = 0; - ieee802154_rx(&local->hw, skb); + ieee802154_rx(local, skb); break; default: WARN(1, "mac802154: Packet is of unknown type %d\n", @@ -58,11 +58,9 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) struct ieee802154_local *local; size_t priv_size; - if (!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed || - !ops->start || !ops->stop || !ops->set_channel) { - pr_err("undefined IEEE802.15.4 device operations\n"); + if (WARN_ON(!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed || + !ops->start || !ops->stop || !ops->set_channel)) return NULL; - } /* Ensure 32-byte alignment of our private data and hw private data. * We use the wpan_phy priv data for both our ieee802154_local and for @@ -107,6 +105,20 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); + INIT_WORK(&local->tx_work, ieee802154_xmit_worker); + + /* init supported flags with 802.15.4 default ranges */ + phy->supported.max_minbe = 8; + phy->supported.min_maxbe = 3; + phy->supported.max_maxbe = 8; + phy->supported.min_frame_retries = 0; + phy->supported.max_frame_retries = 7; + phy->supported.max_csma_backoffs = 5; + phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; + + /* always supported */ + phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE); + return &local->hw; } EXPORT_SYMBOL(ieee802154_alloc_hw); @@ -155,6 +167,23 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) ieee802154_setup_wpan_phy_pib(local->phy); + if (!(hw->flags & IEEE802154_HW_CSMA_PARAMS)) { + local->phy->supported.min_csma_backoffs = 4; + local->phy->supported.max_csma_backoffs = 4; + local->phy->supported.min_maxbe = 5; + local->phy->supported.max_maxbe = 5; + local->phy->supported.min_minbe = 3; + local->phy->supported.max_minbe = 3; + } + + if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) { + local->phy->supported.min_frame_retries = 3; + local->phy->supported.max_frame_retries = 3; + } + + if (hw->flags & IEEE802154_HW_PROMISCUOUS) + local->phy->supported.iftypes |= BIT(NL802154_IFTYPE_MONITOR); + rc = wpan_phy_register(local->phy); if (rc < 0) goto out_wq; diff --git a/kernel/net/mac802154/mib.c b/kernel/net/mac802154/mib.c index 5cf019a57..73f94fbf8 100644 --- a/kernel/net/mac802154/mib.c +++ b/kernel/net/mac802154/mib.c @@ -26,81 +26,22 @@ #include "ieee802154_i.h" #include "driver-ops.h" -void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - sdata->wpan_dev.short_addr = val; - spin_unlock_bh(&sdata->mib_lock); -} - -__le16 mac802154_dev_get_short_addr(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - __le16 ret; - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - ret = sdata->wpan_dev.short_addr; - spin_unlock_bh(&sdata->mib_lock); - - return ret; -} - -__le16 mac802154_dev_get_pan_id(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - __le16 ret; - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - ret = sdata->wpan_dev.pan_id; - spin_unlock_bh(&sdata->mib_lock); - - return ret; -} - -void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - sdata->wpan_dev.pan_id = val; - spin_unlock_bh(&sdata->mib_lock); -} - -u8 mac802154_dev_get_dsn(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - return sdata->wpan_dev.dsn++; -} - void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; int res; + ASSERT_RTNL(); + BUG_ON(dev->type != ARPHRD_IEEE802154); res = drv_set_channel(local, page, chan); if (res) { pr_debug("set_channel failed\n"); } else { - mutex_lock(&local->phy->pib_lock); local->phy->current_channel = chan; local->phy->current_page = page; - mutex_unlock(&local->phy->pib_lock); } } diff --git a/kernel/net/mac802154/rx.c b/kernel/net/mac802154/rx.c index c0d67b2b4..42e96729d 100644 --- a/kernel/net/mac802154/rx.c +++ b/kernel/net/mac802154/rx.c @@ -47,8 +47,6 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, pr_debug("getting packet via slave interface %s\n", sdata->dev->name); - spin_lock_bh(&sdata->mib_lock); - span = wpan_dev->pan_id; sshort = wpan_dev->short_addr; @@ -83,15 +81,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, skb->pkt_type = PACKET_OTHERHOST; break; default: - spin_unlock_bh(&sdata->mib_lock); pr_debug("invalid dest mode\n"); goto fail; } - spin_unlock_bh(&sdata->mib_lock); - skb->dev = sdata->dev; + /* TODO this should be moved after netif_receive_skb call, otherwise + * wireshark will show a mac header with security fields and the + * payload is already decrypted. + */ rc = mac802154_llsec_decrypt(&sdata->sec, skb); if (rc) { pr_debug("decryption failed: %i\n", rc); @@ -207,8 +206,10 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, } list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL802154_IFTYPE_NODE || - !netif_running(sdata->dev)) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE) + continue; + + if (!ieee802154_sdata_running(sdata)) continue; ieee802154_subif_frame(sdata, skb, &hdr); @@ -232,7 +233,7 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb) skb->protocol = htons(ETH_P_IEEE802154); list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL802154_IFTYPE_MONITOR) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) continue; if (!ieee802154_sdata_running(sdata)) @@ -249,13 +250,15 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb) } } -void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) +void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) { - struct ieee802154_local *local = hw_to_local(hw); u16 crc; WARN_ON_ONCE(softirq_count() == 0); + if (local->suspended) + goto drop; + /* TODO: When a transceiver omits the checksum here, we * add an own calculated one. This is currently an ugly * solution because the monitor needs a crc here. @@ -276,8 +279,7 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) crc = crc_ccitt(0, skb->data, skb->len); if (crc) { rcu_read_unlock(); - kfree_skb(skb); - return; + goto drop; } } /* remove crc */ @@ -286,8 +288,11 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) __ieee802154_rx_handle_packet(local, skb); rcu_read_unlock(); + + return; +drop: + kfree_skb(skb); } -EXPORT_SYMBOL(ieee802154_rx); void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) diff --git a/kernel/net/mac802154/trace.c b/kernel/net/mac802154/trace.c new file mode 100644 index 000000000..863e5e6b9 --- /dev/null +++ b/kernel/net/mac802154/trace.c @@ -0,0 +1,9 @@ +#include + +#ifndef __CHECKER__ +#include +#include "driver-ops.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/kernel/net/mac802154/trace.h b/kernel/net/mac802154/trace.h new file mode 100644 index 000000000..6f30e0c93 --- /dev/null +++ b/kernel/net/mac802154/trace.h @@ -0,0 +1,272 @@ +/* Based on net/mac80211/trace.h */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mac802154 + +#if !defined(__MAC802154_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __MAC802154_DRIVER_TRACE + +#include + +#include +#include "ieee802154_i.h" + +#define MAXNAME 32 +#define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) +#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \ + wpan_phy_name(local->hw.phy), MAXNAME) +#define LOCAL_PR_FMT "%s" +#define LOCAL_PR_ARG __entry->wpan_phy_name + +#define CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ + __field(enum nl802154_cca_opts, cca_opt) +#define CCA_ASSIGN \ + do { \ + (__entry->cca_mode) = cca->mode; \ + (__entry->cca_opt) = cca->opt; \ + } while (0) +#define CCA_PR_FMT "cca_mode: %d, cca_opt: %d" +#define CCA_PR_ARG __entry->cca_mode, __entry->cca_opt + +#define BOOL_TO_STR(bo) (bo) ? "true" : "false" + +/* Tracing for driver callbacks */ + +DECLARE_EVENT_CLASS(local_only_evt, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local), + TP_STRUCT__entry( + LOCAL_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_return_void, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(802154_drv_return_int, + TP_PROTO(struct ieee802154_local *local, int ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT ", returned: %d", LOCAL_PR_ARG, + __entry->ret) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_start, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_stop, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(802154_drv_set_channel, + TP_PROTO(struct ieee802154_local *local, u8 page, u8 channel), + TP_ARGS(local, page, channel), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, page) + __field(u8, channel) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->page = page; + __entry->channel = channel; + ), + TP_printk(LOCAL_PR_FMT ", page: %d, channel: %d", LOCAL_PR_ARG, + __entry->page, __entry->channel) +); + +TRACE_EVENT(802154_drv_set_cca_mode, + TP_PROTO(struct ieee802154_local *local, + const struct wpan_phy_cca *cca), + TP_ARGS(local, cca), + TP_STRUCT__entry( + LOCAL_ENTRY + CCA_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + CCA_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT ", " CCA_PR_FMT, LOCAL_PR_ARG, + CCA_PR_ARG) +); + +TRACE_EVENT(802154_drv_set_cca_ed_level, + TP_PROTO(struct ieee802154_local *local, s32 mbm), + TP_ARGS(local, mbm), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s32, mbm) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mbm = mbm; + ), + TP_printk(LOCAL_PR_FMT ", ed level: %d", LOCAL_PR_ARG, + __entry->mbm) +); + +TRACE_EVENT(802154_drv_set_tx_power, + TP_PROTO(struct ieee802154_local *local, s32 power), + TP_ARGS(local, power), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s32, power) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->power = power; + ), + TP_printk(LOCAL_PR_FMT ", mbm: %d", LOCAL_PR_ARG, + __entry->power) +); + +TRACE_EVENT(802154_drv_set_lbt_mode, + TP_PROTO(struct ieee802154_local *local, bool mode), + TP_ARGS(local, mode), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, mode) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mode = mode; + ), + TP_printk(LOCAL_PR_FMT ", lbt mode: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->mode)) +); + +TRACE_EVENT(802154_drv_set_short_addr, + TP_PROTO(struct ieee802154_local *local, __le16 short_addr), + TP_ARGS(local, short_addr), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le16, short_addr) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->short_addr = short_addr; + ), + TP_printk(LOCAL_PR_FMT ", short addr: 0x%04x", LOCAL_PR_ARG, + le16_to_cpu(__entry->short_addr)) +); + +TRACE_EVENT(802154_drv_set_pan_id, + TP_PROTO(struct ieee802154_local *local, __le16 pan_id), + TP_ARGS(local, pan_id), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le16, pan_id) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->pan_id = pan_id; + ), + TP_printk(LOCAL_PR_FMT ", pan id: 0x%04x", LOCAL_PR_ARG, + le16_to_cpu(__entry->pan_id)) +); + +TRACE_EVENT(802154_drv_set_extended_addr, + TP_PROTO(struct ieee802154_local *local, __le64 extended_addr), + TP_ARGS(local, extended_addr), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le64, extended_addr) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->extended_addr = extended_addr; + ), + TP_printk(LOCAL_PR_FMT ", extended addr: 0x%llx", LOCAL_PR_ARG, + le64_to_cpu(__entry->extended_addr)) +); + +TRACE_EVENT(802154_drv_set_pan_coord, + TP_PROTO(struct ieee802154_local *local, bool is_coord), + TP_ARGS(local, is_coord), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, is_coord) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->is_coord = is_coord; + ), + TP_printk(LOCAL_PR_FMT ", is_coord: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->is_coord)) +); + +TRACE_EVENT(802154_drv_set_csma_params, + TP_PROTO(struct ieee802154_local *local, u8 min_be, u8 max_be, + u8 max_csma_backoffs), + TP_ARGS(local, min_be, max_be, max_csma_backoffs), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, min_be) + __field(u8, max_be) + __field(u8, max_csma_backoffs) + ), + TP_fast_assign( + LOCAL_ASSIGN, + __entry->min_be = min_be; + __entry->max_be = max_be; + __entry->max_csma_backoffs = max_csma_backoffs; + ), + TP_printk(LOCAL_PR_FMT ", min be: %d, max be: %d, max csma backoffs: %d", + LOCAL_PR_ARG, __entry->min_be, __entry->max_be, + __entry->max_csma_backoffs) +); + +TRACE_EVENT(802154_drv_set_max_frame_retries, + TP_PROTO(struct ieee802154_local *local, s8 max_frame_retries), + TP_ARGS(local, max_frame_retries), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s8, max_frame_retries) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->max_frame_retries = max_frame_retries; + ), + TP_printk(LOCAL_PR_FMT ", max frame retries: %d", LOCAL_PR_ARG, + __entry->max_frame_retries) +); + +TRACE_EVENT(802154_drv_set_promiscuous_mode, + TP_PROTO(struct ieee802154_local *local, bool on), + TP_ARGS(local, on), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, on) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->on = on; + ), + TP_printk(LOCAL_PR_FMT ", promiscuous mode: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->on)) +); + +#endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include diff --git a/kernel/net/mac802154/tx.c b/kernel/net/mac802154/tx.c index c62e95695..3827f359b 100644 --- a/kernel/net/mac802154/tx.c +++ b/kernel/net/mac802154/tx.c @@ -30,23 +30,11 @@ #include "ieee802154_i.h" #include "driver-ops.h" -/* IEEE 802.15.4 transceivers can sleep during the xmit session, so process - * packets through the workqueue. - */ -struct ieee802154_xmit_cb { - struct sk_buff *skb; - struct work_struct work; - struct ieee802154_local *local; -}; - -static struct ieee802154_xmit_cb ieee802154_xmit_cb; - -static void ieee802154_xmit_worker(struct work_struct *work) +void ieee802154_xmit_worker(struct work_struct *work) { - struct ieee802154_xmit_cb *cb = - container_of(work, struct ieee802154_xmit_cb, work); - struct ieee802154_local *local = cb->local; - struct sk_buff *skb = cb->skb; + struct ieee802154_local *local = + container_of(work, struct ieee802154_local, tx_work); + struct sk_buff *skb = local->tx_skb; struct net_device *dev = skb->dev; int res; @@ -89,9 +77,6 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) put_unaligned_le16(crc, skb_put(skb, 2)); } - if (skb_cow_head(skb, local->hw.extra_tx_headroom)) - goto err_tx; - /* Stop the netif queue on each sub_if_data object. */ ieee802154_stop_queue(&local->hw); @@ -106,11 +91,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; } else { - INIT_WORK(&ieee802154_xmit_cb.work, ieee802154_xmit_worker); - ieee802154_xmit_cb.skb = skb; - ieee802154_xmit_cb.local = local; - - queue_work(local->workqueue, &ieee802154_xmit_cb.work); + local->tx_skb = skb; + queue_work(local->workqueue, &local->tx_work); } return NETDEV_TX_OK; @@ -136,6 +118,10 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int rc; + /* TODO we should move it to wpan_dev_hard_header and dev_hard_header + * functions. The reason is wireshark will show a mac header which is + * with security fields but the payload is not encrypted. + */ rc = mac802154_llsec_encrypt(&sdata->sec, skb); if (rc) { netdev_warn(dev, "encryption failed: %i\n", rc); diff --git a/kernel/net/mac802154/util.c b/kernel/net/mac802154/util.c index 150bf807e..f9fd0957a 100644 --- a/kernel/net/mac802154/util.c +++ b/kernel/net/mac802154/util.c @@ -14,6 +14,7 @@ */ #include "ieee802154_i.h" +#include "driver-ops.h" /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; @@ -85,11 +86,17 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, hrtimer_start(&local->ifs_timer, ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC), HRTIMER_MODE_REL); - - consume_skb(skb); } else { ieee802154_wake_queue(hw); - consume_skb(skb); } + + dev_consume_skb_any(skb); } EXPORT_SYMBOL(ieee802154_xmit_complete); + +void ieee802154_stop_device(struct ieee802154_local *local) +{ + flush_workqueue(local->workqueue); + hrtimer_cancel(&local->ifs_timer); + drv_stop(local); +} diff --git a/kernel/net/mpls/Kconfig b/kernel/net/mpls/Kconfig index 17bde799c..5c467ef97 100644 --- a/kernel/net/mpls/Kconfig +++ b/kernel/net/mpls/Kconfig @@ -24,7 +24,13 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" - help + ---help--- Add support for forwarding of mpls packets. +config MPLS_IPTUNNEL + tristate "MPLS: IP over MPLS tunnel support" + depends on LWTUNNEL && MPLS_ROUTING + ---help--- + mpls ip tunnel support. + endif # MPLS diff --git a/kernel/net/mpls/Makefile b/kernel/net/mpls/Makefile index 65bbe68c7..9ca923625 100644 --- a/kernel/net/mpls/Makefile +++ b/kernel/net/mpls/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o +obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o mpls_router-y := af_mpls.o diff --git a/kernel/net/mpls/af_mpls.c b/kernel/net/mpls/af_mpls.c index 1f93a5978..c32fc411a 100644 --- a/kernel/net/mpls/af_mpls.c +++ b/kernel/net/mpls/af_mpls.c @@ -15,24 +15,19 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif +#include #include "internal.h" -#define LABEL_NOT_SPECIFIED (1<<20) -#define MAX_NEW_LABELS 2 - -/* This maximum ha length copied from the definition of struct neighbour */ -#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) - -struct mpls_route { /* next hop label forwarding entry */ - struct net_device __rcu *rt_dev; - struct rcu_head rt_rcu; - u32 rt_label[MAX_NEW_LABELS]; - u8 rt_protocol; /* routing protocol that set this entry */ - u8 rt_labels; - u8 rt_via_alen; - u8 rt_via_table; - u8 rt_via[0]; -}; +/* Maximum number of labels to look ahead at when selecting a path of + * a multipath route + */ +#define MAX_MP_SELECT_LABELS 4 + +#define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int zero = 0; static int label_limit = (1 << 20) - 1; @@ -58,24 +53,40 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->mpls_ptr); } -static bool mpls_output_possible(const struct net_device *dev) +bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } +EXPORT_SYMBOL_GPL(mpls_output_possible); + +static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh) +{ + u8 *nh0_via = PTR_ALIGN((u8 *)&rt->rt_nh[rt->rt_nhn], VIA_ALEN_ALIGN); + int nh_index = nh - rt->rt_nh; + + return nh0_via + rt->rt_max_alen * nh_index; +} + +static const u8 *mpls_nh_via(const struct mpls_route *rt, + const struct mpls_nh *nh) +{ + return __mpls_nh_via((struct mpls_route *)rt, (struct mpls_nh *)nh); +} -static unsigned int mpls_rt_header_size(const struct mpls_route *rt) +static unsigned int mpls_nh_header_size(const struct mpls_nh *nh) { /* The size of the layer 2.5 labels to be added for this route */ - return rt->rt_labels * sizeof(struct mpls_shim_hdr); + return nh->nh_labels * sizeof(struct mpls_shim_hdr); } -static unsigned int mpls_dev_mtu(const struct net_device *dev) +unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } +EXPORT_SYMBOL_GPL(mpls_dev_mtu); -static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; @@ -85,20 +96,87 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) return true; } +EXPORT_SYMBOL_GPL(mpls_pkt_too_big); + +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, + struct sk_buff *skb, bool bos) +{ + struct mpls_entry_decoded dec; + struct mpls_shim_hdr *hdr; + bool eli_seen = false; + int label_index; + int nh_index = 0; + u32 hash = 0; + + /* No need to look further into packet if there's only + * one path + */ + if (rt->rt_nhn == 1) + goto out; + + for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos; + label_index++) { + if (!pskb_may_pull(skb, sizeof(*hdr) * label_index)) + break; + + /* Read and decode the current label */ + hdr = mpls_hdr(skb) + label_index; + dec = mpls_entry_decode(hdr); + + /* RFC6790 - reserved labels MUST NOT be used as keys + * for the load-balancing function + */ + if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) { + hash = jhash_1word(dec.label, hash); + + /* The entropy label follows the entropy label + * indicator, so this means that the entropy + * label was just added to the hash - no need to + * go any deeper either in the label stack or in the + * payload + */ + if (eli_seen) + break; + } else if (dec.label == MPLS_LABEL_ENTROPY) { + eli_seen = true; + } + + bos = dec.bos; + if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index + + sizeof(struct iphdr))) { + const struct iphdr *v4hdr; + + v4hdr = (const struct iphdr *)(mpls_hdr(skb) + + label_index); + if (v4hdr->version == 4) { + hash = jhash_3words(ntohl(v4hdr->saddr), + ntohl(v4hdr->daddr), + v4hdr->protocol, hash); + } else if (v4hdr->version == 6 && + pskb_may_pull(skb, sizeof(*hdr) * label_index + + sizeof(struct ipv6hdr))) { + const struct ipv6hdr *v6hdr; + + v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) + + label_index); + + hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); + hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); + hash = jhash_1word(v6hdr->nexthdr, hash); + } + } + } + + nh_index = hash % rt->rt_nhn; +out: + return &rt->rt_nh[nh_index]; +} static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) { - /* RFC4385 and RFC5586 encode other packets in mpls such that - * they don't conflict with the ip version number, making - * decoding by examining the ip version correct in everything - * except for the strangest cases. - * - * The strange cases if we choose to support them will require - * manual configuration. - */ - struct iphdr *hdr4; - bool success = true; + enum mpls_payload_type payload_type; + bool success = false; /* The IPv4 code below accesses through the IPv4 header * checksum, which is 12 bytes into the packet. @@ -113,23 +191,32 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, if (!pskb_may_pull(skb, 12)) return false; - /* Use ip_hdr to find the ip protocol version */ - hdr4 = ip_hdr(skb); - if (hdr4->version == 4) { + payload_type = rt->rt_payload_type; + if (payload_type == MPT_UNSPEC) + payload_type = ip_hdr(skb)->version; + + switch (payload_type) { + case MPT_IPV4: { + struct iphdr *hdr4 = ip_hdr(skb); skb->protocol = htons(ETH_P_IP); csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), htons(dec.ttl << 8)); hdr4->ttl = dec.ttl; + success = true; + break; } - else if (hdr4->version == 6) { + case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); hdr6->hop_limit = dec.ttl; + success = true; + break; } - else - /* version 0 and version 1 are used by pseudo wires */ - success = false; + case MPT_UNSPEC: + break; + } + return success; } @@ -139,6 +226,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct net *net = dev_net(dev); struct mpls_shim_hdr *hdr; struct mpls_route *rt; + struct mpls_nh *nh; struct mpls_entry_decoded dec; struct net_device *out_dev; struct mpls_dev *mdev; @@ -176,8 +264,12 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (!rt) goto drop; + nh = mpls_select_multipath(rt, skb, dec.bos); + if (!nh) + goto drop; + /* Find the output device */ - out_dev = rcu_dereference(rt->rt_dev); + out_dev = rcu_dereference(nh->nh_dev); if (!mpls_output_possible(out_dev)) goto drop; @@ -192,7 +284,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, dec.ttl -= 1; /* Verify the destination can hold the packet */ - new_header_size = mpls_rt_header_size(rt); + new_header_size = mpls_nh_header_size(nh); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto drop; @@ -220,13 +312,20 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Push the new labels */ hdr = mpls_hdr(skb); bos = dec.bos; - for (i = rt->rt_labels - 1; i >= 0; i--) { - hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos); + for (i = nh->nh_labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(nh->nh_label[i], + dec.ttl, 0, bos); bos = false; } } - err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb); + /* If via wasn't specified then send out using device address */ + if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) + err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, + out_dev->dev_addr, skb); + else + err = neigh_xmit(nh->nh_via_table, out_dev, + mpls_nh_via(rt, nh), skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); @@ -248,25 +347,35 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { }; struct mpls_route_config { - u32 rc_protocol; - u32 rc_ifindex; - u16 rc_via_table; - u16 rc_via_alen; - u8 rc_via[MAX_VIA_ALEN]; - u32 rc_label; - u32 rc_output_labels; - u32 rc_output_label[MAX_NEW_LABELS]; - u32 rc_nlflags; - struct nl_info rc_nlinfo; + u32 rc_protocol; + u32 rc_ifindex; + u8 rc_via_table; + u8 rc_via_alen; + u8 rc_via[MAX_VIA_ALEN]; + u32 rc_label; + u8 rc_output_labels; + u32 rc_output_label[MAX_NEW_LABELS]; + u32 rc_nlflags; + enum mpls_payload_type rc_payload_type; + struct nl_info rc_nlinfo; + struct rtnexthop *rc_mp; + int rc_mp_len; }; -static struct mpls_route *mpls_rt_alloc(size_t alen) +static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen) { + u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN); struct mpls_route *rt; - rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL); - if (rt) - rt->rt_via_alen = alen; + rt = kzalloc(ALIGN(sizeof(*rt) + num_nh * sizeof(*rt->rt_nh), + VIA_ALEN_ALIGN) + + num_nh * max_alen_aligned, + GFP_KERNEL); + if (rt) { + rt->rt_nhn = num_nh; + rt->rt_max_alen = max_alen_aligned; + } + return rt; } @@ -286,30 +395,27 @@ static void mpls_notify_route(struct net *net, unsigned index, struct mpls_route *rt = new ? new : old; unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; /* Ignore reserved labels for now */ - if (rt && (index >= 16)) + if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED)) rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); } static void mpls_route_update(struct net *net, unsigned index, - struct net_device *dev, struct mpls_route *new, + struct mpls_route *new, const struct nl_info *info) { struct mpls_route __rcu **platform_label; - struct mpls_route *rt, *old = NULL; + struct mpls_route *rt; ASSERT_RTNL(); platform_label = rtnl_dereference(net->mpls.platform_label); rt = rtnl_dereference(platform_label[index]); - if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) { - rcu_assign_pointer(platform_label[index], new); - old = rt; - } + rcu_assign_pointer(platform_label[index], new); - mpls_notify_route(net, index, old, new, info); + mpls_notify_route(net, index, rt, new, info); /* If we removed a route free it now */ - mpls_rt_free(old); + mpls_rt_free(rt); } static unsigned find_free_label(struct net *net) @@ -320,22 +426,300 @@ static unsigned find_free_label(struct net *net) platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; - for (index = 16; index < platform_labels; index++) { + for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels; + index++) { if (!rtnl_dereference(platform_label[index])) return index; } return LABEL_NOT_SPECIFIED; } +#if IS_ENABLED(CONFIG_INET) +static struct net_device *inet_fib_lookup_dev(struct net *net, + const void *addr) +{ + struct net_device *dev; + struct rtable *rt; + struct in_addr daddr; + + memcpy(&daddr, addr, sizeof(struct in_addr)); + rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); + if (IS_ERR(rt)) + return ERR_CAST(rt); + + dev = rt->dst.dev; + dev_hold(dev); + + ip_rt_put(rt); + + return dev; +} +#else +static struct net_device *inet_fib_lookup_dev(struct net *net, + const void *addr) +{ + return ERR_PTR(-EAFNOSUPPORT); +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static struct net_device *inet6_fib_lookup_dev(struct net *net, + const void *addr) +{ + struct net_device *dev; + struct dst_entry *dst; + struct flowi6 fl6; + int err; + + if (!ipv6_stub) + return ERR_PTR(-EAFNOSUPPORT); + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); + err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); + if (err) + return ERR_PTR(err); + + dev = dst->dev; + dev_hold(dev); + dst_release(dst); + + return dev; +} +#else +static struct net_device *inet6_fib_lookup_dev(struct net *net, + const void *addr) +{ + return ERR_PTR(-EAFNOSUPPORT); +} +#endif + +static struct net_device *find_outdev(struct net *net, + struct mpls_route *rt, + struct mpls_nh *nh, int oif) +{ + struct net_device *dev = NULL; + + if (!oif) { + switch (nh->nh_via_table) { + case NEIGH_ARP_TABLE: + dev = inet_fib_lookup_dev(net, mpls_nh_via(rt, nh)); + break; + case NEIGH_ND_TABLE: + dev = inet6_fib_lookup_dev(net, mpls_nh_via(rt, nh)); + break; + case NEIGH_LINK_TABLE: + break; + } + } else { + dev = dev_get_by_index(net, oif); + } + + if (!dev) + return ERR_PTR(-ENODEV); + + /* The caller is holding rtnl anyways, so release the dev reference */ + dev_put(dev); + + return dev; +} + +static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, + struct mpls_nh *nh, int oif) +{ + struct net_device *dev = NULL; + int err = -ENODEV; + + dev = find_outdev(net, rt, nh, oif); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + dev = NULL; + goto errout; + } + + /* Ensure this is a supported device */ + err = -EINVAL; + if (!mpls_dev_get(dev)) + goto errout; + + if ((nh->nh_via_table == NEIGH_LINK_TABLE) && + (dev->addr_len != nh->nh_via_alen)) + goto errout; + + RCU_INIT_POINTER(nh->nh_dev, dev); + + return 0; + +errout: + return err; +} + +static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, + struct mpls_route *rt) +{ + struct net *net = cfg->rc_nlinfo.nl_net; + struct mpls_nh *nh = rt->rt_nh; + int err; + int i; + + if (!nh) + return -ENOMEM; + + err = -EINVAL; + /* Ensure only a supported number of labels are present */ + if (cfg->rc_output_labels > MAX_NEW_LABELS) + goto errout; + + nh->nh_labels = cfg->rc_output_labels; + for (i = 0; i < nh->nh_labels; i++) + nh->nh_label[i] = cfg->rc_output_label[i]; + + nh->nh_via_table = cfg->rc_via_table; + memcpy(__mpls_nh_via(rt, nh), cfg->rc_via, cfg->rc_via_alen); + nh->nh_via_alen = cfg->rc_via_alen; + + err = mpls_nh_assign_dev(net, rt, nh, cfg->rc_ifindex); + if (err) + goto errout; + + return 0; + +errout: + return err; +} + +static int mpls_nh_build(struct net *net, struct mpls_route *rt, + struct mpls_nh *nh, int oif, + struct nlattr *via, struct nlattr *newdst) +{ + int err = -ENOMEM; + + if (!nh) + goto errout; + + if (newdst) { + err = nla_get_labels(newdst, MAX_NEW_LABELS, + &nh->nh_labels, nh->nh_label); + if (err) + goto errout; + } + + if (via) { + err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, + __mpls_nh_via(rt, nh)); + if (err) + goto errout; + } else { + nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC; + } + + err = mpls_nh_assign_dev(net, rt, nh, oif); + if (err) + goto errout; + + return 0; + +errout: + return err; +} + +static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, + u8 cfg_via_alen, u8 *max_via_alen) +{ + int nhs = 0; + int remaining = len; + + if (!rtnh) { + *max_via_alen = cfg_via_alen; + return 1; + } + + *max_via_alen = 0; + + while (rtnh_ok(rtnh, remaining)) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + int attrlen; + + attrlen = rtnh_attrlen(rtnh); + nla = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nla_len(nla) >= + offsetof(struct rtvia, rtvia_addr)) { + int via_alen = nla_len(nla) - + offsetof(struct rtvia, rtvia_addr); + + if (via_alen <= MAX_VIA_ALEN) + *max_via_alen = max_t(u16, *max_via_alen, + via_alen); + } + + nhs++; + rtnh = rtnh_next(rtnh, &remaining); + } + + /* leftover implies invalid nexthop configuration, discard it */ + return remaining > 0 ? 0 : nhs; +} + +static int mpls_nh_build_multi(struct mpls_route_config *cfg, + struct mpls_route *rt) +{ + struct rtnexthop *rtnh = cfg->rc_mp; + struct nlattr *nla_via, *nla_newdst; + int remaining = cfg->rc_mp_len; + int nhs = 0; + int err = 0; + + change_nexthops(rt) { + int attrlen; + + nla_via = NULL; + nla_newdst = NULL; + + err = -EINVAL; + if (!rtnh_ok(rtnh, remaining)) + goto errout; + + /* neither weighted multipath nor any flags + * are supported + */ + if (rtnh->rtnh_hops || rtnh->rtnh_flags) + goto errout; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *attrs = rtnh_attrs(rtnh); + + nla_via = nla_find(attrs, attrlen, RTA_VIA); + nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST); + } + + err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, + rtnh->rtnh_ifindex, nla_via, + nla_newdst); + if (err) + goto errout; + + rtnh = rtnh_next(rtnh, &remaining); + nhs++; + } endfor_nexthops(rt); + + rt->rt_nhn = nhs; + + return 0; + +errout: + return err; +} + static int mpls_route_add(struct mpls_route_config *cfg) { struct mpls_route __rcu **platform_label; struct net *net = cfg->rc_nlinfo.nl_net; - struct net_device *dev = NULL; struct mpls_route *rt, *old; - unsigned index; - int i; int err = -EINVAL; + u8 max_via_alen; + unsigned index; + int nhs; index = cfg->rc_label; @@ -345,33 +729,14 @@ static int mpls_route_add(struct mpls_route_config *cfg) index = find_free_label(net); } - /* The first 16 labels are reserved, and may not be set */ - if (index < 16) + /* Reserved labels may not be set */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) goto errout; /* The full 20 bit range may not be supported. */ if (index >= net->mpls.platform_labels) goto errout; - /* Ensure only a supported number of labels are present */ - if (cfg->rc_output_labels > MAX_NEW_LABELS) - goto errout; - - err = -ENODEV; - dev = dev_get_by_index(net, cfg->rc_ifindex); - if (!dev) - goto errout; - - /* Ensure this is a supported device */ - err = -EINVAL; - if (!mpls_dev_get(dev)) - goto errout; - - err = -EINVAL; - if ((cfg->rc_via_table == NEIGH_LINK_TABLE) && - (dev->addr_len != cfg->rc_via_alen)) - goto errout; - /* Append makes no sense with mpls */ err = -EOPNOTSUPP; if (cfg->rc_nlflags & NLM_F_APPEND) @@ -391,27 +756,34 @@ static int mpls_route_add(struct mpls_route_config *cfg) if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) goto errout; + err = -EINVAL; + nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len, + cfg->rc_via_alen, &max_via_alen); + if (nhs == 0) + goto errout; + err = -ENOMEM; - rt = mpls_rt_alloc(cfg->rc_via_alen); + rt = mpls_rt_alloc(nhs, max_via_alen); if (!rt) goto errout; - rt->rt_labels = cfg->rc_output_labels; - for (i = 0; i < rt->rt_labels; i++) - rt->rt_label[i] = cfg->rc_output_label[i]; rt->rt_protocol = cfg->rc_protocol; - RCU_INIT_POINTER(rt->rt_dev, dev); - rt->rt_via_table = cfg->rc_via_table; - memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen); + rt->rt_payload_type = cfg->rc_payload_type; - mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo); + if (cfg->rc_mp) + err = mpls_nh_build_multi(cfg, rt); + else + err = mpls_nh_build_from_cfg(cfg, rt); + if (err) + goto freert; + + mpls_route_update(net, index, rt, &cfg->rc_nlinfo); - dev_put(dev); return 0; +freert: + mpls_rt_free(rt); errout: - if (dev) - dev_put(dev); return err; } @@ -423,15 +795,15 @@ static int mpls_route_del(struct mpls_route_config *cfg) index = cfg->rc_label; - /* The first 16 labels are reserved, and may not be removed */ - if (index < 16) + /* Reserved labels may not be removed */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) goto errout; /* The full 20 bit range may not be supported */ if (index >= net->mpls.platform_labels) goto errout; - mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo); + mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); err = 0; errout: @@ -528,9 +900,11 @@ static void mpls_ifdown(struct net_device *dev) struct mpls_route *rt = rtnl_dereference(platform_label[index]); if (!rt) continue; - if (rtnl_dereference(rt->rt_dev) != dev) - continue; - rt->rt_dev = NULL; + for_nexthops(rt) { + if (rtnl_dereference(nh->nh_dev) != dev) + continue; + nh->nh_dev = NULL; + } endfor_nexthops(rt); } mdev = mpls_dev_get(dev); @@ -626,9 +1000,10 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, return 0; } +EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, - u32 max_labels, u32 *labels, u32 label[]) + u32 max_labels, u8 *labels, u32 label[]) { unsigned len = nla_len(nla); unsigned nla_labels; @@ -671,6 +1046,49 @@ int nla_get_labels(const struct nlattr *nla, *labels = nla_labels; return 0; } +EXPORT_SYMBOL_GPL(nla_get_labels); + +int nla_get_via(const struct nlattr *nla, u8 *via_alen, + u8 *via_table, u8 via_addr[]) +{ + struct rtvia *via = nla_data(nla); + int err = -EINVAL; + int alen; + + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) + goto errout; + alen = nla_len(nla) - + offsetof(struct rtvia, rtvia_addr); + if (alen > MAX_VIA_ALEN) + goto errout; + + /* Validate the address family */ + switch (via->rtvia_family) { + case AF_PACKET: + *via_table = NEIGH_LINK_TABLE; + break; + case AF_INET: + *via_table = NEIGH_ARP_TABLE; + if (alen != 4) + goto errout; + break; + case AF_INET6: + *via_table = NEIGH_ND_TABLE; + if (alen != 16) + goto errout; + break; + default: + /* Unsupported address family */ + goto errout; + } + + memcpy(via_addr, via->rtvia_addr, alen); + *via_alen = alen; + err = 0; + +errout: + return err; +} static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg) @@ -713,6 +1131,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; + cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; @@ -735,48 +1154,28 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, break; case RTA_DST: { - u32 label_count; + u8 label_count; if (nla_get_labels(nla, 1, &label_count, &cfg->rc_label)) goto errout; - /* The first 16 labels are reserved, and may not be set */ - if (cfg->rc_label < 16) + /* Reserved labels may not be set */ + if (cfg->rc_label < MPLS_LABEL_FIRST_UNRESERVED) goto errout; break; } case RTA_VIA: { - struct rtvia *via = nla_data(nla); - if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) - goto errout; - cfg->rc_via_alen = nla_len(nla) - - offsetof(struct rtvia, rtvia_addr); - if (cfg->rc_via_alen > MAX_VIA_ALEN) - goto errout; - - /* Validate the address family */ - switch(via->rtvia_family) { - case AF_PACKET: - cfg->rc_via_table = NEIGH_LINK_TABLE; - break; - case AF_INET: - cfg->rc_via_table = NEIGH_ARP_TABLE; - if (cfg->rc_via_alen != 4) - goto errout; - break; - case AF_INET6: - cfg->rc_via_table = NEIGH_ND_TABLE; - if (cfg->rc_via_alen != 16) - goto errout; - break; - default: - /* Unsupported address family */ + if (nla_get_via(nla, &cfg->rc_via_alen, + &cfg->rc_via_table, cfg->rc_via)) goto errout; - } - - memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen); + break; + } + case RTA_MULTIPATH: + { + cfg->rc_mp = nla_data(nla); + cfg->rc_mp_len = nla_len(nla); break; } default: @@ -837,16 +1236,54 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (rt->rt_labels && - nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label)) - goto nla_put_failure; - if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen)) - goto nla_put_failure; - dev = rtnl_dereference(rt->rt_dev); - if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) - goto nla_put_failure; if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; + if (rt->rt_nhn == 1) { + const struct mpls_nh *nh = rt->rt_nh; + + if (nh->nh_labels && + nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, + nh->nh_label)) + goto nla_put_failure; + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && + nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), + nh->nh_via_alen)) + goto nla_put_failure; + dev = rtnl_dereference(nh->nh_dev); + if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) + goto nla_put_failure; + } else { + struct rtnexthop *rtnh; + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + for_nexthops(rt) { + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); + if (!rtnh) + goto nla_put_failure; + + dev = rtnl_dereference(nh->nh_dev); + if (dev) + rtnh->rtnh_ifindex = dev->ifindex; + if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, + nh->nh_labels, + nh->nh_label)) + goto nla_put_failure; + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && + nla_put_via(skb, nh->nh_via_table, + mpls_nh_via(rt, nh), + nh->nh_via_alen)) + goto nla_put_failure; + + /* length of rtnetlink header + attributes */ + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; + } endfor_nexthops(rt); + + nla_nest_end(skb, mp); + } nlmsg_end(skb, nlh); return 0; @@ -866,8 +1303,8 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) ASSERT_RTNL(); index = cb->args[0]; - if (index < 16) - index = 16; + if (index < MPLS_LABEL_FIRST_UNRESERVED) + index = MPLS_LABEL_FIRST_UNRESERVED; platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; @@ -891,12 +1328,33 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) - + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */ + nla_total_size(4); /* RTA_DST */ - if (rt->rt_labels) /* RTA_NEWDST */ - payload += nla_total_size(rt->rt_labels * 4); - if (rt->rt_dev) /* RTA_OIF */ - payload += nla_total_size(4); + + if (rt->rt_nhn == 1) { + struct mpls_nh *nh = rt->rt_nh; + + if (nh->nh_dev) + payload += nla_total_size(4); /* RTA_OIF */ + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */ + payload += nla_total_size(2 + nh->nh_via_alen); + if (nh->nh_labels) /* RTA_NEWDST */ + payload += nla_total_size(nh->nh_labels * 4); + } else { + /* each nexthop is packed in an attribute */ + size_t nhsize = 0; + + for_nexthops(rt) { + nhsize += nla_total_size(sizeof(struct rtnexthop)); + /* RTA_VIA */ + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) + nhsize += nla_total_size(2 + nh->nh_via_alen); + if (nh->nh_labels) + nhsize += nla_total_size(nh->nh_labels * 4); + } endfor_nexthops(rt); + /* nested attribute */ + payload += nla_total_size(nhsize); + } + return payload; } @@ -948,23 +1406,29 @@ static int resize_platform_label_table(struct net *net, size_t limit) /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; - rt0 = mpls_rt_alloc(lo->addr_len); + rt0 = mpls_rt_alloc(1, lo->addr_len); if (!rt0) goto nort0; - RCU_INIT_POINTER(rt0->rt_dev, lo); + RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo); rt0->rt_protocol = RTPROT_KERNEL; - rt0->rt_via_table = NEIGH_LINK_TABLE; - memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); + rt0->rt_payload_type = MPT_IPV4; + rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; + rt0->rt_nh->nh_via_alen = lo->addr_len; + memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, + lo->addr_len); } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; - rt2 = mpls_rt_alloc(lo->addr_len); + rt2 = mpls_rt_alloc(1, lo->addr_len); if (!rt2) goto nort2; - RCU_INIT_POINTER(rt2->rt_dev, lo); + RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo); rt2->rt_protocol = RTPROT_KERNEL; - rt2->rt_via_table = NEIGH_LINK_TABLE; - memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len); + rt2->rt_payload_type = MPT_IPV6; + rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; + rt2->rt_nh->nh_via_alen = lo->addr_len; + memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, + lo->addr_len); } rtnl_lock(); @@ -974,7 +1438,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) /* Free any labels beyond the new table */ for (index = limit; index < old_limit; index++) - mpls_route_update(net, index, NULL, NULL, NULL); + mpls_route_update(net, index, NULL, NULL); /* Copy over the old labels */ cp_size = size; @@ -1066,8 +1530,10 @@ static int mpls_net_init(struct net *net) table[0].data = net; net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); - if (net->mpls.ctl == NULL) + if (net->mpls.ctl == NULL) { + kfree(table); return -ENOMEM; + } return 0; } diff --git a/kernel/net/mpls/internal.h b/kernel/net/mpls/internal.h index 8cabeb5a1..bde52ce88 100644 --- a/kernel/net/mpls/internal.h +++ b/kernel/net/mpls/internal.h @@ -21,6 +21,76 @@ struct mpls_dev { struct sk_buff; +#define LABEL_NOT_SPECIFIED (1 << 20) +#define MAX_NEW_LABELS 2 + +/* This maximum ha length copied from the definition of struct neighbour */ +#define VIA_ALEN_ALIGN sizeof(unsigned long) +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, VIA_ALEN_ALIGN)) + +enum mpls_payload_type { + MPT_UNSPEC, /* IPv4 or IPv6 */ + MPT_IPV4 = 4, + MPT_IPV6 = 6, + + /* Other types not implemented: + * - Pseudo-wire with or without control word (RFC4385) + * - GAL (RFC5586) + */ +}; + +struct mpls_nh { /* next hop label forwarding entry */ + struct net_device __rcu *nh_dev; + u32 nh_label[MAX_NEW_LABELS]; + u8 nh_labels; + u8 nh_via_alen; + u8 nh_via_table; +}; + +/* The route, nexthops and vias are stored together in the same memory + * block: + * + * +----------------------+ + * | mpls_route | + * +----------------------+ + * | mpls_nh 0 | + * +----------------------+ + * | ... | + * +----------------------+ + * | mpls_nh n-1 | + * +----------------------+ + * | alignment padding | + * +----------------------+ + * | via[rt_max_alen] 0 | + * +----------------------+ + * | ... | + * +----------------------+ + * | via[rt_max_alen] n-1 | + * +----------------------+ + */ +struct mpls_route { /* next hop label forwarding entry */ + struct rcu_head rt_rcu; + u8 rt_protocol; + u8 rt_payload_type; + u8 rt_max_alen; + unsigned int rt_nhn; + struct mpls_nh rt_nh[0]; +}; + +#define for_nexthops(rt) { \ + int nhsel; struct mpls_nh *nh; \ + for (nhsel = 0, nh = (rt)->rt_nh; \ + nhsel < (rt)->rt_nhn; \ + nh++, nhsel++) + +#define change_nexthops(rt) { \ + int nhsel; struct mpls_nh *nh; \ + for (nhsel = 0, nh = (struct mpls_nh *)((rt)->rt_nh); \ + nhsel < (rt)->rt_nhn; \ + nh++, nhsel++) + +#define endfor_nexthops(rt) } + static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) { return (struct mpls_shim_hdr *)skb_network_header(skb); @@ -50,7 +120,14 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } -int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]); +int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, + const u32 label[]); +int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels, + u32 label[]); +int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, + u8 via[]); +bool mpls_output_possible(const struct net_device *dev); +unsigned int mpls_dev_mtu(const struct net_device *dev); +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); #endif /* MPLS_INTERNAL_H */ diff --git a/kernel/net/mpls/mpls_gso.c b/kernel/net/mpls/mpls_gso.c index 809df534a..0183b32da 100644 --- a/kernel/net/mpls/mpls_gso.c +++ b/kernel/net/mpls/mpls_gso.c @@ -62,6 +62,7 @@ out: static struct packet_offload mpls_mc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_MC), + .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, @@ -69,6 +70,7 @@ static struct packet_offload mpls_mc_offload __read_mostly = { static struct packet_offload mpls_uc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), + .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, diff --git a/kernel/net/mpls/mpls_iptunnel.c b/kernel/net/mpls/mpls_iptunnel.c new file mode 100644 index 000000000..64afd3d0b --- /dev/null +++ b/kernel/net/mpls/mpls_iptunnel.c @@ -0,0 +1,231 @@ +/* + * mpls tunnels An implementation mpls tunnels using the light weight tunnel + * infrastructure + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { + [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, +}; + +static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return en->labels * sizeof(struct mpls_shim_hdr); +} + +int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = NULL; + struct rt6_info *rt6 = NULL; + int err = 0; + bool bos; + int i; + unsigned int ttl; + + /* Obtain the ttl */ + if (dst->ops->family == AF_INET) { + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + } else if (dst->ops->family == AF_INET6) { + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + } else { + goto drop; + } + + skb_orphan(skb); + + /* Find the output device */ + out_dev = dst->dev; + if (!mpls_output_possible(out_dev) || + !dst->lwtstate || skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); + + /* Verify the destination can hold the packet */ + new_header_size = mpls_encap_size(tun_encap_info); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = true; + for (i = tun_encap_info->labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(tun_encap_info->label[i], + ttl, 0, bos); + bos = false; + } + + if (rt) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, + skb); + else if (rt6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return 0; + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + int tun_encap_info_len; + int ret; + + ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy); + if (ret < 0) + return ret; + + if (!tb[MPLS_IPTUNNEL_DST]) + return -EINVAL; + + tun_encap_info_len = sizeof(*tun_encap_info); + + newts = lwtunnel_state_alloc(tun_encap_info_len); + if (!newts) + return -ENOMEM; + + newts->len = tun_encap_info_len; + tun_encap_info = mpls_lwtunnel_encap(newts); + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &tun_encap_info->labels, tun_encap_info->label); + if (ret) + goto errout; + newts->type = LWTUNNEL_ENCAP_MPLS; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; + +errout: + kfree(newts); + *ts = NULL; + + return ret; +} + +static int mpls_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, + tun_encap_info->label)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + return nla_total_size(tun_encap_info->labels * 4); +} + +static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); + struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); + int l; + + if (a_hdr->labels != b_hdr->labels) + return 1; + + for (l = 0; l < MAX_NEW_LABELS; l++) + if (a_hdr->label[l] != b_hdr->label[l]) + return 1; + return 0; +} + +static const struct lwtunnel_encap_ops mpls_iptun_ops = { + .build_state = mpls_build_state, + .output = mpls_output, + .fill_encap = mpls_fill_encap_info, + .get_encap_size = mpls_encap_nlsize, + .cmp_encap = mpls_encap_cmp, +}; + +static int __init mpls_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_init(mpls_iptunnel_init); + +static void __exit mpls_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_exit(mpls_iptunnel_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/net/netfilter/Kconfig b/kernel/net/netfilter/Kconfig index a0f3e6a3c..4692782b5 100644 --- a/kernel/net/netfilter/Kconfig +++ b/kernel/net/netfilter/Kconfig @@ -1,6 +1,14 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER +config NETFILTER_INGRESS + bool "Netfilter ingress support" + default y + select NET_INGRESS + help + This allows you to classify packets from ingress using the Netfilter + infrastructure. + config NETFILTER_NETLINK tristate @@ -198,7 +206,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -346,7 +354,7 @@ config NF_CT_NETLINK_HELPER select NETFILTER_NETLINK depends on NF_CT_NETLINK depends on NETFILTER_NETLINK_QUEUE - depends on NETFILTER_NETLINK_QUEUE_CT + depends on NETFILTER_NETLINK_GLUE_CT depends on NETFILTER_ADVANCED help This option enables the user-space connection tracking helpers @@ -354,13 +362,14 @@ config NF_CT_NETLINK_HELPER If unsure, say `N'. -config NETFILTER_NETLINK_QUEUE_CT - bool "NFQUEUE integration with Connection Tracking" - default n - depends on NETFILTER_NETLINK_QUEUE +config NETFILTER_NETLINK_GLUE_CT + bool "NFQUEUE and NFLOG integration with Connection Tracking" + default n + depends on (NETFILTER_NETLINK_QUEUE || NETFILTER_NETLINK_LOG) && NF_CT_NETLINK help - If this option is enabled, NFQUEUE can include Connection Tracking - information together with the packet is the enqueued via NFNETLINK. + If this option is enabled, NFQUEUE and NFLOG can include + Connection Tracking information together with the packet is + the enqueued via NFNETLINK. config NF_NAT tristate @@ -448,6 +457,11 @@ config NF_TABLES_INET help This option enables support for a mixed IPv4/IPv6 "inet" table. +config NF_TABLES_NETDEV + tristate "Netfilter nf_tables netdev tables support" + help + This option enables support for the "netdev" table. + config NFT_EXTHDR tristate "Netfilter nf_tables IPv6 exthdr module" help @@ -710,7 +724,7 @@ config NETFILTER_XT_TARGET_HL config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED ---help--- This option adds the "HMARK" target. @@ -852,8 +866,10 @@ config NETFILTER_XT_TARGET_REDIRECT config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DUP_IPV4 + select NF_DUP_IPV6 if IP6_NF_IPTABLES != n ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. @@ -862,11 +878,11 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 - select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help This option adds a `TPROXY' target, which is somewhat similar to REDIRECT. It can only be used in the mangle table and is useful @@ -902,7 +918,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -1114,7 +1130,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -1356,10 +1372,10 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n select NF_DEFRAG_IPV4 - select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help This option adds a `socket' match, which can be used to match packets for which a TCP or UDP socket lookup finds a valid socket. diff --git a/kernel/net/netfilter/Makefile b/kernel/net/netfilter/Makefile index a87d8b8ec..7638c36b4 100644 --- a/kernel/net/netfilter/Makefile +++ b/kernel/net/netfilter/Makefile @@ -10,8 +10,6 @@ obj-$(CONFIG_NETFILTER) = netfilter.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o -nfnetlink_queue-y := nfnetlink_queue_core.o -nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o @@ -75,6 +73,7 @@ nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o +obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o diff --git a/kernel/net/netfilter/core.c b/kernel/net/netfilter/core.c index f0adf700b..10880c89d 100644 --- a/kernel/net/netfilter/core.c +++ b/kernel/net/netfilter/core.c @@ -40,6 +40,9 @@ EXPORT_SYMBOL(nf_afinfo); const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); +DEFINE_PER_CPU(bool, nf_skb_duplicated); +EXPORT_SYMBOL_GPL(nf_skb_duplicated); + int nf_register_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); @@ -58,9 +61,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; -EXPORT_SYMBOL(nf_hooks); - #ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -68,33 +68,168 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -int nf_register_hook(struct nf_hook_ops *reg) +static struct list_head *nf_find_hook_list(struct net *net, + const struct nf_hook_ops *reg) +{ + struct list_head *hook_list = NULL; + + if (reg->pf != NFPROTO_NETDEV) + hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + else if (reg->hooknum == NF_NETDEV_INGRESS) { +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->dev && dev_net(reg->dev) == net) + hook_list = ®->dev->nf_hooks_ingress; +#endif + } + return hook_list; +} + +struct nf_hook_entry { + const struct nf_hook_ops *orig_ops; + struct nf_hook_ops ops; +}; + +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *hook_list; + struct nf_hook_entry *entry; struct nf_hook_ops *elem; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->orig_ops = reg; + entry->ops = *reg; + + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) { + kfree(entry); + return -ENOENT; + } + mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + list_for_each_entry(elem, hook_list, list) { if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, elem->list.prev); + list_add_rcu(&entry->ops.list, elem->list.prev); mutex_unlock(&nf_hook_mutex); +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_inc_ingress_queue(); +#endif #ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } -EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_net_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *hook_list; + struct nf_hook_entry *entry; + struct nf_hook_ops *elem; + + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) + return; + mutex_lock(&nf_hook_mutex); - list_del_rcu(®->list); + list_for_each_entry(elem, hook_list, list) { + entry = container_of(elem, struct nf_hook_entry, ops); + if (entry->orig_ops == reg) { + list_del_rcu(&entry->ops.list); + break; + } + } mutex_unlock(&nf_hook_mutex); + if (&elem->list == hook_list) { + WARN(1, "nf_unregister_net_hook: hook not found!\n"); + return; + } +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_dec_ingress_queue(); +#endif #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); + nf_queue_nf_hook_drop(net, &entry->ops); + /* other cpu might still process nfqueue verdict that used reg */ + synchronize_net(); + kfree(entry); +} +EXPORT_SYMBOL(nf_unregister_net_hook); + +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_net_hook(net, ®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_net_hooks(net, reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_net_hooks); + +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + while (n-- > 0) + nf_unregister_net_hook(net, ®[n]); +} +EXPORT_SYMBOL(nf_unregister_net_hooks); + +static LIST_HEAD(nf_hook_list); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct net *net, *last; + int ret; + + rtnl_lock(); + for_each_net(net) { + ret = nf_register_net_hook(net, reg); + if (ret && ret != -ENOENT) + goto rollback; + } + list_add_tail(®->list, &nf_hook_list); + rtnl_unlock(); + + return 0; +rollback: + last = net; + for_each_net(net) { + if (net == last) + break; + nf_unregister_net_hook(net, reg); + } + rtnl_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + struct net *net; + + rtnl_lock(); + list_del(®->list); + for_each_net(net) + nf_unregister_net_hook(net, reg); + rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -142,7 +277,7 @@ unsigned int nf_iterate(struct list_head *head, /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook(*elemp, skb, state); + verdict = (*elemp)->hook((*elemp)->priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) @@ -172,11 +307,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = list_entry_rcu(&nf_hooks[state->pf][state->hook], - struct nf_hook_ops, list); + elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); next_hook: - verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state, - &elem); + verdict = nf_iterate(state->hook_list, skb, state, &elem); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -188,8 +321,6 @@ next_hook: int err = nf_queue(skb, elem, state, verdict >> NF_VERDICT_QBITS); if (err < 0) { - if (err == -ECANCELED) - goto next_hook; if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; @@ -223,6 +354,12 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) } EXPORT_SYMBOL(skb_make_writable); +/* This needs to be compiled in any case to avoid dependencies between the + * nfnetlink_queue code and nf_conntrack. + */ +struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfnl_ct_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence @@ -260,12 +397,12 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); -struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; -EXPORT_SYMBOL_GPL(nfq_ct_hook); - -struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; -EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); - +/* Built-in default zone used e.g. by modules. */ +const struct nf_conntrack_zone nf_ct_zone_dflt = { + .id = NF_CT_DEFAULT_ZONE_ID, + .dir = NF_CT_DEFAULT_ZONE_DIR, +}; +EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ #ifdef CONFIG_NF_NAT_NEEDED @@ -273,8 +410,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int nf_register_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + int ret; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) { + ret = nf_register_net_hook(net, elem); + if (ret && ret != -ENOENT) + goto out_undo; + } + rtnl_unlock(); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); + return ret; +} + +static void nf_unregister_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); +} + static int __net_init netfilter_net_init(struct net *net) { + int i, h, ret; + + for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&net->nf.hooks[i][h]); + } + #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); @@ -285,11 +460,16 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - return 0; + ret = nf_register_hook_list(net); + if (ret) + remove_proc_entry("netfilter", net->proc_net); + + return ret; } static void __net_exit netfilter_net_exit(struct net *net) { + nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } @@ -300,12 +480,7 @@ static struct pernet_operations netfilter_net_ops = { int __init netfilter_init(void) { - int i, h, ret; - - for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } + int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h index 6f024a8a1..b0bc475f6 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -33,7 +33,7 @@ #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype MTYPE -#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id)) +#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) @@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct mtype *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set) del_timer_sync(&map->gc); ip_set_free(map->members); - if (set->dsize) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set); - ip_set_free(map->extensions); - } - kfree(map); + if (set->dsize && set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + ip_set_free(map); set->data = NULL; } @@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; + size_t memsize = sizeof(*map) + map->memsize; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + - map->memsize + - set->dsize * map->elements))) + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -144,10 +139,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(x, set))) + ip_set_timeout_expired(ext_timeout(x, set))) { ret = 0; - else if (!(flags & IPSET_FLAG_EXIST)) + } else if (!(flags & IPSET_FLAG_EXIST)) { + set_bit(e->id, map->members); return -IPSET_ERR_EXIST; + } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } @@ -165,6 +162,10 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_comment(ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); + + /* Activate element */ + set_bit(e->id, map->members); + return 0; } @@ -203,10 +204,13 @@ mtype_list(const struct ip_set *set, struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; + int ret = 0; adt = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; + /* Extensions may be replaced */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { id = cb->args[IPSET_CB_ARG0]; @@ -214,7 +218,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT - mtype_is_filled((const struct mtype_elem *) x) && + mtype_is_filled((const struct mtype_elem *)x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; @@ -222,14 +226,16 @@ mtype_list(const struct ip_set *set, if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + + goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, - mtype_is_filled((const struct mtype_elem *) x))) + mtype_is_filled((const struct mtype_elem *)x))) goto nla_put_failure; ipset_nest_end(skb, nested); } @@ -238,29 +244,32 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } ipset_nest_end(skb, adt); - return 0; +out: + rcu_read_unlock(); + return ret; } static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct mtype *map = set->data; void *x; u32 id; /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); + * but adding/deleting new entries is locked out + */ + spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); @@ -269,7 +278,7 @@ mtype_gc(unsigned long ul_set) ip_set_ext_destroy(set, x); } } - read_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c index 55b083ec5..4783efff0 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -36,11 +36,11 @@ IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); #define MTYPE bitmap_ip +#define HOST_MASK 32 /* Type structure */ struct bitmap_ip { void *members; /* the set members */ - void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -48,6 +48,8 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -58,7 +60,7 @@ struct bitmap_ip_adt_elem { static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { - return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; } /* Common functions */ @@ -80,7 +82,7 @@ static inline int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -137,20 +139,17 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -174,11 +173,12 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else + } else { ip_to = ip; + } if (ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; @@ -189,8 +189,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -225,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -277,16 +270,17 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - if (netmask > 32) + if (netmask > HOST_MASK) return -IPSET_ERR_INVALID_NETMASK; first_ip &= ip_set_hostmask(netmask); @@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], pr_debug("hosts %u, elements %llu\n", hosts, (unsigned long long)elements); - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ip; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { kfree(map); @@ -360,7 +354,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -377,6 +372,7 @@ bitmap_ip_init(void) static void __exit bitmap_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ip_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 86104744b..29dde2083 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac +#define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { @@ -46,24 +47,26 @@ enum { /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ - void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + unsigned char extensions[0] /* MAC + data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { + unsigned char ether[ETH_ALEN] __aligned(2); u16 id; - unsigned char *ether; + u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; -} __attribute__ ((aligned)); +} __aligned(__alignof__(u64)); static inline u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) @@ -71,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) return ip - m->first_ip; } -static inline struct bitmap_ipmac_elem * -get_elem(void *extensions, u16 id, size_t dsize) -{ - return (struct bitmap_ipmac_elem *)(extensions + id * dsize); -} +#define get_elem(extensions, id, dsize) \ + (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) + +#define get_const_elem(extensions, id, dsize) \ + (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ @@ -87,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, if (!test_bit(e->id, map->members)) return 0; - elem = get_elem(map->extensions, e->id, dsize); - if (elem->filled == MAC_FILLED) - return e->ether == NULL || - ether_addr_equal(e->ether, elem->ether); + elem = get_const_elem(map->extensions, e->id, dsize); + if (e->add_mac && elem->filled == MAC_FILLED) + return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } @@ -102,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) if (!test_bit(id, map->members)) return 0; - elem = get_elem(map->extensions, id, dsize); + elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } @@ -130,8 +132,9 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, - * possibly by the kernel */ - if (e->ether) + * possibly by the kernel + */ + if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; @@ -146,28 +149,35 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); - if (test_and_set_bit(e->id, map->members)) { + if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { - if (e->ether && (flags & IPSET_FLAG_EXIST)) - memcpy(elem->ether, e->ether, ETH_ALEN); + if (e->add_mac && + (flags & IPSET_FLAG_EXIST) && + !ether_addr_equal(e->ether, elem->ether)) { + /* memcpy isn't atomic */ + clear_bit(e->id, map->members); + smp_mb__after_atomic(); + ether_addr_copy(elem->ether, e->ether); + } return IPSET_ADD_FAILED; - } else if (!e->ether) + } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ - memcpy(elem->ether, e->ether, ETH_ALEN); + clear_bit(e->id, map->members); + smp_mb__after_atomic(); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; - } else if (e->ether) { + } else if (e->add_mac) { /* We can store MAC too */ - memcpy(elem->ether, e->ether, ETH_ALEN); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; - } else { - elem->filled = MAC_UNSET; - /* MAC is not stored yet, don't start timer */ - return IPSET_ADD_STORE_PLAIN_TIMEOUT; } + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static inline int @@ -182,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = - get_elem(map->extensions, id, dsize); + get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || @@ -204,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = { .id = 0 }; + struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -222,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; e.id = ip_to_id(map, ip); - e.ether = eth_hdr(skb)->h_source; + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -238,20 +248,17 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -259,11 +266,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); - if (tb[IPSET_ATTR_ETHER]) - e.ether = nla_data(tb[IPSET_ATTR_ETHER]); - else - e.ether = NULL; - + if (tb[IPSET_ATTR_ETHER]) { + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + e.add_mac = 1; + } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; @@ -294,13 +300,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -343,25 +342,27 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } elements = (u64)last_ip - first_ip + 1; if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct bitmap_ipmac_elem), + __alignof__(struct bitmap_ipmac_elem)); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ipmac; - set->dsize = ip_set_elem_len(set, tb, - sizeof(struct bitmap_ipmac_elem)); if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; @@ -397,7 +398,8 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -414,6 +416,7 @@ bitmap_ipmac_init(void) static void __exit bitmap_ipmac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c index 005dd3644..7f0c73335 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c @@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { void *members; /* the set members */ - void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -73,7 +74,7 @@ static inline int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -136,19 +137,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], u16 port_to; int ret = 0; - if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) + return -IPSET_ERR_PROTOCOL; + port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -168,8 +163,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (port < map->first_port) return -IPSET_ERR_BITMAP_RANGE; } - } else + } else { port_to = port; + } if (port_to > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -180,8 +176,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -214,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * map->elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_port = first_port; map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; @@ -237,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map; u16 first_port, last_port; + u32 elements; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -253,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], last_port = tmp; } - map = kzalloc(sizeof(*map), GFP_KERNEL); + elements = last_port - first_port + 1; + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; - map->elements = last_port - first_port + 1; + map->elements = elements; map->memsize = bitmap_bytes(0, map->elements); set->variant = &bitmap_port; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; @@ -294,7 +285,8 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -311,6 +303,7 @@ bitmap_port_init(void) static void __exit bitmap_port_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_port_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_core.c b/kernel/net/netfilter/ipset/ip_set_core.c index d259da3ce..54f3d7cb2 100644 --- a/kernel/net/netfilter/ipset/ip_set_core.c +++ b/kernel/net/netfilter/ipset/ip_set_core.c @@ -32,8 +32,10 @@ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ - int is_deleted; /* deleted by ip_set_net_exit */ + bool is_deleted; /* deleted by ip_set_net_exit */ + bool is_destroyed; /* all sets are destroyed */ }; + static int ip_set_net_id __read_mostly; static inline struct ip_set_net *ip_set_pernet(struct net *net) @@ -42,7 +44,7 @@ static inline struct ip_set_net *ip_set_pernet(struct net *net) } #define IP_SET_INC 64 -#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) +#define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; @@ -59,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] -/* - * The set types are implemented in modules and registered set types +/* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ @@ -85,7 +86,7 @@ find_set_type(const char *name, u8 family, u8 revision) struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && @@ -130,9 +131,10 @@ __find_set_type_get(const char *name, u8 family, u8 revision, goto unlock; } /* Make sure the type is already loaded - * but we don't support the revision */ + * but we don't support the revision + */ list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name)) { + if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } @@ -166,7 +168,7 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; @@ -208,15 +210,15 @@ ip_set_type_register(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); - ret = -EINVAL; - goto unlock; + ip_set_type_unlock(); + return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); -unlock: ip_set_type_unlock(); + return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); @@ -230,12 +232,12 @@ ip_set_type_unregister(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); - goto unlock; + ip_set_type_unlock(); + return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); -unlock: ip_set_type_unlock(); synchronize_rcu(); @@ -289,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -306,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -317,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), - sizeof(struct in6_addr)); + sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); @@ -362,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) } size_t -ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, + size_t align) { enum ip_set_ext_id id; - size_t offset = 0; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; + if (!align) + align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; - offset += ALIGN(len + offset, ip_set_extensions[id].align); - set->offset[id] = offset; + len = ALIGN(len, ip_set_extensions[id].align); + set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; - offset += ip_set_extensions[id].len; + len += ip_set_extensions[id].len; } - return len + offset; + return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); @@ -389,13 +393,22 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + if (tb[IPSET_ATTR_TIMEOUT]) { - if (!(set->extensions & IPSET_EXT_TIMEOUT)) + if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { - if (!(set->extensions & IPSET_EXT_COUNTER)) + if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( @@ -405,25 +418,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { - if (!(set->extensions & IPSET_EXT_COMMENT)) + if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbmark = fullmark >> 32; ext->skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbprio = be32_to_cpu(nla_get_be32( tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbqueue = be16_to_cpu(nla_get_be16( tb[IPSET_ATTR_SKBQUEUE])); @@ -432,8 +445,32 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -/* - * Creating/destroying/renaming/swapping affect the existence and +int +ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, + const void *e, bool active) +{ + if (SET_WITH_TIMEOUT(set)) { + unsigned long *timeout = ext_timeout(e, set); + + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(active ? ip_set_timeout_get(timeout) + : *timeout))) + return -EMSGSIZE; + } + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, set))) + return -EMSGSIZE; + if (SET_WITH_COMMENT(set) && + ip_set_put_comment(skb, ext_comment(e, set))) + return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_put_extensions); + +/* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * @@ -460,8 +497,7 @@ __ip_set_put(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -/* - * Add, del and test set entries from kernel. +/* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. @@ -485,27 +521,26 @@ int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -524,20 +559,19 @@ int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } @@ -547,27 +581,25 @@ int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } EXPORT_SYMBOL_GPL(ip_set_del); -/* - * Find set by name, reference it once. The reference makes sure the +/* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ @@ -581,7 +613,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; - if (s != NULL && STREQ(s->name, name)) { + if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; @@ -594,8 +626,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) } EXPORT_SYMBOL_GPL(ip_set_get_byname); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -608,7 +639,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; - if (set != NULL) + if (set) __ip_set_put(set); rcu_read_unlock(); } @@ -622,8 +653,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_put_byindex); -/* - * Get the name of a set behind a set index. +/* Get the name of a set behind a set index. * We assume the set is referenced, so it does exist and * can't be destroyed. The set cannot be renamed due to * the referencing either. @@ -634,7 +664,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) { const struct ip_set *set = ip_set_rcu_get(net, index); - BUG_ON(set == NULL); + BUG_ON(!set); BUG_ON(set->ref == 0); /* Referenced, so it's safe */ @@ -642,13 +672,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_name_byindex); -/* - * Routines to call by external subsystems, which do not +/* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ -/* - * Find set by index, reference it once. The reference makes sure the +/* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. @@ -674,8 +702,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -690,15 +717,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index) nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); - if (set != NULL) + if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); -/* - * Communication protocol with userspace over netlink. +/* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ @@ -725,7 +751,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), sizeof(*nfmsg), flags); - if (nlh == NULL) + if (!nlh) return NULL; nfmsg = nlmsg_data(nlh); @@ -758,7 +784,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL && STREQ(set->name, name)) { + if (set && STRNCMP(set->name, name)) { *id = i; break; } @@ -784,10 +810,10 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s == NULL) { + if (!s) { if (*index == IPSET_INVALID_ID) *index = i; - } else if (STREQ(name, s->name)) { + } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; @@ -816,18 +842,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; - struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(nlh); int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_REVISION] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL || - (attr[IPSET_ATTR_DATA] != NULL && + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_REVISION] || + !attr[IPSET_ATTR_FAMILY] || + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; @@ -838,33 +864,29 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); - /* - * First, and without any locks, allocate and initialize + /* First, and without any locks, allocate and initialize * a normal base set structure. */ - set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; - rwlock_init(&set->lock); + spin_lock_init(&set->lock); strlcpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; - /* - * Next, check that we know the type, and take + /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ - ret = find_set_type_get(typename, family, revision, &(set->type)); + ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; - /* - * Without holding any locks, create private part. - */ + /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy)) { @@ -878,8 +900,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* BTW, ret==0 here. */ - /* - * Here, we have a valid, constructed set and we are protected + /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ @@ -887,7 +908,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && - STREQ(set->type->name, clash->type->name) && + STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && @@ -902,7 +923,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* Wraparound */ goto cleanup; - list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -916,12 +937,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, inst->ip_set_max = i; kfree(tmp); ret = 0; - } else if (ret) + } else if (ret) { goto cleanup; + } - /* - * Finally! Add our shiny new set to the list, and be done. - */ + /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; @@ -946,12 +966,9 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static void -ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) +ip_set_destroy_set(struct ip_set *set) { - struct ip_set *set = ip_set(inst, index); - pr_debug("set: %s\n", set->name); - ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -986,30 +1003,36 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && s->ref) { + if (s && s->ref) { ret = -IPSET_ERR_BUSY; goto out; } } + inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) - ip_set_destroy_set(inst, i); + if (s) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(s); + } } + /* Modified by ip_set_destroy() only, which is serialized */ + inst->is_destroyed = false; } else { s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); - if (s == NULL) { + if (!s) { ret = -ENOENT; goto out; } else if (s->ref) { ret = -IPSET_ERR_BUSY; goto out; } + ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - ip_set_destroy_set(inst, i); + ip_set_destroy_set(s); } return 0; out: @@ -1024,9 +1047,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->flush(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); } static int @@ -1044,12 +1067,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) + if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (s == NULL) + if (!s) return -ENOENT; ip_set_flush_set(s); @@ -1081,12 +1104,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; read_lock_bh(&ip_set_ref_lock); @@ -1098,7 +1121,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && STREQ(s->name, name2)) { + if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } @@ -1130,23 +1153,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); - if (from == NULL) + if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); - if (to == NULL) + if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. - * Not an artificial restriction anymore, as we must prevent - * possible loops created by swapping in setlist type of sets. */ + * Not an artifical restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. + */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; @@ -1177,12 +1201,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, static int ip_set_dump_done(struct netlink_callback *cb) { - struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; if (cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", - ip_set(inst, cb->args[IPSET_CB_INDEX])->name); - __ip_set_put_byindex(inst, - (ip_set_id_t) cb->args[IPSET_CB_INDEX]); + struct ip_set_net *inst = + (struct ip_set_net *)cb->args[IPSET_CB_NET]; + ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + struct ip_set *set = ip_set(inst, index); + + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); + __ip_set_put_byindex(inst, index); } return 0; } @@ -1204,7 +1232,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; @@ -1213,27 +1241,23 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); - /* cb->args[IPSET_CB_NET]: net namespace - * [IPSET_CB_DUMP]: dump single set/all sets - * [IPSET_CB_INDEX]: set index - * [IPSET_CB_ARG0]: type specific - */ - if (cda[IPSET_ATTR_SETNAME]) { struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (set == NULL) + if (!set) return -ENOENT; dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; - } else + } else { dump_type = DUMP_ALL; + } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); + dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; @@ -1251,6 +1275,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; + bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) { @@ -1258,7 +1283,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) if (ret < 0) { nlh = nlmsg_hdr(cb->skb); /* We have to create and send the error message - * manually :-( */ + * manually :-( + */ if (nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(cb->skb, nlh, ret); return ret; @@ -1276,13 +1302,21 @@ dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { - index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); - if (set == NULL) { + is_destroyed = inst->is_destroyed; + if (!set || is_destroyed) { + write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } + if (is_destroyed) { + /* All sets are just being destroyed */ + ret = 0; + goto out; + } continue; } /* When dumping all sets, we must dump "sorted" @@ -1290,14 +1324,17 @@ dump_last: */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == - !!(set->type->features & IPSET_DUMP_LAST))) + !!(set->type->features & IPSET_DUMP_LAST))) { + write_unlock_bh(&ip_set_ref_lock); continue; + } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - __ip_set_get(set); + set->ref++; } + write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); @@ -1325,11 +1362,13 @@ dump_last: goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; + if (set->variant->uref) + set->variant->uref(set, cb, true); /* Fall through and add elements */ default: - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->list(set, skb, cb); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; @@ -1341,6 +1380,8 @@ dump_last: dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; + if (set && set->variant->uref) + set->variant->uref(set, cb, false); goto dump_last; } goto out; @@ -1355,7 +1396,10 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", ip_set(inst, index)->name); + set = ip_set(inst, index); + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); __ip_set_put_byindex(inst, index); cb->args[IPSET_CB_ARG0] = 0; } @@ -1407,9 +1451,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); retried = true; } while (ret == -EAGAIN && set->variant->resize && @@ -1425,12 +1469,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); @@ -1447,7 +1491,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1462,25 +1507,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1517,25 +1562,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1572,26 +1617,26 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_DATA] == NULL || + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy)) return -IPSET_ERR_PROTOCOL; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; @@ -1613,15 +1658,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL)) + !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1670,8 +1715,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL)) + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); @@ -1681,7 +1726,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1726,11 +1771,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh2; int ret = 0; - if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1858,7 +1903,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EFAULT; goto done; } - op = (unsigned int *) data; + op = (unsigned int *)data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ @@ -1970,10 +2015,11 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); + list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; - inst->is_deleted = 0; + inst->is_deleted = false; + inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } @@ -1986,12 +2032,14 @@ ip_set_net_exit(struct net *net) struct ip_set *set = NULL; ip_set_id_t i; - inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + inst->is_deleted = true; /* flag for ip_set_nfnl_put */ for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL) - ip_set_destroy_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(set); + } } kfree(rcu_dereference_protected(inst->ip_set_list, 1)); } @@ -2003,11 +2051,11 @@ static struct pernet_operations ip_set_net_ops = { .size = sizeof(struct ip_set_net) }; - static int __init ip_set_init(void) { int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); return ret; diff --git a/kernel/net/netfilter/ipset/ip_set_getport.c b/kernel/net/netfilter/ipset/ip_set_getport.c index 29fb01ddf..42c3e3ba1 100644 --- a/kernel/net/netfilter/ipset/ip_set_getport.c +++ b/kernel/net/netfilter/ipset/ip_set_getport.c @@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct tcphdr *th; th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); - if (th == NULL) + if (!th) /* No choice either */ return false; @@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const sctp_sctphdr_t *sh; sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); - if (sh == NULL) + if (!sh) /* No choice either */ return false; @@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct udphdr *uh; uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); - if (uh == NULL) + if (!uh) /* No choice either */ return false; @@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmphdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16)htons((ic->type << 8) | ic->code); @@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmp6hdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16) @@ -98,7 +98,7 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto) { const struct iphdr *iph = ip_hdr(skb); - unsigned int protooff = ip_hdrlen(skb); + unsigned int protooff = skb_network_offset(skb) + ip_hdrlen(skb); int protocol = iph->protocol; /* See comments at tcp_match in ip_tables.c */ @@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, return false; default: /* Other protocols doesn't have ports, - so we can match fragments */ + * so we can match fragments. + */ *proto = protocol; return true; } @@ -135,7 +136,9 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, __be16 frag_off = 0; nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + protoff = ipv6_skip_exthdr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return false; diff --git a/kernel/net/netfilter/ipset/ip_set_hash_gen.h b/kernel/net/netfilter/ipset/ip_set_hash_gen.h index 974ff386d..e5336ab36 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_gen.h +++ b/kernel/net/netfilter/ipset/ip_set_hash_gen.h @@ -10,19 +10,19 @@ #include #include +#include #include -#ifndef rcu_dereference_bh -#define rcu_dereference_bh(p) rcu_dereference(p) -#endif + +#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) +#define ipset_dereference_protected(p, set) \ + __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. * Internally jhash is used with the assumption that the size of the - * stored data is a multiple of sizeof(u32). If storage supports timeout, - * the timeout field must be the last one in the data structure - that field - * is ignored when computing the hash key. + * stored data is a multiple of sizeof(u32). * * Readers and resizing * @@ -35,7 +35,9 @@ /* Number of elements to store in an initial array block */ #define AHASH_INIT_SIZE 4 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) +/* Max muber of elements in the array block when tuned */ +#define AHASH_MAX_TUNED 64 /* Max number of elements can be tuned */ #ifdef IP_SET_HASH_WITH_MULTI @@ -53,8 +55,9 @@ tune_ahash_max(u8 curr, u32 multi) /* Currently, at listing one hash bucket must fit into a message. * Therefore we have a hard limit here. */ - return n > curr && n <= 64 ? n : curr; + return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } + #define TUNE_AHASH_MAX(h, multi) \ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) #else @@ -64,18 +67,24 @@ tune_ahash_max(u8 curr, u32 multi) /* A hash bucket */ struct hbucket { - void *value; /* the array of the values */ + struct rcu_head rcu; /* for call_rcu_bh */ + /* Which positions are used in the array */ + DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ + unsigned char value[0] /* the array of the values */ + __aligned(__alignof__(u64)); }; /* The hash table: the table size stored here in order to make resizing easy */ struct htable { + atomic_t ref; /* References for resizing */ + atomic_t uref; /* References for dumping */ u8 htable_bits; /* size of hash table == 2^htable_bits */ - struct hbucket bucket[0]; /* hashtable buckets */ + struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; -#define hbucket(h, i) (&((h)->bucket[i])) +#define hbucket(h, i) ((h)->bucket[i]) #ifndef IPSET_NET_COUNT #define IPSET_NET_COUNT 1 @@ -83,8 +92,8 @@ struct htable { /* Book-keeping of the prefixes added to the set */ struct net_prefixes { - u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ - u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */ + u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */ + u8 cidr[IPSET_NET_COUNT]; /* the cidr value */ }; /* Compute the hash table size */ @@ -97,11 +106,11 @@ htable_size(u8 hbits) if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; - return hsize * sizeof(struct hbucket) + sizeof(struct htable); + return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } /* Compute htable_bits from the user input parameter hashsize */ @@ -110,6 +119,7 @@ htable_bits(u32 hashsize) { /* Assume that hashsize == 2^htable_bits */ u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) /* Round up to the first 2^n value */ bits = fls(hashsize); @@ -117,30 +127,6 @@ htable_bits(u32 hashsize) return bits; } -static int -hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) -{ - if (n->pos >= n->size) { - void *tmp; - - if (n->size >= ahash_max) - /* Trigger rehashing */ - return -EAGAIN; - - tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); - if (!tmp) - return -ENOMEM; - if (n->size) { - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - } - n->value = tmp; - n->size += AHASH_INIT_SIZE; - } - return 0; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -149,23 +135,31 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #endif /* cidr + 1 is stored in net_prefixes to support /0 */ -#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1) +#define NCIDR_PUT(cidr) ((cidr) + 1) +#define NCIDR_GET(cidr) ((cidr) - 1) #ifdef IP_SET_HASH_WITH_NETS_PACKED /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */ -#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1) -#define NCIDR(cidr) (cidr) +#define DCIDR_PUT(cidr) ((cidr) - 1) +#define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1) #else -#define GCIDR(cidr, i) (__CIDR(cidr, i)) -#define NCIDR(cidr) (cidr - 1) +#define DCIDR_PUT(cidr) (cidr) +#define DCIDR_GET(cidr, i) __CIDR(cidr, i) #endif +#define INIT_CIDR(cidr, host_mask) \ + DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask)) + #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 +/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) (SET_HOST_MASK(family) + 1) +#define CIDR_POS(c) ((c) - 1) #else +/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) SET_HOST_MASK(family) +#define CIDR_POS(c) ((c) - 2) #endif #else @@ -180,6 +174,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_data_equal #undef mtype_do_data_match #undef mtype_data_set_flags +#undef mtype_data_reset_elem #undef mtype_data_reset_flags #undef mtype_data_netmask #undef mtype_data_list @@ -193,7 +188,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_ahash_memsize #undef mtype_flush #undef mtype_destroy -#undef mtype_gc_init #undef mtype_same_set #undef mtype_kadt #undef mtype_uadt @@ -203,6 +197,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_del #undef mtype_test_cidrs #undef mtype_test +#undef mtype_uref #undef mtype_expire #undef mtype_resize #undef mtype_head @@ -227,6 +222,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) #define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) + #define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) @@ -234,7 +230,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) -#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) @@ -244,23 +239,36 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_uref IPSET_TOKEN(MTYPE, _uref) #define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) +#ifndef MTYPE +#error "MTYPE is not defined!" +#endif + +#ifndef HOST_MASK +#error "HOST_MASK is not defined!" +#endif + #ifndef HKEY_DATALEN #define HKEY_DATALEN sizeof(struct mtype_elem) #endif #define HKEY(data, initval, htable_bits) \ -(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ +(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \ & jhash_mask(htable_bits)) #ifndef htype +#ifndef HTYPE +#error "HTYPE is not defined!" +#endif /* HTYPE */ #define htype HTYPE /* The generic hash structure */ @@ -280,18 +288,16 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif -#ifdef IP_SET_HASH_WITH_RBTREE - struct rb_root rbtree; -#endif #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[0]; /* book-keeping of prefixes */ #endif }; -#endif +#endif /* htype */ #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different - * sized networks */ + * sized networks. cidr == real cidr + 1 to support /0. + */ static void mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) { @@ -299,12 +305,12 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) { - if (j != -1) + if (j != -1) { continue; - else if (h->nets[i].cidr[n] < cidr) + } else if (h->nets[i].cidr[n] < cidr) { j = i; - else if (h->nets[i].cidr[n] == cidr) { - h->nets[cidr - 1].nets[n]++; + } else if (h->nets[i].cidr[n] == cidr) { + h->nets[CIDR_POS(cidr)].nets[n]++; return; } } @@ -313,7 +319,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; } h->nets[i].cidr[n] = cidr; - h->nets[cidr - 1].nets[n] = 1; + h->nets[CIDR_POS(cidr)].nets[n] = 1; } static void @@ -322,15 +328,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) u8 i, j, net_end = nets_length - 1; for (i = 0; i < nets_length; i++) { - if (h->nets[i].cidr[n] != cidr) - continue; - h->nets[cidr -1].nets[n]--; - if (h->nets[cidr -1].nets[n] > 0) - return; + if (h->nets[i].cidr[n] != cidr) + continue; + h->nets[CIDR_POS(cidr)].nets[n]--; + if (h->nets[CIDR_POS(cidr)].nets[n] > 0) + return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) - h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + return; } } #endif @@ -341,15 +347,18 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t, u8 nets_length, size_t dsize) { u32 i; - size_t memsize = sizeof(*h) - + sizeof(*t) + struct hbucket *n; + size_t memsize = sizeof(*h) + sizeof(*t); + #ifdef IP_SET_HASH_WITH_NETS - + sizeof(struct net_prefixes) * nets_length + memsize += sizeof(struct net_prefixes) * nets_length; #endif - + jhash_size(t->htable_bits) * sizeof(struct hbucket); - - for (i = 0; i < jhash_size(t->htable_bits); i++) - memsize += t->bucket[i].size * dsize; + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + memsize += sizeof(struct hbucket) + n->size * dsize; + } return memsize; } @@ -364,7 +373,8 @@ mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) int i; for (i = 0; i < n->pos; i++) - ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); + if (test_bit(i, n->used)) + ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } /* Flush a hash type of set: destroy all elements */ @@ -376,16 +386,16 @@ mtype_flush(struct ip_set *set) struct hbucket *n; u32 i; - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - n->size = n->pos = 0; - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); @@ -401,13 +411,13 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + kfree(n); } ip_set_free(t); @@ -419,13 +429,11 @@ mtype_destroy(struct ip_set *set) { struct htype *h = set->data; - if (set->extensions & IPSET_EXT_TIMEOUT) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&h->gc); - mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); -#ifdef IP_SET_HASH_WITH_RBTREE - rbtree_destroy(&h->rbtree); -#endif + mtype_ahash_destroy(set, + __ipset_dereference_protected(h->table, 1), true); kfree(h); set->data = NULL; @@ -437,7 +445,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct htype *h = set->data; init_timer(&h->gc); - h->gc.data = (unsigned long) set; + h->gc.data = (unsigned long)set; h->gc.function = gc; h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -468,63 +476,78 @@ static void mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) { struct htable *t; - struct hbucket *n; + struct hbucket *n, *tmp; struct mtype_elem *data; - u32 i; - int j; + u32 i, j, d; #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - for (j = 0; j < n->pos; j++) { + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) { + d++; + continue; + } data = ahash_data(n, j, dsize); if (ip_set_timeout_expired(ext_timeout(data, set))) { pr_debug("expired %u/%u\n", i, j); + clear_bit(j, n->used); + smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, SCIDR(data->cidr, k), - nets_length, k); + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, + k)), + nets_length, k); #endif ip_set_ext_destroy(set, data); - if (j != n->pos - 1) - /* Not last one */ - memcpy(data, - ahash_data(n, n->pos - 1, dsize), - dsize); - n->pos--; h->elements--; + d++; } } - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * dsize, - GFP_ATOMIC); + if (d >= AHASH_INIT_SIZE) { + if (d >= n->size) { + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + continue; + } + tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) /* Still try to delete expired elements */ continue; - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - n->value = tmp; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + d * dsize, data, dsize); + set_bit(d, tmp->used); + d++; + } + tmp->pos = d; + rcu_assign_pointer(hbucket(t, i), tmp); + kfree_rcu(n, rcu); } } - rcu_read_unlock_bh(); } static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct htype *h = set->data; pr_debug("called\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); mtype_expire(set, h, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -532,93 +555,152 @@ mtype_gc(unsigned long ul_set) /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or - * fail due to memory pressures. */ + * fail due to memory pressures. + */ static int mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; - struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); - u8 htable_bits = orig->htable_bits; + struct htable *t, *orig; + u8 htable_bits; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; + struct mtype_elem *tmp; #endif struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j; + u32 i, j, key; int ret; - /* Try to cleanup once */ - if (SET_WITH_TIMEOUT(set) && !retried) { - i = h->elements; - write_lock_bh(&set->lock); - mtype_expire(set, set->data, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); - if (h->elements < i) - return 0; - } +#ifdef IP_SET_HASH_WITH_NETS + tmp = kmalloc(dsize, GFP_KERNEL); + if (!tmp) + return -ENOMEM; +#endif + rcu_read_lock_bh(); + orig = rcu_dereference_bh_nfnl(h->table); + htable_bits = orig->htable_bits; + rcu_read_unlock_bh(); retry: ret = 0; htable_bits++; - pr_debug("attempt to resize set %s from %u to %u, t %p\n", - set->name, orig->htable_bits, htable_bits, orig); if (!htable_bits) { /* In case we have plenty of memory :-) */ pr_warn("Cannot increase the hashsize of set %s further\n", set->name); - return -IPSET_ERR_HASH_FULL; + ret = -IPSET_ERR_HASH_FULL; + goto out; + } + t = ip_set_alloc(htable_size(htable_bits)); + if (!t) { + ret = -ENOMEM; + goto out; } - t = ip_set_alloc(sizeof(*t) - + jhash_size(htable_bits) * sizeof(struct hbucket)); - if (!t) - return -ENOMEM; t->htable_bits = htable_bits; - read_lock_bh(&set->lock); + spin_lock_bh(&set->lock); + orig = __ipset_dereference_protected(h->table, 1); + /* There can't be another parallel resizing, but dumping is possible */ + atomic_set(&orig->ref, 1); + atomic_inc(&orig->uref); + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = hbucket(orig, i); + n = __ipset_dereference_protected(hbucket(orig, i), 1); + if (!n) + continue; for (j = 0; j < n->pos; j++) { - data = ahash_data(n, j, set->dsize); + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); #ifdef IP_SET_HASH_WITH_NETS + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ flags = 0; + memcpy(tmp, data, dsize); + data = tmp; mtype_data_reset_flags(data, &flags); #endif - m = hbucket(t, HKEY(data, h->initval, htable_bits)); - ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); - if (ret < 0) { -#ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(data, &flags); -#endif - read_unlock_bh(&set->lock); - mtype_ahash_destroy(set, t, false); - if (ret == -EAGAIN) - goto retry; - return ret; + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference_protected(hbucket(t, key), 1); + if (!m) { + m = kzalloc(sizeof(*m) + + AHASH_INIT_SIZE * dsize, + GFP_ATOMIC); + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + + (m->size + AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - d = ahash_data(m, m->pos++, set->dsize); - memcpy(d, data, set->dsize); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); #ifdef IP_SET_HASH_WITH_NETS mtype_data_reset_flags(d, &flags); #endif } } - rcu_assign_pointer(h->table, t); - read_unlock_bh(&set->lock); + + spin_unlock_bh(&set->lock); /* Give time to other readers of the set */ synchronize_rcu_bh(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - mtype_ahash_destroy(set, orig, false); + /* If there's nobody else dumping the table, destroy it */ + if (atomic_dec_and_test(&orig->uref)) { + pr_debug("Table destroy by resize %p\n", orig); + mtype_ahash_destroy(set, orig, false); + } - return 0; +out: +#ifdef IP_SET_HASH_WITH_NETS + kfree(tmp); +#endif + return ret; + +cleanup: + atomic_set(&orig->ref, 0); + atomic_dec(&orig->uref); + spin_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + goto out; } /* Add an element to a hash and update the internal counters when succeeded, - * otherwise report the proper error code. */ + * otherwise report the proper error code. + */ static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) @@ -627,17 +709,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct htable *t; const struct mtype_elem *d = value; struct mtype_elem *data; - struct hbucket *n; - int i, ret = 0; - int j = AHASH_MAX(h) + 1; + struct hbucket *n, *old = ERR_PTR(-ENOENT); + int i, j = -1; bool flag_exist = flags & IPSET_FLAG_EXIST; + bool deleted = false, forceadd = false, reuse = false; u32 key, multi = 0; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + if (h->elements >= h->maxelem) { + if (SET_WITH_TIMEOUT(set)) + /* FIXME: when set is full, we slow down here */ + mtype_expire(set, h, NLEN(set->family), set->dsize); + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + forceadd = true; + } + + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) { + if (forceadd) { + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } else if (h->elements >= h->maxelem) { + goto set_full; + } + old = NULL; + n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + n->size = AHASH_INIT_SIZE; + goto copy_elem; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + /* Reuse first deleted entry */ + if (j == -1) { + deleted = reuse = true; + j = i; + } + continue; + } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { if (flag_exist || @@ -645,85 +759,94 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_timeout_expired(ext_timeout(data, set)))) { /* Just the extensions could be overwritten */ j = i; - goto reuse_slot; - } else { - ret = -IPSET_ERR_EXIST; - goto out; + goto overwrite_extensions; } + return -IPSET_ERR_EXIST; } /* Reuse first timed out entry */ if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set)) && - j != AHASH_MAX(h) + 1) + j == -1) { j = i; + reuse = true; + } } - if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) { - /* Choosing the first entry in the array to replace */ - j = 0; - goto reuse_slot; - } - if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h, NLEN(set->family), set->dsize); - - if (h->elements >= h->maxelem) { - if (net_ratelimit()) - pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - -reuse_slot: - if (j != AHASH_MAX(h) + 1) { - /* Fill out reused slot */ + if (reuse || forceadd) { data = ahash_data(n, j, set->dsize); + if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) { - mtype_del_cidr(h, SCIDR(data->cidr, i), - NLEN(set->family), i); - mtype_add_cidr(h, SCIDR(d->cidr, i), - NLEN(set->family), i); - } + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, i)), + NLEN(set->family), i); #endif - ip_set_ext_destroy(set, data); - } else { - /* Use/create a new slot */ + ip_set_ext_destroy(set, data); + h->elements--; + } + goto copy_data; + } + if (h->elements >= h->maxelem) + goto set_full; + /* Create a new slot */ + if (n->pos >= n->size) { TUNE_AHASH_MAX(h, multi); - ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); - if (ret != 0) { - if (ret == -EAGAIN) - mtype_data_next(&h->next, d); - goto out; + if (n->size >= AHASH_MAX(h)) { + /* Trigger rehashing */ + mtype_data_next(&h->next, d); + return -EAGAIN; } - data = ahash_data(n, n->pos++, set->dsize); + old = n; + n = kzalloc(sizeof(*n) + + (old->size + AHASH_INIT_SIZE) * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + memcpy(n, old, sizeof(struct hbucket) + + old->size * set->dsize); + n->size = old->size + AHASH_INIT_SIZE; + } + +copy_elem: + j = n->pos++; + data = ahash_data(n, j, set->dsize); +copy_data: + h->elements++; #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family), - i); + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), + NLEN(set->family), i); #endif - h->elements++; - } memcpy(data, d, sizeof(struct mtype_elem)); +overwrite_extensions: #ifdef IP_SET_HASH_WITH_NETS mtype_data_set_flags(data, flags); #endif - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(data, set), ext->timeout); if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(data, set), ext); + /* Must come last for the case when timed out entry is reused */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, set), ext->timeout); + smp_mb__before_atomic(); + set_bit(j, n->used); + if (old != ERR_PTR(-ENOENT)) { + rcu_assign_pointer(hbucket(t, key), n); + if (old) + kfree_rcu(old, rcu); + } -out: - rcu_read_unlock_bh(); - return ret; + return 0; +set_full: + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; } -/* Delete an element from the hash: swap it with the last element - * and free up space if possible. +/* Delete an element from the hash and free up space if possible. */ static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -734,55 +857,70 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, ret = -IPSET_ERR_EXIST; -#ifdef IP_SET_HASH_WITH_NETS - u8 j; -#endif + int i, j, k, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; + size_t dsize = set->dsize; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); - for (i = 0; i < n->pos; i++) { - data = ahash_data(n, i, set->dsize); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) + goto out; + for (i = 0, k = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + k++; + continue; + } + data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) goto out; - if (i != n->pos - 1) - /* Not last one */ - memcpy(data, ahash_data(n, n->pos - 1, set->dsize), - set->dsize); - n->pos--; + ret = 0; + clear_bit(i, n->used); + smp_mb__after_atomic(); + if (i + 1 == n->pos) + n->pos--; h->elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family), - j); + mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), + NLEN(set->family), j); #endif ip_set_ext_destroy(set, data); - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * set->dsize, - GFP_ATOMIC); - if (!tmp) { - ret = 0; + + for (; i < n->pos; i++) { + if (!test_bit(i, n->used)) + k++; + } + if (n->pos == 0 && k == 0) { + rcu_assign_pointer(hbucket(t, key), NULL); + kfree_rcu(n, rcu); + } else if (k >= AHASH_INIT_SIZE) { + struct hbucket *tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) goto out; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, k = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + k * dsize, data, dsize); + set_bit(j, tmp->used); + k++; } - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * set->dsize); - kfree(n->value); - n->value = tmp; + tmp->pos = k; + rcu_assign_pointer(hbucket(t, key), tmp); + kfree_rcu(n, rcu); } - ret = 0; goto out; } out: - rcu_read_unlock_bh(); return ret; } @@ -801,7 +939,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_NETS /* Special test function which takes into account the different network - * sizes added to the set */ + * sizes added to the set + */ static int mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, const struct ip_set_ext *ext, @@ -824,16 +963,21 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) { #if IPSET_NET_COUNT == 2 mtype_data_reset_elem(d, &orig); - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false); for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi; k++) { - mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true); + mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]), + true); #else - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0])); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; @@ -871,13 +1015,13 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; - rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, - * try all possible network sizes */ + * try all possible network sizes + */ for (i = 0; i < IPSET_NET_COUNT; i++) - if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family)) + if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family)) break; if (i == IPSET_NET_COUNT) { ret = mtype_test_cidrs(set, d, ext, mext, flags); @@ -886,8 +1030,14 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) { + ret = 0; + goto out; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi) && !(SET_WITH_TIMEOUT(set) && @@ -897,7 +1047,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, } } out: - rcu_read_unlock_bh(); return ret; } @@ -909,15 +1058,19 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u8 htable_bits; + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + htable_bits = t->htable_bits; + rcu_read_unlock_bh(); nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, - htonl(jhash_size(t->htable_bits))) || + htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; #ifdef IP_SET_HASH_WITH_NETMASK @@ -941,32 +1094,63 @@ nla_put_failure: return -EMSGSIZE; } +/* Make possible to run dumping parallel with resizing */ +static void +mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) +{ + struct htype *h = set->data; + struct htable *t; + + if (start) { + rcu_read_lock_bh(); + t = rcu_dereference_bh_nfnl(h->table); + atomic_inc(&t->uref); + cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; + rcu_read_unlock_bh(); + } else if (cb->args[IPSET_CB_PRIVATE]) { + t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + /* Resizing didn't destroy the hash table */ + pr_debug("Table destroy by dump: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + cb->args[IPSET_CB_PRIVATE] = 0; + } +} + /* Reply a LIST/SAVE request: dump the elements of the specified set */ static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { - const struct htype *h = set->data; - const struct htable *t = rcu_dereference_bh_nfnl(h->table); + const struct htable *t; struct nlattr *atd, *nested; const struct hbucket *n; const struct mtype_elem *e; u32 first = cb->args[IPSET_CB_ARG0]; /* We assume that one hash bucket fills into one page */ void *incomplete; - int i; + int i, ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + t = (const struct htable *)cb->args[IPSET_CB_PRIVATE]; + /* Expire may replace a hbucket with another one */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { incomplete = skb_tail_pointer(skb); - n = hbucket(t, cb->args[IPSET_CB_ARG0]); + n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0])); pr_debug("cb->arg bucket: %lu, t %p n %p\n", cb->args[IPSET_CB_ARG0], t, n); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; e = ahash_data(n, i, set->dsize); if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) @@ -977,9 +1161,10 @@ mtype_list(const struct ip_set *set, if (!nested) { if (cb->args[IPSET_CB_ARG0] == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (mtype_data_list(skb, e)) goto nla_put_failure; @@ -992,7 +1177,7 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nlmsg_trim(skb, incomplete); @@ -1000,20 +1185,24 @@ nla_put_failure: pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", set->name); cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; + } else { + ipset_nest_end(skb, atd); } - ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static int IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt); + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); static int IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + enum ipset_adt adt, u32 *lineno, u32 flags, + bool retried); static const struct ip_set_type_variant mtype_variant = { .kadt = mtype_kadt, @@ -1027,6 +1216,7 @@ static const struct ip_set_type_variant mtype_variant = { .flush = mtype_flush, .head = mtype_head, .list = mtype_list, + .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, }; @@ -1045,7 +1235,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u8 netmask; #endif size_t hsize; - struct HTYPE *h; + struct htype *h; struct htable *t; #ifndef IP_SET_PROTO_UNDEF @@ -1064,12 +1254,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || -#ifdef IP_SET_HASH_WITH_MARKMASK - !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || -#endif !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; +#ifdef IP_SET_HASH_WITH_MARKMASK + /* Separated condition in order to avoid directive in argument list */ + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK))) + return -IPSET_ERR_PROTOCOL; +#endif if (tb[IPSET_ATTR_HASHSIZE]) { hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); @@ -1092,7 +1284,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_MARKMASK if (tb[IPSET_ATTR_MARKMASK]) { - markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK])); if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; @@ -1137,12 +1329,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem))); #ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem))); } #endif if (tb[IPSET_ATTR_TIMEOUT]) { @@ -1165,3 +1359,5 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, return 0; } #endif /* IP_SET_EMIT_CREATE */ + +#undef HKEY_DATALEN diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ip.c b/kernel/net/netfilter/ipset/ip_set_hash_ip.c index 76959d79e..9d6bf19f7 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ip.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ip.c @@ -56,15 +56,15 @@ hash_ip4_data_equal(const struct hash_ip4_elem *e1, return e1->ip == e2->ip; } -static inline bool +static bool hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -74,7 +74,6 @@ hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) } #define MTYPE hash_ip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -109,20 +108,17 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, hosts; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -145,7 +141,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -162,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -196,10 +192,10 @@ hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -208,12 +204,9 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE @@ -247,22 +240,25 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -301,7 +297,8 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -318,6 +315,7 @@ hash_ip_init(void) static void __exit hash_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ip_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c index 7abf9788c..a0695a2ab 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -63,10 +63,10 @@ hash_ipmark4_data_list(struct sk_buff *skb, if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -76,10 +76,8 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next, next->ip = d->ip; } -#define MTYPE hash_ipmark4 -#define PF 4 -#define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#define MTYPE hash_ipmark4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -110,25 +108,22 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip, ip_to = 0; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST || @@ -147,7 +142,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -160,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -191,10 +186,10 @@ hash_ipmark6_data_list(struct sk_buff *skb, if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -204,18 +199,13 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipmark6 -#define PF 6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" - static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -243,27 +233,30 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { @@ -274,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; - return ret; + return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { @@ -307,7 +298,8 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -324,6 +316,7 @@ hash_ipmark_init(void) static void __exit hash_ipmark_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c index dcbcceb9a..9d84b3dff 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c @@ -69,10 +69,10 @@ hash_ipport4_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -83,10 +83,8 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next, next->port = d->port; } -#define MTYPE hash_ipport4 -#define PF 4 -#define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#define MTYPE hash_ipport4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -118,29 +116,23 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -148,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -171,7 +164,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -195,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -231,10 +224,10 @@ hash_ipport6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -245,15 +238,11 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipport6 -#define PF 6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int @@ -285,31 +274,31 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -317,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -341,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -376,7 +366,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -393,6 +384,7 @@ hash_ipport_init(void) static void __exit hash_ipport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c index 7ef93fc88..215b7b942 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -63,17 +63,17 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, static bool hash_ipportip4_data_list(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) + const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -86,7 +86,6 @@ hash_ipportip4_data_next(struct hash_ipportip4_elem *next, /* Common functions */ #define MTYPE hash_ipportip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -120,22 +119,19 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -143,10 +139,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -154,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -177,7 +171,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -201,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -240,10 +234,10 @@ hash_ipportip6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -254,11 +248,9 @@ hash_ipportip6_data_next(struct hash_ipportip4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -293,24 +285,27 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -318,10 +313,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -329,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -353,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -388,7 +381,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -405,6 +399,7 @@ hash_ipportip_init(void) static void __exit hash_ipportip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c index b6012ad92..9ca719625 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -114,10 +114,10 @@ hash_ipportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -130,7 +130,6 @@ hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, } #define MTYPE hash_ipportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -142,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -174,23 +173,20 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -205,10 +201,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -216,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -249,7 +244,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -270,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -294,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -367,10 +363,10 @@ hash_ipportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -381,11 +377,9 @@ hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -398,7 +392,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -429,27 +423,28 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -466,10 +461,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip2, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -477,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -508,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -547,7 +541,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -564,6 +559,7 @@ hash_ipportnet_init(void) static void __exit hash_ipportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_mac.c b/kernel/net/netfilter/ipset/ip_set_hash_mac.c index 65690b52a..f1e7d2c0f 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_mac.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_mac.c @@ -52,7 +52,12 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, static inline bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { - return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); + if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; } static inline void @@ -62,7 +67,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next, } #define MTYPE hash_mac4 -#define PF 4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF @@ -85,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, return 0; if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); @@ -103,22 +107,16 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_ETHER] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_ETHER])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -IPSET_ERR_HASH_ELEM; @@ -149,7 +147,8 @@ static struct ip_set_type hash_mac_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -166,6 +165,7 @@ hash_mac_init(void) static void __exit hash_mac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_net.c b/kernel/net/netfilter/ipset/ip_set_hash_net.c index 6b3ac10ac..3e4bffdc1 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_net.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_net.c @@ -95,10 +95,10 @@ hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -109,7 +109,6 @@ hash_net4_data_next(struct hash_net4_elem *next, } #define MTYPE hash_net4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -121,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -147,21 +146,18 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, last; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -173,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -180,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt, set) ? -ret: + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -202,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -264,10 +261,10 @@ hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -277,11 +274,9 @@ hash_net6_data_next(struct hash_net4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_net6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -294,7 +289,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -318,36 +313,34 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - - if (!e.cidr || e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr || e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip, e.cidr); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -383,7 +376,8 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -400,6 +394,7 @@ hash_net_init(void) static void __exit hash_net_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_net_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c index 380ef5148..43d8c9896 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -37,88 +36,13 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); -/* Interface name rbtree */ - -struct iface_node { - struct rb_node node; - char iface[IFNAMSIZ]; -}; - -#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) - -static void -rbtree_destroy(struct rb_root *root) -{ - struct iface_node *node, *next; - - rbtree_postorder_for_each_entry_safe(node, next, root, node) - kfree(node); - - *root = RB_ROOT; -} - -static int -iface_test(struct rb_root *root, const char **iface) -{ - struct rb_node *n = root->rb_node; - - while (n) { - const char *d = iface_data(n); - int res = strcmp(*iface, d); - - if (res < 0) - n = n->rb_left; - else if (res > 0) - n = n->rb_right; - else { - *iface = d; - return 1; - } - } - return 0; -} - -static int -iface_add(struct rb_root *root, const char **iface) -{ - struct rb_node **n = &(root->rb_node), *p = NULL; - struct iface_node *d; - - while (*n) { - char *ifname = iface_data(*n); - int res = strcmp(*iface, ifname); - - p = *n; - if (res < 0) - n = &((*n)->rb_left); - else if (res > 0) - n = &((*n)->rb_right); - else { - *iface = ifname; - return 0; - } - } - - d = kzalloc(sizeof(*d), GFP_ATOMIC); - if (!d) - return -ENOMEM; - strcpy(d->iface, *iface); - - rb_link_node(&d->node, p, n); - rb_insert_color(&d->node, root); - - *iface = d->iface; - return 0; -} - /* Type specific function prefix */ #define HTYPE hash_netiface #define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 -#define STREQ(a, b) (strcmp(a, b) == 0) +#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ) /* IPv4 variant */ @@ -137,7 +61,7 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -151,7 +75,7 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -193,10 +117,10 @@ hash_netiface4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -207,7 +131,6 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, } #define MTYPE hash_netiface4 -#define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) #include "ip_set_hash_gen.h" @@ -220,7 +143,7 @@ static const char *get_physindev_name(const struct sk_buff *skb) return dev ? dev->name : NULL; } -static const char *get_phyoutdev_name(const struct sk_buff *skb) +static const char *get_physoutdev_name(const struct sk_buff *skb) { struct net_device *dev = nf_bridge_get_physoutdev(skb); @@ -236,11 +159,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -250,35 +172,25 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); -#define IFACE(dir) (par->dir ? par->dir->name : NULL) +#define IFACE(dir) (par->dir ? par->dir->name : "") #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - e.iface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_physoutdev_name(skb); - if (!e.iface) + if (!eiface) return -EINVAL; + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif - } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + } else { + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; - return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -291,25 +203,21 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; - char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -318,21 +226,11 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -353,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr); + } if (retried) ip = ntohl(h->next.ip); @@ -365,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -388,7 +287,7 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -402,7 +301,7 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -444,10 +343,10 @@ hash_netiface6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -457,12 +356,9 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_netiface6 -#define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) #define IP_SET_EMIT_CREATE @@ -476,11 +372,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -492,85 +387,64 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - e.iface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); - if (!e.iface) - return -EINVAL; + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_physoutdev_name(skb); + if (!eiface) + return -EINVAL; + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif - } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + } else { + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + ip6_netmask(&e.ip, e.cidr); - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -613,7 +487,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -630,6 +505,7 @@ hash_netiface_init(void) static void __exit hash_netiface_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netiface_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c index ea8772afb..a93dfebff 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c @@ -57,8 +57,8 @@ struct hash_netnet4_elem { static inline bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, - const struct hash_netnet4_elem *ip2, - u32 *multi) + const struct hash_netnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp; @@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) static inline void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, - struct hash_netnet4_elem *orig) + struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) static bool hash_netnet4_data_list(struct sk_buff *skb, - const struct hash_netnet4_elem *data) + const struct hash_netnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -122,28 +122,34 @@ nla_put_failure: static inline void hash_netnet4_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet4_elem *d) + const struct hash_netnet4_elem *d) { next->ipcmp = d->ipcmp; } #define MTYPE hash_netnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netnet4_init(struct hash_netnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -157,7 +163,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -165,45 +171,43 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; - u8 cidr, cidr2; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr2 || cidr2 > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr2; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -226,8 +230,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { @@ -238,28 +243,27 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); - last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); - e.cidr[1] = cidr2; + last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = last2 + 1; } ip = last + 1; @@ -283,8 +287,8 @@ struct hash_netnet6_elem { static inline bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, - const struct hash_netnet6_elem *ip2, - u32 *multi) + const struct hash_netnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -311,7 +315,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) static inline void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, - struct hash_netnet6_elem *orig) + struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -330,7 +334,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) static bool hash_netnet6_data_list(struct sk_buff *skb, - const struct hash_netnet6_elem *data) + const struct hash_netnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -349,34 +353,39 @@ nla_put_failure: static inline void hash_netnet6_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet6_elem *d) + const struct hash_netnet6_elem *d) { } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netnet6_init(struct hash_netnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) - e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); @@ -388,50 +397,53 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -470,7 +482,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -487,6 +500,7 @@ hash_netnet_init(void) static void __exit hash_netnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netnet_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netport.c b/kernel/net/netfilter/ipset/ip_set_hash_netport.c index c0ddb58d1..731813e0f 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netport.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netport.c @@ -110,10 +110,10 @@ hash_netport4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -125,7 +125,6 @@ hash_netport4_data_next(struct hash_netport4_elem *next, } #define MTYPE hash_netport4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -137,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -167,23 +166,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -194,10 +190,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -205,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -215,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -240,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -257,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } ip = last + 1; } @@ -326,10 +322,10 @@ hash_netport6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -340,11 +336,9 @@ hash_netport6_data_next(struct hash_netport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netport6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -357,7 +351,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -387,25 +381,22 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -417,10 +408,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], } ip6_netmask(&e.ip, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -428,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -459,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -495,7 +485,8 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -512,6 +503,7 @@ hash_netport_init(void) static void __exit hash_netport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netport_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c index bfaa94c7b..9a14c2378 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -54,7 +54,7 @@ struct hash_netportnet4_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -62,8 +62,8 @@ struct hash_netportnet4_elem { static inline bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, - const struct hash_netportnet4_elem *ip2, - u32 *multi) + const struct hash_netportnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp && @@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) static inline void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, - struct hash_netportnet4_elem *orig) + struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, static bool hash_netportnet4_data_list(struct sk_buff *skb, - const struct hash_netportnet4_elem *data) + const struct hash_netportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -124,37 +124,43 @@ hash_netportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet4_elem *d) + const struct hash_netportnet4_elem *d) { next->ipcmp = d->ipcmp; next->port = d->port; } #define MTYPE hash_netportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netportnet4_init(struct hash_netportnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -172,7 +178,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -181,49 +187,43 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; bool with_ports = false; - u8 cidr, cidr2; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netportnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr || cidr > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -231,14 +231,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -262,8 +264,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { @@ -281,16 +284,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { @@ -301,13 +304,12 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &cidr2); - e.cidr[1] = cidr2; + &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -326,7 +328,7 @@ struct hash_netportnet6_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -334,8 +336,8 @@ struct hash_netportnet6_elem { static inline bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, - const struct hash_netportnet6_elem *ip2, - u32 *multi) + const struct hash_netportnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -364,7 +366,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) static inline void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, - struct hash_netportnet6_elem *orig) + struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -384,7 +386,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, static bool hash_netportnet6_data_list(struct sk_buff *skb, - const struct hash_netportnet6_elem *data) + const struct hash_netportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -397,41 +399,46 @@ hash_netportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void hash_netportnet6_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet6_elem *d) + const struct hash_netportnet6_elem *d) { next->port = d->port; } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netportnet6_init(struct hash_netportnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; @@ -449,7 +456,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -459,47 +466,46 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netportnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK)) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -507,14 +513,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -538,8 +546,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -577,7 +585,8 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -594,6 +603,7 @@ hash_netportnet_init(void) static void __exit hash_netportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netportnet_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_list_set.c b/kernel/net/netfilter/ipset/ip_set_list_set.c index f8f682806..bbede95c9 100644 --- a/kernel/net/netfilter/ipset/ip_set_list_set.c +++ b/kernel/net/netfilter/ipset/ip_set_list_set.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -27,8 +28,10 @@ MODULE_ALIAS("ip_set_list:set"); /* Member elements */ struct set_elem { + struct rcu_head rcu; + struct list_head list; ip_set_id_t id; -}; +} __aligned(__alignof__(u64)); struct set_adt_elem { ip_set_id_t id; @@ -41,12 +44,9 @@ struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ struct net *net; /* namespace */ - struct set_elem members[0]; /* the set members */ + struct list_head members; /* the set members */ }; -#define list_set_elem(set, map, id) \ - (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) - static int list_set_ktest(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -54,17 +54,14 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i, cmdflags = opt->cmdflags; + u32 cmdflags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -91,13 +88,9 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -115,13 +108,9 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -138,110 +127,65 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + int ret = -EINVAL; + rcu_read_lock(); switch (adt) { case IPSET_TEST: - return list_set_ktest(set, skb, par, opt, &ext); + ret = list_set_ktest(set, skb, par, opt, &ext); + break; case IPSET_ADD: - return list_set_kadd(set, skb, par, opt, &ext); + ret = list_set_kadd(set, skb, par, opt, &ext); + break; case IPSET_DEL: - return list_set_kdel(set, skb, par, opt, &ext); + ret = list_set_kdel(set, skb, par, opt, &ext); + break; default: break; } - return -EINVAL; -} - -static bool -id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) -{ - const struct list_set *map = set->data; - const struct set_elem *e; - - if (i >= map->size) - return 0; + rcu_read_unlock(); - e = list_set_elem(set, map, i); - return !!(e->id == id && - !(SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set)))); + return ret; } -static int -list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, - const struct ip_set_ext *ext) -{ - struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); +/* Userspace interfaces: we are protected by the nfnl mutex */ - if (e->id != IPSET_INVALID_ID) { - if (i == map->size - 1) { - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - } else { - struct set_elem *x = list_set_elem(set, map, - map->size - 1); - - /* Last element pushed off */ - if (x->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, x->id); - ip_set_ext_destroy(set, x); - } - memmove(list_set_elem(set, map, i + 1), e, - set->dsize * (map->size - (i + 1))); - /* Extensions must be initialized to zero */ - memset(e, 0, set->dsize); - } - } - - e->id = d->id; - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); - return 0; -} - -static int -list_set_del(struct ip_set *set, u32 i) +static void +__list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); ip_set_put_byindex(map->net, e->id); + /* We may call it, because we don't have a to be destroyed + * extension which is used by the kernel. + */ ip_set_ext_destroy(set, e); + kfree_rcu(e, rcu); +} - if (i < map->size - 1) - memmove(e, list_set_elem(set, map, i + 1), - set->dsize * (map->size - (i + 1))); +static inline void +list_set_del(struct ip_set *set, struct set_elem *e) +{ + list_del_rcu(&e->list); + __list_set_del(set, e); +} - /* Last element */ - e = list_set_elem(set, map, map->size - 1); - e->id = IPSET_INVALID_ID; - return 0; +static inline void +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) +{ + list_replace_rcu(&old->list, &e->list); + __list_set_del(set, old); } static void set_cleanup_entries(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i = 0; + struct set_elem *e, *n; - while (i < map->size) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID && - ip_set_timeout_expired(ext_timeout(e, set))) - list_set_del(set, i); - /* Check element moved to position i in next loop */ - else - i++; - } + list_for_each_entry_safe(e, n, &map->members, list) + if (ip_set_timeout_expired(ext_timeout(e, set))) + list_set_del(set, e); } static int @@ -250,31 +194,46 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; + struct set_elem *e, *next, *prev = NULL; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return 1; - else if (d->before > 0) - ret = id_eq(set, i + 1, d->refid); - else - ret = i > 0 && id_eq(set, i - 1, d->refid); + if (d->before == 0) { + ret = 1; + } else if (d->before > 0) { + next = list_next_entry(e, list); + ret = !list_is_last(&e->list, &map->members) && + next->id == d->refid; + } else { + ret = prev && prev->id == d->refid; + } return ret; } return 0; } +static void +list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext, + struct set_elem *e) +{ + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); + /* Update timeout last */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); +} static int list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -282,60 +241,78 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; + struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; - u32 i, ret = 0; if (SET_WITH_TIMEOUT(set)) set_cleanup_entries(set); - /* Check already added element */ - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto insert; - else if (e->id != d->id) + /* Find where to add the new entry */ + n = prev = next = NULL; + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - - if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || - (d->before < 0 && - (i == 0 || !id_eq(set, i - 1, d->refid)))) - /* Before/after doesn't match */ + else if (d->id == e->id) + n = e; + else if (d->before == 0 || e->id != d->refid) + continue; + else if (d->before > 0) + next = e; + else + prev = e; + } + /* Re-add already existing element */ + if (n) { + if ((d->before > 0 && !next) || + (d->before < 0 && !prev)) return -IPSET_ERR_REF_EXIST; if (!flag_exist) - /* Can't re-add */ return -IPSET_ERR_EXIST; /* Update extensions */ - ip_set_ext_destroy(set, e); + ip_set_ext_destroy(set, n); + list_set_init_extensions(set, ext, n); - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; } -insert: - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - ret = d->before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(set, i, d, ext); - else if (e->id != d->refid) - continue; - else if (d->before > 0) - ret = list_set_add(set, i, d, ext); - else if (i + 1 < map->size) - ret = list_set_add(set, i + 1, d, ext); + /* Add new entry */ + if (d->before == 0) { + /* Append */ + n = list_empty(&map->members) ? NULL : + list_last_entry(&map->members, struct set_elem, list); + } else if (d->before > 0) { + /* Insert after next element */ + if (!list_is_last(&next->list, &map->members)) + n = list_next_entry(next, list); + } else { + /* Insert before prev element */ + if (prev->list.prev != &map->members) + n = list_prev_entry(prev, list); } + /* Can we replace a timed out entry? */ + if (n && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(n, set)))) + n = NULL; + + e = kzalloc(set->dsize, GFP_ATOMIC); + if (!e) + return -ENOMEM; + e->id = d->id; + INIT_LIST_HEAD(&e->list); + list_set_init_extensions(set, ext, e); + if (n) + list_set_replace(set, e, n); + else if (next) + list_add_tail_rcu(&e->list, &next->list); + else if (prev) + list_add_rcu(&e->list, &prev->list); + else + list_add_tail_rcu(&e->list, &map->members); - return ret; + return 0; } static int @@ -344,32 +321,30 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return d->before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + struct set_elem *e, *next, *prev = NULL; + + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return list_set_del(set, i); - else if (d->before > 0) { - if (!id_eq(set, i + 1, d->refid)) + if (d->before > 0) { + next = list_next_entry(e, list); + if (list_is_last(&e->list, &map->members) || + next->id != d->refid) return -IPSET_ERR_REF_EXIST; - return list_set_del(set, i); - } else if (i == 0 || !id_eq(set, i - 1, d->refid)) - return -IPSET_ERR_REF_EXIST; - else - return list_set_del(set, i); + } else if (d->before < 0) { + if (!prev || prev->id != d->refid) + return -IPSET_ERR_REF_EXIST; + } + list_set_del(set, e); + return 0; } - return -IPSET_ERR_EXIST; + return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST; } static int @@ -383,19 +358,13 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set *s; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_NAME] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_NAME] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -410,6 +379,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + e.before = f & IPSET_FLAG_BEFORE; } @@ -447,27 +417,26 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - e->id = IPSET_INVALID_ID; - } - } + struct set_elem *e, *n; + + list_for_each_entry_safe(e, n, &map->members, list) + list_set_del(set, e); } static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; + struct set_elem *e, *n; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - list_set_flush(set); + list_for_each_entry_safe(e, n, &map->members, list) { + list_del(&e->list); + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + kfree(e); + } kfree(map); set->data = NULL; @@ -478,6 +447,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) { const struct list_set *map = set->data; struct nlattr *nested; + struct set_elem *e; + u32 n = 0; + + list_for_each_entry(e, &map->members, list) + n++; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) @@ -485,7 +459,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->size * set->dsize))) + htonl(sizeof(*map) + n * set->dsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -502,18 +476,22 @@ list_set_list(const struct ip_set *set, { const struct list_set *map = set->data; struct nlattr *atd, *nested; - u32 i, first = cb->args[IPSET_CB_ARG0]; - const struct set_elem *e; + u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + struct set_elem *e; + int ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - for (; cb->args[IPSET_CB_ARG0] < map->size; - cb->args[IPSET_CB_ARG0]++) { - i = cb->args[IPSET_CB_ARG0]; - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto finish; + list_for_each_entry(e, &map->members, list) { + if (i == first) + break; + i++; + } + + rcu_read_lock(); + list_for_each_entry_from(e, &map->members, list) { + i++; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -521,9 +499,10 @@ list_set_list(const struct ip_set *set, if (!nested) { if (i == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(map->net, e->id))) @@ -532,20 +511,23 @@ list_set_list(const struct ip_set *set, goto nla_put_failure; ipset_nest_end(skb, nested); } -finish: + ipset_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } + cb->args[IPSET_CB_ARG0] = i - 1; ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static bool @@ -577,12 +559,12 @@ static const struct ip_set_type_variant set_variant = { static void list_set_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct list_set *map = set->data; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set_cleanup_entries(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -594,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct list_set *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -606,24 +588,16 @@ static bool init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; - struct set_elem *e; - u32 i; - map = kzalloc(sizeof(*map) + - min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, - GFP_KERNEL); + map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return false; map->size = size; map->net = net; + INIT_LIST_HEAD(&map->members); set->data = map; - for (i = 0; i < size; i++) { - e = list_set_elem(set, map, i); - e->id = IPSET_INVALID_ID; - } - return true; } @@ -644,7 +618,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], size = IP_SET_LIST_MIN_SIZE; set->variant = &set_variant; - set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); + set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), + __alignof__(struct set_elem)); if (!init_list_set(net, set, size)) return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { @@ -678,7 +653,8 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -695,6 +671,7 @@ list_set_init(void) static void __exit list_set_fini(void) { + rcu_barrier(); ip_set_type_unregister(&list_set_type); } diff --git a/kernel/net/netfilter/ipset/pfxlen.c b/kernel/net/netfilter/ipset/pfxlen.c index 04d15fdc9..1c8a42c10 100644 --- a/kernel/net/netfilter/ipset/pfxlen.c +++ b/kernel/net/netfilter/ipset/pfxlen.c @@ -1,9 +1,7 @@ #include #include -/* - * Prefixlen maps for fast conversions, by Jan Engelhardt. - */ +/* Prefixlen maps for fast conversions, by Jan Engelhardt. */ #define E(a, b, c, d) \ {.ip6 = { \ @@ -11,8 +9,7 @@ htonl(c), htonl(d), \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { @@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = { EXPORT_SYMBOL_GPL(ip_set_netmask_map); #undef E -#define E(a, b, c, d) \ - {.ip6 = { (__force __be32) a, (__force __be32) b, \ - (__force __be32) c, (__force __be32) d, \ +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32)a, (__force __be32)b, \ + (__force __be32)c, (__force __be32)d, \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { diff --git a/kernel/net/netfilter/ipvs/Kconfig b/kernel/net/netfilter/ipvs/Kconfig index 3b6929dec..b32fb0dbe 100644 --- a/kernel/net/netfilter/ipvs/Kconfig +++ b/kernel/net/netfilter/ipvs/Kconfig @@ -162,6 +162,17 @@ config IP_VS_FO If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_OVF + tristate "weighted overflow scheduling" + ---help--- + The weighted overflow scheduling algorithm directs network + connections to the server with the highest weight that is + currently available and overflows to the next when active + connections exceed the node's weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/kernel/net/netfilter/ipvs/Makefile b/kernel/net/netfilter/ipvs/Makefile index 38b2723b2..67f3f4389 100644 --- a/kernel/net/netfilter/ipvs/Makefile +++ b/kernel/net/netfilter/ipvs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o +obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/kernel/net/netfilter/ipvs/ip_vs_app.c b/kernel/net/netfilter/ipvs/ip_vs_app.c index dfd7b65b3..0328f7250 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_app.c +++ b/kernel/net/netfilter/ipvs/ip_vs_app.c @@ -75,7 +75,7 @@ static void ip_vs_app_inc_rcu_free(struct rcu_head *head) * Allocate/initialize app incarnation and register it in proto apps. */ static int -ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, +ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { struct ip_vs_protocol *pp; @@ -107,7 +107,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, } } - ret = pp->register_app(net, inc); + ret = pp->register_app(ipvs, inc); if (ret) goto out; @@ -127,7 +127,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, * Release app incarnation */ static void -ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) +ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; @@ -135,7 +135,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) return; if (pp->unregister_app) - pp->unregister_app(net, inc); + pp->unregister_app(ipvs, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", pp->name, inc->name, ntohs(inc->port)); @@ -175,14 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc) * Register an application incarnation in protocol applications */ int -register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, +register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { int result; mutex_lock(&__ip_vs_app_mutex); - result = ip_vs_app_inc_new(net, app, proto, port); + result = ip_vs_app_inc_new(ipvs, app, proto, port); mutex_unlock(&__ip_vs_app_mutex); @@ -191,15 +191,11 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, /* Register application for netns */ -struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) +struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *a; int err = 0; - if (!ipvs) - return ERR_PTR(-ENOENT); - mutex_lock(&__ip_vs_app_mutex); list_for_each_entry(a, &ipvs->app_list, a_list) { @@ -230,21 +226,17 @@ out_unlock: * We are sure there are no app incarnations attached to services * Caller should use synchronize_rcu() or rcu_barrier() */ -void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) +void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *a, *anxt, *inc, *nxt; - if (!ipvs) - return; - mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { if (app && strcmp(app->name, a->name)) continue; list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { - ip_vs_app_inc_release(net, inc); + ip_vs_app_inc_release(ipvs, inc); } list_del(&a->a_list); @@ -611,17 +603,19 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -int __net_init ip_vs_app_net_init(struct net *net) +int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; INIT_LIST_HEAD(&ipvs->app_list); proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } -void __net_exit ip_vs_app_net_cleanup(struct net *net) +void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { - unregister_ip_vs_app(net, NULL /* all */); + struct net *net = ipvs->net; + + unregister_ip_vs_app(ipvs, NULL /* all */); remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_conn.c b/kernel/net/netfilter/ipvs/ip_vs_conn.c index b0f7b626b..85ca189bd 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_conn.c +++ b/kernel/net/netfilter/ipvs/ip_vs_conn.c @@ -108,7 +108,7 @@ static inline void ct_write_unlock_bh(unsigned int key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, +static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { @@ -116,11 +116,11 @@ static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int pro if (af == AF_INET6) return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)net>>8)) & ip_vs_conn_tab_mask; + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; #endif return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)net>>8)) & ip_vs_conn_tab_mask; + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, @@ -141,14 +141,14 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, port = p->vport; } - return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); } static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, &cp->caddr, cp->cport, NULL, 0, &p); if (cp->pe) { @@ -279,7 +279,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) continue; /* HIT */ @@ -314,33 +314,34 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) } static int -ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, +ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, + int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, - int inverse, struct ip_vs_conn_param *p) + struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; - struct net *net = skb_net(skb); pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return 1; - if (likely(!inverse)) - ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], p); else - ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr, pptr[1], &iph->saddr, pptr[0], p); return 0; } struct ip_vs_conn * -ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) return NULL; return ip_vs_conn_in_get(&p); @@ -359,7 +360,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (unlikely(p->pe_data && p->pe->ct_match)) { - if (!ip_vs_conn_net_eq(cp, p->net)) + if (cp->ipvs != p->ipvs) continue; if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { if (__ip_vs_conn_get(cp)) @@ -377,7 +378,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (__ip_vs_conn_get(cp)) goto out; } @@ -418,7 +419,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) continue; /* HIT */ @@ -439,12 +440,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) } struct ip_vs_conn * -ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) return NULL; return ip_vs_conn_out_get(&p); @@ -638,7 +640,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) * so we can make the assumption that the svc_af is the same as the * dest_af */ - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, + dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { @@ -668,7 +670,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) #endif ip_vs_bind_xmit(cp); - pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol); if (pd && atomic_read(&pd->appcnt)) ip_vs_bind_app(cp, pd->pp); } @@ -746,7 +748,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs, int ip_vs_check_template(struct ip_vs_conn *ct) { struct ip_vs_dest *dest = ct->dest; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); + struct netns_ipvs *ipvs = ct->ipvs; /* * Checking the dest server status. @@ -800,8 +802,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct net *net = ip_vs_conn_net(cp); - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = cp->ipvs; /* * do I control anybody? @@ -847,7 +848,7 @@ static void ip_vs_conn_expire(unsigned long data) cp->timeout = 60*HZ; if (ipvs->sync_state & IP_VS_STATE_MASTER) - ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); ip_vs_conn_put(cp); } @@ -875,8 +876,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct netns_ipvs *ipvs = net_ipvs(p->net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, p->protocol); cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); @@ -887,7 +888,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, INIT_HLIST_NODE(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - ip_vs_conn_net_set(cp, p->net); + cp->ipvs = ipvs; cp->af = p->af; cp->daf = dest_af; cp->protocol = p->protocol; @@ -1061,7 +1062,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; - if (!ip_vs_conn_net_eq(cp, net)) + if (!net_eq(cp->ipvs->net, net)) return 0; if (cp->pe_data) { pe_data[0] = ' '; @@ -1146,7 +1147,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) const struct ip_vs_conn *cp = v; struct net *net = seq_file_net(seq); - if (!ip_vs_conn_net_eq(cp, net)) + if (!net_eq(cp->ipvs->net, net)) return 0; #ifdef CONFIG_IP_VS_IPV6 @@ -1240,7 +1241,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) } /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(struct net *net) +void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { int idx; struct ip_vs_conn *cp, *cp_c; @@ -1256,7 +1257,7 @@ void ip_vs_random_dropentry(struct net *net) if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; - if (!ip_vs_conn_net_eq(cp, net)) + if (cp->ipvs != ipvs) continue; if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { @@ -1308,18 +1309,17 @@ void ip_vs_random_dropentry(struct net *net) /* * Flush all the connection entries in the ip_vs_conn_tab */ -static void ip_vs_conn_flush(struct net *net) +static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { int idx; struct ip_vs_conn *cp, *cp_c; - struct netns_ipvs *ipvs = net_ipvs(net); flush_again: rcu_read_lock(); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (!ip_vs_conn_net_eq(cp, net)) + if (cp->ipvs != ipvs) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); @@ -1345,23 +1345,22 @@ flush_again: /* * per netns init and exit */ -int __net_init ip_vs_conn_net_init(struct net *net) +int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - atomic_set(&ipvs->conn_count, 0); - proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); - proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); + proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_fops); return 0; } -void __net_exit ip_vs_conn_net_cleanup(struct net *net) +void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { /* flush all the connection entries first */ - ip_vs_conn_flush(net); - remove_proc_entry("ip_vs_conn", net->proc_net); - remove_proc_entry("ip_vs_conn_sync", net->proc_net); + ip_vs_conn_flush(ipvs); + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); + remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); } int __init ip_vs_conn_init(void) diff --git a/kernel/net/netfilter/ipvs/ip_vs_core.c b/kernel/net/netfilter/ipvs/ip_vs_core.c index 5d2b806a8..f57b4dcdb 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_core.c +++ b/kernel/net/netfilter/ipvs/ip_vs_core.c @@ -112,7 +112,7 @@ static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; @@ -146,7 +146,7 @@ static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; @@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; s = this_cpu_ptr(cp->dest->stats.cpustats); @@ -215,7 +215,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { - ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) @@ -245,20 +245,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc, const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ + const union nf_inet_addr *src_addr, *dst_addr; + + if (likely(!ip_vs_iph_inverse(iph))) { + src_addr = &iph->saddr; + dst_addr = &iph->daddr; + } else { + src_addr = &iph->daddr; + dst_addr = &iph->saddr; + } + /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + ipv6_addr_prefix(&snet.in6, &src_addr->in6, (__force __u32) svc->netmask); else #endif - snet.ip = iph->saddr.ip & svc->netmask; + snet.ip = src_addr->ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), - IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -276,7 +286,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ { int protocol = iph->protocol; - const union nf_inet_addr *vaddr = &iph->daddr; + const union nf_inet_addr *vaddr = dst_addr; __be16 vport = 0; if (dst_port == svc->port) { @@ -319,7 +329,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -360,8 +376,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, - src_port, &iph->daddr, dst_port, ¶m); + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, + src_port, dst_addr, dst_port, ¶m); cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, skb->mark); @@ -412,7 +428,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; + __be16 _ports[2], *pptr, cport, vport; + const void *caddr, *vaddr; unsigned int flags; *ignored = 1; @@ -423,14 +440,26 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, if (pptr == NULL) return NULL; + if (likely(!ip_vs_iph_inverse(iph))) { + cport = pptr[0]; + caddr = &iph->saddr; + vport = pptr[1]; + vaddr = &iph->daddr; + } else { + cport = pptr[1]; + caddr = &iph->daddr; + vport = pptr[0]; + vaddr = &iph->saddr; + } + /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ - if (pptr[0] == FTPDATA) { - IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + if (cport == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } @@ -438,19 +467,25 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Do not schedule replies from local real server. */ - if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { - IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, - "Not scheduling reply for existing connection"); - __ip_vs_conn_put(cp); - return NULL; + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph); + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + + if (cp) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, + "Not scheduling reply for existing" + " connection"); + __ip_vs_conn_put(cp); + return NULL; + } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; @@ -458,7 +493,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Non-persistent service */ - if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " @@ -467,7 +502,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; @@ -483,11 +524,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, - &iph->saddr, pptr[0], &iph->daddr, - pptr[1], &p); + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, - dest->port ? dest->port : pptr[1], + dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; @@ -507,6 +547,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return cp; } +static inline int ip_vs_addr_is_unicast(struct net *net, int af, + union nf_inet_addr *addr) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; +#endif + return (inet_addr_type(net, addr->ip) == RTN_UNICAST); +} /* * Pass or drop the packet. @@ -516,33 +565,21 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { - __be16 _ports[2], *pptr; -#ifdef CONFIG_SYSCTL - struct net *net; - struct netns_ipvs *ipvs; - int unicast; -#endif + __be16 _ports[2], *pptr, dport; + struct netns_ipvs *ipvs = svc->ipvs; + struct net *net = ipvs->net; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); - if (pptr == NULL) { + if (!pptr) return NF_DROP; - } - -#ifdef CONFIG_SYSCTL - net = skb_net(skb); - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; - else -#endif - unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); + dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - ipvs = net_ipvs(net); - if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + if (sysctl_cache_bypass(ipvs) && svc->fwmark && + !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && + ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && @@ -554,7 +591,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, @@ -578,7 +615,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_put(cp); return ret; } -#endif /* * When the virtual ftp service is presented, packets destined @@ -586,9 +622,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) + if (svc->port == FTPPORT && dport != FTPPORT) return NF_ACCEPT; + if (unlikely(ip_vs_iph_icmp(iph))) + return NF_DROP; + /* * Notify the client that the destination is unreachable, and * release the socket buffer. @@ -598,11 +637,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { - if (!skb->dev) { - struct net *net_ = dev_net(skb_dst(skb)->dev); - - skb->dev = net_->loopback_dev; - } + if (!skb->dev) + skb->dev = net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif @@ -613,15 +649,13 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_SYSCTL -static int sysctl_snat_reroute(struct sk_buff *skb) +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); return ipvs->sysctl_snat_reroute; } -static int sysctl_nat_icmp_send(struct net *net) +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); return ipvs->sysctl_nat_icmp_send; } @@ -632,8 +666,8 @@ static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) #else -static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } -static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -652,12 +686,13 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) return IP_DEFRAG_VS_OUT; } -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, + struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); @@ -665,10 +700,10 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } -static int ip_vs_route_me_harder(int af, struct sk_buff *skb, - unsigned int hooknum) +static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, + struct sk_buff *skb, unsigned int hooknum) { - if (!sysctl_snat_reroute(skb)) + if (!sysctl_snat_reroute(ipvs)) return 0; /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ if (NF_INET_LOCAL_IN == hooknum) @@ -678,12 +713,12 @@ static int ip_vs_route_me_harder(int af, struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && - ip6_route_me_harder(skb) != 0) + ip6_route_me_harder(ipvs->net, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(skb, RTN_LOCAL) != 0) + ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) return 1; return 0; @@ -836,7 +871,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); - if (ip_vs_route_me_harder(af, skb, hooknum)) + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; /* do the statistics and put it back */ @@ -860,8 +895,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related, - unsigned int hooknum) +static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -876,7 +911,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { - if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -922,10 +957,10 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); - ip_vs_fill_ip4hdr(cih, &ciph); - ciph.len += offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); + /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); + cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; @@ -935,16 +970,16 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum, struct ip_vs_iphdr *ipvsh) +static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) { struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; union nf_inet_addr snet; - unsigned int writable; + unsigned int offset; *related = 1; ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); @@ -972,31 +1007,23 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, ic->icmp6_type, ntohs(icmpv6_id(ic)), &ipvsh->saddr, &ipvsh->daddr); - /* Now find the contained IP header */ - ciph.len = ipvsh->len + sizeof(_icmph); - ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); - if (ip6h == NULL) + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), + true, &ciph)) return NF_ACCEPT; /* The packet looks wrong, ignore */ - ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ - ciph.daddr.in6 = ip6h->daddr; - /* skip possible IPv6 exthdrs of contained IPv6 packet */ - ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); - if (ciph.protocol < 0) - return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); + cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; snet.in6 = ciph.saddr.in6; - writable = ciph.len; + offset = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, - pp, writable, sizeof(struct ipv6hdr), + pp, offset, sizeof(struct ipv6hdr), hooknum); } #endif @@ -1081,7 +1108,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, { struct ip_vs_protocol *pp = pd->pp; - IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (!skb_make_writable(skb, iph->len)) goto drop; @@ -1115,10 +1142,10 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_vs_route_me_harder(af, skb, hooknum)) + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; - IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); @@ -1143,13 +1170,13 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + struct sock *sk; EnterFunction(11); @@ -1157,29 +1184,27 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (skb->ipvs_property) return NF_ACCEPT; + sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ - if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (inet && sk->sk_family == PF_INET && inet->nodefrag) + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - net = skb_net(skb); - if (!net_ipvs(net)->enable) + if (!ipvs->enable) return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_out_icmp_v6(skb, &related, + int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) @@ -1189,13 +1214,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; - int verdict = ip_vs_out_icmp(skb, &related, hooknum); + int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } - pd = ip_vs_proto_data_get(net, iph.protocol); + pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; @@ -1205,21 +1230,21 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; - ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, &iph, 0); + cp = pp->conn_out_get(ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); - if (sysctl_nat_icmp_send(net) && + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { @@ -1229,7 +1254,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no @@ -1246,7 +1271,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (!skb->dev) - skb->dev = net->loopback_dev; + skb->dev = ipvs->net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, @@ -1260,7 +1285,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) } } } - IP_VS_DBG_PKT(12, af, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1271,10 +1296,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); } /* @@ -1282,10 +1307,10 @@ ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1296,10 +1321,10 @@ ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET6); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); } /* @@ -1307,14 +1332,51 @@ ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET6); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); } #endif +static unsigned int +ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + + if (!iph->fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ + + /* Schedule and create new connection entry into cpp */ + if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) + return 0; + } + + if (unlikely(!*cpp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, iph->off, + "ip_vs_in: packet continues traversal as normal"); + if (iph->fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, iph->off, + "unhandled fragment"); + } + *verdict = NF_ACCEPT; + return 0; + } + + return 1; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1322,9 +1384,9 @@ ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Currently handles error types - unreachable, quench, ttl exceeded. */ static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, + unsigned int hooknum) { - struct net *net = NULL; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ @@ -1333,13 +1395,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; - bool ipip; + bool ipip, new_cp = false; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { - if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1373,8 +1435,6 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - net = skb_net(skb); - /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { @@ -1390,7 +1450,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ipip = true; } - pd = ip_vs_proto_data_get(net, cih->protocol); + pd = ip_vs_proto_data_get(ipvs, cih->protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; @@ -1404,15 +1464,24 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) "Checking incoming ICMP for"); offset2 = offset; - ip_vs_fill_ip4hdr(cih, &ciph); - ciph.len += offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph); offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. * For IPIP this is error for request, not for reply. */ - cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); - if (!cp) - return NF_ACCEPT; + cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) + return v; + new_cp = true; + } verdict = NF_DROP; @@ -1443,7 +1512,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - ipv4_update_pmtu(skb, dev_net(skb->dev), + ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) @@ -1489,23 +1558,26 @@ ignore_ipip: verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: - __ip_vs_conn_put(cp); + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum, struct ip_vs_iphdr *iph) +static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *iph) { - struct net *net = NULL; - struct ipv6hdr _ip6h, *ip6h; struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offs_ciph, writable, verdict; + unsigned int offset, verdict; + bool new_cp = false; *related = 1; @@ -1534,21 +1606,11 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, ic->icmp6_type, ntohs(icmpv6_id(ic)), &iph->saddr, &iph->daddr); - /* Now find the contained IP header */ - ciph.len = iph->len + sizeof(_icmph); - offs_ciph = ciph.len; /* Save ip header offset */ - ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); - if (ip6h == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ - ciph.daddr.in6 = ip6h->daddr; - /* skip possible IPv6 exthdrs of contained IPv6 packet */ - ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); - if (ciph.protocol < 0) - return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ - - net = skb_net(skb); - pd = ip_vs_proto_data_get(net, ciph.protocol); + offset = iph->len + sizeof(_icmph); + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) + return NF_ACCEPT; + + pd = ip_vs_proto_data_get(ipvs, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; @@ -1557,36 +1619,49 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, if (ciph.fragoffs) return NF_ACCEPT; - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, "Checking incoming ICMPv6 for"); /* The embedded headers contain source and dest in reverse order * if not from localhost */ - cp = pp->conn_in_get(AF_INET6, skb, &ciph, - (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); + cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) + return v; + + new_cp = true; + } - if (!cp) - return NF_ACCEPT; /* VS/TUN, VS/DR and LOCALNODE just let it go */ if ((hooknum == NF_INET_LOCAL_OUT) && (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { - __ip_vs_conn_put(cp); - return NF_ACCEPT; + verdict = NF_ACCEPT; + goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); /* Need to mangle contained IPv6 header in ICMPv6 packet */ - writable = ciph.len; + offset = ciph.len; if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || IPPROTO_SCTP == ciph.protocol) - writable += 2 * sizeof(__u16); /* Also mangle ports */ + offset += 2 * sizeof(__u16); /* Also mangle ports */ - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); - __ip_vs_conn_put(cp); +out: + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); return verdict; } @@ -1598,16 +1673,15 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; - struct netns_ipvs *ipvs; int conn_reuse_mode; + struct sock *sk; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1621,7 +1695,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, @@ -1629,20 +1703,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ - if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + sk = skb_to_full_sk(skb); + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (inet && sk->sk_family == PF_INET && inet->nodefrag) + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; } @@ -1650,8 +1721,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, - &iph); + int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, + hooknum, &iph); if (related) return verdict; @@ -1660,21 +1731,30 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; - int verdict = ip_vs_in_icmp(skb, &related, hooknum); + int verdict = ip_vs_in_icmp(ipvs, skb, &related, + hooknum); if (related) return verdict; } /* Protocol supported? */ - pd = ip_vs_proto_data_get(net, iph.protocol); - if (unlikely(!pd)) + pd = ip_vs_proto_data_get(ipvs, iph.protocol); + if (unlikely(!pd)) { + /* The only way we'll see this packet again is if it's + * encapsulated, so mark it with ipvs_property=1 so we + * skip it if we're ignoring tunneled packets + */ + if (sysctl_ignore_tunneled(ipvs)) + skb->ipvs_property = 1; + return NF_ACCEPT; + } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, &iph, 0); + cp = pp->conn_in_get(ipvs, af, skb, &iph); conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && @@ -1688,32 +1768,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) cp = NULL; } - if (unlikely(!cp) && !iph.fragoffs) { - /* No (second) fragments need to enter here, as nf_defrag_ipv6 - * replayed fragment zero will already have created the cp - */ + if (unlikely(!cp)) { int v; - /* Schedule and create new connection entry into &cp */ - if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) return v; } - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, af, pp, skb, 0, - "ip_vs_in: packet continues traversal as normal"); - if (iph.fragoffs) { - /* Fragment that couldn't be mapped to a conn entry - * is missing module nf_defrag_ipv6 - */ - IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); - IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); - } - return NF_ACCEPT; - } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1753,7 +1816,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) pkts = atomic_add_return(1, &cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) - ip_vs_sync_conn(net, cp, pkts); + ip_vs_sync_conn(ipvs, cp, pkts); ip_vs_conn_put(cp); return ret; @@ -1764,10 +1827,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_remote_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); } /* @@ -1775,10 +1838,10 @@ ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1788,10 +1851,10 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_remote_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET6); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); } /* @@ -1799,10 +1862,10 @@ ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET6); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); } #endif @@ -1818,46 +1881,40 @@ ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, * and send them to ip_vs_in_icmp. */ static unsigned int -ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int r; - struct net *net; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = net_ipvs(state->net); if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(skb, &r, ops->hooknum); + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); } #ifdef CONFIG_IP_VS_IPV6 static unsigned int -ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int r; - struct net *net; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = net_ipvs(state->net); struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); } #endif @@ -1866,7 +1923,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, @@ -1876,7 +1932,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * applied to IPVS. */ { .hook = ip_vs_remote_request4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, @@ -1884,7 +1939,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_local_reply4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, @@ -1892,7 +1946,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After mangle, schedule and forward local requests */ { .hook = ip_vs_local_request4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, @@ -1901,7 +1954,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, @@ -1909,7 +1961,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -1918,7 +1969,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, @@ -1928,7 +1978,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * applied to IPVS. */ { .hook = ip_vs_remote_request6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, @@ -1936,7 +1985,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_local_reply6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, @@ -1944,7 +1992,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After mangle, schedule and forward local requests */ { .hook = ip_vs_local_request6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, @@ -1953,7 +2000,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, @@ -1961,7 +2007,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -1987,22 +2032,22 @@ static int __net_init __ip_vs_init(struct net *net) atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; - if (ip_vs_estimator_net_init(net) < 0) + if (ip_vs_estimator_net_init(ipvs) < 0) goto estimator_fail; - if (ip_vs_control_net_init(net) < 0) + if (ip_vs_control_net_init(ipvs) < 0) goto control_fail; - if (ip_vs_protocol_net_init(net) < 0) + if (ip_vs_protocol_net_init(ipvs) < 0) goto protocol_fail; - if (ip_vs_app_net_init(net) < 0) + if (ip_vs_app_net_init(ipvs) < 0) goto app_fail; - if (ip_vs_conn_net_init(net) < 0) + if (ip_vs_conn_net_init(ipvs) < 0) goto conn_fail; - if (ip_vs_sync_net_init(net) < 0) + if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", @@ -2013,15 +2058,15 @@ static int __net_init __ip_vs_init(struct net *net) */ sync_fail: - ip_vs_conn_net_cleanup(net); + ip_vs_conn_net_cleanup(ipvs); conn_fail: - ip_vs_app_net_cleanup(net); + ip_vs_app_net_cleanup(ipvs); app_fail: - ip_vs_protocol_net_cleanup(net); + ip_vs_protocol_net_cleanup(ipvs); protocol_fail: - ip_vs_control_net_cleanup(net); + ip_vs_control_net_cleanup(ipvs); control_fail: - ip_vs_estimator_net_cleanup(net); + ip_vs_estimator_net_cleanup(ipvs); estimator_fail: net->ipvs = NULL; return -ENOMEM; @@ -2029,22 +2074,25 @@ estimator_fail: static void __net_exit __ip_vs_cleanup(struct net *net) { - ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(net); - ip_vs_app_net_cleanup(net); - ip_vs_protocol_net_cleanup(net); - ip_vs_control_net_cleanup(net); - ip_vs_estimator_net_cleanup(net); - IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); EnterFunction(2); - net_ipvs(net)->enable = 0; /* Disable packet reception */ + ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); - ip_vs_sync_net_cleanup(net); + ip_vs_sync_net_cleanup(ipvs); LeaveFunction(2); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_ctl.c b/kernel/net/netfilter/ipvs/ip_vs_ctl.c index 285eae3a1..e7c1b052c 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_ctl.c +++ b/kernel/net/netfilter/ipvs/ip_vs_ctl.c @@ -228,7 +228,7 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) - ip_vs_random_dropentry(ipvs->net); + ip_vs_random_dropentry(ipvs); schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif @@ -263,7 +263,7 @@ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; * Returns hash value for virtual service */ static inline unsigned int -ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, +ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { register unsigned int porth = ntohs(port); @@ -276,7 +276,7 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); - ahash ^= ((size_t) net >> 8); + ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; @@ -285,9 +285,9 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { - return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; + return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* @@ -309,14 +309,14 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ - hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } @@ -357,21 +357,21 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_find(struct net *net, int af, __u16 protocol, +__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol) - && net_eq(svc->net, net)) { + && (svc->ipvs == ipvs)) { /* HIT */ return svc; } @@ -385,17 +385,17 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(net, fwmark); + hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af - && net_eq(svc->net, net)) { + && (svc->ipvs == ipvs)) { /* HIT */ return svc; } @@ -406,17 +406,16 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) /* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, +ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; - struct netns_ipvs *ipvs = net_ipvs(net); /* * Check the table hashed by fwmark first */ if (fwmark) { - svc = __ip_vs_svc_fwm_find(net, af, fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } @@ -425,7 +424,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -435,7 +434,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -443,7 +442,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: @@ -543,10 +542,9 @@ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) } /* Check if real service by is present */ -bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, +bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { - struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash; struct ip_vs_dest *dest; @@ -601,7 +599,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, +struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -612,7 +610,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) @@ -660,7 +658,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash @@ -715,10 +713,9 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest) * are expired, and the refcnt of each destination in the trash must * be 0, so we simply release them here. */ -static void ip_vs_trash_cleanup(struct net *net) +static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - struct netns_ipvs *ipvs = net_ipvs(net); del_timer_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ @@ -788,7 +785,7 @@ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; @@ -842,15 +839,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { - ip_vs_start_estimator(svc->net, &dest->stats); + ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; - if (sched->add_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { - if (sched->upd_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } @@ -873,12 +871,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) + !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; } else #endif { - atype = inet_addr_type(svc->net, udest->addr.ip); + atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } @@ -1035,12 +1033,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, +static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_stop_estimator(net, &dest->stats); + ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. @@ -1078,13 +1074,13 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, svc->num_dests--; if (dest->af != svc->af) - net_ipvs(svc->net)->mixed_address_family_dests--; + svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); - if (sched->del_dest) + if (sched && sched->del_dest) sched->del_dest(svc, dest); } } @@ -1119,7 +1115,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest, false); + __ip_vs_del_dest(svc->ipvs, dest, false); LeaveFunction(2); @@ -1128,8 +1124,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) static void ip_vs_dest_trash_expire(unsigned long data) { - struct net *net = (struct net *) data; - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = (struct netns_ipvs *)data; struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -1162,24 +1157,26 @@ static void ip_vs_dest_trash_expire(unsigned long data) * Add a service into the service hash table */ static int -ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0, i; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; - struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - ret = -ENOENT; - goto out_err; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_err; + } } if (u->pe_name && *u->pe_name) { @@ -1233,17 +1230,19 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->flags = u->flags; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - svc->net = net; + svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + } /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); @@ -1255,7 +1254,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - ip_vs_start_estimator(net, &svc->stats); + ip_vs_start_estimator(ipvs, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1291,17 +1290,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { - struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* * Lookup the scheduler, by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - return -ENOENT; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } } old_sched = sched; @@ -1329,14 +1331,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { + if (old_sched) { + ip_vs_unbind_scheduler(svc, old_sched); + RCU_INIT_POINTER(svc->scheduler, NULL); + /* Wait all svc->sched_data users */ + synchronize_rcu(); + } /* Bind the new scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) { - old_sched = sched; - goto out; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + ip_vs_scheduler_put(sched); + goto out; + } } - /* Unbind the old scheduler on success */ - ip_vs_unbind_scheduler(svc, old_sched); } /* @@ -1366,7 +1374,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; pr_info("%s: enter\n", __func__); @@ -1374,7 +1382,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) if (svc->af == AF_INET) ipvs->num_services--; - ip_vs_stop_estimator(svc->net, &svc->stats); + ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); @@ -1390,7 +1398,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest, cleanup); + __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* @@ -1441,7 +1449,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net, bool cleanup) +static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; @@ -1453,7 +1461,7 @@ static int ip_vs_flush(struct net *net, bool cleanup) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } @@ -1464,7 +1472,7 @@ static int ip_vs_flush(struct net *net, bool cleanup) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } @@ -1476,12 +1484,12 @@ static int ip_vs_flush(struct net *net, bool cleanup) * Delete service by {netns} in the service table. * Called by __ip_vs_cleanup() */ -void ip_vs_service_net_cleanup(struct net *net) +void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) { EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net, true); + ip_vs_flush(ipvs, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } @@ -1525,7 +1533,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) { + if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); @@ -1534,7 +1542,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, } hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) { + if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); @@ -1568,26 +1576,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) return 0; } -static int ip_vs_zero_all(struct net *net) +static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } - ip_vs_zero_stats(&net_ipvs(net)->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats); return 0; } @@ -1600,7 +1608,7 @@ static int proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; @@ -1611,7 +1619,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, /* Restore the correct value */ *valp = val; } else { - update_defense_level(net_ipvs(net)); + update_defense_level(ipvs); } } return rc; @@ -1829,6 +1837,18 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "schedule_icmp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ignore_tunneled", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1874,6 +1894,7 @@ static inline const char *ip_vs_fwd_name(unsigned int flags) static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; @@ -1881,7 +1902,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net) && pos-- == 0) { + if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; @@ -1893,7 +1914,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net) && pos-- == 0) { + if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; @@ -1982,6 +2003,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1990,18 +2012,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - sched->name); + sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - sched->name, + sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, sched->name, + svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2180,7 +2202,7 @@ static const struct file_operations ip_vs_stats_percpu_fops = { /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ -static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; @@ -2193,13 +2215,13 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } @@ -2207,7 +2229,7 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } @@ -2319,24 +2341,34 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - mutex_lock(&ipvs->sync_mutex); - if (cmd == IP_VS_SO_SET_STARTDAEMON) - ret = start_sync_thread(net, dm->state, dm->mcast_ifn, - dm->syncid); - else - ret = stop_sync_thread(net, dm->state); - mutex_unlock(&ipvs->sync_mutex); + if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ipvs_sync_daemon_cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + strlcpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)); + cfg.syncid = dm->syncid; + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(ipvs, &cfg, dm->state); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + } else { + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(ipvs, dm->state); + mutex_unlock(&ipvs->sync_mutex); + } goto out_dec; } mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net, false); + ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); + ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } @@ -2351,7 +2383,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(net); + ret = ip_vs_zero_all(ipvs); goto out_unlock; } } @@ -2369,10 +2401,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) - svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD @@ -2386,7 +2418,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(net, &usvc, &svc); + ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); @@ -2427,13 +2459,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; + char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); + sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2443,7 +2477,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) } static inline int -__ip_vs_get_service_entries(struct net *net, +__ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { @@ -2455,7 +2489,7 @@ __ip_vs_get_service_entries(struct net *net, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || !net_eq(svc->net, net)) + if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) @@ -2474,7 +2508,7 @@ __ip_vs_get_service_entries(struct net *net, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || !net_eq(svc->net, net)) + if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) @@ -2494,7 +2528,7 @@ out: } static inline int -__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; @@ -2503,9 +2537,9 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, rcu_read_lock(); if (get->fwmark) - svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else - svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); @@ -2550,7 +2584,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, } static inline void -__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; @@ -2559,12 +2593,12 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif @@ -2627,15 +2661,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ipvs->master_syncid; + d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ipvs->backup_syncid; + d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; @@ -2683,7 +2717,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_service_entries(net, get, user); + ret = __ip_vs_get_service_entries(ipvs, get, user); } break; @@ -2697,9 +2731,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) - svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else - svc = __ip_vs_service_find(net, AF_INET, + svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); @@ -2725,7 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_dest_entries(net, get, user); + ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; @@ -2733,7 +2767,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } @@ -2790,6 +2824,11 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, + [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ @@ -2892,6 +2931,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; + char *sched_name; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2910,8 +2950,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } sched = rcu_dereference_protected(svc->scheduler, 1); + sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || @@ -2961,12 +3002,13 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start || !net_eq(svc->net, net)) + if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2977,7 +3019,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start || !net_eq(svc->net, net)) + if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2993,7 +3035,7 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct net *net, +static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, struct ip_vs_service **ret_svc) @@ -3038,9 +3080,9 @@ static int ip_vs_genl_parse_service(struct net *net, rcu_read_lock(); if (usvc->fwmark) - svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else - svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); rcu_read_unlock(); *ret_svc = svc; @@ -3078,14 +3120,14 @@ static int ip_vs_genl_parse_service(struct net *net, return 0; } -static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, +static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc); return ret ? ERR_PTR(ret) : svc; } @@ -3160,7 +3202,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); @@ -3170,7 +3213,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, goto out_err; - svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); + svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -3246,7 +3289,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid) + struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; @@ -3255,9 +3298,23 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || - nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || - nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || + nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; +#ifdef CONFIG_IP_VS_IPV6 + if (c->mcast_af == AF_INET6) { + if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, + &c->mcast_group.in6)) + goto nla_put_failure; + } else +#endif + if (c->mcast_af == AF_INET && + nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, + c->mcast_group.ip)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3268,7 +3325,7 @@ nla_put_failure: } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid, + struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; @@ -3278,7 +3335,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, if (!hdr) return -EMSGSIZE; - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); @@ -3292,14 +3349,13 @@ nla_put_failure: static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ipvs->master_mcast_ifn, - ipvs->master_syncid, cb) < 0) + &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; @@ -3307,8 +3363,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ipvs->backup_mcast_ifn, - ipvs->backup_syncid, cb) < 0) + &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; @@ -3320,39 +3375,90 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { + struct ipvs_sync_daemon_cfg c; + struct nlattr *a; + int ret; + + memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + sizeof(c.mcast_ifn)); + c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); + + a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; + if (a) + c.sync_maxlen = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; + if (a) { + c.mcast_af = AF_INET; + c.mcast_group.ip = nla_get_in_addr(a); + if (!ipv4_is_multicast(c.mcast_group.ip)) + return -EINVAL; + } else { + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; + if (a) { +#ifdef CONFIG_IP_VS_IPV6 + int addr_type; + + c.mcast_af = AF_INET6; + c.mcast_group.in6 = nla_get_in6_addr(a); + addr_type = ipv6_addr_type(&c.mcast_group.in6); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + return -EINVAL; +#else + return -EAFNOSUPPORT; +#endif + } + } + + a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; + if (a) + c.mcast_port = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; + if (a) + c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ - if (net_ipvs(net)->mixed_address_family_dests > 0) + if (ipvs->mixed_address_family_dests > 0) return -EINVAL; - return start_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(ipvs, &c, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + return ret; } -static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { + int ret; + if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(ipvs, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + return ret; } -static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3364,38 +3470,33 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - return ip_vs_set_timeout(net, &t); + return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { - int ret = 0, cmd; - struct net *net; - struct netns_ipvs *ipvs; + int ret = -EINVAL, cmd; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); - ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - mutex_lock(&ipvs->sync_mutex); if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; + ip_vs_daemon_policy)) goto out; - } if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(net, daemon_attrs); + ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else - ret = ip_vs_genl_del_daemon(net, daemon_attrs); -out: - mutex_unlock(&ipvs->sync_mutex); + ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } + +out: return ret; } @@ -3406,22 +3507,22 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; - struct net *net; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net, false); + ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(net, info->attrs); + ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(net); + ret = ip_vs_zero_all(ipvs); goto out; } @@ -3431,7 +3532,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = 1; - ret = ip_vs_genl_parse_service(net, &usvc, + ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) @@ -3470,7 +3571,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* The synchronization protocol is incompatible * with mixed family services */ - if (net_ipvs(net)->sync_state) { + if (ipvs->sync_state) { ret = -EINVAL; goto out; } @@ -3490,7 +3591,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) - ret = ip_vs_add_service(net, &usvc, &svc); + ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; @@ -3528,9 +3629,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; - struct net *net; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3559,7 +3660,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc; - svc = ip_vs_genl_find_service(net, + svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); @@ -3580,7 +3681,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || @@ -3735,10 +3836,10 @@ static void ip_vs_genl_unregister(void) * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL -static int __net_init ip_vs_control_net_init_sysctl(struct net *net) +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; int idx; - struct netns_ipvs *ipvs = net_ipvs(net); struct ctl_table *tbl; atomic_set(&ipvs->dropentry, 0); @@ -3757,6 +3858,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) } else tbl = vs_vars; /* Initialize sysctl defaults */ + for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { + if (tbl[idx].proc_handler == proc_do_defense_mode) + tbl[idx].extra2 = ipvs; + } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; @@ -3798,7 +3903,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; - + tbl[idx++].data = &ipvs->sysctl_schedule_icmp; + tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { @@ -3806,7 +3912,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) kfree(tbl); return -ENOMEM; } - ip_vs_start_estimator(net, &ipvs->tot_stats); + ip_vs_start_estimator(ipvs, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); @@ -3815,14 +3921,14 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) return 0; } -static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(net, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -3830,8 +3936,8 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) #else -static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } -static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif @@ -3839,10 +3945,10 @@ static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, }; -int __net_init ip_vs_control_net_init(struct net *net) +int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; int i, idx; - struct netns_ipvs *ipvs = net_ipvs(net); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -3851,7 +3957,7 @@ int __net_init ip_vs_control_net_init(struct net *net) INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, - (unsigned long) net); + (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); @@ -3873,7 +3979,7 @@ int __net_init ip_vs_control_net_init(struct net *net) proc_create("ip_vs_stats_percpu", 0, net->proc_net, &ip_vs_stats_percpu_fops); - if (ip_vs_control_net_init_sysctl(net)) + if (ip_vs_control_net_init_sysctl(ipvs)) goto err; return 0; @@ -3883,12 +3989,12 @@ err: return -ENOMEM; } -void __net_exit ip_vs_control_net_cleanup(struct net *net) +void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; - ip_vs_trash_cleanup(net); - ip_vs_control_net_cleanup_sysctl(net); + ip_vs_trash_cleanup(ipvs); + ip_vs_control_net_cleanup_sysctl(ipvs); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); remove_proc_entry("ip_vs", net->proc_net); diff --git a/kernel/net/netfilter/ipvs/ip_vs_est.c b/kernel/net/netfilter/ipvs/ip_vs_est.c index ef0eb0a8d..457c6c193 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_est.c +++ b/kernel/net/netfilter/ipvs/ip_vs_est.c @@ -102,10 +102,8 @@ static void estimation_timer(unsigned long arg) struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; - struct net *net = (struct net *)arg; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = (struct netns_ipvs *)arg; - ipvs = net_ipvs(net); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); @@ -140,9 +138,8 @@ static void estimation_timer(unsigned long arg) mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); @@ -152,9 +149,8 @@ void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; spin_lock_bh(&ipvs->est_lock); @@ -192,18 +188,16 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) dst->outbps = (e->outbps + 0xF) >> 5; } -int __net_init ip_vs_estimator_net_init(struct net *net) +int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); - setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } -void __net_exit ip_vs_estimator_net_cleanup(struct net *net) +void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&net_ipvs(net)->est_timer); + del_timer_sync(&ipvs->est_timer); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_ftp.c b/kernel/net/netfilter/ipvs/ip_vs_ftp.c index 5d3daae98..d30c327bb 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_ftp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_ftp.c @@ -181,7 +181,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; - struct net *net; *diff = 0; @@ -223,14 +222,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + ip_vs_conn_fill_param(cp->ipvs, AF_INET, iph->protocol, &from, port, &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), + ip_vs_conn_fill_param(cp->ipvs, AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); /* As above, this is ipv4 only */ @@ -289,9 +288,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * would be adjusted twice. */ - net = skb_net(skb); cp->app_data = NULL; - ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return ret; } @@ -320,7 +318,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; - struct net *net; /* no diff required for incoming packets */ *diff = 0; @@ -392,7 +389,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + ip_vs_conn_fill_param(cp->ipvs, AF_INET, iph->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); @@ -413,8 +410,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Move tunnel to listen state */ - net = skb_net(skb); - ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return 1; @@ -447,14 +443,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net) if (!ipvs) return -ENOENT; - app = register_ip_vs_app(net, &ip_vs_ftp); + app = register_ip_vs_app(ipvs, &ip_vs_ftp); if (IS_ERR(app)) return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; - ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); + ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; pr_info("%s: loaded support on port[%d] = %d\n", @@ -463,7 +459,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) return 0; err_unreg: - unregister_ip_vs_app(net, &ip_vs_ftp); + unregister_ip_vs_app(ipvs, &ip_vs_ftp); return ret; } /* @@ -471,7 +467,12 @@ err_unreg: */ static void __ip_vs_ftp_exit(struct net *net) { - unregister_ip_vs_app(net, &ip_vs_ftp); + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return; + + unregister_ip_vs_app(ipvs, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblc.c b/kernel/net/netfilter/ipvs/ip_vs_lblc.c index 127f14046..cccf4d637 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_lblc.c +++ b/kernel/net/netfilter/ipvs/ip_vs_lblc.c @@ -250,8 +250,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) static int sysctl_lblc_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL - struct netns_ipvs *ipvs = net_ipvs(svc->net); - return ipvs->sysctl_lblc_expiration; + return svc->ipvs->sysctl_lblc_expiration; #else return DEFAULT_EXPIRATION; #endif diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c index 2229d2d8b..796d70e47 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c @@ -415,8 +415,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) static int sysctl_lblcr_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL - struct netns_ipvs *ipvs = net_ipvs(svc->net); - return ipvs->sysctl_lblcr_expiration; + return svc->ipvs->sysctl_lblcr_expiration; #else return DEFAULT_EXPIRATION; #endif diff --git a/kernel/net/netfilter/ipvs/ip_vs_nfct.c b/kernel/net/netfilter/ipvs/ip_vs_nfct.c index 5882bbfd1..30434fb13 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_nfct.c +++ b/kernel/net/netfilter/ipvs/ip_vs_nfct.c @@ -161,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum, &orig->src.u3, orig->src.u.tcp.port, &orig->dst.u3, orig->dst.u.tcp.port, &p); cp = ip_vs_conn_out_get(&p); @@ -274,8 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) " for conn " FMT_CONN "\n", __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); - h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, - &tuple); + h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); /* Show what happens instead of calling nf_ct_kill() */ diff --git a/kernel/net/netfilter/ipvs/ip_vs_ovf.c b/kernel/net/netfilter/ipvs/ip_vs_ovf.c new file mode 100644 index 000000000..f7d62c3b7 --- /dev/null +++ b/kernel/net/netfilter/ipvs/ip_vs_ovf.c @@ -0,0 +1,86 @@ +/* + * IPVS: Overflow-Connection Scheduling module + * + * Authors: Raducu Deaconu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Scheduler implements "overflow" loadbalancing according to number of active + * connections , will keep all conections to the node with the highest weight + * and overflow to the next node if the number of connections exceeds the node's + * weight. + * Note that this scheduler might not be suitable for UDP because it only uses + * active connections + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* OVF Connection scheduling */ +static struct ip_vs_dest * +ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *h = NULL; + int hw = 0, w; + + IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); + /* select the node with highest weight, go to next in line if active + * connections exceed weight + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + w = atomic_read(&dest->weight); + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->activeconns) > w || + w == 0) + continue; + if (!h || w > hw) { + h = dest; + hw = w; + } + } + + if (h) { + IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", + IP_VS_DBG_ADDR(h->af, &h->addr), + ntohs(h->port), + atomic_read(&h->activeconns), + atomic_read(&h->weight)); + return h; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_ovf_scheduler = { + .name = "ovf", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), + .schedule = ip_vs_ovf_schedule, +}; + +static int __init ip_vs_ovf_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); +} + +static void __exit ip_vs_ovf_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_ovf_init); +module_exit(ip_vs_ovf_cleanup); +MODULE_LICENSE("GPL"); diff --git a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c index bed5f7042..1b8d594e4 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -70,7 +70,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) const char *dptr; int retc; - ip_vs_fill_iph_skb(p->af, skb, &iph); + ip_vs_fill_iph_skb(p->af, skb, false, &iph); /* Only useful with UDP */ if (iph.protocol != IPPROTO_UDP) diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto.c b/kernel/net/netfilter/ipvs/ip_vs_proto.c index 939f7fbe9..8ae480715 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto.c @@ -63,9 +63,8 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) * register an ipvs protocols netns related data */ static int -register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { - struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); @@ -79,7 +78,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { - int ret = pp->init_netns(net, pd); + int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; @@ -116,9 +115,8 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) * unregister an ipvs protocols netns data */ static int -unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); @@ -127,7 +125,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) - pd->pp->exit_netns(net, pd); + pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } @@ -156,8 +154,8 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -static struct ip_vs_proto_data * -__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); @@ -169,14 +167,6 @@ __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) return NULL; } - -struct ip_vs_proto_data * -ip_vs_proto_data_get(struct net *net, unsigned short proto) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - - return __ipvs_proto_data_get(ipvs, proto); -} EXPORT_SYMBOL(ip_vs_proto_data_get); /* @@ -317,7 +307,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -int __net_init ip_vs_protocol_net_init(struct net *net) +int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { @@ -339,27 +329,26 @@ int __net_init ip_vs_protocol_net_init(struct net *net) }; for (i = 0; i < ARRAY_SIZE(protos); i++) { - ret = register_ip_vs_proto_netns(net, protos[i]); + ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: - ip_vs_protocol_net_cleanup(net); + ip_vs_protocol_net_cleanup(ipvs); return ret; } -void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) - unregister_ip_vs_proto_netns(net, pd); + unregister_ip_vs_proto_netns(ipvs, pd); } } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 5de3dd312..5320d3997 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -41,30 +41,28 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 static void -ah_esp_conn_fill_param_proto(struct net *net, int af, - const struct ip_vs_iphdr *iph, int inverse, +ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, + const struct ip_vs_iphdr *iph, struct ip_vs_conn_param *p) { - if (likely(!inverse)) - ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else - ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - int inverse) +ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; - struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* @@ -73,7 +71,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, */ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", - inverse ? "ICMP+" : "", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); @@ -84,19 +82,18 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, static struct ip_vs_conn * -ah_esp_conn_out_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; - struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", - inverse ? "ICMP+" : "", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); @@ -107,7 +104,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c index 5b84c0b56..010ddeec1 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -9,35 +9,44 @@ #include static int -sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; - struct netns_ipvs *ipvs; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; - - sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (sh == NULL) { - *verdict = NF_DROP; - return 0; + __be16 _ports[2], *ports = NULL; + + if (likely(!ip_vs_iph_icmp(iph))) { + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh) { + sch = skb_header_pointer( + skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch && (sch->type == SCTP_CID_INIT || + sysctl_sloppy_sctp(ipvs))) + ports = &sh->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); } - sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), - sizeof(_schunkh), &_schunkh); - if (sch == NULL) { + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); - ipvs = net_ipvs(net); rcu_read_lock(); - if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && - (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, sh->dest))) { + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { @@ -474,14 +483,13 @@ static inline __u16 sctp_app_hashkey(__be16 port) & SCTP_APP_TAB_MASK; } -static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); hash = sctp_app_hashkey(port); @@ -498,9 +506,9 @@ out: return ret; } -static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -508,7 +516,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) static int sctp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -549,10 +557,8 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); @@ -561,7 +567,7 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c index 8e92beb0c..d7024b2ed 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -32,27 +32,47 @@ #include static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; - struct netns_ipvs *ipvs; + __be16 _ports[2], *ports = NULL; - th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); - if (th == NULL) { + /* In the event of icmp, we're only guaranteed to have the first 8 + * bytes of the transport header, so we only check the rest of the + * TCP packet for non-ICMP packets + */ + if (likely(!ip_vs_iph_icmp(iph))) { + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th) { + if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) + return 1; + ports = &th->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); - ipvs = net_ipvs(net); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ rcu_read_lock(); - if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && - (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, th->dest))) { + + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + + if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { @@ -571,14 +591,13 @@ static inline __u16 tcp_app_hashkey(__be16 port) } -static int tcp_register_app(struct net *net, struct ip_vs_app *inc) +static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); @@ -597,9 +616,9 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void -tcp_unregister_app(struct net *net, struct ip_vs_app *inc) +tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -609,7 +628,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc) static int tcp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -653,9 +672,9 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ -void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; @@ -668,10 +687,8 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) * timeouts is netns related now. * --------------------------------------------- */ -static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); @@ -681,7 +698,7 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c index b62a3c0ff..e494e9a88 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -29,28 +29,42 @@ #include static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; + __be16 _ports[2], *ports = NULL; - /* IPv6 fragments, only first fragment will hit this */ - uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); - if (uh == NULL) { + if (likely(!ip_vs_iph_icmp(iph))) { + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (uh) + ports = &uh->source; + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); + rcu_read_lock(); - svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, uh->dest); + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( @@ -348,14 +362,13 @@ static inline __u16 udp_app_hashkey(__be16 port) } -static int udp_register_app(struct net *net, struct ip_vs_app *inc) +static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); hash = udp_app_hashkey(port); @@ -374,9 +387,9 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) static void -udp_unregister_app(struct net *net, struct ip_vs_app *inc) +udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -385,7 +398,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc) static int udp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -456,10 +469,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); @@ -468,7 +479,7 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_sched.c b/kernel/net/netfilter/ipvs/ip_vs_sched.c index 199760c71..a2ff7d746 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_sched.c +++ b/kernel/net/netfilter/ipvs/ip_vs_sched.c @@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc, if (sched->done_service) sched->done_service(svc); - /* svc->scheduler can not be set to NULL */ + /* svc->scheduler can be set to NULL only by caller */ } @@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { - if (scheduler && scheduler->module) + if (scheduler) module_put(scheduler->module); } @@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { - struct ip_vs_scheduler *sched; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; - sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - sched->name, svc->fwmark, svc->fwmark, msg); + sched_name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } diff --git a/kernel/net/netfilter/ipvs/ip_vs_sh.c b/kernel/net/netfilter/ipvs/ip_vs_sh.c index 98a13433b..1e373a5e4 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_sh.c +++ b/kernel/net/netfilter/ipvs/ip_vs_sh.c @@ -280,35 +280,29 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, static inline __be16 ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - __be16 port; - struct tcphdr _tcph, *th; - struct udphdr _udph, *uh; - sctp_sctphdr_t _sctph, *sh; + __be16 _ports[2], *ports; + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ switch (iph->protocol) { case IPPROTO_TCP: - th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); - if (unlikely(th == NULL)) - return 0; - port = th->source; - break; case IPPROTO_UDP: - uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); - if (unlikely(uh == NULL)) - return 0; - port = uh->source; - break; case IPPROTO_SCTP: - sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (unlikely(sh == NULL)) + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) return 0; - port = sh->source; - break; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; default: - port = 0; + return 0; } - - return port; } @@ -322,6 +316,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_dest *dest; struct ip_vs_sh_state *s; __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); @@ -331,9 +328,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, s = (struct ip_vs_sh_state *) svc->sched_data; if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) - dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port); else - dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + dest = ip_vs_sh_get(svc, s, hash_addr, port); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); @@ -341,7 +338,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph->saddr), + IP_VS_DBG_ADDR(svc->af, hash_addr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); diff --git a/kernel/net/netfilter/ipvs/ip_vs_sync.c b/kernel/net/netfilter/ipvs/ip_vs_sync.c index 19b9cce6c..803001a45 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_sync.c +++ b/kernel/net/netfilter/ipvs/ip_vs_sync.c @@ -193,7 +193,7 @@ union ip_vs_sync_conn { #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) struct ip_vs_sync_thread_data { - struct net *net; + struct netns_ipvs *ipvs; struct socket *sock; char *buf; int id; @@ -262,6 +262,11 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; +union ipvs_sockaddr { + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + struct ip_vs_sync_buff { struct list_head list; unsigned long firstuse; @@ -320,26 +325,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) * Create a new sync buffer for Version 1 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; - sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->syncid = ipvs->mcfg.syncid; sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); - sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)sb->mesg + len; sb->firstuse = jiffies; return sb; @@ -402,7 +409,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) * Create a new sync buffer for Version 0 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg_v0 *mesg; @@ -410,17 +417,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; - mesg->syncid = ipvs->master_syncid; + mesg->syncid = ipvs->mcfg.syncid; mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); - sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)mesg + len; sb->firstuse = jiffies; return sb; } @@ -524,16 +533,15 @@ set: * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, +static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_buff *buff; struct ipvs_master_sync_state *ms; int id; - int len; + unsigned int len; if (unlikely(cp->af != AF_INET)) return; @@ -553,17 +561,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, id = select_master_thread_id(ipvs, cp); ms = &ipvs->ms[id]; buff = ms->sync_buff; + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; if (buff) { m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; /* Send buffer if it is for v1 */ - if (!m->nr_conns) { + if (buff->head + len > buff->end || !m->nr_conns) { sb_queue_tail(ipvs, ms); ms->sync_buff = NULL; buff = NULL; } } if (!buff) { - buff = ip_vs_sync_buff_create_v0(ipvs); + buff = ip_vs_sync_buff_create_v0(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -572,8 +582,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, ms->sync_buff = buff; } - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; s = (struct ip_vs_sync_conn_v0 *) buff->head; @@ -597,12 +605,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, m->nr_conns++; m->size = htons(ntohs(m->size) + len); buff->head += len; - - /* check if there is a space for next one */ - if (buff->head + FULL_CONN_SIZE > buff->end) { - sb_queue_tail(ipvs, ms); - ms->sync_buff = NULL; - } spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ @@ -612,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, pkts = atomic_add_return(1, &cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); - ip_vs_sync_conn(net, cp->control, pkts); + ip_vs_sync_conn(ipvs, cp, pkts); } } @@ -621,9 +623,8 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) +void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; struct ip_vs_sync_buff *buff; @@ -634,7 +635,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp, pkts); + ip_vs_sync_conn_v0(ipvs, cp, pkts); return; } /* Do not sync ONE PACKET */ @@ -694,7 +695,7 @@ sloop: } if (!buff) { - buff = ip_vs_sync_buff_create(ipvs); + buff = ip_vs_sync_buff_create(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -781,21 +782,21 @@ control: * fill_param used by version 1 */ static inline int -ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, +ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, struct ip_vs_conn_param *p, __u8 *pe_data, unsigned int pe_data_len, __u8 *pe_name, unsigned int pe_name_len) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) - ip_vs_conn_fill_param(net, af, sc->v6.protocol, + ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, (const union nf_inet_addr *)&sc->v6.caddr, sc->v6.cport, (const union nf_inet_addr *)&sc->v6.vaddr, sc->v6.vport, p); else #endif - ip_vs_conn_fill_param(net, af, sc->v4.protocol, + ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, (const union nf_inet_addr *)&sc->v4.caddr, sc->v4.cport, (const union nf_inet_addr *)&sc->v4.vaddr, @@ -834,7 +835,7 @@ ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, * Param: ... * timeout is in sec. */ -static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, +static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, unsigned int flags, unsigned int state, unsigned int protocol, unsigned int type, const union nf_inet_addr *daddr, __be16 dport, @@ -843,7 +844,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, { struct ip_vs_dest *dest; struct ip_vs_conn *cp; - struct netns_ipvs *ipvs = net_ipvs(net); if (!(flags & IP_VS_CONN_F_TEMPLATE)) { cp = ip_vs_conn_in_get(param); @@ -901,7 +901,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * with synchronization, so we can make the assumption that * the svc_af is the same as the dest_af */ - dest = ip_vs_find_dest(net, type, type, daddr, dport, + dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); @@ -938,7 +938,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, } else { struct ip_vs_proto_data *pd; - pd = ip_vs_proto_data_get(net, protocol); + pd = ip_vs_proto_data_get(ipvs, protocol); if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) cp->timeout = pd->timeout_table[state]; else @@ -950,7 +950,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, /* * Process received multicast message for Version 0 */ -static void ip_vs_process_message_v0(struct net *net, const char *buffer, +static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, const size_t buflen) { struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; @@ -1006,14 +1006,14 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, } } - ip_vs_conn_fill_param(net, AF_INET, s->protocol, + ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, (const union nf_inet_addr *)&s->caddr, s->cport, (const union nf_inet_addr *)&s->vaddr, s->vport, ¶m); /* Send timeout as Zero */ - ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, (union nf_inet_addr *)&s->daddr, s->dport, 0, 0, opt); } @@ -1064,7 +1064,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, /* * Process a Version 1 sync. connection */ -static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) { struct ip_vs_sync_conn_options opt; union ip_vs_sync_conn *s; @@ -1168,21 +1168,21 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) state = 0; } } - if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, pe_data_len, pe_name, pe_name_len)) { retc = 50; goto out; } /* If only IPv4, just silent skip IPv6 */ if (af == AF_INET) - ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, ntohl(s->v4.timeout), ntohl(s->v4.fwmark), (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #ifdef CONFIG_IP_VS_IPV6 else - ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, ntohl(s->v6.timeout), ntohl(s->v6.fwmark), (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) @@ -1201,10 +1201,9 @@ out: * ip_vs_conn entries. * Handles Version 0 & 1 */ -static void ip_vs_process_message(struct net *net, __u8 *buffer, +static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, const size_t buflen) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; __u8 *p, *msg_end; int i, nr_conns; @@ -1219,7 +1218,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* SyncID sanity check */ - if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); return; } @@ -1254,7 +1253,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* Process a single sync_conn */ - retc = ip_vs_proc_sync_conn(net, p, msg_end); + retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); if (retc < 0) { IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", retc); @@ -1265,7 +1264,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, } } else { /* Old type of message */ - ip_vs_process_message_v0(net, buffer, buflen); + ip_vs_process_message_v0(ipvs, buffer, buflen); return; } } @@ -1303,6 +1302,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop) /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ lock_sock(sk); inet->mc_loop = loop ? 1 : 0; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_LOOP */ + np->mc_loop = loop ? 1 : 0; + } +#endif release_sock(sk); } @@ -1316,6 +1323,33 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); inet->mc_ttl = ttl; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_HOPS */ + np->mcast_hops = ttl; + } +#endif + release_sock(sk); +} + +/* Control fragmentation of messages */ +static void set_mcast_pmtudisc(struct sock *sk, int val) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ + lock_sock(sk); + inet->pmtudisc = val; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MTU_DISCOVER */ + np->pmtudisc = val; + } +#endif release_sock(sk); } @@ -1338,44 +1372,15 @@ static int set_mcast_if(struct sock *sk, char *ifname) lock_sock(sk); inet->mc_index = dev->ifindex; /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(struct net *net, int sync_state) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); - if (!dev) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", ipvs->send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); - if (!dev) - return -ENODEV; - - ipvs->recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", ipvs->recv_mesg_maxlen); + /* IPV6_MULTICAST_IF */ + np->mcast_oif = dev->ifindex; } +#endif + release_sock(sk); return 0; } @@ -1405,15 +1410,34 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) mreq.imr_ifindex = dev->ifindex; - rtnl_lock(); lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); - rtnl_unlock(); return ret; } +#ifdef CONFIG_IP_VS_IPV6 +static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, + char *ifname) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + int ret; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); + release_sock(sk); + + return ret; +} +#endif static int bind_mcastif_addr(struct socket *sock, char *ifname) { @@ -1442,53 +1466,69 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); } +static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, + struct ipvs_sync_daemon_cfg *c, int id) +{ + if (AF_INET6 == c->mcast_af) { + sa->in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_port = htons(c->mcast_port + id), + }; + sa->in6.sin6_addr = c->mcast_group.in6; + *salen = sizeof(sa->in6); + } else { + sa->in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(c->mcast_port + id), + }; + sa->in.sin_addr = c->mcast_group.in; + *salen = sizeof(sa->in); + } +} + /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net, int id) +static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) { - struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; - /* First create a socket move it to right name space later */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + /* First create a socket */ + result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); - result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; } set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); + set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); + /* Allow fragmentation if MTU changes */ + set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 1, result); - result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + if (AF_INET == ipvs->mcfg.mcast_af) + result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); + else + result = 0; if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; } + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1497,7 +1537,7 @@ static struct socket *make_send_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1505,47 +1545,42 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) { - struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 0, result); - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ipvs->backup_mcast_ifn); +#ifdef CONFIG_IP_VS_IPV6 + if (ipvs->bcfg.mcast_af == AF_INET6) + result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, + ipvs->bcfg.mcast_ifn); + else +#endif + result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, + ipvs->bcfg.mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -1554,7 +1589,7 @@ static struct socket *make_receive_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1646,14 +1681,14 @@ next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; - struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct netns_ipvs *ipvs = tinfo->ipvs; struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); for (;;) { sb = next_sync_buff(ipvs, ms); @@ -1692,7 +1727,7 @@ done: ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo); return 0; @@ -1702,12 +1737,12 @@ done: static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; - struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct netns_ipvs *ipvs = tinfo->ipvs; int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); + ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1717,19 +1752,19 @@ static int sync_thread_backup(void *data) /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - ipvs->recv_mesg_maxlen); + ipvs->bcfg.sync_maxlen); if (len <= 0) { if (len != -EAGAIN) pr_err("receiving message error\n"); break; } - ip_vs_process_message(tinfo->net, tinfo->buf, len); + ip_vs_process_message(ipvs, tinfo->buf, len); } } /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); @@ -1737,16 +1772,18 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, + int state) { struct ip_vs_sync_thread_data *tinfo; struct task_struct **array = NULL, *task; struct socket *sock; - struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; char *name; int (*threadfn)(void *data); - int id, count; + int id, count, hlen; int result = -ENOMEM; + u16 mtu, min_mtu; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", @@ -1758,22 +1795,46 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) } else count = ipvs->threads_mask + 1; + if (c->mcast_af == AF_UNSPEC) { + c->mcast_af = AF_INET; + c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); + } + if (!c->mcast_port) + c->mcast_port = IP_VS_SYNC_PORT; + if (!c->mcast_ttl) + c->mcast_ttl = 1; + + dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); + if (!dev) { + pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); + return -ENODEV; + } + hlen = (AF_INET6 == c->mcast_af) ? + sizeof(struct ipv6hdr) + sizeof(struct udphdr) : + sizeof(struct iphdr) + sizeof(struct udphdr); + mtu = (state == IP_VS_STATE_BACKUP) ? + clamp(dev->mtu, 1500U, 65535U) : 1500U; + min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; + + if (c->sync_maxlen) + c->sync_maxlen = clamp_t(unsigned int, + c->sync_maxlen, min_mtu, + 65535 - hlen); + else + c->sync_maxlen = mtu - hlen; + if (state == IP_VS_STATE_MASTER) { if (ipvs->ms) return -EEXIST; - strlcpy(ipvs->master_mcast_ifn, mcast_ifn, - sizeof(ipvs->master_mcast_ifn)); - ipvs->master_syncid = syncid; + ipvs->mcfg = *c; name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { if (ipvs->backup_threads) return -EEXIST; - strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, - sizeof(ipvs->backup_mcast_ifn)); - ipvs->backup_syncid = syncid; + ipvs->bcfg = *c; name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; } else { @@ -1801,14 +1862,13 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) if (!array) goto out; } - set_sync_mesg_maxlen(net, state); tinfo = NULL; for (id = 0; id < count; id++) { if (state == IP_VS_STATE_MASTER) - sock = make_send_sock(net, id); + sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(net, id); + sock = make_receive_sock(ipvs, id); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; @@ -1816,10 +1876,10 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) goto outsocket; - tinfo->net = net; + tinfo->ipvs = ipvs; tinfo->sock = sock; if (state == IP_VS_STATE_BACKUP) { - tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) goto outtinfo; @@ -1854,11 +1914,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) return 0; outsocket: - sk_release_kernel(sock->sk); + sock_release(sock); outtinfo: if (tinfo) { - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); } @@ -1880,9 +1940,8 @@ out: } -int stop_sync_thread(struct net *net, int state) +int stop_sync_thread(struct netns_ipvs *ipvs, int state) { - struct netns_ipvs *ipvs = net_ipvs(net); struct task_struct **array; int id; int retc = -EINVAL; @@ -1948,27 +2007,24 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -int __net_init ip_vs_sync_net_init(struct net *net) +int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); return 0; } -void ip_vs_sync_net_cleanup(struct net *net) +void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) { int retc; - struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); - retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); if (retc && retc != -ESRCH) pr_err("Failed to stop Master Daemon\n"); - retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); if (retc && retc != -ESRCH) pr_err("Failed to stop Backup Daemon\n"); mutex_unlock(&ipvs->sync_mutex); diff --git a/kernel/net/netfilter/ipvs/ip_vs_xmit.c b/kernel/net/netfilter/ipvs/ip_vs_xmit.c index 19986ec5f..3264cb49b 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_xmit.c +++ b/kernel/net/netfilter/ipvs/ip_vs_xmit.c @@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; @@ -213,19 +212,20 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } -static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, +static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, + int rt_mode, struct ip_vs_iphdr *ipvsh, struct sk_buff *skb, int mtu) { #ifdef CONFIG_IP_VS_IPV6 if (skb_af == AF_INET6) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { if (!skb->dev) skb->dev = net->loopback_dev; /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) + if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); @@ -234,8 +234,6 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, } else #endif { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); - /* If we're going to tunnel the packet and pmtu discovery * is disabled, we'll just fragment it anyway */ @@ -243,7 +241,8 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, return true; if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len > mtu && !skb_is_gso(skb))) { + skb->len > mtu && !skb_is_gso(skb) && + !ip_vs_iph_icmp(ipvsh))) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG(1, "frag needed for %pI4\n", @@ -257,11 +256,12 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr, struct ip_vs_iphdr *ipvsh) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ int mtu; @@ -337,7 +337,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -364,13 +364,16 @@ err_unreach: #ifdef CONFIG_IP_VS_IPV6 static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, - struct in6_addr *ret_saddr, int do_xfrm) + struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) { struct dst_entry *dst; struct flowi6 fl6 = { .daddr = *daddr, }; + if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + dst = ip6_route_output(net, NULL, &fl6); if (dst->error) goto out_err; @@ -400,11 +403,12 @@ out_err: * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct dst_entry *dst; @@ -427,7 +431,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, &dest_dst->dst_saddr.in6, - do_xfrm); + do_xfrm, rt_mode); if (!dst) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); @@ -435,7 +439,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_unreach; } rt = (struct rt6_info *) dst; - cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", @@ -446,7 +450,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; - dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, + rt_mode); if (!dst) goto err_unreach; rt = (struct rt6_info *) dst; @@ -481,7 +486,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -501,6 +506,13 @@ err_put: return -1; err_unreach: + /* The ip6_link_failure function requires the dev field to be set + * in order to get the net (further for the sake of fwmark + * reflection). + */ + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + dst_link_failure(skb); return -1; } @@ -519,10 +531,27 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); } return ret; } +/* In the event of a remote destination, it's possible that we would have + * matches against an old socket (particularly a TIME-WAIT socket). This + * causes havoc down the line (ip_local_out et. al. expect regular sockets + * and invalid memory accesses will happen) so simply drop the association + * in this case. +*/ +static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) +{ + /* If dev is set, the packet came from the LOCAL_IN callback and + * not from a local TCP socket. + */ + if (skb->dev) + skb_orphan(skb); +} + /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, struct ip_vs_conn *cp, int local) @@ -534,12 +563,23 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 1); + + /* Remove the early_demux association unless it's bound for the + * exact same port and address on this host after translation. + */ + if (!local || cp->vport != cp->dport || + !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) + ip_vs_drop_early_demux_sk(skb); + if (!local) { skb_forward_csum(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); + if (!skb->sk) + skb_sender_cpu_clear(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; + return ret; } @@ -553,9 +593,12 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) ip_vs_notrack(skb); if (!local) { + ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); + if (!skb->sk) + skb_sender_cpu_clear(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; return ret; @@ -588,7 +631,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, + if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -615,10 +658,13 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct ipv6hdr *iph = ipv6_hdr(skb); + EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL, + if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, + &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -665,7 +711,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL, ipvsh); @@ -682,7 +728,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error; @@ -692,8 +738,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { - IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " - "stopping DNAT to loopback address"); + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, + "ip_vs_nat_xmit(): stopping DNAT to loopback " + "address"); goto tx_error; } @@ -710,7 +757,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -753,7 +800,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -771,7 +819,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error; @@ -781,8 +829,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { - IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error; @@ -800,7 +848,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -841,6 +889,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, struct ipv6hdr *old_ipv6h = NULL; #endif + ip_vs_drop_early_demux_sk(skb); + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) @@ -924,8 +974,8 @@ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct net *net = skb_net(skb); - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -941,7 +991,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | @@ -999,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip_local_out(skb); + ip_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1035,7 +1085,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1090,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(skb); + ip6_local_out(cp->ipvs->net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1122,7 +1173,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); @@ -1161,10 +1212,12 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL); + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH); if (local < 0) goto tx_error; if (local) { @@ -1229,7 +1282,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, NULL, iph); if (local < 0) goto tx_error; @@ -1321,8 +1374,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, - NULL, ipvsh, 0, rt_mode); + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; rt = (struct rt6_info *) skb_dst(skb); @@ -1346,7 +1399,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); diff --git a/kernel/net/netfilter/nf_conntrack_core.c b/kernel/net/netfilter/nf_conntrack_core.c index 13fad8668..3cb3cb831 100644 --- a/kernel/net/netfilter/nf_conntrack_core.c +++ b/kernel/net/netfilter/nf_conntrack_core.c @@ -126,7 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); unsigned int nf_conntrack_hash_rnd __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) { unsigned int n; @@ -135,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) * three bytes manually. */ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } @@ -151,15 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net) } static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - u16 zone, unsigned int size) + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple, zone), size); + return __hash_bucket(hash_conntrack_raw(tuple), size); } -static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, +static inline u_int32_t hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, zone, net->ct.htable_size); + return __hash_conntrack(tuple, net->ct.htable_size); } bool @@ -168,6 +168,7 @@ nf_ct_get_tuple(const struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, + struct net *net, struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) @@ -181,12 +182,13 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; - return l4proto->pkt_to_tuple(skb, dataoff, tuple); + return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuple); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, - u_int16_t l3num, struct nf_conntrack_tuple *tuple) + u_int16_t l3num, + struct net *net, struct nf_conntrack_tuple *tuple) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -205,7 +207,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, l4proto = __nf_ct_l4proto_find(l3num, protonum); - ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, + ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, l3proto, l4proto); rcu_read_unlock(); @@ -287,6 +289,40 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) spin_unlock(&pcpu->lock); } +/* Released via destroy_conntrack() */ +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, + const struct nf_conntrack_zone *zone, + gfp_t flags) +{ + struct nf_conn *tmpl; + + tmpl = kzalloc(sizeof(*tmpl), flags); + if (tmpl == NULL) + return NULL; + + tmpl->status = IPS_TEMPLATE; + write_pnet(&tmpl->ct_net, net); + + if (nf_ct_zone_add(tmpl, flags, zone) < 0) + goto out_free; + + atomic_set(&tmpl->ct_general.use, 0); + + return tmpl; +out_free: + kfree(tmpl); + return NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); + +void nf_ct_tmpl_free(struct nf_conn *tmpl) +{ + nf_ct_ext_destroy(tmpl); + nf_ct_ext_free(tmpl); + kfree(tmpl); +} +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -298,6 +334,10 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); + if (unlikely(nf_ct_is_template(ct))) { + nf_ct_tmpl_free(ct); + return; + } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -329,7 +369,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; - u16 zone = nf_ct_zone(ct); unsigned int sequence; nf_ct_helper_destroy(ct); @@ -337,9 +376,9 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_conntrack(net, zone, + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -387,8 +426,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, - const struct nf_conntrack_tuple *tuple, - u16 zone) + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -396,8 +435,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(ct) == zone && - nf_ct_is_confirmed(ct); + nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && + nf_ct_is_confirmed(ct); } /* @@ -406,7 +445,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * and recheck nf_ct_tuple_equal(tuple, &h->tuple) */ static struct nf_conntrack_tuple_hash * -____nf_conntrack_find(struct net *net, u16 zone, +____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; @@ -442,7 +481,7 @@ begin: /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * -__nf_conntrack_find_get(struct net *net, u16 zone, +__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; @@ -469,11 +508,11 @@ begin: } struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(struct net *net, u16 zone, +nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, zone)); + hash_conntrack_raw(tuple)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -492,11 +531,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, int nf_conntrack_hash_check_insert(struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - u16 zone; unsigned int sequence; zone = nf_ct_zone(ct); @@ -504,9 +543,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_conntrack(net, zone, + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -514,12 +553,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; add_timer(&ct->timeout); @@ -540,32 +581,11 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -/* deletion from this larval template list happens via nf_ct_put() */ -void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) -{ - struct ct_pcpu *pcpu; - - __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); - - /* add this conntrack to the (per cpu) tmpl list */ - local_bh_disable(); - tmpl->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); - - spin_lock(&pcpu->lock); - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->tmpl); - spin_unlock_bh(&pcpu->lock); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); - /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { + const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -574,7 +594,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - u16 zone; unsigned int sequence; ct = nf_ct_get(skb, &ctinfo); @@ -595,7 +614,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = hash_bucket(hash, net); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -627,12 +646,14 @@ __nf_conntrack_confirm(struct sk_buff *skb) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; /* Timer relative to confirmation time, not original @@ -685,11 +706,14 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { struct net *net = nf_ct_net(ignored_conntrack); + const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *ct; - u16 zone = nf_ct_zone(ignored_conntrack); - unsigned int hash = hash_conntrack(net, zone, tuple); + unsigned int hash; + + zone = nf_ct_zone(ignored_conntrack); + hash = hash_conntrack(net, tuple); /* Disable BHs the entire time since we need to disable them at * least once for the stats anyway. @@ -699,7 +723,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(ct) == zone) { + nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { NF_CT_STAT_INC(net, found); rcu_read_unlock_bh(); return 1; @@ -788,7 +812,8 @@ void init_nf_conntrack_hash_rnd(void) } static struct nf_conn * -__nf_conntrack_alloc(struct net *net, u16 zone, +__nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) @@ -798,7 +823,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, if (unlikely(!nf_conntrack_hash_rnd)) { init_nf_conntrack_hash_rnd(); /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig, zone); + hash = hash_conntrack_raw(orig); } /* We don't want any race condition at early drop stage */ @@ -818,10 +843,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone, * SLAB_DESTROY_BY_RCU. */ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); - if (ct == NULL) { - atomic_dec(&net->ct.count); - return ERR_PTR(-ENOMEM); - } + if (ct == NULL) + goto out; + spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; @@ -835,31 +859,24 @@ __nf_conntrack_alloc(struct net *net, u16 zone, memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, __nfct_init_offset[0])); -#ifdef CONFIG_NF_CONNTRACK_ZONES - if (zone) { - struct nf_conntrack_zone *nf_ct_zone; - nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); - if (!nf_ct_zone) - goto out_free; - nf_ct_zone->id = zone; - } -#endif + if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) + goto out_free; + /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ atomic_set(&ct->ct_general.use, 0); return ct; - -#ifdef CONFIG_NF_CONNTRACK_ZONES out_free: - atomic_dec(&net->ct.count); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +out: + atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); -#endif } -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, +struct nf_conn *nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) @@ -901,8 +918,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_ecache *ecache; struct nf_conntrack_expect *exp = NULL; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; + struct nf_conntrack_zone tmp; unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { @@ -910,6 +928,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return NULL; } + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) @@ -921,10 +940,13 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (timeout_ext) - timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); - else + if (timeout_ext) { + timeouts = nf_ct_timeout_data(timeout_ext); + if (unlikely(!timeouts)) + timeouts = l4proto->get_timeouts(net); + } else { timeouts = l4proto->get_timeouts(net); + } if (!l4proto->new(ct, skb, dataoff, timeouts)) { nf_conntrack_free(ct); @@ -933,7 +955,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } if (timeout_ext) - nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC); + nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), + GFP_ATOMIC); nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); @@ -1004,21 +1027,23 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, int *set_reply, enum ip_conntrack_info *ctinfo) { + const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_zone tmp; struct nf_conn *ct; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, &tuple, l3proto, + dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("resolve_normal_ct: Can't get tuple\n"); return NULL; } /* look for tuple match */ - hash = hash_conntrack_raw(&tuple, zone); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); + hash = hash_conntrack_raw(&tuple); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1522,10 +1547,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); - if (!hash) { - printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + if (!hash) hash = vzalloc(sz); - } if (hash && nulls) for (i = 0; i < nr_slots; i++) @@ -1576,8 +1599,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), - hashsize); + bucket = __hash_conntrack(&h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } @@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net) spin_lock_init(&pcpu->lock); INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } net->ct.stat = alloc_percpu(struct ip_conntrack_stat); diff --git a/kernel/net/netfilter/nf_conntrack_expect.c b/kernel/net/netfilter/nf_conntrack_expect.c index 7a17070c5..acf5c7b3f 100644 --- a/kernel/net/netfilter/nf_conntrack_expect.c +++ b/kernel/net/netfilter/nf_conntrack_expect.c @@ -88,7 +88,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple } struct nf_conntrack_expect * -__nf_ct_expect_find(struct net *net, u16 zone, +__nf_ct_expect_find(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; @@ -100,7 +101,7 @@ __nf_ct_expect_find(struct net *net, u16 zone, h = nf_ct_expect_dst_hash(tuple); hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone(i->master) == zone) + nf_ct_zone_equal_any(i->master, zone)) return i; } return NULL; @@ -109,7 +110,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * -nf_ct_expect_find_get(struct net *net, u16 zone, +nf_ct_expect_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; @@ -127,7 +129,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * -nf_ct_find_expectation(struct net *net, u16 zone, +nf_ct_find_expectation(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; @@ -140,7 +143,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone(i->master) == zone) { + nf_ct_zone_equal_any(i->master, zone)) { exp = i; break; } @@ -219,16 +222,17 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } - return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return a->master == b->master && a->class == b->class && - nf_ct_tuple_equal(&a->tuple, &b->tuple) && - nf_ct_tuple_mask_equal(&a->mask, &b->mask) && - nf_ct_zone(a->master) == nf_ct_zone(b->master); + nf_ct_tuple_equal(&a->tuple, &b->tuple) && + nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } /* Generally a bad idea to call this: could have matched already. */ diff --git a/kernel/net/netfilter/nf_conntrack_h323_main.c b/kernel/net/netfilter/nf_conntrack_h323_main.c index 1d69f5b97..9511af04d 100644 --- a/kernel/net/netfilter/nf_conntrack_h323_main.c +++ b/kernel/net/netfilter/nf_conntrack_h323_main.c @@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net, flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2), false)) { - if (ipv6_addr_equal(rt6_nexthop(rt1), - rt6_nexthop(rt2)) && + if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr), + rt6_nexthop(rt2, &fl2.daddr)) && rt1->dst.dev == rt2->dst.dev) ret = 1; dst_release(&rt2->dst); diff --git a/kernel/net/netfilter/nf_conntrack_labels.c b/kernel/net/netfilter/nf_conntrack_labels.c index bb53f120e..3ce5c314e 100644 --- a/kernel/net/netfilter/nf_conntrack_labels.c +++ b/kernel/net/netfilter/nf_conntrack_labels.c @@ -14,6 +14,8 @@ #include #include +static spinlock_t nf_connlabels_lock; + static unsigned int label_bits(const struct nf_conn_labels *l) { unsigned int longs = l->words; @@ -48,7 +50,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) } EXPORT_SYMBOL_GPL(nf_connlabel_set); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static void replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -89,7 +90,35 @@ int nf_connlabels_replace(struct nf_conn *ct, return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); -#endif + +int nf_connlabels_get(struct net *net, unsigned int n_bits) +{ + size_t words; + + if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE)) + return -ERANGE; + + words = BITS_TO_LONGS(n_bits); + + spin_lock(&nf_connlabels_lock); + net->ct.labels_used++; + if (words > net->ct.label_words) + net->ct.label_words = words; + spin_unlock(&nf_connlabels_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_get); + +void nf_connlabels_put(struct net *net) +{ + spin_lock(&nf_connlabels_lock); + net->ct.labels_used--; + if (net->ct.labels_used == 0) + net->ct.label_words = 0; + spin_unlock(&nf_connlabels_lock); +} +EXPORT_SYMBOL_GPL(nf_connlabels_put); static struct nf_ct_ext_type labels_extend __read_mostly = { .len = sizeof(struct nf_conn_labels), @@ -99,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = { int nf_conntrack_labels_init(void) { + spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/kernel/net/netfilter/nf_conntrack_netlink.c b/kernel/net/netfilter/nf_conntrack_netlink.c index d1c23940a..9f5272968 100644 --- a/kernel/net/netfilter/nf_conntrack_netlink.c +++ b/kernel/net/netfilter/nf_conntrack_netlink.c @@ -127,6 +127,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb, return ret; } +static inline int +ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, + const struct nf_conntrack_zone *zone, int dir) +{ + if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) + return 0; + if (nla_put_be16(skb, attrtype, htons(zone->id))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + static inline int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { @@ -458,6 +472,7 @@ static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; @@ -473,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -485,10 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct) && - nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0 || @@ -598,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES - + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) @@ -609,6 +632,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) static int ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { + const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -655,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) nfmsg->res_id = 0; rcu_read_lock(); + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -667,10 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct) && - nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) @@ -920,15 +952,54 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, return ret; } +static int +ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone) +{ + nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (attr) + zone->id = ntohs(nla_get_be16(attr)); +#else + if (attr) + return -EOPNOTSUPP; +#endif + return 0; +} + +static int +ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type, + struct nf_conntrack_zone *zone) +{ + int ret; + + if (zone->id != NF_CT_DEFAULT_ZONE_ID) + return -EINVAL; + + ret = ctnetlink_parse_zone(attr, zone); + if (ret < 0) + return ret; + + if (type == CTA_TUPLE_REPLY) + zone->dir = NF_CT_ZONE_DIR_REPL; + else + zone->dir = NF_CT_ZONE_DIR_ORIG; + + return 0; +} + static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_IP] = { .type = NLA_NESTED }, [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, + [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; static int ctnetlink_parse_tuple(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, - enum ctattr_type type, u_int8_t l3num) + enum ctattr_type type, u_int8_t l3num, + struct nf_conntrack_zone *zone) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -955,6 +1026,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; + if (tb[CTA_TUPLE_ZONE]) { + if (!zone) + return -EINVAL; + + err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE], + type, zone); + if (err < 0) + return err; + } + /* orig and expect tuples get DIR_ORIGINAL */ if (type == CTA_TUPLE_REPLY) tuple->dst.dir = IP_CT_DIR_REPLY; @@ -964,21 +1045,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } -static int -ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) -{ - if (attr) -#ifdef CONFIG_NF_CONNTRACK_ZONES - *zone = ntohs(nla_get_be16(attr)); -#else - return -EOPNOTSUPP; -#endif - else - *zone = 0; - - return 0; -} - static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1058,7 +1124,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1066,9 +1132,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, + u3, &zone); else if (cda[CTA_TUPLE_REPLY]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, + u3, &zone); else { return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, @@ -1078,7 +1146,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -1112,7 +1180,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, struct sk_buff *skb2 = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { @@ -1138,16 +1206,18 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, + u3, &zone); else if (cda[CTA_TUPLE_REPLY]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, + u3, &zone); else return -EINVAL; if (err < 0) return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -1645,7 +1715,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, } static struct nf_conn * -ctnetlink_create_conntrack(struct net *net, u16 zone, +ctnetlink_create_conntrack(struct net *net, + const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, @@ -1761,7 +1832,8 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, struct nf_conntrack_tuple_hash *master_h; struct nf_conn *master_ct; - err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, + u3, NULL); if (err < 0) goto err2; @@ -1804,7 +1876,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1812,21 +1884,23 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) { - err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, + u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_REPLY]) { - err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, + u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, zone, &otuple); + h = nf_conntrack_find_get(net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, zone, &rtuple); + h = nf_conntrack_find_get(net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; @@ -1836,7 +1910,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; - ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, + ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); @@ -2059,9 +2133,9 @@ ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); -#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT static size_t -ctnetlink_nfqueue_build_size(const struct nf_conn *ct) +ctnetlink_glue_build_size(const struct nf_conn *ct) { return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ @@ -2082,23 +2156,40 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES - + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) ; } -static int -ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) +static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conn *ct; + + ct = nf_ct_get(skb, ctinfo); + if (ct && nf_ct_is_untracked(ct)) + ct = NULL; + + return ct; +} + +static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; rcu_read_lock(); + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -2106,12 +2197,14 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct)) { - if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) - goto nla_put_failure; - } + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) + goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; @@ -2154,7 +2247,32 @@ nla_put_failure: } static int -ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) +ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + u_int16_t ct_attr, u_int16_t ct_info_attr) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, ct_attr | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (__ctnetlink_glue_build(skb, ct) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static int +ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) { int err; @@ -2194,7 +2312,7 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) } static int -ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) +ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; int ret; @@ -2204,31 +2322,31 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) return ret; spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } -static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, - const struct nf_conn *ct, - struct nf_conntrack_tuple *tuple, - struct nf_conntrack_tuple *mask) +static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) { int err; err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, - nf_ct_l3num(ct)); + nf_ct_l3num(ct), NULL); if (err < 0) return err; return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, - nf_ct_l3num(ct)); + nf_ct_l3num(ct), NULL); } static int -ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, - u32 portid, u32 report) +ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) { struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; @@ -2240,8 +2358,8 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; - err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, - ct, &tuple, &mask); + err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); if (err < 0) return err; @@ -2268,14 +2386,24 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, return 0; } -static struct nfq_ct_hook ctnetlink_nfqueue_hook = { - .build_size = ctnetlink_nfqueue_build_size, - .build = ctnetlink_nfqueue_build, - .parse = ctnetlink_nfqueue_parse, - .attach_expect = ctnetlink_nfqueue_attach_expect, - .seq_adjust = nf_ct_tcp_seqadj_set, +static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int diff) +{ + if (!(ct->status & IPS_NAT_MASK)) + return; + + nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); +} + +static struct nfnl_ct_hook ctnetlink_glue_hook = { + .get_ct = ctnetlink_glue_get_ct, + .build_size = ctnetlink_glue_build_size, + .build = ctnetlink_glue_build, + .parse = ctnetlink_glue_parse, + .attach_expect = ctnetlink_glue_attach_expect, + .seq_adjust = ctnetlink_glue_seqadj, }; -#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ +#endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */ /*********************************************************************** * EXPECT @@ -2612,23 +2740,22 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - u16 zone = 0; + struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, .done = ctnetlink_exp_done, }; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, + u3, NULL); if (err < 0) return err; - if (cda[CTA_EXPECT_ZONE]) { - err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); - if (err < 0) - return err; - } + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -2652,7 +2779,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct sk_buff *skb2; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { @@ -2672,16 +2799,18 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_EXPECT_TUPLE]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); else if (cda[CTA_EXPECT_MASTER]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, + u3, NULL); else return -EINVAL; if (err < 0) return err; - exp = nf_ct_expect_find_get(net, zone, &tuple); + exp = nf_ct_expect_find_get(net, &zone, &tuple); if (!exp) return -ENOENT; @@ -2732,8 +2861,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_zone zone; unsigned int i; - u16 zone; int err; if (cda[CTA_EXPECT_TUPLE]) { @@ -2742,12 +2871,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(net, zone, &tuple); + exp = nf_ct_expect_find_get(net, &zone, &tuple); if (!exp) return -ENOENT; @@ -2849,7 +2979,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, return -EINVAL; err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, - &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3); + &nat_tuple, CTA_EXPECT_NAT_TUPLE, + u3, NULL); if (err < 0) return err; @@ -2937,7 +3068,8 @@ err_out: } static int -ctnetlink_create_expect(struct net *net, u16 zone, +ctnetlink_create_expect(struct net *net, + const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], u_int8_t u3, u32 portid, int report) { @@ -2949,13 +3081,16 @@ ctnetlink_create_expect(struct net *net, u16 zone, int err; /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, + u3, NULL); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, + u3, NULL); if (err < 0) return err; @@ -2995,11 +3130,6 @@ ctnetlink_create_expect(struct net *net, u16 zone, } err = nf_ct_expect_related_report(exp, portid, report); - if (err < 0) - goto err_exp; - - return 0; -err_exp: nf_ct_expect_put(exp); err_ct: nf_ct_put(ct); @@ -3016,7 +3146,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_expect *exp; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (!cda[CTA_EXPECT_TUPLE] @@ -3028,19 +3158,18 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; spin_lock_bh(&nf_conntrack_expect_lock); - exp = __nf_ct_expect_find(net, zone, &tuple); - + exp = __nf_ct_expect_find(net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - err = ctnetlink_create_expect(net, zone, cda, - u3, + err = ctnetlink_create_expect(net, &zone, cda, u3, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } @@ -3258,9 +3387,9 @@ static int __init ctnetlink_init(void) pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } -#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT /* setup interaction between nf_queue and nf_conntrack_netlink. */ - RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook); + RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook); #endif return 0; @@ -3279,8 +3408,8 @@ static void __exit ctnetlink_exit(void) unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); -#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT - RCU_INIT_POINTER(nfq_ct_hook, NULL); +#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT + RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif } diff --git a/kernel/net/netfilter/nf_conntrack_pptp.c b/kernel/net/netfilter/nf_conntrack_pptp.c index 825c3e3f8..5588c7ae1 100644 --- a/kernel/net/netfilter/nf_conntrack_pptp.c +++ b/kernel/net/netfilter/nf_conntrack_pptp.c @@ -143,13 +143,14 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, const struct nf_conntrack_tuple *t) { const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_zone *zone; struct nf_conntrack_expect *exp; struct nf_conn *sibling; - u16 zone = nf_ct_zone(ct); pr_debug("trying to timeout ct or exp for tuple "); nf_ct_dump_tuple(t); + zone = nf_ct_zone(ct); h = nf_conntrack_find_get(net, zone, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); diff --git a/kernel/net/netfilter/nf_conntrack_proto_dccp.c b/kernel/net/netfilter/nf_conntrack_proto_dccp.c index 6dd995c7c..fce1b1cca 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_dccp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_dccp.c @@ -398,7 +398,7 @@ static inline struct dccp_net *dccp_pernet(struct net *net) } static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { struct dccp_hdr _hdr, *dh; diff --git a/kernel/net/netfilter/nf_conntrack_proto_generic.c b/kernel/net/netfilter/nf_conntrack_proto_generic.c index 60865f110..86dc752e5 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_generic.c +++ b/kernel/net/netfilter/nf_conntrack_proto_generic.c @@ -45,7 +45,7 @@ static inline struct nf_generic_net *generic_pernet(struct net *net) static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -90,7 +90,13 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return nf_generic_should_process(nf_ct_protonum(ct)); + bool ret; + + ret = nf_generic_should_process(nf_ct_protonum(ct)); + if (!ret) + pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", + nf_ct_protonum(ct)); + return ret; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) diff --git a/kernel/net/netfilter/nf_conntrack_proto_gre.c b/kernel/net/netfilter/nf_conntrack_proto_gre.c index 7648674f2..a96451a7a 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_gre.c +++ b/kernel/net/netfilter/nf_conntrack_proto_gre.c @@ -190,9 +190,8 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, /* gre hdr info to tuple */ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { - struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev); const struct gre_hdr_pptp *pgrehdr; struct gre_hdr_pptp _pgrehdr; __be16 srckey; diff --git a/kernel/net/netfilter/nf_conntrack_proto_sctp.c b/kernel/net/netfilter/nf_conntrack_proto_sctp.c index b45da90fa..9578a7c37 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_sctp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_sctp.c @@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = { "SHUTDOWN_SENT", "SHUTDOWN_RECD", "SHUTDOWN_ACK_SENT", + "HEARTBEAT_SENT", + "HEARTBEAT_ACKED", }; #define SECS * HZ @@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, + [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; #define sNO SCTP_CONNTRACK_NONE @@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT +#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. +HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to + that of the HEARTBEAT chunk. Secondary connection is + established. */ /* TODO @@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - - Multi Homing support. + - Full Multi Homing support. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} } }; @@ -142,7 +156,7 @@ static inline struct sctp_net *sctp_pernet(struct net *net) } static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct sctphdr *hp; struct sctphdr _hdr; @@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; + case SCTP_CID_HEARTBEAT: + pr_debug("SCTP_CID_HEARTBEAT"); + i = 9; + break; + case SCTP_CID_HEARTBEAT_ACK: + pr_debug("SCTP_CID_HEARTBEAT_ACK"); + i = 10; + break; default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ + /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", sctp_conntrack_names[cur_state]); return cur_state; @@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); goto out; @@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct, /* Sec 8.5.1 (D) */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_HEARTBEAT_ACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", + sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out_unlock; + } } old_state = ct->proto.sctp.state; @@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Sec 8.5.1 (A) */ return false; } + } else if (sch->type == SCTP_CID_HEARTBEAT) { + pr_debug("Setting vtag %x for secondary conntrack\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ @@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ @@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { } }; @@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; + pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT]; + pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED]; #endif return 0; } diff --git a/kernel/net/netfilter/nf_conntrack_proto_tcp.c b/kernel/net/netfilter/nf_conntrack_proto_tcp.c index 70383de72..278f3b935 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_tcp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_tcp.c @@ -277,7 +277,7 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net) } static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct tcphdr *hp; struct tcphdr _hdr; diff --git a/kernel/net/netfilter/nf_conntrack_proto_udp.c b/kernel/net/netfilter/nf_conntrack_proto_udp.c index 6957281ff..478f92f83 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_udp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_udp.c @@ -38,6 +38,7 @@ static inline struct nf_udp_net *udp_pernet(struct net *net) static bool udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct udphdr *hp; diff --git a/kernel/net/netfilter/nf_conntrack_proto_udplite.c b/kernel/net/netfilter/nf_conntrack_proto_udplite.c index c5903d164..1ac8ee13a 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_udplite.c +++ b/kernel/net/netfilter/nf_conntrack_proto_udplite.c @@ -48,6 +48,7 @@ static inline struct udplite_net *udplite_pernet(struct net *net) static bool udplite_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct udphdr *hp; diff --git a/kernel/net/netfilter/nf_conntrack_seqadj.c b/kernel/net/netfilter/nf_conntrack_seqadj.c index ce3e840c8..dff0f0cc5 100644 --- a/kernel/net/netfilter/nf_conntrack_seqadj.c +++ b/kernel/net/netfilter/nf_conntrack_seqadj.c @@ -103,9 +103,9 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb, ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); + sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); + sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); @@ -193,8 +193,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb, newseq = htonl(ntohl(tcph->seq) + seqoff); newack = htonl(ntohl(tcph->ack_seq) - ackoff); - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, + false); pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), diff --git a/kernel/net/netfilter/nf_conntrack_standalone.c b/kernel/net/netfilter/nf_conntrack_standalone.c index fc823fa5d..1fb3cacc0 100644 --- a/kernel/net/netfilter/nf_conntrack_standalone.c +++ b/kernel/net/netfilter/nf_conntrack_standalone.c @@ -140,6 +140,35 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, + int dir) +{ + const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + + if (zone->dir != dir) + return; + switch (zone->dir) { + case NF_CT_DEFAULT_ZONE_DIR: + seq_printf(s, "zone=%u ", zone->id); + break; + case NF_CT_ZONE_DIR_ORIG: + seq_printf(s, "zone-orig=%u ", zone->id); + break; + case NF_CT_ZONE_DIR_REPL: + seq_printf(s, "zone-reply=%u ", zone->id); + break; + default: + break; + } +} +#else +static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, + int dir) +{ +} +#endif + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { @@ -202,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v) print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l3proto, l4proto); + ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); + if (seq_has_overflowed(s)) goto release; @@ -214,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v) print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l3proto, l4proto); + ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; @@ -228,11 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v) #endif ct_show_secctx(s, ct); - -#ifdef CONFIG_NF_CONNTRACK_ZONES - seq_printf(s, "zone=%u ", nf_ct_zone(ct)); -#endif - + ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); diff --git a/kernel/net/netfilter/nf_internals.h b/kernel/net/netfilter/nf_internals.h index ea7f36784..065522564 100644 --- a/kernel/net/netfilter/nf_internals.h +++ b/kernel/net/netfilter/nf_internals.h @@ -19,6 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum); +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/kernel/net/netfilter/nf_log.c b/kernel/net/netfilter/nf_log.c index 675d12c69..a5d41dfa9 100644 --- a/kernel/net/netfilter/nf_log.c +++ b/kernel/net/netfilter/nf_log.c @@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NFPROTO_NUMPROTO; i++) - RCU_INIT_POINTER(loggers[i][logger->type], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = nft_log_dereference(loggers[i][logger->type]); + if (log == logger) + RCU_INIT_POINTER(loggers[i][logger->type], NULL); + } mutex_unlock(&nf_log_mutex); + synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); diff --git a/kernel/net/netfilter/nf_nat_core.c b/kernel/net/netfilter/nf_nat_core.c index 4e0b47831..06a9f4577 100644 --- a/kernel/net/netfilter/nf_nat_core.c +++ b/kernel/net/netfilter/nf_nat_core.c @@ -83,7 +83,7 @@ out: rcu_read_unlock(); } -int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) +int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct flowi fl; unsigned int hh_len; @@ -99,7 +99,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -118,14 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) { unsigned int hash; /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_conntrack_hash_rnd); return reciprocal_scale(hash, net->ct.nat_htable_size); } @@ -185,20 +184,22 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(struct net *net, u16 zone, +find_appropriate_src(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_nat_l3proto *l3proto, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(net, zone, tuple); + unsigned int h = hash_by_src(net, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { ct = nat->ct; - if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { + if (same_src(ct, tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -218,7 +219,8 @@ find_appropriate_src(struct net *net, u16 zone, * the ip with the lowest src-ip/dst-ip/proto usage. */ static void -find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, +find_best_ips_proto(const struct nf_conntrack_zone *zone, + struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) @@ -258,7 +260,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? - 0 : (__force u32)tuple->dst.u3.all[max] ^ zone); + 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { @@ -297,10 +299,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { + const struct nf_conntrack_zone *zone; const struct nf_nat_l3proto *l3proto; const struct nf_nat_l4proto *l4proto; struct net *net = nf_ct_net(ct); - u16 zone = nf_ct_zone(ct); + + zone = nf_ct_zone(ct); rcu_read_lock(); l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); @@ -420,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct, if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; - srchash = hash_by_src(net, nf_ct_zone(ct), + srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate extension aera */ diff --git a/kernel/net/netfilter/nf_nat_proto_dccp.c b/kernel/net/netfilter/nf_nat_proto_dccp.c index b8067b53f..15c47b246 100644 --- a/kernel/net/netfilter/nf_nat_proto_dccp.c +++ b/kernel/net/netfilter/nf_nat_proto_dccp.c @@ -69,7 +69,7 @@ dccp_manip_pkt(struct sk_buff *skb, l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - 0); + false); return true; } diff --git a/kernel/net/netfilter/nf_nat_proto_tcp.c b/kernel/net/netfilter/nf_nat_proto_tcp.c index 37f5505f4..4f8820fc5 100644 --- a/kernel/net/netfilter/nf_nat_proto_tcp.c +++ b/kernel/net/netfilter/nf_nat_proto_tcp.c @@ -70,7 +70,7 @@ tcp_manip_pkt(struct sk_buff *skb, return true; l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } diff --git a/kernel/net/netfilter/nf_nat_proto_udp.c b/kernel/net/netfilter/nf_nat_proto_udp.c index b0ede2f0d..b1e627227 100644 --- a/kernel/net/netfilter/nf_nat_proto_udp.c +++ b/kernel/net/netfilter/nf_nat_proto_udp.c @@ -57,7 +57,7 @@ udp_manip_pkt(struct sk_buff *skb, l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - 0); + false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } diff --git a/kernel/net/netfilter/nf_nat_proto_udplite.c b/kernel/net/netfilter/nf_nat_proto_udplite.c index 368f14e01..58340c97b 100644 --- a/kernel/net/netfilter/nf_nat_proto_udplite.c +++ b/kernel/net/netfilter/nf_nat_proto_udplite.c @@ -56,7 +56,7 @@ udplite_manip_pkt(struct sk_buff *skb, } l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; diff --git a/kernel/net/netfilter/nf_nat_redirect.c b/kernel/net/netfilter/nf_nat_redirect.c index 97b75f9bf..d43869879 100644 --- a/kernel/net/netfilter/nf_nat_redirect.c +++ b/kernel/net/netfilter/nf_nat_redirect.c @@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); - if (indev != NULL) { + if (indev && indev->ifa_list) { ifa = indev->ifa_list; newdst = ifa->ifa_local; } diff --git a/kernel/net/netfilter/nf_queue.c b/kernel/net/netfilter/nf_queue.c index 2e88032cd..5baa8e24e 100644 --- a/kernel/net/netfilter/nf_queue.c +++ b/kernel/net/netfilter/nf_queue.c @@ -69,19 +69,14 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(physdev); } #endif - /* Drop reference to owner of hook which queued us. */ - module_put(entry->elem->owner); } EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ -bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) +void nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; - if (!try_module_get(entry->elem->owner)) - return false; - if (state->in) dev_hold(state->in); if (state->out) @@ -100,11 +95,20 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(physdev); } #endif - - return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) +{ + const struct nf_queue_handler *qh; + + rcu_read_lock(); + qh = rcu_dereference(queue_handler); + if (qh) + qh->nf_hook_drop(net, ops); + rcu_read_unlock(); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). @@ -120,22 +124,20 @@ int nf_queue(struct sk_buff *skb, const struct nf_queue_handler *qh; /* QUEUE == DROP if no one is waiting, to be safe. */ - rcu_read_lock(); - qh = rcu_dereference(queue_handler); if (!qh) { status = -ESRCH; - goto err_unlock; + goto err; } afinfo = nf_get_afinfo(state->pf); if (!afinfo) - goto err_unlock; + goto err; entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); if (!entry) { status = -ENOMEM; - goto err_unlock; + goto err; } *entry = (struct nf_queue_entry) { @@ -145,16 +147,11 @@ int nf_queue(struct sk_buff *skb, .size = sizeof(*entry) + afinfo->route_key_size, }; - if (!nf_queue_entry_get_refs(entry)) { - status = -ECANCELED; - goto err_unlock; - } + nf_queue_entry_get_refs(entry); skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); - rcu_read_unlock(); - if (status < 0) { nf_queue_entry_release_refs(entry); goto err; @@ -162,8 +159,6 @@ int nf_queue(struct sk_buff *skb, return 0; -err_unlock: - rcu_read_unlock(); err: kfree(entry); return status; @@ -176,19 +171,15 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) const struct nf_afinfo *afinfo; int err; - rcu_read_lock(); - nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) { - elem = list_entry(elem->list.prev, struct nf_hook_ops, list); - verdict = NF_ACCEPT; - } + if (verdict == NF_REPEAT) + verdict = elem->hook(elem->priv, skb, &entry->state); if (verdict == NF_ACCEPT) { afinfo = nf_get_afinfo(entry->state.pf); - if (!afinfo || afinfo->reroute(skb, entry) < 0) + if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0) verdict = NF_DROP; } @@ -196,7 +187,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[entry->state.pf][entry->state.hook], + verdict = nf_iterate(entry->state.hook_list, skb, &entry->state, &elem); } @@ -204,15 +195,13 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) case NF_ACCEPT: case NF_STOP: local_bh_disable(); - entry->state.okfn(entry->state.sk, skb); + entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, elem, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { - if (err == -ECANCELED) - goto next_hook; if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; @@ -224,7 +213,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) default: kfree_skb(skb); } - rcu_read_unlock(); + kfree(entry); } EXPORT_SYMBOL(nf_reinject); diff --git a/kernel/net/netfilter/nf_synproxy_core.c b/kernel/net/netfilter/nf_synproxy_core.c index 52e20c9a4..c8a4a48bc 100644 --- a/kernel/net/netfilter/nf_synproxy_core.c +++ b/kernel/net/netfilter/nf_synproxy_core.c @@ -11,15 +11,18 @@ #include #include #include +#include #include #include #include #include + #include #include #include #include +#include int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -185,7 +188,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; - u32 *ptr, old; + __be32 *ptr, old; if (synproxy->tsoff == 0) return 1; @@ -213,18 +216,18 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, if (op[0] == TCPOPT_TIMESTAMP && op[1] == TCPOLEN_TIMESTAMP) { if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - ptr = (u32 *)&op[2]; + ptr = (__be32 *)&op[2]; old = *ptr; *ptr = htonl(ntohl(*ptr) - synproxy->tsoff); } else { - ptr = (u32 *)&op[6]; + ptr = (__be32 *)&op[6]; old = *ptr; *ptr = htonl(ntohl(*ptr) + synproxy->tsoff); } inet_proto_csum_replace4(&th->check, skb, - old, *ptr, 0); + old, *ptr, false); return 1; } optoff += op[1]; @@ -348,23 +351,20 @@ static void __net_exit synproxy_proc_exit(struct net *net) static int __net_init synproxy_net_init(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - struct nf_conntrack_tuple t; struct nf_conn *ct; int err = -ENOMEM; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); + ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL); + if (!ct) goto err1; - } if (!nfct_seqadj_ext_add(ct)) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; - nf_conntrack_tmpl_insert(net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net) err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff --git a/kernel/net/netfilter/nf_tables_api.c b/kernel/net/netfilter/nf_tables_api.c index 34ded0931..2cb429d34 100644 --- a/kernel/net/netfilter/nf_tables_api.c +++ b/kernel/net/netfilter/nf_tables_api.c @@ -89,6 +89,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) } static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, struct nft_af_info *afi, @@ -96,7 +97,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, struct nft_chain *chain, const struct nlattr * const *nla) { - ctx->net = sock_net(skb->sk); + ctx->net = net; ctx->afi = afi; ctx->table = table; ctx->chain = chain; @@ -127,13 +128,50 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +int nft_register_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + struct net *net = read_pnet(&basechain->pnet); + + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return 0; + + return nf_register_net_hooks(net, basechain->ops, hook_nops); +} +EXPORT_SYMBOL_GPL(nft_register_basechain); + +void nft_unregister_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + struct net *net = read_pnet(&basechain->pnet); + + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return; + + nf_unregister_net_hooks(net, basechain->ops, hook_nops); +} +EXPORT_SYMBOL_GPL(nft_unregister_basechain); + +static int nf_tables_register_hooks(const struct nft_table *table, + struct nft_chain *chain, + unsigned int hook_nops) +{ + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return 0; + + return nft_register_basechain(nft_base_chain(chain), hook_nops); +} + static void nf_tables_unregister_hooks(const struct nft_table *table, - const struct nft_chain *chain, + struct nft_chain *chain, unsigned int hook_nops) { - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return; + + nft_unregister_basechain(nft_base_chain(chain), hook_nops); } /* Internal table flags */ @@ -560,7 +598,7 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + err = nft_register_basechain(nft_base_chain(chain), afi->nops); if (err < 0) goto err; @@ -575,20 +613,20 @@ err: if (i-- <= 0) break; - nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); + nft_unregister_basechain(nft_base_chain(chain), afi->nops); } return err; } static void nf_tables_table_disable(const struct nft_af_info *afi, - struct nft_table *table) + struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { if (chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, - afi->nops); + nft_unregister_basechain(nft_base_chain(chain), + afi->nops); } } @@ -635,15 +673,14 @@ err: return ret; } -static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; @@ -669,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -679,30 +716,32 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } + err = -EAFNOSUPPORT; if (!try_module_get(afi->owner)) - return -EAFNOSUPPORT; + goto err1; err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) - goto err1; + goto err2; nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&table->list, &afi->tables); return 0; -err2: +err3: kfree(table); -err1: +err2: module_put(afi->owner); +err1: return err; } @@ -771,18 +810,17 @@ out: return err; } -static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_deltable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) return nft_flush(&ctx, family); @@ -881,6 +919,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFTA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, }; static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) @@ -954,6 +994,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; + if (basechain->dev_name[0] && + nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) + goto nla_put_failure; nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -1165,16 +1208,20 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) BUG_ON(chain->use > 0); if (chain->flags & NFT_BASE_CHAIN) { - module_put(nft_base_chain(chain)->type->owner); - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); + struct nft_base_chain *basechain = nft_base_chain(chain); + + module_put(basechain->type->owner); + free_percpu(basechain->stats); + if (basechain->ops[0].dev != NULL) + dev_put(basechain->ops[0].dev); + kfree(basechain); } else { kfree(chain); } } -static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1184,8 +1231,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct net_device *dev = NULL; u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; @@ -1264,7 +1311,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(stats); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); if (trans == NULL) { @@ -1325,17 +1372,43 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return -ENOENT; hookfn = type->hooks[hooknum]; + if (afi->flags & NFT_AF_NEEDS_DEV) { + char ifname[IFNAMSIZ]; + + if (!ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + + nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); + dev = dev_get_by_name(net, ifname); + if (!dev) { + module_put(type->owner); + return -ENOENT; + } + } else if (ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); if (basechain == NULL) { module_put(type->owner); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } + if (dev != NULL) + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return PTR_ERR(stats); } basechain->stats = stats; @@ -1344,6 +1417,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (stats == NULL) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } rcu_assign_pointer(basechain->stats, stats); @@ -1356,11 +1431,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, for (i = 0; i < afi->nops; i++) { ops = &basechain->ops[i]; ops->pf = family; - ops->owner = afi->owner; ops->hooknum = hooknum; ops->priority = priority; ops->priv = chain; ops->hook = afi->hooks[ops->hooknum]; + ops->dev = dev; if (hookfn) ops->hook = hookfn; if (afi->hook_ops_init) @@ -1380,14 +1455,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) { - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); - if (err < 0) - goto err1; - } + err = nf_tables_register_hooks(table, chain, afi->nops); + if (err < 0) + goto err1; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1402,15 +1474,14 @@ err1: return err; } -static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; @@ -1432,7 +1503,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, if (chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); return nft_delchain(&ctx); } @@ -1936,13 +2007,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static struct nft_expr_info *info; -static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2001,7 +2071,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); n = 0; size = 0; @@ -2102,13 +2172,12 @@ err1: return err; } -static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; @@ -2131,7 +2200,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(chain); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2270,12 +2339,11 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; struct nft_table *table = NULL; @@ -2297,7 +2365,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, return -ENOENT; } - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -2549,6 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2556,7 +2625,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2619,14 +2688,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -2724,7 +2792,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) { @@ -2808,8 +2876,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set nft_set_destroy(set); } -static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2822,7 +2890,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2950,7 +3018,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2959,7 +3027,6 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); if (IS_ERR(afi)) @@ -2971,7 +3038,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, if (!trans && (table->flags & NFT_TABLE_INACTIVE)) return -ENOENT; - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -3061,6 +3128,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3076,8 +3144,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, - false); + err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, + (void *)nla, false); if (err < 0) return err; @@ -3138,11 +3206,12 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; @@ -3454,11 +3523,10 @@ err1: return err; } -static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3467,7 +3535,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true); if (err < 0) return err; @@ -3549,8 +3617,8 @@ err1: return err; } -static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nlattr *attr; @@ -3561,7 +3629,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; @@ -3956,7 +4024,8 @@ static int nf_tables_abort(struct sk_buff *skb) struct nft_trans *trans, *next; struct nft_trans_elem *te; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { diff --git a/kernel/net/netfilter/nf_tables_core.c b/kernel/net/netfilter/nf_tables_core.c index f153b0707..f3695a497 100644 --- a/kernel/net/netfilter/nf_tables_core.c +++ b/kernel/net/netfilter/nf_tables_core.c @@ -48,9 +48,7 @@ static void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_chain *chain, int rulenum, enum nft_trace type) { - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - - nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", chain->table->name, chain->name, comments[type], rulenum); @@ -111,10 +109,10 @@ struct nft_jumpstack { }; unsigned int -nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +nft_do_chain(struct nft_pktinfo *pkt, void *priv) { - const struct nft_chain *chain = ops->priv, *basechain = chain; - const struct net *net = read_pnet(&nft_base_chain(basechain)->pnet); + const struct nft_chain *chain = priv, *basechain = chain; + const struct net *net = pkt->net; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; diff --git a/kernel/net/netfilter/nf_tables_netdev.c b/kernel/net/netfilter/nf_tables_netdev.c new file mode 100644 index 000000000..edb3502f2 --- /dev/null +++ b/kernel/net/netfilter/nf_tables_netdev.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void +nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct iphdr *iph, _iph; + u32 len, thoff; + + nft_set_pktinfo(pkt, skb, state); + + iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), + &_iph); + if (!iph) + return; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return; + + len = ntohs(iph->tot_len); + thoff = iph->ihl * 4; + if (skb->len < len) + return; + else if (len < thoff) + return; + + pkt->tprot = iph->protocol; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; +} + +static inline void +__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + unsigned int thoff = 0; + unsigned short frag_off; + int protohdr; + u32 pkt_len; + + ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), + &_ip6h); + if (!ip6h) + return; + + if (ip6h->version != 6) + return; + + pkt_len = ntohs(ip6h->payload_len); + if (pkt_len + sizeof(*ip6h) > skb->len) + return; + + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + if (protohdr < 0) + return; + + pkt->tprot = protohdr; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = frag_off; +#endif +} + +static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, skb, state); + __nft_netdev_set_pktinfo_ipv6(pkt, skb, state); +} + +static unsigned int +nft_do_chain_netdev(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); + break; + case htons(ETH_P_IPV6): + nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); + break; + default: + nft_set_pktinfo(&pkt, skb, state); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static struct nft_af_info nft_af_netdev __read_mostly = { + .family = NFPROTO_NETDEV, + .nhooks = NF_NETDEV_NUMHOOKS, + .owner = THIS_MODULE, + .flags = NFT_AF_NEEDS_DEV, + .nops = 1, + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, +}; + +static int nf_tables_netdev_init_net(struct net *net) +{ + net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.netdev == NULL) + return -ENOMEM; + + memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev)); + + if (nft_register_afinfo(net, net->nft.netdev) < 0) + goto err; + + return 0; +err: + kfree(net->nft.netdev); + return -ENOMEM; +} + +static void nf_tables_netdev_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.netdev); + kfree(net->nft.netdev); +} + +static struct pernet_operations nf_tables_netdev_net_ops = { + .init = nf_tables_netdev_init_net, + .exit = nf_tables_netdev_exit_net, +}; + +static const struct nf_chain_type nft_filter_chain_netdev = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_NETDEV, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_NETDEV_INGRESS), +}; + +static void nft_netdev_event(unsigned long event, struct nft_af_info *afi, + struct net_device *dev, struct nft_table *table, + struct nft_base_chain *basechain) +{ + switch (event) { + case NETDEV_REGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED)); + + dev_hold(dev); + basechain->ops[0].dev = dev; + basechain->flags &= ~NFT_BASECHAIN_DISABLED; + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_register_basechain(basechain, afi->nops); + break; + case NETDEV_UNREGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED); + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_unregister_basechain(basechain, afi->nops); + + dev_put(basechain->ops[0].dev); + basechain->ops[0].dev = NULL; + basechain->flags |= NFT_BASECHAIN_DISABLED; + break; + case NETDEV_CHANGENAME: + if (dev->ifindex != basechain->ops[0].dev->ifindex) + return; + + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + break; + } +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { + if (afi->family != NFPROTO_NETDEV) + continue; + + list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nft_netdev_event(event, afi, dev, table, + nft_base_chain(chain)); + } + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_netdev_notifier = { + .notifier_call = nf_tables_netdev_event, +}; + +static int __init nf_tables_netdev_init(void) +{ + int ret; + + nft_register_chain_type(&nft_filter_chain_netdev); + ret = register_pernet_subsys(&nf_tables_netdev_net_ops); + if (ret < 0) + nft_unregister_chain_type(&nft_filter_chain_netdev); + + register_netdevice_notifier(&nf_tables_netdev_notifier); + + return ret; +} + +static void __exit nf_tables_netdev_exit(void) +{ + unregister_netdevice_notifier(&nf_tables_netdev_notifier); + unregister_pernet_subsys(&nf_tables_netdev_net_ops); + nft_unregister_chain_type(&nft_filter_chain_netdev); +} + +module_init(nf_tables_netdev_init); +module_exit(nf_tables_netdev_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */ diff --git a/kernel/net/netfilter/nfnetlink.c b/kernel/net/netfilter/nfnetlink.c index 8b117c90e..77afe913d 100644 --- a/kernel/net/netfilter/nfnetlink.c +++ b/kernel/net/netfilter/nfnetlink.c @@ -64,7 +64,7 @@ void nfnl_unlock(__u8 subsys_id) EXPORT_SYMBOL_GPL(nfnl_unlock); #ifdef CONFIG_PROVE_LOCKING -int lockdep_nfnl_is_held(u8 subsys_id) +bool lockdep_nfnl_is_held(u8 subsys_id) { return lockdep_is_held(&table[subsys_id].mutex); } @@ -269,6 +269,12 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) } } +enum { + NFNL_BATCH_FAILURE = (1 << 0), + NFNL_BATCH_DONE = (1 << 1), + NFNL_BATCH_REPLAY = (1 << 2), +}; + static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u_int16_t subsys_id) { @@ -276,19 +282,19 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; - bool success = true, done = false; static LIST_HEAD(err_list); + u32 status; int err; if (subsys_id >= NFNL_SUBSYS_COUNT) return netlink_ack(skb, nlh, -EINVAL); replay: + status = 0; + skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) return netlink_ack(oskb, nlh, -ENOMEM); - skb->sk = oskb->sk; - nfnl_lock(subsys_id); ss = rcu_dereference_protected(table[subsys_id].subsys, lockdep_is_held(&table[subsys_id].mutex)); @@ -336,10 +342,10 @@ replay: if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ nfnl_err_reset(&err_list); - success = false; + status |= NFNL_BATCH_FAILURE; goto done; } else if (type == NFNL_MSG_BATCH_END) { - done = true; + status |= NFNL_BATCH_DONE; goto done; } else if (type < NLMSG_MIN_TYPE) { err = -EINVAL; @@ -373,7 +379,7 @@ replay: goto ack; if (nc->call_batch) { - err = nc->call_batch(net->nfnl, skb, nlh, + err = nc->call_batch(net, net->nfnl, skb, nlh, (const struct nlattr **)cda); } @@ -382,11 +388,8 @@ replay: * original skb. */ if (err == -EAGAIN) { - nfnl_err_reset(&err_list); - ss->abort(oskb); - nfnl_unlock(subsys_id); - kfree_skb(skb); - goto replay; + status |= NFNL_BATCH_REPLAY; + goto next; } } ack: @@ -402,7 +405,7 @@ ack: */ nfnl_err_reset(&err_list); netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); - success = false; + status |= NFNL_BATCH_FAILURE; goto done; } /* We don't stop processing the batch on errors, thus, @@ -410,19 +413,26 @@ ack: * triggers. */ if (err) - success = false; + status |= NFNL_BATCH_FAILURE; } - +next: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } done: - if (success && done) + if (status & NFNL_BATCH_REPLAY) { + ss->abort(oskb); + nfnl_err_reset(&err_list); + nfnl_unlock(subsys_id); + kfree_skb(skb); + goto replay; + } else if (status == NFNL_BATCH_DONE) { ss->commit(oskb); - else + } else { ss->abort(oskb); + } nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); @@ -432,6 +442,7 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); + u_int16_t res_id; int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || @@ -456,7 +467,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); - nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + nfnetlink_rcv_batch(skb, nlh, res_id); } else { netlink_rcv_skb(skb, &nfnetlink_rcv_msg); } @@ -474,7 +490,7 @@ static int nfnetlink_bind(struct net *net, int group) type = nfnl_group2type[group]; rcu_read_lock(); - ss = nfnetlink_get_subsys(type); + ss = nfnetlink_get_subsys(type << 8); rcu_read_unlock(); if (!ss) request_module("nfnetlink-subsys-%d", type); diff --git a/kernel/net/netfilter/nfnetlink_acct.c b/kernel/net/netfilter/nfnetlink_acct.c index c18af2f63..fefbf5f0b 100644 --- a/kernel/net/netfilter/nfnetlink_acct.c +++ b/kernel/net/netfilter/nfnetlink_acct.c @@ -27,8 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); -static LIST_HEAD(nfnl_acct_list); - struct nf_acct { atomic64_t pkts; atomic64_t bytes; @@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; + struct net *net = sock_net(nfnl); char *acct_name; unsigned int size = 0; u32 flags = 0; @@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &nfnl_acct_list, head) { + list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } atomic_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); return 0; } @@ -185,6 +184,7 @@ nla_put_failure: static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -257,6 +257,7 @@ static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -336,19 +337,20 @@ static int nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); char *acct_name; struct nf_acct *cur; int ret = -ENOENT; if (!tb[NFACCT_NAME]) { - list_for_each_entry(cur, &nfnl_acct_list, head) + list_for_each_entry(cur, &net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = { MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); -struct nf_acct *nfnl_acct_find_get(const char *acct_name) +struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { - atomic_dec(&acct->refcnt); + if (atomic_dec_and_test(&acct->refcnt)) + kfree_rcu(acct, rcu_head); + module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(nfnl_acct_put); @@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_overquota); +static int __net_init nfnl_acct_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->nfnl_acct_list); + + return 0; +} + +static void __net_exit nfnl_acct_net_exit(struct net *net) +{ + struct nf_acct *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_del_rcu(&cur->head); + + if (atomic_dec_and_test(&cur->refcnt)) + kfree_rcu(cur, rcu_head); + } +} + +static struct pernet_operations nfnl_acct_ops = { + .init = nfnl_acct_net_init, + .exit = nfnl_acct_net_exit, +}; + static int __init nfnl_acct_init(void) { int ret; + ret = register_pernet_subsys(&nfnl_acct_ops); + if (ret < 0) { + pr_err("nfnl_acct_init: failed to register pernet ops\n"); + goto err_out; + } + pr_info("nfnl_acct: registering with nfnetlink.\n"); ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); - goto err_out; + goto cleanup_pernet; } return 0; + +cleanup_pernet: + unregister_pernet_subsys(&nfnl_acct_ops); err_out: return ret; } static void __exit nfnl_acct_exit(void) { - struct nf_acct *cur, *tmp; - pr_info("nfnl_acct: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&nfnl_acct_subsys); - - list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { - list_del_rcu(&cur->head); - /* We are sure that our objects have no clients at this point, - * it's safe to release them all without checking refcnt. */ - kfree_rcu(cur, rcu_head); - } + unregister_pernet_subsys(&nfnl_acct_ops); } module_init(nfnl_acct_init); diff --git a/kernel/net/netfilter/nfnetlink_cttimeout.c b/kernel/net/netfilter/nfnetlink_cttimeout.c index 476accd17..c7a2d0e1c 100644 --- a/kernel/net/netfilter/nfnetlink_cttimeout.c +++ b/kernel/net/netfilter/nfnetlink_cttimeout.c @@ -291,6 +291,34 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, return ret; } +static void untimeout(struct nf_conntrack_tuple_hash *i, + struct ctnl_timeout *timeout) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); + struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); + + if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); +} + +static void ctnl_untimeout(struct ctnl_timeout *timeout) +{ + struct nf_conntrack_tuple_hash *h; + const struct hlist_nulls_node *nn; + int i; + + local_bh_disable(); + for (i = 0; i < init_net.ct.htable_size; i++) { + spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + if (i < init_net.ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &init_net.ct.hash[i], hnnode) + untimeout(h, timeout); + } + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + } + local_bh_enable(); +} + /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) { @@ -301,6 +329,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); + ctnl_untimeout(timeout); kfree_rcu(timeout, rcu_head); } else { /* still in use, restore reference counter. */ @@ -567,6 +596,10 @@ static void __exit cttimeout_exit(void) pr_info("cttimeout: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&cttimeout_subsys); + + /* Make sure no conntrack objects refer to custom timeouts anymore. */ + ctnl_untimeout(NULL); + list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { list_del_rcu(&cur->head); /* We are sure that our objects have no clients at this point, @@ -579,6 +612,7 @@ static void __exit cttimeout_exit(void) RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + rcu_barrier(); } module_init(cttimeout_init); diff --git a/kernel/net/netfilter/nfnetlink_log.c b/kernel/net/netfilter/nfnetlink_log.c index 4ef1fae84..740cce468 100644 --- a/kernel/net/netfilter/nfnetlink_log.c +++ b/kernel/net/netfilter/nfnetlink_log.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -401,7 +402,9 @@ __build_packet_message(struct nfnl_log_net *log, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, - const char *prefix, unsigned int plen) + const char *prefix, unsigned int plen, + const struct nfnl_ct_hook *nfnl_ct, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; @@ -538,9 +541,9 @@ __build_packet_message(struct nfnl_log_net *log, if (skb->tstamp.tv64) { struct nfulnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(skb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); + struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + ts.sec = cpu_to_be64(kts.tv_sec); + ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; @@ -575,6 +578,10 @@ __build_packet_message(struct nfnl_log_net *log, htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; + if (ct && nfnl_ct->build(inst->skb, ct, ctinfo, + NFULA_CT, NFULA_CT_INFO) < 0) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); @@ -598,8 +605,6 @@ nla_put_failure: return -1; } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { @@ -622,12 +627,16 @@ nfulnl_log_packet(struct net *net, const struct nf_loginfo *li_user, const char *prefix) { - unsigned int size, data_len; + size_t size; + unsigned int data_len; struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; struct nfnl_log_net *log = nfnl_log_pernet(net); + const struct nfnl_ct_hook *nfnl_ct = NULL; + struct nf_conn *ct = NULL; + enum ip_conntrack_info uninitialized_var(ctinfo); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; @@ -673,6 +682,14 @@ nfulnl_log_packet(struct net *net, size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); + if (inst->flags & NFULNL_CFG_F_CONNTRACK) { + nfnl_ct = rcu_dereference(nfnl_ct_hook); + if (nfnl_ct != NULL) { + ct = nfnl_ct->get_ct(skb, &ctinfo); + if (ct != NULL) + size += nfnl_ct->build_size(ct); + } + } qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ @@ -717,7 +734,8 @@ nfulnl_log_packet(struct net *net, inst->qlen++; __build_packet_message(log, inst, skb, data_len, pf, - hooknum, in, out, prefix, plen); + hooknum, in, out, prefix, plen, + nfnl_ct, ct, ctinfo); if (inst->qlen >= qthreshold) __nfulnl_flush(inst); @@ -807,6 +825,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct net *net = sock_net(ctnl); struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; + u16 flags = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = nfmsg->nfgen_family; @@ -828,6 +847,28 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; } + /* Check if we support these flags in first place, dependencies should + * be there too not to break atomicity. + */ + if (nfula[NFULA_CFG_FLAGS]) { + flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS])); + + if ((flags & NFULNL_CFG_F_CONNTRACK) && + !rcu_access_pointer(nfnl_ct_hook)) { +#ifdef CONFIG_MODULES + nfnl_unlock(NFNL_SUBSYS_ULOG); + request_module("ip_conntrack_netlink"); + nfnl_lock(NFNL_SUBSYS_ULOG); + if (rcu_access_pointer(nfnl_ct_hook)) { + ret = -EAGAIN; + goto out_put; + } +#endif + ret = -EOPNOTSUPP; + goto out_put; + } + } + if (cmd != NULL) { switch (cmd->command) { case NFULNL_CFG_CMD_BIND: @@ -856,16 +897,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -ENOTSUPP; break; } + } else if (!inst) { + ret = -ENODEV; + goto out; } if (nfula[NFULA_CFG_MODE]) { - struct nfulnl_msg_config_mode *params; - params = nla_data(nfula[NFULA_CFG_MODE]); + struct nfulnl_msg_config_mode *params = + nla_data(nfula[NFULA_CFG_MODE]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } @@ -873,42 +913,23 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, if (nfula[NFULA_CFG_TIMEOUT]) { __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_timeout(inst, ntohl(timeout)); } if (nfula[NFULA_CFG_NLBUFSIZ]) { __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } if (nfula[NFULA_CFG_QTHRESH]) { __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_qthresh(inst, ntohl(qthresh)); } - if (nfula[NFULA_CFG_FLAGS]) { - __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]); - - if (!inst) { - ret = -ENODEV; - goto out; - } - nfulnl_set_flags(inst, ntohs(flags)); - } + if (nfula[NFULA_CFG_FLAGS]) + nfulnl_set_flags(inst, flags); out_put: instance_put(inst); diff --git a/kernel/net/netfilter/nfnetlink_queue.c b/kernel/net/netfilter/nfnetlink_queue.c new file mode 100644 index 000000000..861c66152 --- /dev/null +++ b/kernel/net/netfilter/nfnetlink_queue.c @@ -0,0 +1,1444 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfnetlink. + * + * (C) 2005 by Harald Welte + * (C) 2007 by Patrick McHardy + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +/* We're using struct nlattr which has 16bit nla_len. Note that nla_len + * includes the header length. Thus, the maximum packet length that we + * support is 65531 bytes. We send truncated packets if the specified length + * is larger than that. Userspace can check for presence of NFQA_CAP_LEN + * attribute to detect truncation. + */ +#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + struct rcu_head rcu; + + u32 peer_portid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; + u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ + spinlock_t lock; + unsigned int queue_total; + unsigned int id_sequence; /* 'sequence' of pkt ids */ + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); + +static int nfnl_queue_net_id __read_mostly; + +#define INSTANCE_BUCKETS 16 +struct nfnl_queue_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ + return net_generic(net, nfnl_queue_net_id); +} + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) +{ + struct hlist_head *head; + struct nfqnl_instance *inst; + + head = &q->instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) +{ + struct nfqnl_instance *inst; + unsigned int h; + int err; + + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + inst->queue_num = queue_num; + inst->peer_portid = portid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = NFQNL_MAX_COPY_RANGE; + inst->copy_mode = NFQNL_COPY_NONE; + spin_lock_init(&inst->lock); + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) { + err = -EAGAIN; + goto out_free; + } + + h = instance_hashfn(queue_num); + hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); + + spin_unlock(&q->instances_lock); + + return inst; + +out_free: + kfree(inst); +out_unlock: + spin_unlock(&q->instances_lock); + return ERR_PTR(err); +} + +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data); + +static void +instance_destroy_rcu(struct rcu_head *head) +{ + struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, + rcu); + + nfqnl_flush(inst, NULL, 0); + kfree(inst); + module_put(THIS_MODULE); +} + +static void +__instance_destroy(struct nfqnl_instance *inst) +{ + hlist_del_rcu(&inst->hlist); + call_rcu(&inst->rcu, instance_destroy_rcu); +} + +static void +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) +{ + spin_lock(&q->instances_lock); + __instance_destroy(inst); + spin_unlock(&q->instances_lock); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +static void +__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_del(&entry->list); + queue->queue_total--; +} + +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue->lock); + + list_for_each_entry(i, &queue->queue_list, list) { + if (i->id == id) { + entry = i; + break; + } + } + + if (entry) + __dequeue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + spin_lock_bh(&queue->lock); + list_for_each_entry_safe(entry, next, &queue->queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue->queue_total--; + nf_reinject(entry, NF_DROP); + } + } + spin_unlock_bh(&queue->lock); +} + +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, + bool csum_verify) +{ + __u32 flags = 0; + + if (packet->ip_summed == CHECKSUM_PARTIAL) + flags = NFQA_SKB_CSUMNOTREADY; + else if (csum_verify) + flags = NFQA_SKB_CSUM_NOTVERIFIED; + + if (skb_is_gso(packet)) + flags |= NFQA_SKB_GSO; + + return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ + const struct cred *cred; + + if (!sk_fullsock(sk)) + return 0; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + cred = sk->sk_socket->file->f_cred; + if (nla_put_be32(skb, NFQA_UID, + htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) + goto nla_put_failure; + if (nla_put_be32(skb, NFQA_GID, + htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) + goto nla_put_failure; + } + read_unlock_bh(&sk->sk_callback_lock); + return 0; + +nla_put_failure: + read_unlock_bh(&sk->sk_callback_lock); + return -1; +} + +static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +{ + u32 seclen = 0; +#if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) + return 0; + + read_lock_bh(&skb->sk->sk_callback_lock); + + if (skb->secmark) + security_secid_to_secctx(skb->secmark, secdata, &seclen); + + read_unlock_bh(&skb->sk->sk_callback_lock); +#endif + return seclen; +} + +static struct sk_buff * +nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry, + __be32 **packet_id_ptr) +{ + size_t size; + size_t data_len = 0, cap_len = 0, rem_len = 0; + unsigned int hlen = 0; + struct sk_buff *skb; + struct nlattr *nla; + struct nfqnl_msg_packet_hdr *pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *entskb = entry->skb; + struct net_device *indev; + struct net_device *outdev; + struct nf_conn *ct = NULL; + enum ip_conntrack_info uninitialized_var(ctinfo); + struct nfnl_ct_hook *nfnl_ct; + bool csum_verify; + char *secdata = NULL; + u32 seclen = 0; + + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + + nla_total_size(sizeof(u_int32_t)); /* cap_len */ + + if (entskb->tstamp.tv64) + size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + if (entry->state.hook <= NF_INET_FORWARD || + (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + csum_verify = !skb_csum_unnecessary(entskb); + else + csum_verify = false; + + outdev = entry->state.out; + + switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + break; + + case NFQNL_COPY_PACKET: + if (!(queue->flags & NFQA_CFG_F_GSO) && + entskb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(entskb)) + return NULL; + + data_len = ACCESS_ONCE(queue->copy_range); + if (data_len > entskb->len) + data_len = entskb->len; + + hlen = skb_zerocopy_headlen(entskb); + hlen = min_t(unsigned int, hlen, data_len); + size += sizeof(struct nlattr) + hlen; + cap_len = entskb->len; + rem_len = data_len - hlen; + break; + } + + nfnl_ct = rcu_dereference(nfnl_ct_hook); + + if (queue->flags & NFQA_CFG_F_CONNTRACK) { + if (nfnl_ct != NULL) { + ct = nfnl_ct->get_ct(entskb, &ctinfo); + if (ct != NULL) + size += nfnl_ct->build_size(ct); + } + } + + if (queue->flags & NFQA_CFG_F_UID_GID) { + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t))); /* gid */ + } + + if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { + seclen = nfqnl_get_sk_secctx(entskb, &secdata); + if (seclen) + size += nla_total_size(seclen); + } + + skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, + GFP_ATOMIC); + if (!skb) { + skb_tx_error(entskb); + return NULL; + } + + nlh = nlmsg_put(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg), 0); + if (!nlh) { + skb_tx_error(entskb); + kfree_skb(skb); + return NULL; + } + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = entry->state.pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); + pmsg = nla_data(nla); + pmsg->hw_protocol = entskb->protocol; + pmsg->hook = entry->state.hook; + *packet_id_ptr = &pmsg->packet_id; + + indev = entry->state.in; + if (indev) { +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) + goto nla_put_failure; +#else + if (entry->state.pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)) || + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex))) + goto nla_put_failure; + } else { + int physinif; + + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; + + physinif = nf_bridge_get_physinif(entskb); + if (physinif && + nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(physinif))) + goto nla_put_failure; + } +#endif + } + + if (outdev) { +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) + goto nla_put_failure; +#else + if (entry->state.pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)) || + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) + goto nla_put_failure; + } else { + int physoutif; + + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; + + physoutif = nf_bridge_get_physoutif(entskb); + if (physoutif && + nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(physoutif))) + goto nla_put_failure; + } +#endif + } + + if (entskb->mark && + nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) + goto nla_put_failure; + + if (indev && entskb->dev && + entskb->mac_header != entskb->network_header) { + struct nfqnl_msg_packet_hw phw; + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(entskb, phw.hw_addr); + if (len) { + phw.hw_addrlen = htons(len); + if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) + goto nla_put_failure; + } + } + + if (entskb->tstamp.tv64) { + struct nfqnl_msg_packet_timestamp ts; + struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + + ts.sec = cpu_to_be64(kts.tv_sec); + ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); + + if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) + goto nla_put_failure; + } + + if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && + nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) + goto nla_put_failure; + + if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + goto nla_put_failure; + + if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) + goto nla_put_failure; + + if (cap_len > data_len && + nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + goto nla_put_failure; + + if (nfqnl_put_packet_info(skb, entskb, csum_verify)) + goto nla_put_failure; + + if (data_len) { + struct nlattr *nla; + + if (skb_tailroom(skb) < sizeof(*nla) + hlen) + goto nla_put_failure; + + nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); + nla->nla_type = NFQA_PAYLOAD; + nla->nla_len = nla_attr_size(data_len); + + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; + } + + nlh->nlmsg_len = skb->len; + return skb; + +nla_put_failure: + skb_tx_error(entskb); + kfree_skb(skb); + net_err_ratelimited("nf_queue: error creating packet message\n"); + return NULL; +} + +static int +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry) +{ + struct sk_buff *nskb; + int err = -ENOBUFS; + __be32 *packet_id_ptr; + int failopen = 0; + + nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); + if (nskb == NULL) { + err = -ENOMEM; + goto err_out; + } + spin_lock_bh(&queue->lock); + + if (queue->queue_total >= queue->queue_maxlen) { + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_dropped++; + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + } + goto err_out_free_nskb; + } + entry->id = ++queue->id_sequence; + *packet_id_ptr = htonl(entry->id); + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + if (err < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + return 0; + +err_out_free_nskb: + kfree_skb(nskb); +err_out_unlock: + spin_unlock_bh(&queue->lock); + if (failopen) + nf_reinject(entry, NF_ACCEPT); +err_out: + return err; +} + +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) + nf_queue_entry_get_refs(entry); + return entry; +} + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, + struct sk_buff *skb, struct nf_queue_entry *entry) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry_seg); + if (ret) + free_entry(entry_seg); + } + return ret; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + unsigned int queued; + struct nfqnl_instance *queue; + struct sk_buff *skb, *segs; + int err = -ENOBUFS; + struct net *net = entry->state.net; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(q, queuenum); + if (!queue) + return -ESRCH; + + if (queue->copy_mode == NFQNL_COPY_NONE) + return -EINVAL; + + skb = entry->skb; + + switch (entry->state.pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + return __nfqnl_enqueue_packet(net, queue, entry); + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ESRCH to + * mean 'ignore this hook'. + */ + if (IS_ERR_OR_NULL(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (err == 0) + err = __nfqnl_enqueue_packet_gso(net, queue, + segs, entry); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +{ + struct sk_buff *nskb; + + if (diff < 0) { + if (pskb_trim(e->skb, data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "nf_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, data, data_len); + e->skb->ip_summed = CHECKSUM_NONE; + return 0; +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + spin_lock_bh(&queue->lock); + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + if (range == 0 || range > NFQNL_MAX_COPY_RANGE) + queue->copy_range = NFQNL_MAX_COPY_RANGE; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->state.in) + if (entry->state.in->ifindex == ifindex) + return 1; + if (entry->state.out) + if (entry->state.out->ifindex == ifindex) + return 1; +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (entry->skb->nf_bridge) { + int physinif, physoutif; + + physinif = nf_bridge_get_physinif(entry->skb); + physoutif = nf_bridge_get_physoutif(entry->skb); + + if (physinif == ifindex || physoutif == ifindex) + return 1; + } +#endif + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(struct net *net, int ifindex) +{ + int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + rcu_read_lock(); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, dev_cmp, ifindex); + } + + rcu_read_unlock(); +} + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev_net(dev), dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +{ + return entry->elem == (struct nf_hook_ops *)ops_ptr; +} + +static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +{ + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int i; + + rcu_read_lock(); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook); + } + rcu_read_unlock(); +} + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this portid */ + spin_lock(&q->instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) + __instance_destroy(inst); + } + } + spin_unlock(&q->instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, + [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, + [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, +}; + +static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, +}; + +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) +{ + struct nfqnl_instance *queue; + + queue = instance_lookup(q, queue_num); + if (!queue) + return ERR_PTR(-ENODEV); + + if (queue->peer_portid != nlportid) + return ERR_PTR(-EPERM); + + return queue; +} + +static struct nfqnl_msg_verdict_hdr* +verdicthdr_get(const struct nlattr * const nfqa[]) +{ + struct nfqnl_msg_verdict_hdr *vhdr; + unsigned int verdict; + + if (!nfqa[NFQA_VERDICT_HDR]) + return NULL; + + vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); + verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; + if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) + return NULL; + return vhdr; +} + +static int nfq_id_after(unsigned int id, unsigned int max) +{ + return (int)(id - max) > 0; +} + +static int +nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_queue_entry *entry, *tmp; + unsigned int verdict, maxid; + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + LIST_HEAD(batch_list); + u16 queue_num = ntohs(nfmsg->res_id); + + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + maxid = ntohl(vhdr->id); + + spin_lock_bh(&queue->lock); + + list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { + if (nfq_id_after(entry->id, maxid)) + break; + __dequeue_entry(queue, entry); + list_add_tail(&entry->list, &batch_list); + } + + spin_unlock_bh(&queue->lock); + + if (list_empty(&batch_list)) + return -ENOENT; + + list_for_each_entry_safe(entry, tmp, &batch_list, list) { + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + nf_reinject(entry, verdict); + } + return 0; +} + +static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[], + struct nf_queue_entry *entry, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conn *ct; + + ct = nfnl_ct->get_ct(entry->skb, ctinfo); + if (ct == NULL) + return NULL; + + if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) + return NULL; + + if (nfqa[NFQA_EXP]) + nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, + NETLINK_CB(entry->skb).portid, + nlmsg_report(nlh)); + return ct; +} + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nf_queue_entry *entry; + enum ip_conntrack_info uninitialized_var(ctinfo); + struct nfnl_ct_hook *nfnl_ct; + struct nf_conn *ct = NULL; + + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = instance_lookup(q, queue_num); + if (!queue) + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + if (entry == NULL) + return -ENOENT; + + /* rcu lock already held from nfnl->call_rcu. */ + nfnl_ct = rcu_dereference(nfnl_ct_hook); + + if (nfqa[NFQA_CT]) { + if (nfnl_ct != NULL) + ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); + } + + if (nfqa[NFQA_PAYLOAD]) { + u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); + int diff = payload_len - entry->skb->len; + + if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), + payload_len, entry, diff) < 0) + verdict = NF_DROP; + + if (ct && diff) + nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); + } + + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + + nf_reinject(entry, verdict); + return 0; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { + [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, + [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, +}; + +static const struct nf_queue_handler nfqh = { + .outfn = &nfqnl_enqueue_packet, + .nf_hook_drop = &nfqnl_nf_hook_drop, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + struct nfqnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int ret = 0; + + if (nfqa[NFQA_CFG_CMD]) { + cmd = nla_data(nfqa[NFQA_CFG_CMD]); + + /* Obsolete commands without queue context */ + switch (cmd->command) { + case NFQNL_CFG_CMD_PF_BIND: return 0; + case NFQNL_CFG_CMD_PF_UNBIND: return 0; + } + } + + rcu_read_lock(); + queue = instance_lookup(q, queue_num); + if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { + ret = -EPERM; + goto err_out_unlock; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) { + ret = -EBUSY; + goto err_out_unlock; + } + queue = instance_create(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto err_out_unlock; + } + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + instance_destroy(q, queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + case NFQNL_CFG_CMD_PF_UNBIND: + break; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfqa[NFQA_CFG_PARAMS]) { + struct nfqnl_msg_config_params *params; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + params = nla_data(nfqa[NFQA_CFG_PARAMS]); + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { + __be32 *queue_maxlen; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); + spin_lock_bh(&queue->lock); + queue->queue_maxlen = ntohl(*queue_maxlen); + spin_unlock_bh(&queue->lock); + } + + if (nfqa[NFQA_CFG_FLAGS]) { + __u32 flags, mask; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + + if (!nfqa[NFQA_CFG_MASK]) { + /* A mask is needed to specify which flags are being + * changed. + */ + ret = -EINVAL; + goto err_out_unlock; + } + + flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); + mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); + + if (flags >= NFQA_CFG_F_MAX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } +#if !IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (flags & mask & NFQA_CFG_F_SECCTX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } +#endif + spin_lock_bh(&queue->lock); + queue->flags &= ~mask; + queue->flags |= flags & mask; + spin_unlock_bh(&queue->lock); + } + +err_out_unlock: + rcu_read_unlock(); + return ret; +} + +static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, }, + [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy }, + [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy }, +}; + +static const struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + struct net *net; + struct nfnl_queue_net *q; + + if (!st) + return NULL; + + net = seq_file_net(seq); + q = nfnl_queue_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&q->instance_table[st->bucket])) + return q->instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + struct net *net = seq_file_net(seq); + + h = h->next; + while (!h) { + struct nfnl_queue_net *q; + + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + q = nfnl_queue_pernet(net); + h = q->instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ + spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); + return get_idx(s, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ + spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", + inst->queue_num, + inst->peer_portid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + inst->id_sequence, 1); + return 0; +} + +static const struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &nfqnl_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif /* PROC_FS */ + +static int __net_init nfnl_queue_net_init(struct net *net) +{ + unsigned int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&q->instance_table[i]); + + spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + net->nf.proc_netfilter, &nfqnl_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nfnl_queue_net_ops = { + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ + int status; + + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); + goto out; + } + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + pr_err("nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + register_netdevice_notifier(&nfqnl_dev_notifier); + nf_register_queue_handler(&nfqh); + return status; + +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); +out: + return status; +} + +static void __exit nfnetlink_queue_fini(void) +{ + nf_unregister_queue_handler(); + unregister_netdevice_notifier(&nfqnl_dev_notifier); + nfnetlink_subsys_unregister(&nfqnl_subsys); + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(nfnetlink_queue_init); +module_exit(nfnetlink_queue_fini); diff --git a/kernel/net/netfilter/nfnetlink_queue_core.c b/kernel/net/netfilter/nfnetlink_queue_core.c deleted file mode 100644 index 11c7682fa..000000000 --- a/kernel/net/netfilter/nfnetlink_queue_core.c +++ /dev/null @@ -1,1362 +0,0 @@ -/* - * This is a module which is used for queueing packets and communicating with - * userspace via nfnetlink. - * - * (C) 2005 by Harald Welte - * (C) 2007 by Patrick McHardy - * - * Based on the old ipv4-only ip_queue.c: - * (C) 2000-2002 James Morris - * (C) 2003-2005 Netfilter Core Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -#include "../bridge/br_private.h" -#endif - -#define NFQNL_QMAX_DEFAULT 1024 - -/* We're using struct nlattr which has 16bit nla_len. Note that nla_len - * includes the header length. Thus, the maximum packet length that we - * support is 65531 bytes. We send truncated packets if the specified length - * is larger than that. Userspace can check for presence of NFQA_CAP_LEN - * attribute to detect truncation. - */ -#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) - -struct nfqnl_instance { - struct hlist_node hlist; /* global list of queues */ - struct rcu_head rcu; - - u32 peer_portid; - unsigned int queue_maxlen; - unsigned int copy_range; - unsigned int queue_dropped; - unsigned int queue_user_dropped; - - - u_int16_t queue_num; /* number of this queue */ - u_int8_t copy_mode; - u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ -/* - * Following fields are dirtied for each queued packet, - * keep them in same cache line if possible. - */ - spinlock_t lock; - unsigned int queue_total; - unsigned int id_sequence; /* 'sequence' of pkt ids */ - struct list_head queue_list; /* packets in queue */ -}; - -typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); - -static int nfnl_queue_net_id __read_mostly; - -#define INSTANCE_BUCKETS 16 -struct nfnl_queue_net { - spinlock_t instances_lock; - struct hlist_head instance_table[INSTANCE_BUCKETS]; -}; - -static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) -{ - return net_generic(net, nfnl_queue_net_id); -} - -static inline u_int8_t instance_hashfn(u_int16_t queue_num) -{ - return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; -} - -static struct nfqnl_instance * -instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) -{ - struct hlist_head *head; - struct nfqnl_instance *inst; - - head = &q->instance_table[instance_hashfn(queue_num)]; - hlist_for_each_entry_rcu(inst, head, hlist) { - if (inst->queue_num == queue_num) - return inst; - } - return NULL; -} - -static struct nfqnl_instance * -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) -{ - struct nfqnl_instance *inst; - unsigned int h; - int err; - - spin_lock(&q->instances_lock); - if (instance_lookup(q, queue_num)) { - err = -EEXIST; - goto out_unlock; - } - - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) { - err = -ENOMEM; - goto out_unlock; - } - - inst->queue_num = queue_num; - inst->peer_portid = portid; - inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = NFQNL_MAX_COPY_RANGE; - inst->copy_mode = NFQNL_COPY_NONE; - spin_lock_init(&inst->lock); - INIT_LIST_HEAD(&inst->queue_list); - - if (!try_module_get(THIS_MODULE)) { - err = -EAGAIN; - goto out_free; - } - - h = instance_hashfn(queue_num); - hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); - - spin_unlock(&q->instances_lock); - - return inst; - -out_free: - kfree(inst); -out_unlock: - spin_unlock(&q->instances_lock); - return ERR_PTR(err); -} - -static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, - unsigned long data); - -static void -instance_destroy_rcu(struct rcu_head *head) -{ - struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, - rcu); - - nfqnl_flush(inst, NULL, 0); - kfree(inst); - module_put(THIS_MODULE); -} - -static void -__instance_destroy(struct nfqnl_instance *inst) -{ - hlist_del_rcu(&inst->hlist); - call_rcu(&inst->rcu, instance_destroy_rcu); -} - -static void -instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) -{ - spin_lock(&q->instances_lock); - __instance_destroy(inst); - spin_unlock(&q->instances_lock); -} - -static inline void -__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) -{ - list_add_tail(&entry->list, &queue->queue_list); - queue->queue_total++; -} - -static void -__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) -{ - list_del(&entry->list); - queue->queue_total--; -} - -static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) -{ - struct nf_queue_entry *entry = NULL, *i; - - spin_lock_bh(&queue->lock); - - list_for_each_entry(i, &queue->queue_list, list) { - if (i->id == id) { - entry = i; - break; - } - } - - if (entry) - __dequeue_entry(queue, entry); - - spin_unlock_bh(&queue->lock); - - return entry; -} - -static void -nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) -{ - struct nf_queue_entry *entry, *next; - - spin_lock_bh(&queue->lock); - list_for_each_entry_safe(entry, next, &queue->queue_list, list) { - if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue->queue_total--; - nf_reinject(entry, NF_DROP); - } - } - spin_unlock_bh(&queue->lock); -} - -static int -nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, - bool csum_verify) -{ - __u32 flags = 0; - - if (packet->ip_summed == CHECKSUM_PARTIAL) - flags = NFQA_SKB_CSUMNOTREADY; - else if (csum_verify) - flags = NFQA_SKB_CSUM_NOTVERIFIED; - - if (skb_is_gso(packet)) - flags |= NFQA_SKB_GSO; - - return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; -} - -static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) -{ - const struct cred *cred; - - if (!sk_fullsock(sk)) - return 0; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - cred = sk->sk_socket->file->f_cred; - if (nla_put_be32(skb, NFQA_UID, - htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) - goto nla_put_failure; - if (nla_put_be32(skb, NFQA_GID, - htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) - goto nla_put_failure; - } - read_unlock_bh(&sk->sk_callback_lock); - return 0; - -nla_put_failure: - read_unlock_bh(&sk->sk_callback_lock); - return -1; -} - -static struct sk_buff * -nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, - struct nf_queue_entry *entry, - __be32 **packet_id_ptr) -{ - size_t size; - size_t data_len = 0, cap_len = 0; - unsigned int hlen = 0; - struct sk_buff *skb; - struct nlattr *nla; - struct nfqnl_msg_packet_hdr *pmsg; - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct sk_buff *entskb = entry->skb; - struct net_device *indev; - struct net_device *outdev; - struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); - bool csum_verify; - - size = nlmsg_total_size(sizeof(struct nfgenmsg)) - + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#endif - + nla_total_size(sizeof(u_int32_t)) /* mark */ - + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) - + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ - + nla_total_size(sizeof(u_int32_t)); /* cap_len */ - - if (entskb->tstamp.tv64) - size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); - - if (entry->state.hook <= NF_INET_FORWARD || - (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) - csum_verify = !skb_csum_unnecessary(entskb); - else - csum_verify = false; - - outdev = entry->state.out; - - switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { - case NFQNL_COPY_META: - case NFQNL_COPY_NONE: - break; - - case NFQNL_COPY_PACKET: - if (!(queue->flags & NFQA_CFG_F_GSO) && - entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) - return NULL; - - data_len = ACCESS_ONCE(queue->copy_range); - if (data_len > entskb->len) - data_len = entskb->len; - - hlen = skb_zerocopy_headlen(entskb); - hlen = min_t(unsigned int, hlen, data_len); - size += sizeof(struct nlattr) + hlen; - cap_len = entskb->len; - break; - } - - if (queue->flags & NFQA_CFG_F_CONNTRACK) - ct = nfqnl_ct_get(entskb, &size, &ctinfo); - - if (queue->flags & NFQA_CFG_F_UID_GID) { - size += (nla_total_size(sizeof(u_int32_t)) /* uid */ - + nla_total_size(sizeof(u_int32_t))); /* gid */ - } - - skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, - GFP_ATOMIC); - if (!skb) { - skb_tx_error(entskb); - return NULL; - } - - nlh = nlmsg_put(skb, 0, 0, - NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, - sizeof(struct nfgenmsg), 0); - if (!nlh) { - skb_tx_error(entskb); - kfree_skb(skb); - return NULL; - } - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->state.pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); - - nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); - pmsg = nla_data(nla); - pmsg->hw_protocol = entskb->protocol; - pmsg->hook = entry->state.hook; - *packet_id_ptr = &pmsg->packet_id; - - indev = entry->state.in; - if (indev) { -#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) - goto nla_put_failure; -#else - if (entry->state.pf == PF_BRIDGE) { - /* Case 1: indev is physical input device, we need to - * look for bridge group (when called from - * netfilter_bridge) */ - if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(indev->ifindex)) || - /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by __nf_queue */ - nla_put_be32(skb, NFQA_IFINDEX_INDEV, - htonl(br_port_get_rcu(indev)->br->dev->ifindex))) - goto nla_put_failure; - } else { - int physinif; - - /* Case 2: indev is bridge group, we need to look for - * physical device (when called from ipv4) */ - if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, - htonl(indev->ifindex))) - goto nla_put_failure; - - physinif = nf_bridge_get_physinif(entskb); - if (physinif && - nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(physinif))) - goto nla_put_failure; - } -#endif - } - - if (outdev) { -#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) - goto nla_put_failure; -#else - if (entry->state.pf == PF_BRIDGE) { - /* Case 1: outdev is physical output device, we need to - * look for bridge group (when called from - * netfilter_bridge) */ - if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(outdev->ifindex)) || - /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by __nf_queue */ - nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, - htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) - goto nla_put_failure; - } else { - int physoutif; - - /* Case 2: outdev is bridge group, we need to look for - * physical output device (when called from ipv4) */ - if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, - htonl(outdev->ifindex))) - goto nla_put_failure; - - physoutif = nf_bridge_get_physoutif(entskb); - if (physoutif && - nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(physoutif))) - goto nla_put_failure; - } -#endif - } - - if (entskb->mark && - nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) - goto nla_put_failure; - - if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { - struct nfqnl_msg_packet_hw phw; - int len; - - memset(&phw, 0, sizeof(phw)); - len = dev_parse_header(entskb, phw.hw_addr); - if (len) { - phw.hw_addrlen = htons(len); - if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) - goto nla_put_failure; - } - } - - if (entskb->tstamp.tv64) { - struct nfqnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(entskb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); - - if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) - goto nla_put_failure; - } - - if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && - nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) - goto nla_put_failure; - - if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) - goto nla_put_failure; - - if (cap_len > data_len && - nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) - goto nla_put_failure; - - if (nfqnl_put_packet_info(skb, entskb, csum_verify)) - goto nla_put_failure; - - if (data_len) { - struct nlattr *nla; - - if (skb_tailroom(skb) < sizeof(*nla) + hlen) - goto nla_put_failure; - - nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); - nla->nla_type = NFQA_PAYLOAD; - nla->nla_len = nla_attr_size(data_len); - - if (skb_zerocopy(skb, entskb, data_len, hlen)) - goto nla_put_failure; - } - - nlh->nlmsg_len = skb->len; - return skb; - -nla_put_failure: - skb_tx_error(entskb); - kfree_skb(skb); - net_err_ratelimited("nf_queue: error creating packet message\n"); - return NULL; -} - -static int -__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, - struct nf_queue_entry *entry) -{ - struct sk_buff *nskb; - int err = -ENOBUFS; - __be32 *packet_id_ptr; - int failopen = 0; - - nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); - if (nskb == NULL) { - err = -ENOMEM; - goto err_out; - } - spin_lock_bh(&queue->lock); - - if (queue->queue_total >= queue->queue_maxlen) { - if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { - failopen = 1; - err = 0; - } else { - queue->queue_dropped++; - net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", - queue->queue_total); - } - goto err_out_free_nskb; - } - entry->id = ++queue->id_sequence; - *packet_id_ptr = htonl(entry->id); - - /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); - if (err < 0) { - queue->queue_user_dropped++; - goto err_out_unlock; - } - - __enqueue_entry(queue, entry); - - spin_unlock_bh(&queue->lock); - return 0; - -err_out_free_nskb: - kfree_skb(nskb); -err_out_unlock: - spin_unlock_bh(&queue->lock); - if (failopen) - nf_reinject(entry, NF_ACCEPT); -err_out: - return err; -} - -static struct nf_queue_entry * -nf_queue_entry_dup(struct nf_queue_entry *e) -{ - struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) { - if (nf_queue_entry_get_refs(entry)) - return entry; - kfree(entry); - } - return NULL; -} - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -/* When called from bridge netfilter, skb->data must point to MAC header - * before calling skb_gso_segment(). Else, original MAC header is lost - * and segmented skbs will be sent to wrong destination. - */ -static void nf_bridge_adjust_skb_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_push(skb, skb->network_header - skb->mac_header); -} - -static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_pull(skb, skb->network_header - skb->mac_header); -} -#else -#define nf_bridge_adjust_skb_data(s) do {} while (0) -#define nf_bridge_adjust_segmented_data(s) do {} while (0) -#endif - -static void free_entry(struct nf_queue_entry *entry) -{ - nf_queue_entry_release_refs(entry); - kfree(entry); -} - -static int -__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, - struct sk_buff *skb, struct nf_queue_entry *entry) -{ - int ret = -ENOMEM; - struct nf_queue_entry *entry_seg; - - nf_bridge_adjust_segmented_data(skb); - - if (skb->next == NULL) { /* last packet, no need to copy entry */ - struct sk_buff *gso_skb = entry->skb; - entry->skb = skb; - ret = __nfqnl_enqueue_packet(net, queue, entry); - if (ret) - entry->skb = gso_skb; - return ret; - } - - skb->next = NULL; - - entry_seg = nf_queue_entry_dup(entry); - if (entry_seg) { - entry_seg->skb = skb; - ret = __nfqnl_enqueue_packet(net, queue, entry_seg); - if (ret) - free_entry(entry_seg); - } - return ret; -} - -static int -nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ - unsigned int queued; - struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; - int err = -ENOBUFS; - struct net *net = dev_net(entry->state.in ? - entry->state.in : entry->state.out); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - /* rcu_read_lock()ed by nf_hook_slow() */ - queue = instance_lookup(q, queuenum); - if (!queue) - return -ESRCH; - - if (queue->copy_mode == NFQNL_COPY_NONE) - return -EINVAL; - - skb = entry->skb; - - switch (entry->state.pf) { - case NFPROTO_IPV4: - skb->protocol = htons(ETH_P_IP); - break; - case NFPROTO_IPV6: - skb->protocol = htons(ETH_P_IPV6); - break; - } - - if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) - return __nfqnl_enqueue_packet(net, queue, entry); - - nf_bridge_adjust_skb_data(skb); - segs = skb_gso_segment(skb, 0); - /* Does not use PTR_ERR to limit the number of error codes that can be - * returned by nf_queue. For instance, callers rely on -ECANCELED to - * mean 'ignore this hook'. - */ - if (IS_ERR_OR_NULL(segs)) - goto out_err; - queued = 0; - err = 0; - do { - struct sk_buff *nskb = segs->next; - if (err == 0) - err = __nfqnl_enqueue_packet_gso(net, queue, - segs, entry); - if (err == 0) - queued++; - else - kfree_skb(segs); - segs = nskb; - } while (segs); - - if (queued) { - if (err) /* some segments are already queued */ - free_entry(entry); - kfree_skb(skb); - return 0; - } - out_err: - nf_bridge_adjust_segmented_data(skb); - return err; -} - -static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) -{ - struct sk_buff *nskb; - - if (diff < 0) { - if (pskb_trim(e->skb, data_len)) - return -ENOMEM; - } else if (diff > 0) { - if (data_len > 0xFFFF) - return -EINVAL; - if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), - diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "nf_queue: OOM " - "in mangle, dropping packet\n"); - return -ENOMEM; - } - kfree_skb(e->skb); - e->skb = nskb; - } - skb_put(e->skb, diff); - } - if (!skb_make_writable(e->skb, data_len)) - return -ENOMEM; - skb_copy_to_linear_data(e->skb, data, data_len); - e->skb->ip_summed = CHECKSUM_NONE; - return 0; -} - -static int -nfqnl_set_mode(struct nfqnl_instance *queue, - unsigned char mode, unsigned int range) -{ - int status = 0; - - spin_lock_bh(&queue->lock); - switch (mode) { - case NFQNL_COPY_NONE: - case NFQNL_COPY_META: - queue->copy_mode = mode; - queue->copy_range = 0; - break; - - case NFQNL_COPY_PACKET: - queue->copy_mode = mode; - if (range == 0 || range > NFQNL_MAX_COPY_RANGE) - queue->copy_range = NFQNL_MAX_COPY_RANGE; - else - queue->copy_range = range; - break; - - default: - status = -EINVAL; - - } - spin_unlock_bh(&queue->lock); - - return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ - if (entry->state.in) - if (entry->state.in->ifindex == ifindex) - return 1; - if (entry->state.out) - if (entry->state.out->ifindex == ifindex) - return 1; -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { - int physinif, physoutif; - - physinif = nf_bridge_get_physinif(entry->skb); - physoutif = nf_bridge_get_physoutif(entry->skb); - - if (physinif == ifindex || physoutif == ifindex) - return 1; - } -#endif - return 0; -} - -/* drop all packets with either indev or outdev == ifindex from all queue - * instances */ -static void -nfqnl_dev_drop(struct net *net, int ifindex) -{ - int i; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - rcu_read_lock(); - - for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct nfqnl_instance *inst; - struct hlist_head *head = &q->instance_table[i]; - - hlist_for_each_entry_rcu(inst, head, hlist) - nfqnl_flush(inst, dev_cmp, ifindex); - } - - rcu_read_unlock(); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static int -nfqnl_rcv_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - /* Drop any packets associated with the downed device */ - if (event == NETDEV_DOWN) - nfqnl_dev_drop(dev_net(dev), dev->ifindex); - return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_dev_notifier = { - .notifier_call = nfqnl_rcv_dev_event, -}; - -static int -nfqnl_rcv_nl_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct netlink_notify *n = ptr; - struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); - - if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { - int i; - - /* destroy all instances for this portid */ - spin_lock(&q->instances_lock); - for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *t2; - struct nfqnl_instance *inst; - struct hlist_head *head = &q->instance_table[i]; - - hlist_for_each_entry_safe(inst, t2, head, hlist) { - if (n->portid == inst->peer_portid) - __instance_destroy(inst); - } - } - spin_unlock(&q->instances_lock); - } - return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_rtnl_notifier = { - .notifier_call = nfqnl_rcv_nl_event, -}; - -static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { - [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, - [NFQA_MARK] = { .type = NLA_U32 }, - [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, - [NFQA_CT] = { .type = NLA_UNSPEC }, - [NFQA_EXP] = { .type = NLA_UNSPEC }, -}; - -static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { - [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, - [NFQA_MARK] = { .type = NLA_U32 }, -}; - -static struct nfqnl_instance * -verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) -{ - struct nfqnl_instance *queue; - - queue = instance_lookup(q, queue_num); - if (!queue) - return ERR_PTR(-ENODEV); - - if (queue->peer_portid != nlportid) - return ERR_PTR(-EPERM); - - return queue; -} - -static struct nfqnl_msg_verdict_hdr* -verdicthdr_get(const struct nlattr * const nfqa[]) -{ - struct nfqnl_msg_verdict_hdr *vhdr; - unsigned int verdict; - - if (!nfqa[NFQA_VERDICT_HDR]) - return NULL; - - vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); - verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; - if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) - return NULL; - return vhdr; -} - -static int nfq_id_after(unsigned int id, unsigned int max) -{ - return (int)(id - max) > 0; -} - -static int -nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nf_queue_entry *entry, *tmp; - unsigned int verdict, maxid; - struct nfqnl_msg_verdict_hdr *vhdr; - struct nfqnl_instance *queue; - LIST_HEAD(batch_list); - u16 queue_num = ntohs(nfmsg->res_id); - - struct net *net = sock_net(ctnl); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) - return PTR_ERR(queue); - - vhdr = verdicthdr_get(nfqa); - if (!vhdr) - return -EINVAL; - - verdict = ntohl(vhdr->verdict); - maxid = ntohl(vhdr->id); - - spin_lock_bh(&queue->lock); - - list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { - if (nfq_id_after(entry->id, maxid)) - break; - __dequeue_entry(queue, entry); - list_add_tail(&entry->list, &batch_list); - } - - spin_unlock_bh(&queue->lock); - - if (list_empty(&batch_list)) - return -ENOENT; - - list_for_each_entry_safe(entry, tmp, &batch_list, list) { - if (nfqa[NFQA_MARK]) - entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - nf_reinject(entry, verdict); - } - return 0; -} - -static int -nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - - struct nfqnl_msg_verdict_hdr *vhdr; - struct nfqnl_instance *queue; - unsigned int verdict; - struct nf_queue_entry *entry; - enum ip_conntrack_info uninitialized_var(ctinfo); - struct nf_conn *ct = NULL; - - struct net *net = sock_net(ctnl); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - queue = instance_lookup(q, queue_num); - if (!queue) - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) - return PTR_ERR(queue); - - vhdr = verdicthdr_get(nfqa); - if (!vhdr) - return -EINVAL; - - verdict = ntohl(vhdr->verdict); - - entry = find_dequeue_entry(queue, ntohl(vhdr->id)); - if (entry == NULL) - return -ENOENT; - - if (nfqa[NFQA_CT]) { - ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); - if (ct && nfqa[NFQA_EXP]) { - nfqnl_attach_expect(ct, nfqa[NFQA_EXP], - NETLINK_CB(skb).portid, - nlmsg_report(nlh)); - } - } - - if (nfqa[NFQA_PAYLOAD]) { - u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); - int diff = payload_len - entry->skb->len; - - if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), - payload_len, entry, diff) < 0) - verdict = NF_DROP; - - if (ct) - nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff); - } - - if (nfqa[NFQA_MARK]) - entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - - nf_reinject(entry, verdict); - return 0; -} - -static int -nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - return -ENOTSUPP; -} - -static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { - [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, - [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, -}; - -static const struct nf_queue_handler nfqh = { - .outfn = &nfqnl_enqueue_packet, -}; - -static int -nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_instance *queue; - struct nfqnl_msg_config_cmd *cmd = NULL; - struct net *net = sock_net(ctnl); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - int ret = 0; - - if (nfqa[NFQA_CFG_CMD]) { - cmd = nla_data(nfqa[NFQA_CFG_CMD]); - - /* Obsolete commands without queue context */ - switch (cmd->command) { - case NFQNL_CFG_CMD_PF_BIND: return 0; - case NFQNL_CFG_CMD_PF_UNBIND: return 0; - } - } - - rcu_read_lock(); - queue = instance_lookup(q, queue_num); - if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { - ret = -EPERM; - goto err_out_unlock; - } - - if (cmd != NULL) { - switch (cmd->command) { - case NFQNL_CFG_CMD_BIND: - if (queue) { - ret = -EBUSY; - goto err_out_unlock; - } - queue = instance_create(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) { - ret = PTR_ERR(queue); - goto err_out_unlock; - } - break; - case NFQNL_CFG_CMD_UNBIND: - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - instance_destroy(q, queue); - break; - case NFQNL_CFG_CMD_PF_BIND: - case NFQNL_CFG_CMD_PF_UNBIND: - break; - default: - ret = -ENOTSUPP; - break; - } - } - - if (nfqa[NFQA_CFG_PARAMS]) { - struct nfqnl_msg_config_params *params; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - params = nla_data(nfqa[NFQA_CFG_PARAMS]); - nfqnl_set_mode(queue, params->copy_mode, - ntohl(params->copy_range)); - } - - if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { - __be32 *queue_maxlen; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); - spin_lock_bh(&queue->lock); - queue->queue_maxlen = ntohl(*queue_maxlen); - spin_unlock_bh(&queue->lock); - } - - if (nfqa[NFQA_CFG_FLAGS]) { - __u32 flags, mask; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - - if (!nfqa[NFQA_CFG_MASK]) { - /* A mask is needed to specify which flags are being - * changed. - */ - ret = -EINVAL; - goto err_out_unlock; - } - - flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); - mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); - - if (flags >= NFQA_CFG_F_MAX) { - ret = -EOPNOTSUPP; - goto err_out_unlock; - } - - spin_lock_bh(&queue->lock); - queue->flags &= ~mask; - queue->flags |= flags & mask; - spin_unlock_bh(&queue->lock); - } - -err_out_unlock: - rcu_read_unlock(); - return ret; -} - -static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { - [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_policy }, - [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .policy = nfqa_cfg_policy }, - [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_batch_policy }, -}; - -static const struct nfnetlink_subsystem nfqnl_subsys = { - .name = "nf_queue", - .subsys_id = NFNL_SUBSYS_QUEUE, - .cb_count = NFQNL_MSG_MAX, - .cb = nfqnl_cb, -}; - -#ifdef CONFIG_PROC_FS -struct iter_state { - struct seq_net_private p; - unsigned int bucket; -}; - -static struct hlist_node *get_first(struct seq_file *seq) -{ - struct iter_state *st = seq->private; - struct net *net; - struct nfnl_queue_net *q; - - if (!st) - return NULL; - - net = seq_file_net(seq); - q = nfnl_queue_pernet(net); - for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&q->instance_table[st->bucket])) - return q->instance_table[st->bucket].first; - } - return NULL; -} - -static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) -{ - struct iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - - h = h->next; - while (!h) { - struct nfnl_queue_net *q; - - if (++st->bucket >= INSTANCE_BUCKETS) - return NULL; - - q = nfnl_queue_pernet(net); - h = q->instance_table[st->bucket].first; - } - return h; -} - -static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_node *head; - head = get_first(seq); - - if (head) - while (pos && (head = get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *seq_start(struct seq_file *s, loff_t *pos) - __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) -{ - spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); - return get_idx(s, *pos); -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return get_next(s, v); -} - -static void seq_stop(struct seq_file *s, void *v) - __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) -{ - spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); -} - -static int seq_show(struct seq_file *s, void *v) -{ - const struct nfqnl_instance *inst = v; - - seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", - inst->queue_num, - inst->peer_portid, inst->queue_total, - inst->copy_mode, inst->copy_range, - inst->queue_dropped, inst->queue_user_dropped, - inst->id_sequence, 1); - return seq_has_overflowed(s); -} - -static const struct seq_operations nfqnl_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nfqnl_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfqnl_seq_ops, - sizeof(struct iter_state)); -} - -static const struct file_operations nfqnl_file_ops = { - .owner = THIS_MODULE, - .open = nfqnl_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif /* PROC_FS */ - -static int __net_init nfnl_queue_net_init(struct net *net) -{ - unsigned int i; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&q->instance_table[i]); - - spin_lock_init(&q->instances_lock); - -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_queue", 0440, - net->nf.proc_netfilter, &nfqnl_file_ops)) - return -ENOMEM; -#endif - return 0; -} - -static void __net_exit nfnl_queue_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); -#endif -} - -static struct pernet_operations nfnl_queue_net_ops = { - .init = nfnl_queue_net_init, - .exit = nfnl_queue_net_exit, - .id = &nfnl_queue_net_id, - .size = sizeof(struct nfnl_queue_net), -}; - -static int __init nfnetlink_queue_init(void) -{ - int status; - - status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); - goto out; - } - - netlink_register_notifier(&nfqnl_rtnl_notifier); - status = nfnetlink_subsys_register(&nfqnl_subsys); - if (status < 0) { - pr_err("nf_queue: failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } - - register_netdevice_notifier(&nfqnl_dev_notifier); - nf_register_queue_handler(&nfqh); - return status; - -cleanup_netlink_notifier: - netlink_unregister_notifier(&nfqnl_rtnl_notifier); -out: - return status; -} - -static void __exit nfnetlink_queue_fini(void) -{ - nf_unregister_queue_handler(); - unregister_netdevice_notifier(&nfqnl_dev_notifier); - nfnetlink_subsys_unregister(&nfqnl_subsys); - netlink_unregister_notifier(&nfqnl_rtnl_notifier); - unregister_pernet_subsys(&nfnl_queue_net_ops); - - rcu_barrier(); /* Wait for completion of call_rcu()'s */ -} - -MODULE_DESCRIPTION("netfilter packet queue handler"); -MODULE_AUTHOR("Harald Welte "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); - -module_init(nfnetlink_queue_init); -module_exit(nfnetlink_queue_fini); diff --git a/kernel/net/netfilter/nfnetlink_queue_ct.c b/kernel/net/netfilter/nfnetlink_queue_ct.c deleted file mode 100644 index 96cac50e0..000000000 --- a/kernel/net/netfilter/nfnetlink_queue_ct.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * (C) 2012 by Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include -#include -#include -#include -#include -#include - -struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size, - enum ip_conntrack_info *ctinfo) -{ - struct nfq_ct_hook *nfq_ct; - struct nf_conn *ct; - - /* rcu_read_lock()ed by __nf_queue already. */ - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return NULL; - - ct = nf_ct_get(entskb, ctinfo); - if (ct) { - if (!nf_ct_is_untracked(ct)) - *size += nfq_ct->build_size(ct); - else - ct = NULL; - } - return ct; -} - -struct nf_conn * -nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr, - enum ip_conntrack_info *ctinfo) -{ - struct nfq_ct_hook *nfq_ct; - struct nf_conn *ct; - - /* rcu_read_lock()ed by __nf_queue already. */ - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return NULL; - - ct = nf_ct_get(skb, ctinfo); - if (ct && !nf_ct_is_untracked(ct)) - nfq_ct->parse(attr, ct); - - return ct; -} - -int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - struct nfq_ct_hook *nfq_ct; - struct nlattr *nest_parms; - u_int32_t tmp; - - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return 0; - - nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED); - if (!nest_parms) - goto nla_put_failure; - - if (nfq_ct->build(skb, ct) < 0) - goto nla_put_failure; - - nla_nest_end(skb, nest_parms); - - tmp = ctinfo; - if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - -void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, int diff) -{ - struct nfq_ct_hook *nfq_ct; - - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return; - - if ((ct->status & IPS_NAT_MASK) && diff) - nfq_ct->seq_adjust(skb, ct, ctinfo, diff); -} - -int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, - u32 portid, u32 report) -{ - struct nfq_ct_hook *nfq_ct; - - if (nf_ct_is_untracked(ct)) - return 0; - - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return -EOPNOTSUPP; - - return nfq_ct->attach_expect(attr, ct, portid, report); -} diff --git a/kernel/net/netfilter/nft_compat.c b/kernel/net/netfilter/nft_compat.c index 7f29cfc76..9c8fab001 100644 --- a/kernel/net/netfilter/nft_compat.c +++ b/kernel/net/netfilter/nft_compat.c @@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) @@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) @@ -617,6 +619,13 @@ struct nft_xt { static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, + const char *name, u32 rev, u32 family) +{ + return strcmp(match->name, name) == 0 && match->revision == rev && + (match->family == NFPROTO_UNSPEC || match->family == family); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -624,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_match; struct xt_match *match; char *mt_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -639,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) { + if (nft_match_cmp(match, mt_name, rev, family)) { if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); @@ -691,6 +699,13 @@ static LIST_HEAD(nft_target_list); static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, + const char *name, u32 rev, u32 family) +{ + return strcmp(tg->name, name) == 0 && tg->revision == rev && + (tg->family == NFPROTO_UNSPEC || tg->family == family); +} + static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -698,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_target; struct xt_target *target; char *tg_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -713,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) { + if (nft_target_cmp(target, tg_name, rev, family)) { if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); diff --git a/kernel/net/netfilter/nft_counter.c b/kernel/net/netfilter/nft_counter.c index 175912392..c7808fc19 100644 --- a/kernel/net/netfilter/nft_counter.c +++ b/kernel/net/netfilter/nft_counter.c @@ -18,39 +18,66 @@ #include struct nft_counter { - seqlock_t lock; u64 bytes; u64 packets; }; +struct nft_counter_percpu { + struct nft_counter counter; + struct u64_stats_sync syncp; +}; + +struct nft_counter_percpu_priv { + struct nft_counter_percpu __percpu *counter; +}; + static void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_counter *priv = nft_expr_priv(expr); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu *this_cpu; + + local_bh_disable(); + this_cpu = this_cpu_ptr(priv->counter); + u64_stats_update_begin(&this_cpu->syncp); + this_cpu->counter.bytes += pkt->skb->len; + this_cpu->counter.packets++; + u64_stats_update_end(&this_cpu->syncp); + local_bh_enable(); +} - write_seqlock_bh(&priv->lock); - priv->bytes += pkt->skb->len; - priv->packets++; - write_sequnlock_bh(&priv->lock); +static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, + struct nft_counter *total) +{ + const struct nft_counter_percpu *cpu_stats; + u64 bytes, packets; + unsigned int seq; + int cpu; + + memset(total, 0, sizeof(*total)); + for_each_possible_cpu(cpu) { + cpu_stats = per_cpu_ptr(counter, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + bytes = cpu_stats->counter.bytes; + packets = cpu_stats->counter.packets; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + + total->packets += packets; + total->bytes += bytes; + } } static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) { - struct nft_counter *priv = nft_expr_priv(expr); - unsigned int seq; - u64 bytes; - u64 packets; + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter total; - do { - seq = read_seqbegin(&priv->lock); - bytes = priv->bytes; - packets = priv->packets; - } while (read_seqretry(&priv->lock, seq)); + nft_counter_fetch(priv->counter, &total); - if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || + nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) goto nla_put_failure; return 0; @@ -67,24 +94,71 @@ static int nft_counter_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_counter *priv = nft_expr_priv(expr); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + + cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + if (tb[NFTA_COUNTER_PACKETS]) { + this_cpu->counter.packets = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + } + if (tb[NFTA_COUNTER_BYTES]) { + this_cpu->counter.bytes = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + } + preempt_enable(); + priv->counter = cpu_stats; + return 0; +} - if (tb[NFTA_COUNTER_PACKETS]) - priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); - if (tb[NFTA_COUNTER_BYTES]) - priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); +static void nft_counter_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - seqlock_init(&priv->lock); + free_percpu(priv->counter); +} + +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(src); + struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + struct nft_counter total; + + nft_counter_fetch(priv->counter, &total); + + cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, + GFP_ATOMIC); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + this_cpu->counter.packets = total.packets; + this_cpu->counter.bytes = total.bytes; + preempt_enable(); + + priv_clone->counter = cpu_stats; return 0; } static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)), .eval = nft_counter_eval, .init = nft_counter_init, + .destroy = nft_counter_destroy, .dump = nft_counter_dump, + .clone = nft_counter_clone, }; static struct nft_expr_type nft_counter_type __read_mostly = { diff --git a/kernel/net/netfilter/nft_ct.c b/kernel/net/netfilter/nft_ct.c index 8cbca3432..939921532 100644 --- a/kernel/net/netfilter/nft_ct.c +++ b/kernel/net/netfilter/nft_ct.c @@ -366,6 +366,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; switch (priv->key) { + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: diff --git a/kernel/net/netfilter/nft_dynset.c b/kernel/net/netfilter/nft_dynset.c index 513a8ef60..9dec3bd1b 100644 --- a/kernel/net/netfilter/nft_dynset.c +++ b/kernel/net/netfilter/nft_dynset.c @@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, } ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL) - nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + if (priv->expr != NULL && + nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + return NULL; return elem; } diff --git a/kernel/net/netfilter/nft_limit.c b/kernel/net/netfilter/nft_limit.c index 435c1ccd6..5d67938f8 100644 --- a/kernel/net/netfilter/nft_limit.c +++ b/kernel/net/netfilter/nft_limit.c @@ -20,63 +20,79 @@ static DEFINE_SPINLOCK(limit_lock); struct nft_limit { + u64 last; u64 tokens; + u64 tokens_max; u64 rate; - u64 unit; - unsigned long stamp; + u64 nsecs; + u32 burst; }; -static void nft_limit_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) { - struct nft_limit *priv = nft_expr_priv(expr); + u64 now, tokens; + s64 delta; spin_lock_bh(&limit_lock); - if (time_after_eq(jiffies, priv->stamp)) { - priv->tokens = priv->rate; - priv->stamp = jiffies + priv->unit * HZ; - } - - if (priv->tokens >= 1) { - priv->tokens--; + now = ktime_get_ns(); + tokens = limit->tokens + now - limit->last; + if (tokens > limit->tokens_max) + tokens = limit->tokens_max; + + limit->last = now; + delta = tokens - cost; + if (delta >= 0) { + limit->tokens = delta; spin_unlock_bh(&limit_lock); - return; + return false; } + limit->tokens = tokens; spin_unlock_bh(&limit_lock); - - regs->verdict.code = NFT_BREAK; + return true; } -static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { - [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, - [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, -}; - -static int nft_limit_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, +static int nft_limit_init(struct nft_limit *limit, const struct nlattr * const tb[]) { - struct nft_limit *priv = nft_expr_priv(expr); + u64 unit; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; - priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); - priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - priv->stamp = jiffies + priv->unit * HZ; - priv->tokens = priv->rate; + limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); + limit->nsecs = unit * NSEC_PER_SEC; + if (limit->rate == 0 || limit->nsecs < unit) + return -EOVERFLOW; + limit->tokens = limit->tokens_max = limit->nsecs; + + if (tb[NFTA_LIMIT_BURST]) { + u64 rate; + + limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + + rate = limit->rate + limit->burst; + if (rate < limit->rate) + return -EOVERFLOW; + + limit->rate = rate; + } + limit->last = ktime_get_ns(); + return 0; } -static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, + enum nft_limit_type type) { - const struct nft_limit *priv = nft_expr_priv(expr); + u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); + u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || + nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || + nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type))) goto nla_put_failure; return 0; @@ -84,18 +100,114 @@ nla_put_failure: return -1; } +struct nft_limit_pkts { + struct nft_limit limit; + u64 cost; +}; + +static void nft_limit_pkts_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit_pkts *priv = nft_expr_priv(expr); + + if (nft_limit_eval(&priv->limit, priv->cost)) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { + [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, + [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, + [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, + [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, +}; + +static int nft_limit_pkts_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit_pkts *priv = nft_expr_priv(expr); + int err; + + err = nft_limit_init(&priv->limit, tb); + if (err < 0) + return err; + + priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate); + return 0; +} + +static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_limit_pkts *priv = nft_expr_priv(expr); + + return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); +} + static struct nft_expr_type nft_limit_type; -static const struct nft_expr_ops nft_limit_ops = { +static const struct nft_expr_ops nft_limit_pkts_ops = { + .type = &nft_limit_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .eval = nft_limit_pkts_eval, + .init = nft_limit_pkts_init, + .dump = nft_limit_pkts_dump, +}; + +static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_expr_priv(expr); + u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate); + + if (nft_limit_eval(priv, cost)) + regs->verdict.code = NFT_BREAK; +} + +static int nft_limit_pkt_bytes_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_init(priv, tb); +} + +static int nft_limit_pkt_bytes_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); +} + +static const struct nft_expr_ops nft_limit_pkt_bytes_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), - .eval = nft_limit_eval, - .init = nft_limit_init, - .dump = nft_limit_dump, + .eval = nft_limit_pkt_bytes_eval, + .init = nft_limit_pkt_bytes_init, + .dump = nft_limit_pkt_bytes_dump, }; +static const struct nft_expr_ops * +nft_limit_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_LIMIT_TYPE] == NULL) + return &nft_limit_pkts_ops; + + switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { + case NFT_LIMIT_PKTS: + return &nft_limit_pkts_ops; + case NFT_LIMIT_PKT_BYTES: + return &nft_limit_pkt_bytes_ops; + } + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_limit_type __read_mostly = { .name = "limit", - .ops = &nft_limit_ops, + .select_ops = nft_limit_select_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, .flags = NFT_EXPR_STATEFUL, diff --git a/kernel/net/netfilter/nft_log.c b/kernel/net/netfilter/nft_log.c index a13d6a386..319c22b4b 100644 --- a/kernel/net/netfilter/nft_log.c +++ b/kernel/net/netfilter/nft_log.c @@ -31,9 +31,8 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &priv->loginfo, "%s", priv->prefix); } diff --git a/kernel/net/netfilter/nft_meta.c b/kernel/net/netfilter/nft_meta.c index 52561e1c3..9dfaf4d55 100644 --- a/kernel/net/netfilter/nft_meta.c +++ b/kernel/net/netfilter/nft_meta.c @@ -31,6 +31,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = pkt->in, *out = pkt->out; + struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -42,7 +43,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(__be16 *)dest = skb->protocol; break; case NFT_META_NFPROTO: - *dest = pkt->ops->pf; + *dest = pkt->pf; break; case NFT_META_L4PROTO: *dest = pkt->tprot; @@ -86,33 +87,35 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(u16 *)dest = out->type; break; case NFT_META_SKUID: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket == NULL || - skb->sk->sk_socket->file == NULL) { - read_unlock_bh(&skb->sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket == NULL || + sk->sk_socket->file == NULL) { + read_unlock_bh(&sk->sk_callback_lock); goto err; } *dest = from_kuid_munged(&init_user_ns, - skb->sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&skb->sk->sk_callback_lock); + sk->sk_socket->file->f_cred->fsuid); + read_unlock_bh(&sk->sk_callback_lock); break; case NFT_META_SKGID: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket == NULL || - skb->sk->sk_socket->file == NULL) { - read_unlock_bh(&skb->sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket == NULL || + sk->sk_socket->file == NULL) { + read_unlock_bh(&sk->sk_callback_lock); goto err; } *dest = from_kgid_munged(&init_user_ns, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); + sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: { @@ -135,7 +138,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (pkt->ops->pf) { + switch (pkt->pf) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) *dest = PACKET_MULTICAST; @@ -166,11 +169,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; *dest = out->group; break; +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - *dest = skb->sk->sk_classid; + *dest = sk->sk_classid; break; +#endif default: WARN_ON(1); goto err; @@ -246,7 +252,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_CPU: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: +#endif len = sizeof(u32); break; case NFT_META_IIFNAME: diff --git a/kernel/net/netfilter/nft_payload.c b/kernel/net/netfilter/nft_payload.c index 94fb3b27a..09b4b07eb 100644 --- a/kernel/net/netfilter/nft_payload.c +++ b/kernel/net/netfilter/nft_payload.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -17,6 +18,53 @@ #include #include +/* add vlan header into the user buffer for if tag was removed by offloads */ +static bool +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +{ + int mac_off = skb_mac_header(skb) - skb->data; + u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + struct vlan_ethhdr veth; + + vlanh = (u8 *) &veth; + if (offset < ETH_HLEN) { + u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + + if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + return false; + + veth.h_vlan_proto = skb->vlan_proto; + + memcpy(dst_u8, vlanh + offset, ethlen); + + len -= ethlen; + if (len == 0) + return true; + + dst_u8 += ethlen; + offset = ETH_HLEN; + } else if (offset >= VLAN_ETH_HLEN) { + offset -= VLAN_HLEN; + goto skip; + } + + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth.h_vlan_encapsulated_proto = skb->protocol; + + vlanh += offset; + + vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); + memcpy(dst_u8, vlanh, vlan_len); + + len -= vlan_len; + if (!len) + return true; + + dst_u8 += vlan_len; + skip: + return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; +} + static void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -26,10 +74,18 @@ static void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; + dest[priv->len / NFT_REG32_SIZE] = 0; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; + + if (skb_vlan_tag_present(skb)) { + if (!nft_payload_copy_vlan(dest, skb, + priv->offset, priv->len)) + goto err; + return; + } offset = skb_mac_header(skb) - skb->data; break; case NFT_PAYLOAD_NETWORK_HEADER: @@ -43,7 +99,6 @@ static void nft_payload_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; diff --git a/kernel/net/netfilter/nft_queue.c b/kernel/net/netfilter/nft_queue.c index 96805d21d..61d216eb7 100644 --- a/kernel/net/netfilter/nft_queue.c +++ b/kernel/net/netfilter/nft_queue.c @@ -42,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr, queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, - priv->queues_total, pkt->ops->pf, + priv->queues_total, pkt->pf, jhash_initval); } } diff --git a/kernel/net/netfilter/nft_reject_inet.c b/kernel/net/netfilter/nft_reject_inet.c index 635dbba93..759ca5248 100644 --- a/kernel/net/netfilter/nft_reject_inet.c +++ b/kernel/net/netfilter/nft_reject_inet.c @@ -22,38 +22,37 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); - switch (pkt->ops->pf) { + switch (pkt->pf) { case NFPROTO_IPV4: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, nft_reject_icmp_code(priv->icmp_code), - pkt->ops->hooknum); + pkt->hook); break; } break; case NFPROTO_IPV6: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + nf_send_reset6(pkt->net, pkt->skb, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nf_send_unreach6(net, pkt->skb, + nf_send_unreach6(pkt->net, pkt->skb, nft_reject_icmpv6_code(priv->icmp_code), - pkt->ops->hooknum); + pkt->hook); break; } break; diff --git a/kernel/net/netfilter/x_tables.c b/kernel/net/netfilter/x_tables.c index 51a459c3c..d4aaad747 100644 --- a/kernel/net/netfilter/x_tables.c +++ b/kernel/net/netfilter/x_tables.c @@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_IPV6] = "ip6", }; -/* Allow this many total (re)entries. */ -static const unsigned int xt_jumpstack_multiplier = 2; - /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { @@ -658,35 +655,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { - struct xt_table_info *newinfo; - int cpu; + struct xt_table_info *info = NULL; + size_t sz = sizeof(*info) + size; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); - if (!newinfo) - return NULL; - - newinfo->size = size; - - for_each_possible_cpu(cpu) { - if (size <= PAGE_SIZE) - newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, - cpu_to_node(cpu)); - else - newinfo->entries[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); - - if (newinfo->entries[cpu] == NULL) { - xt_free_table_info(newinfo); + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!info) { + info = vmalloc(sz); + if (!info) return NULL; - } } - - return newinfo; + memset(info, 0, sizeof(*info)); + info->size = size; + return info; } EXPORT_SYMBOL(xt_alloc_table_info); @@ -694,18 +679,13 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) - kvfree(info->entries[cpu]); - if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); kvfree(info->jumpstack); } - free_percpu(info->stackptr); - - kfree(info); + kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); @@ -747,15 +727,14 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock); DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); +struct static_key xt_tee_enabled __read_mostly; +EXPORT_SYMBOL_GPL(xt_tee_enabled); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; int cpu; - i->stackptr = alloc_percpu(unsigned int); - if (i->stackptr == NULL) - return -ENOMEM; - size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = vzalloc(size); @@ -764,8 +743,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->jumpstack == NULL) return -ENOMEM; - i->stacksize *= xt_jumpstack_multiplier; - size = sizeof(void *) * i->stacksize; + /* ruleset without jumps -- no stack needed */ + if (i->stacksize == 0) + return 0; + + /* Jumpstack needs to be able to record two full callchains, one + * from the first rule set traversal, plus one table reentrancy + * via -j TEE without clobbering the callchain that brought us to + * TEE target. + * + * This is done by allocating two jumpstacks per cpu, on reentry + * the upper half of the stack is used. + * + * see the jumpstack setup in ipt_do_table() for more details. + */ + size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { if (size > PAGE_SIZE) i->jumpstack[cpu] = vmalloc_node(size, @@ -947,11 +939,9 @@ static int xt_table_seq_show(struct seq_file *seq, void *v) { struct xt_table *table = list_entry(v, struct xt_table, list); - if (strlen(table->name)) { + if (*table->name) seq_printf(seq, "%s\n", table->name); - return seq_has_overflowed(seq); - } else - return 0; + return 0; } static const struct seq_operations xt_table_seq_ops = { @@ -1087,10 +1077,8 @@ static int xt_match_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; match = list_entry(trav->curr, struct xt_match, list); - if (*match->name == '\0') - return 0; - seq_printf(seq, "%s\n", match->name); - return seq_has_overflowed(seq); + if (*match->name) + seq_printf(seq, "%s\n", match->name); } return 0; } @@ -1142,10 +1130,8 @@ static int xt_target_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; target = list_entry(trav->curr, struct xt_target, list); - if (*target->name == '\0') - return 0; - seq_printf(seq, "%s\n", target->name); - return seq_has_overflowed(seq); + if (*target->name) + seq_printf(seq, "%s\n", target->name); } return 0; } @@ -1207,7 +1193,6 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) if (!(hook_mask & 1)) continue; ops[i].hook = fn; - ops[i].owner = table->me; ops[i].pf = table->af; ops[i].hooknum = hooknum; ops[i].priority = table->priority; diff --git a/kernel/net/netfilter/xt_CT.c b/kernel/net/netfilter/xt_CT.c index 75747aecd..e7ac07e53 100644 --- a/kernel/net/netfilter/xt_CT.c +++ b/kernel/net/netfilter/xt_CT.c @@ -171,6 +171,9 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, if (timeout_ext == NULL) ret = -ENOMEM; + rcu_read_unlock(); + return ret; + err_put_timeout: __xt_ct_tg_timeout_put(timeout); out: @@ -181,10 +184,23 @@ out: #endif } +static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info) +{ + switch (info->flags & (XT_CT_ZONE_DIR_ORIG | + XT_CT_ZONE_DIR_REPL)) { + case XT_CT_ZONE_DIR_ORIG: + return NF_CT_ZONE_DIR_ORIG; + case XT_CT_ZONE_DIR_REPL: + return NF_CT_ZONE_DIR_REPL; + default: + return NF_CT_DEFAULT_ZONE_DIR; + } +} + static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { - struct nf_conntrack_tuple t; + struct nf_conntrack_zone zone; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -194,7 +210,9 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, } #ifndef CONFIG_NF_CONNTRACK_ZONES - if (info->zone) + if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG | + XT_CT_ZONE_DIR_REPL | + XT_CT_ZONE_MARK)) goto err1; #endif @@ -202,11 +220,17 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err1; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); - ret = PTR_ERR(ct); - if (IS_ERR(ct)) + memset(&zone, 0, sizeof(zone)); + zone.id = info->zone; + zone.dir = xt_ct_flags_to_dir(info); + if (info->flags & XT_CT_ZONE_MARK) + zone.flags |= NF_CT_FLAG_MARK; + + ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL); + if (!ct) { + ret = -ENOMEM; goto err2; + } ret = 0; if ((info->ct_events || info->exp_events) && @@ -227,14 +251,14 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err3; } - - nf_conntrack_tmpl_insert(par->net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: @@ -297,8 +321,10 @@ static void xt_ct_destroy_timeout(struct nf_conn *ct) if (timeout_put) { timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) + if (timeout_ext) { timeout_put(timeout_ext->timeout); + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + } } rcu_read_unlock(); #endif diff --git a/kernel/net/netfilter/xt_IDLETIMER.c b/kernel/net/netfilter/xt_IDLETIMER.c index f407ebc13..29d2c31f4 100644 --- a/kernel/net/netfilter/xt_IDLETIMER.c +++ b/kernel/net/netfilter/xt_IDLETIMER.c @@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; diff --git a/kernel/net/netfilter/xt_LOG.c b/kernel/net/netfilter/xt_LOG.c index c13b79440..1763ab82b 100644 --- a/kernel/net/netfilter/xt_LOG.c +++ b/kernel/net/netfilter/xt_LOG.c @@ -33,7 +33,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; diff --git a/kernel/net/netfilter/xt_NFLOG.c b/kernel/net/netfilter/xt_NFLOG.c index fb7497c92..a1fa2c800 100644 --- a/kernel/net/netfilter/xt_NFLOG.c +++ b/kernel/net/netfilter/xt_NFLOG.c @@ -26,7 +26,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; diff --git a/kernel/net/netfilter/xt_TCPMSS.c b/kernel/net/netfilter/xt_TCPMSS.c index e762de5ee..b7c43def0 100644 --- a/kernel/net/netfilter/xt_TCPMSS.c +++ b/kernel/net/netfilter/xt_TCPMSS.c @@ -108,7 +108,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); if (dst_mtu(skb_dst(skb)) <= minlen) { @@ -144,7 +144,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), - 0); + false); return 0; } } @@ -185,18 +185,18 @@ tcpmss_mangle_packet(struct sk_buff *skb, memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, - htons(len), htons(len + TCPOLEN_MSS), 1); + htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; - inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); + inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } @@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; @@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; diff --git a/kernel/net/netfilter/xt_TCPOPTSTRIP.c b/kernel/net/netfilter/xt_TCPOPTSTRIP.c index 625fa1d63..eb92bffff 100644 --- a/kernel/net/netfilter/xt_TCPOPTSTRIP.c +++ b/kernel/net/netfilter/xt_TCPOPTSTRIP.c @@ -80,7 +80,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, n <<= 8; } inet_proto_csum_replace2(&tcph->check, skb, htons(o), - htons(n), 0); + htons(n), false); } memset(opt + i, TCPOPT_NOP, optl); } diff --git a/kernel/net/netfilter/xt_TEE.c b/kernel/net/netfilter/xt_TEE.c index 292934d23..3eff7b67c 100644 --- a/kernel/net/netfilter/xt_TEE.c +++ b/kernel/net/netfilter/xt_TEE.c @@ -10,26 +10,15 @@ * modify it under the terms of the GNU General Public License * version 2 or later, as published by the Free Software Foundation. */ -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include #include -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -# define WITH_CONNTRACK 1 -# include -#endif - struct xt_tee_priv { struct notifier_block notifier; struct xt_tee_tginfo *tginfo; @@ -37,162 +26,27 @@ struct xt_tee_priv { }; static const union nf_inet_addr tee_zero_address; -static DEFINE_PER_CPU(bool, tee_active); - -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool -tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) -{ - const struct iphdr *iph = ip_hdr(skb); - struct net *net = pick_net(skb); - struct rtable *rt; - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - if (info->priv) { - if (info->priv->oif == -1) - return false; - fl4.flowi4_oif = info->priv->oif; - } - fl4.daddr = info->gw.ip; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return false; - - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - skb->dev = rt->dst.dev; - skb->protocol = htons(ETH_P_IP); - return true; -} static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - struct iphdr *iph; + int oif = info->priv ? info->priv->oif : 0; - if (__this_cpu_read(tee_active)) - return XT_CONTINUE; - /* - * Copy the skb, and route the copy. Will later return %XT_CONTINUE for - * the original skb, which should continue on its way as if nothing has - * happened. The copy should be independently delivered to the TEE - * --gateway. - */ - skb = pskb_copy(skb, GFP_ATOMIC); - if (skb == NULL) - return XT_CONTINUE; - -#ifdef WITH_CONNTRACK - /* Avoid counting cloned packets towards the original connection. */ - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); -#endif - /* - * If we are in PREROUTING/INPUT, the checksum must be recalculated - * since the length could have changed as a result of defragmentation. - * - * We also decrease the TTL to mitigate potential TEE loops - * between two hosts. - * - * Set %IP_DF so that the original source is notified of a potentially - * decreased MTU on the clone route. IPv6 does this too. - */ - iph = ip_hdr(skb); - iph->frag_off |= htons(IP_DF); - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_IN) - --iph->ttl; - ip_send_check(iph); + nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif); - if (tee_tg_route4(skb, info)) { - __this_cpu_write(tee_active, true); - ip_local_out(skb); - __this_cpu_write(tee_active, false); - } else { - kfree_skb(skb); - } return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_IPV6) -static bool -tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) -{ - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct net *net = pick_net(skb); - struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - if (info->priv) { - if (info->priv->oif == -1) - return false; - fl6.flowi6_oif = info->priv->oif; - } - fl6.daddr = info->gw.in6; - fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | - (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - return false; - } - skb_dst_drop(skb); - skb_dst_set(skb, dst); - skb->dev = dst->dev; - skb->protocol = htons(ETH_P_IPV6); - return true; -} - +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; + int oif = info->priv ? info->priv->oif : 0; - if (__this_cpu_read(tee_active)) - return XT_CONTINUE; - skb = pskb_copy(skb, GFP_ATOMIC); - if (skb == NULL) - return XT_CONTINUE; + nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif); -#ifdef WITH_CONNTRACK - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); -#endif - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_IN) { - struct ipv6hdr *iph = ipv6_hdr(skb); - --iph->hop_limit; - } - if (tee_tg_route6(skb, info)) { - __this_cpu_write(tee_active, true); - ip6_local_out(skb); - __this_cpu_write(tee_active, false); - } else { - kfree_skb(skb); - } return XT_CONTINUE; } #endif @@ -251,6 +105,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par) } else info->priv = NULL; + static_key_slow_inc(&xt_tee_enabled); return 0; } @@ -262,6 +117,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) unregister_netdevice_notifier(&info->priv->notifier); kfree(info->priv); } + static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { @@ -275,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) { .name = "TEE", .revision = 1, diff --git a/kernel/net/netfilter/xt_TPROXY.c b/kernel/net/netfilter/xt_TPROXY.c index cca96cec1..3ab591e73 100644 --- a/kernel/net/netfilter/xt_TPROXY.c +++ b/kernel/net/netfilter/xt_TPROXY.c @@ -250,8 +250,8 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, * no such listener is found, or NULL if the TCP header is incomplete. */ static struct sock * -tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, - struct sock *sk) +tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, + __be32 laddr, __be16 lport, struct sock *sk) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr _hdr, *hp; @@ -267,13 +267,12 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -291,7 +290,7 @@ nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) } static unsigned int -tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, +tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) { const struct iphdr *iph = ip_hdr(skb); @@ -306,7 +305,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -318,11 +317,11 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ - sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); + sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk); else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -352,7 +351,7 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; - return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); + return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } static unsigned int @@ -360,7 +359,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; - return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); + return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } #ifdef XT_TPROXY_HAVE_IPV6 @@ -430,15 +429,14 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, tgi->lport ? tgi->lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -474,7 +472,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -489,7 +487,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/kernel/net/netfilter/xt_addrtype.c b/kernel/net/netfilter/xt_addrtype.c index fab6eea1b..11d609199 100644 --- a/kernel/net/netfilter/xt_addrtype.c +++ b/kernel/net/netfilter/xt_addrtype.c @@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; - if (rt->rt6i_flags & RTF_ANYCAST) + if (ipv6_anycast_destination((struct dst_entry *)rt, addr)) ret |= XT_ADDRTYPE_ANYCAST; dst_release(&rt->dst); @@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev, static bool addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; @@ -143,7 +143,7 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) static bool addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph; const struct net_device *dev = NULL; diff --git a/kernel/net/netfilter/xt_connlabel.c b/kernel/net/netfilter/xt_connlabel.c index 9f8719df2..bb9cbeb18 100644 --- a/kernel/net/netfilter/xt_connlabel.c +++ b/kernel/net/netfilter/xt_connlabel.c @@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) XT_CONNLABEL_OP_SET; struct xt_connlabel_mtinfo *info = par->matchinfo; int ret; - size_t words; - - if (info->bit > XT_CONNLABEL_MAXBIT) - return -ERANGE; if (info->options & ~options) { pr_err("Unknown options in mask %x\n", info->options); @@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return ret; } - par->net->ct.labels_used++; - words = BITS_TO_LONGS(info->bit+1); - if (words > par->net->ct.label_words) - par->net->ct.label_words = words; - + ret = nf_connlabels_get(par->net, info->bit + 1); + if (ret < 0) + nf_ct_l3proto_module_put(par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { - par->net->ct.labels_used--; - if (par->net->ct.labels_used == 0) - par->net->ct.label_words = 0; + nf_connlabels_put(par->net); nf_ct_l3proto_module_put(par->family); } diff --git a/kernel/net/netfilter/xt_connlimit.c b/kernel/net/netfilter/xt_connlimit.c index 29ba6218a..99bbc8298 100644 --- a/kernel/net/netfilter/xt_connlimit.c +++ b/kernel/net/netfilter/xt_connlimit.c @@ -134,7 +134,7 @@ static bool add_hlist(struct hlist_head *head, static unsigned int check_hlist(struct net *net, struct hlist_head *head, const struct nf_conntrack_tuple *tuple, - u16 zone, + const struct nf_conntrack_zone *zone, bool *addit) { const struct nf_conntrack_tuple_hash *found; @@ -201,7 +201,7 @@ static unsigned int count_tree(struct net *net, struct rb_root *root, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u8 family, u16 zone) + u8 family, const struct nf_conntrack_zone *zone) { struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; @@ -290,7 +290,8 @@ static int count_them(struct net *net, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u_int8_t family, u16 zone) + u_int8_t family, + const struct nf_conntrack_zone *zone) { struct rb_root *root; int count; @@ -316,22 +317,22 @@ static int count_them(struct net *net, static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_connlimit_info *info = par->matchinfo; union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; - u16 zone = NF_CT_DEFAULT_ZONE; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) { tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; zone = nf_ct_zone(ct); } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - par->family, &tuple)) { + par->family, net, &tuple)) { goto hotdrop; } diff --git a/kernel/net/netfilter/xt_ipvs.c b/kernel/net/netfilter/xt_ipvs.c index 8d47c3780..71a9d95e0 100644 --- a/kernel/net/netfilter/xt_ipvs.c +++ b/kernel/net/netfilter/xt_ipvs.c @@ -48,6 +48,7 @@ static bool ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ipvs_mtinfo *data = par->matchinfo; + struct netns_ipvs *ipvs = net_ipvs(par->net); /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ const u_int8_t family = par->family; struct ip_vs_iphdr iph; @@ -67,7 +68,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) goto out; } - ip_vs_fill_iph_skb(family, skb, &iph); + ip_vs_fill_iph_skb(family, skb, true, &iph); if (data->bitmask & XT_IPVS_PROTO) if ((iph.protocol == data->l4proto) ^ @@ -85,7 +86,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */); + cp = pp->conn_out_get(ipvs, family, skb, &iph); if (unlikely(cp == NULL)) { match = false; goto out; diff --git a/kernel/net/netfilter/xt_mark.c b/kernel/net/netfilter/xt_mark.c index 233452387..ebd41dc50 100644 --- a/kernel/net/netfilter/xt_mark.c +++ b/kernel/net/netfilter/xt_mark.c @@ -23,6 +23,7 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); +MODULE_ALIAS("arpt_MARK"); static unsigned int mark_tg(struct sk_buff *skb, const struct xt_action_param *par) diff --git a/kernel/net/netfilter/xt_nfacct.c b/kernel/net/netfilter/xt_nfacct.c index 8c646ed9c..3048a7e3a 100644 --- a/kernel/net/netfilter/xt_nfacct.c +++ b/kernel/net/netfilter/xt_nfacct.c @@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) struct xt_nfacct_match_info *info = par->matchinfo; struct nf_acct *nfacct; - nfacct = nfnl_acct_find_get(info->name); + nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { pr_info("xt_nfacct: accounting object with name `%s' " "does not exists\n", info->name); diff --git a/kernel/net/netfilter/xt_osf.c b/kernel/net/netfilter/xt_osf.c index 0778855ea..df8801e02 100644 --- a/kernel/net/netfilter/xt_osf.c +++ b/kernel/net/netfilter/xt_osf.c @@ -200,7 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; - struct net *net = dev_net(p->in ? p->in : p->out); + struct net *net = p->net; if (!info) return false; diff --git a/kernel/net/netfilter/xt_owner.c b/kernel/net/netfilter/xt_owner.c index ca2e577ed..1302b475a 100644 --- a/kernel/net/netfilter/xt_owner.c +++ b/kernel/net/netfilter/xt_owner.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -33,8 +34,9 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; + struct sock *sk = skb_to_full_sk(skb); - if (skb->sk == NULL || skb->sk->sk_socket == NULL) + if (sk == NULL || sk->sk_socket == NULL) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* @@ -43,7 +45,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = skb->sk->sk_socket->file; + filp = sk->sk_socket->file; if (filp == NULL) return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; diff --git a/kernel/net/netfilter/xt_recent.c b/kernel/net/netfilter/xt_recent.c index 45e1b30e4..d725a2774 100644 --- a/kernel/net/netfilter/xt_recent.c +++ b/kernel/net/netfilter/xt_recent.c @@ -237,7 +237,7 @@ static void recent_table_flush(struct recent_table *t) static bool recent_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; struct recent_net *recent_net = recent_pernet(net); const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; diff --git a/kernel/net/netfilter/xt_set.c b/kernel/net/netfilter/xt_set.c index 89045982e..5669e5b45 100644 --- a/kernel/net/netfilter/xt_set.c +++ b/kernel/net/netfilter/xt_set.c @@ -9,14 +9,16 @@ */ /* Kernel module which implements the set match and SET target - * for netfilter/iptables. */ + * for netfilter/iptables. + */ #include #include #include -#include +#include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -52,6 +54,7 @@ static bool set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v0 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.u.compat.dim, info->match_set.u.compat.flags, 0, UINT_MAX); @@ -68,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info) info->u.compat.dim = IPSET_DIM_ZERO; if (info->u.flags[0] & IPSET_MATCH_INV) info->u.compat.flags |= IPSET_INV_MATCH; - for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) { info->u.compat.dim++; if (info->u.flags[i] & IPSET_SRC) - info->u.compat.flags |= (1<u.compat.dim); + info->u.compat.flags |= (1 << info->u.compat.dim); } } @@ -88,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) info->match_set.index); return -ENOENT; } - if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; @@ -114,6 +117,7 @@ static bool set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, 0, UINT_MAX); @@ -178,9 +182,10 @@ static bool set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v3 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -224,9 +229,10 @@ static bool set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v4 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -252,6 +258,7 @@ static unsigned int set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v0 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, info->add_set.u.compat.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, @@ -290,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) return -ENOENT; } } - if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || - info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); @@ -324,6 +331,7 @@ static unsigned int set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v1 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -392,6 +400,7 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -399,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -418,6 +427,8 @@ static unsigned int set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; + int ret; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -425,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) ADT_OPT(map_opt, par->family, info->map_set.dim, info->map_set.flags, 0, UINT_MAX); - int ret; - /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -456,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } - static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -496,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) !(par->hook_mask & (1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { - pr_warn("mapping of prio or/and queue is allowed only" - "from OUTPUT/FORWARD/POSTROUTING chains\n"); + pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); return -EINVAL; } index = ip_set_nfnl_get_byindex(par->net, @@ -518,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -545,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par) ip_set_nfnl_put(par->net, info->map_set.index); } - static struct xt_match set_matches[] __read_mostly = { { .name = "set", diff --git a/kernel/net/netfilter/xt_socket.c b/kernel/net/netfilter/xt_socket.c index e092cb046..2ec08f04b 100644 --- a/kernel/net/netfilter/xt_socket.c +++ b/kernel/net/netfilter/xt_socket.c @@ -143,7 +143,8 @@ static bool xt_socket_sk_is_transparent(struct sock *sk) } } -static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, +static struct sock *xt_socket_lookup_slow_v4(struct net *net, + const struct sk_buff *skb, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); @@ -197,7 +198,7 @@ static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, } #endif - return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr, + return xt_socket_get_sock_v4(net, protocol, saddr, daddr, sport, dport, indev); } @@ -205,10 +206,11 @@ static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) { + struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (!sk) - sk = xt_socket_lookup_slow_v4(skb, par->in); + sk = xt_socket_lookup_slow_v4(par->net, skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -226,6 +228,10 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_TRANSPARENT) transparent = xt_socket_sk_is_transparent(sk); + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; + if (sk != skb->sk) sock_gen_put(sk); @@ -247,7 +253,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) } static bool -socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } @@ -330,7 +336,8 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, return NULL; } -static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, +static struct sock *xt_socket_lookup_slow_v6(struct net *net, + const struct sk_buff *skb, const struct net_device *indev) { __be16 uninitialized_var(dport), uninitialized_var(sport); @@ -366,18 +373,19 @@ static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, return NULL; } - return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr, + return xt_socket_get_sock_v6(net, tproto, saddr, daddr, sport, dport, indev); } static bool -socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (!sk) - sk = xt_socket_lookup_slow_v6(skb, par->in); + sk = xt_socket_lookup_slow_v6(par->net, skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -395,6 +403,10 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_TRANSPARENT) transparent = xt_socket_sk_is_transparent(sk); + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; + if (sk != skb->sk) sock_gen_put(sk); @@ -428,6 +440,19 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par) return 0; } +static int socket_mt_v3_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo3 *info = + (struct xt_socket_mtinfo3 *)par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V3) { + pr_info("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V3); + return -EINVAL; + } + return 0; +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -442,7 +467,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -454,7 +479,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -466,7 +491,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -478,13 +503,37 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, +#endif + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, #endif }; diff --git a/kernel/net/netlink/af_netlink.c b/kernel/net/netlink/af_netlink.c index 980121e75..59651af8c 100644 --- a/kernel/net/netlink/af_netlink.c +++ b/kernel/net/netlink/af_netlink.c @@ -76,17 +76,19 @@ struct listeners { }; /* state bits */ -#define NETLINK_CONGESTED 0x0 +#define NETLINK_S_CONGESTED 0x0 /* flags */ -#define NETLINK_KERNEL_SOCKET 0x1 -#define NETLINK_RECV_PKTINFO 0x2 -#define NETLINK_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; + return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; } struct netlink_table *nl_table __read_mostly; @@ -175,7 +177,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt) out: spin_unlock(&netlink_tap_lock); - if (found && nt->module) + if (found) module_put(nt->module); return found ? 0 : -ENODEV; @@ -278,8 +280,9 @@ static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_S_CONGESTED, + &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); } @@ -292,8 +295,8 @@ static void netlink_rcv_wake(struct sock *sk) struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(NETLINK_CONGESTED, &nlk->state); - if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + clear_bit(NETLINK_S_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } @@ -608,16 +611,6 @@ netlink_current_frame(const struct netlink_ring *ring, return netlink_lookup_frame(ring, ring->head, status); } -static struct nl_mmap_hdr * -netlink_previous_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - unsigned int prev; - - prev = ring->head ? ring->head - 1 : ring->frame_max; - return netlink_lookup_frame(ring, prev, status); -} - static void netlink_increment_head(struct netlink_ring *ring) { ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; @@ -625,11 +618,11 @@ static void netlink_increment_head(struct netlink_ring *ring) static void netlink_forward_ring(struct netlink_ring *ring) { - unsigned int head = ring->head, pos = head; + unsigned int head = ring->head; const struct nl_mmap_hdr *hdr; do { - hdr = __netlink_lookup_frame(ring, pos); + hdr = __netlink_lookup_frame(ring, ring->head); if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) break; if (hdr->nm_status != NL_MMAP_STATUS_SKIP) @@ -638,6 +631,21 @@ static void netlink_forward_ring(struct netlink_ring *ring) } while (ring->head != head); } +static bool netlink_has_valid_frame(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_VALID) + return true; + pos = pos != 0 ? pos - 1 : ring->frame_max; + } while (pos != head); + + return false; +} + static bool netlink_dump_space(struct netlink_sock *nlk) { struct netlink_ring *ring = &nlk->rx_ring; @@ -683,13 +691,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, mask = datagram_poll(file, sock, wait); - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - netlink_forward_ring(&nlk->rx_ring); - if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLIN | POLLRDNORM; + /* We could already have received frames in the normal receive + * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, + * so if mask contains pollin/etc already, there's no point + * walking the ring. + */ + if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + if (netlink_has_valid_frame(&nlk->rx_ring)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); } - spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (nlk->tx_ring.pg_vec) { @@ -1118,6 +1132,7 @@ static int netlink_insert(struct sock *sk, u32 portid) if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); + goto err; } /* We need to ensure that the socket is hashed and visible. */ @@ -1157,14 +1172,15 @@ static struct proto netlink_proto = { }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol) + struct mutex *cb_mutex, int protocol, + int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; - sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; @@ -1226,7 +1242,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, if (err < 0) goto out; - err = __netlink_create(net, sock, cb_mutex, protocol); + err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; @@ -1336,20 +1352,24 @@ static int netlink_autobind(struct socket *sock) struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; - static s32 rover = -4097; + s32 rover = -4096; + bool ok; retry: cond_resched(); rcu_read_lock(); - if (__netlink_lookup(table, portid, net)) { + ok = !__netlink_lookup(table, portid, net); + rcu_read_unlock(); + if (!ok) { /* Bind collision, search negative portid values. */ - portid = rover--; - if (rover > -4097) + if (rover == -4096) + /* rover will be in range [S32_MIN, -4097] */ + rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); + else if (rover >= -4096) rover = -4097; - rcu_read_unlock(); + portid = rover--; goto retry; } - rcu_read_unlock(); err = netlink_insert(sk, portid); if (err == -EADDRINUSE) @@ -1708,7 +1728,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { @@ -1723,7 +1743,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -1856,15 +1876,16 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) +struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask) { #ifdef CONFIG_NETLINK_MMAP + unsigned int maxlen, linear_size; struct sock *sk = NULL; struct sk_buff *skb; struct netlink_ring *ring; struct nl_mmap_hdr *hdr; - unsigned int maxlen; sk = netlink_getsockbyportid(ssk, dst_portid); if (IS_ERR(sk)) @@ -1875,7 +1896,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_put; - if (ring->frame_size - NL_MMAP_HDRLEN < size) + /* We need to account the full linear size needed as a ring + * slot cannot have non-linear parts. + */ + linear_size = size + ldiff; + if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) goto out_put; skb = alloc_skb_head(gfp_mask); @@ -1889,13 +1914,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, /* check again under lock */ maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < size) + if (maxlen < linear_size) goto out_free; netlink_forward_ring(ring); hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); if (hdr == NULL) goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); atomic_inc(&ring->pending); @@ -1921,7 +1947,7 @@ out: #endif return alloc_skb(size, gfp_mask); } -EXPORT_SYMBOL_GPL(netlink_alloc_skb); +EXPORT_SYMBOL_GPL(__netlink_alloc_skb); int netlink_has_listeners(struct sock *sk, unsigned int group) { @@ -1947,7 +1973,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(NETLINK_CONGESTED, &nlk->state)) { + !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); @@ -1983,8 +2009,17 @@ static void do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) return; - if (!net_eq(sock_net(sk), p->net)) - return; + if (!net_eq(sock_net(sk), p->net)) { + if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + return; + + if (!peernet_has_id(sock_net(sk), p->net)) + return; + + if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, + CAP_NET_BROADCAST)) + return; + } if (p->failure) { netlink_overrun(sk); @@ -2008,23 +2043,33 @@ static void do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; - } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + goto out; + } + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if (sk_filter(sk, p->skb2)) { + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + goto out; + } + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + NETLINK_CB(p->skb2).nsid_is_set = true; + val = netlink_broadcast_deliver(sk, p->skb2); + if (val < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } +out: sock_put(sk); } @@ -2071,7 +2116,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid consume_skb(info.skb2); if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } @@ -2109,7 +2154,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { ret = 1; goto out; } @@ -2128,7 +2173,7 @@ out: * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the - * NETLINK_RECV_NO_ENOBUFS socket option. + * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { @@ -2188,9 +2233,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: if (val) - nlk->flags |= NETLINK_RECV_PKTINFO; + nlk->flags |= NETLINK_F_RECV_PKTINFO; else - nlk->flags &= ~NETLINK_RECV_PKTINFO; + nlk->flags &= ~NETLINK_F_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: @@ -2219,18 +2264,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } case NETLINK_BROADCAST_ERROR: if (val) - nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; else - nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; err = 0; break; case NETLINK_NO_ENOBUFS: if (val) { - nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(NETLINK_CONGESTED, &nlk->state); + nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; + clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { - nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } err = 0; break; @@ -2253,6 +2298,23 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; } #endif /* CONFIG_NETLINK_MMAP */ + case NETLINK_LISTEN_ALL_NSID: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) + return -EPERM; + + if (val) + nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; + else + nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; + err = 0; + break; + case NETLINK_CAP_ACK: + if (val) + nlk->flags |= NETLINK_F_CAP_ACK; + else + nlk->flags &= ~NETLINK_F_CAP_ACK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2279,7 +2341,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2289,7 +2351,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2299,7 +2361,39 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + case NETLINK_LIST_MEMBERSHIPS: { + int pos, idx, shift; + + err = 0; + netlink_lock_table(); + for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { + if (len - pos < sizeof(u32)) + break; + + idx = pos / sizeof(unsigned long); + shift = (pos % sizeof(unsigned long)) * 8; + if (put_user((u32)(nlk->groups[idx] >> shift), + (u32 __user *)(optval + pos))) { + err = -EFAULT; + break; + } + } + if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + err = -EFAULT; + netlink_unlock_table(); + break; + } + case NETLINK_CAP_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2319,6 +2413,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } +static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + if (!NETLINK_CB(skb).nsid_is_set) + return; + + put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), + &NETLINK_CB(skb).nsid); +} + static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -2367,7 +2471,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) * sendmsg(), but that's what we've got... */ if (netlink_tx_is_mmaped(sk) && - msg->msg_iter.type == ITER_IOVEC && + iter_is_iovec(&msg->msg_iter) && msg->msg_iter.nr_segs == 1 && msg->msg_iter.iov->iov_base == NULL) { err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, @@ -2473,8 +2577,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_RECV_PKTINFO) + if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); @@ -2528,17 +2634,10 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - /* - * We have to just have a reference on the net from sk, but don't - * get_net it. Besides, we cannot get and then put the net here. - * So we create one inside init_net and the move it to net. - */ - - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; - sk_change_net(sk, net); if (!cfg || cfg->groups < 32) groups = 32; @@ -2557,7 +2656,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_KERNEL_SOCKET; + nlk->flags |= NETLINK_F_KERNEL_SOCKET; netlink_table_grab(); if (!nl_table[unit].registered) { @@ -2594,7 +2693,10 @@ EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { - sk_release_kernel(sk); + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); @@ -2683,6 +2785,7 @@ static int netlink_dump(struct sock *sk) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int len, err = -ENOBUFS; + int alloc_min_size; int alloc_size; mutex_lock(nlk->cb_mutex); @@ -2691,9 +2794,6 @@ static int netlink_dump(struct sock *sk) goto errout_skb; } - cb = &nlk->cb; - alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - if (!netlink_rx_is_mmaped(sk) && atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; @@ -2703,23 +2803,35 @@ static int netlink_dump(struct sock *sk) * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ - if (alloc_size < nlk->max_recvmsg_len) { - skb = netlink_alloc_skb(sk, - nlk->max_recvmsg_len, - nlk->portid, + cb = &nlk->cb; + alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); + + if (alloc_min_size < nlk->max_recvmsg_len) { + alloc_size = nlk->max_recvmsg_len; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - /* available room should be exact amount to avoid MSG_TRUNC */ - if (skb) - skb_reserve(skb, skb_tailroom(skb) - - nlk->max_recvmsg_len); } - if (!skb) + if (!skb) { + alloc_size = alloc_min_size; skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); + } if (!skb) goto errout_skb; + + /* Trim skb to allocated size. User is expected to provide buffer as + * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at + * netlink_recvmsg())). dump will pack as many smaller messages as + * could fit within the allocated skb. skb is typically allocated + * with larger space than required (could be as much as near 2x the + * requested size with align to next power of 2 approach). Allowing + * dump to use the excess space makes it difficult for a user to have a + * reasonable static buffer based on the expected largest dump of a + * single netdev. The outcome is MSG_TRUNC error. + */ + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -2841,9 +2953,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); - /* error messages get the original request appened */ - if (err) + /* Error messages get the original request appened, unless the user + * requests to cap the error message. + */ + if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), @@ -2866,7 +2981,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); diff --git a/kernel/net/netlink/genetlink.c b/kernel/net/netlink/genetlink.c index 2ed5f9647..bc0e504f3 100644 --- a/kernel/net/netlink/genetlink.c +++ b/kernel/net/netlink/genetlink.c @@ -39,7 +39,7 @@ void genl_unlock(void) EXPORT_SYMBOL(genl_unlock); #ifdef CONFIG_LOCKDEP -int lockdep_genl_is_held(void) +bool lockdep_genl_is_held(void) { return lockdep_is_held(&genl_mutex); } @@ -1136,19 +1136,19 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, } EXPORT_SYMBOL(genlmsg_multicast_allns); -void genl_notify(struct genl_family *family, - struct sk_buff *skb, struct net *net, u32 portid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) +void genl_notify(struct genl_family *family, struct sk_buff *skb, + struct genl_info *info, u32 group, gfp_t flags) { + struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; int report = 0; - if (nlh) - report = nlmsg_report(nlh); + if (info->nlhdr) + report = nlmsg_report(info->nlhdr); if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; group = family->mcgrp_offset + group; - nlmsg_notify(sk, skb, portid, group, report, flags); + nlmsg_notify(sk, skb, info->snd_portid, group, report, flags); } EXPORT_SYMBOL(genl_notify); diff --git a/kernel/net/netrom/af_netrom.c b/kernel/net/netrom/af_netrom.c index b987fd56c..ed212ffc1 100644 --- a/kernel/net/netrom/af_netrom.c +++ b/kernel/net/netrom/af_netrom.c @@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto); + sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; @@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; diff --git a/kernel/net/netrom/nr_route.c b/kernel/net/netrom/nr_route.c index 96b64d2f6..d72a4f155 100644 --- a/kernel/net/netrom/nr_route.c +++ b/kernel/net/netrom/nr_route.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/net/nfc/af_nfc.c b/kernel/net/nfc/af_nfc.c index 2277276f5..54e40fa47 100644 --- a/kernel/net/nfc/af_nfc.c +++ b/kernel/net/nfc/af_nfc.c @@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto, read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { - rc = proto_tab[proto]->create(net, sock, proto_tab[proto]); + rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); diff --git a/kernel/net/nfc/core.c b/kernel/net/nfc/core.c index cff3f1614..1fe3d3b36 100644 --- a/kernel/net/nfc/core.c +++ b/kernel/net/nfc/core.c @@ -449,7 +449,7 @@ error: * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated */ -int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx) +int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { int rc = 0; @@ -476,7 +476,7 @@ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx) if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); - dev->ops->deactivate_target(dev, dev->active_target); + dev->ops->deactivate_target(dev, dev->active_target, mode); dev->active_target = NULL; error: diff --git a/kernel/net/nfc/digital_core.c b/kernel/net/nfc/digital_core.c index 009bcf317..23c2a118a 100644 --- a/kernel/net/nfc/digital_core.c +++ b/kernel/net/nfc/digital_core.c @@ -631,7 +631,8 @@ static int digital_activate_target(struct nfc_dev *nfc_dev, } static void digital_deactivate_target(struct nfc_dev *nfc_dev, - struct nfc_target *target) + struct nfc_target *target, + u8 mode) { struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); diff --git a/kernel/net/nfc/hci/core.c b/kernel/net/nfc/hci/core.c index 6e061da22..2b0f0ac49 100644 --- a/kernel/net/nfc/hci/core.c +++ b/kernel/net/nfc/hci/core.c @@ -678,7 +678,8 @@ static int hci_activate_target(struct nfc_dev *nfc_dev, } static void hci_deactivate_target(struct nfc_dev *nfc_dev, - struct nfc_target *target) + struct nfc_target *target, + u8 mode) { } diff --git a/kernel/net/nfc/hci/llc.c b/kernel/net/nfc/hci/llc.c index 1b90c0531..1399a03fa 100644 --- a/kernel/net/nfc/hci/llc.c +++ b/kernel/net/nfc/hci/llc.c @@ -144,11 +144,13 @@ inline int nfc_llc_start(struct nfc_llc *llc) { return llc->ops->start(llc); } +EXPORT_SYMBOL(nfc_llc_start); inline int nfc_llc_stop(struct nfc_llc *llc) { return llc->ops->stop(llc); } +EXPORT_SYMBOL(nfc_llc_stop); inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) { diff --git a/kernel/net/nfc/llcp.h b/kernel/net/nfc/llcp.h index de1789e3c..1f68724d4 100644 --- a/kernel/net/nfc/llcp.h +++ b/kernel/net/nfc/llcp.h @@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction); /* Sock API */ -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp); +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern); void nfc_llcp_sock_free(struct nfc_llcp_sock *sock); void nfc_llcp_accept_unlink(struct sock *sk); void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/kernel/net/nfc/llcp_core.c b/kernel/net/nfc/llcp_core.c index b18f07ccb..98876274a 100644 --- a/kernel/net/nfc/llcp_core.c +++ b/kernel/net/nfc/llcp_core.c @@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, sock->ssap = ssap; } - new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC); + new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); diff --git a/kernel/net/nfc/llcp_sock.c b/kernel/net/nfc/llcp_sock.c index 9578bd6a4..ecf0a0196 100644 --- a/kernel/net/nfc/llcp_sock.c +++ b/kernel/net/nfc/llcp_sock.c @@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); @@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk) } } -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp) +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; - sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto); + sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; @@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) } static int llcp_sock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock, else sock->ops = &llcp_sock_ops; - sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC); + sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; diff --git a/kernel/net/nfc/nci/Kconfig b/kernel/net/nfc/nci/Kconfig index a4f1e42e3..85d4819ab 100644 --- a/kernel/net/nfc/nci/Kconfig +++ b/kernel/net/nfc/nci/Kconfig @@ -12,10 +12,17 @@ config NFC_NCI config NFC_NCI_SPI depends on NFC_NCI && SPI select CRC_CCITT - bool "NCI over SPI protocol support" + tristate "NCI over SPI protocol support" default n help NCI (NFC Controller Interface) is a communication protocol between an NFC Controller (NFCC) and a Device Host (DH). Say yes if you use an NCI driver that requires SPI link layer. + +config NFC_NCI_UART + depends on NFC_NCI && TTY + tristate "NCI over UART protocol support" + default n + help + Say yes if you use an NCI driver that requires UART link layer. diff --git a/kernel/net/nfc/nci/Makefile b/kernel/net/nfc/nci/Makefile index 7ed894926..0ca31d9bf 100644 --- a/kernel/net/nfc/nci/Makefile +++ b/kernel/net/nfc/nci/Makefile @@ -6,4 +6,8 @@ obj-$(CONFIG_NFC_NCI) += nci.o nci-objs := core.o data.o lib.o ntf.o rsp.o hci.o -nci-$(CONFIG_NFC_NCI_SPI) += spi.o +nci_spi-y += spi.o +obj-$(CONFIG_NFC_NCI_SPI) += nci_spi.o + +nci_uart-y += uart.o +obj-$(CONFIG_NFC_NCI_UART) += nci_uart.o diff --git a/kernel/net/nfc/nci/core.c b/kernel/net/nfc/nci/core.c index 49ff32106..10c99a578 100644 --- a/kernel/net/nfc/nci/core.c +++ b/kernel/net/nfc/nci/core.c @@ -28,6 +28,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include +#include #include #include #include @@ -63,6 +64,19 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, return NULL; } +int nci_get_conn_info_by_id(struct nci_dev *ndev, u8 id) +{ + struct nci_conn_info *conn_info; + + list_for_each_entry(conn_info, &ndev->conn_info_list, list) { + if (conn_info->id == id) + return conn_info->conn_id; + } + + return -EINVAL; +} +EXPORT_SYMBOL(nci_get_conn_info_by_id); + /* ---- NCI requests ---- */ void nci_req_complete(struct nci_dev *ndev, int result) @@ -73,6 +87,7 @@ void nci_req_complete(struct nci_dev *ndev, int result) complete(&ndev->req_completion); } } +EXPORT_SYMBOL(nci_req_complete); static void nci_req_cancel(struct nci_dev *ndev, int err) { @@ -323,6 +338,60 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) sizeof(struct nci_rf_deactivate_cmd), &cmd); } +struct nci_cmd_param { + __u16 opcode; + size_t len; + __u8 *payload; +}; + +static void nci_generic_req(struct nci_dev *ndev, unsigned long opt) +{ + struct nci_cmd_param *param = + (struct nci_cmd_param *)opt; + + nci_send_cmd(ndev, param->opcode, param->len, param->payload); +} + +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) +{ + struct nci_cmd_param param; + + param.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY, oid); + param.len = len; + param.payload = payload; + + return __nci_request(ndev, nci_generic_req, (unsigned long)¶m, + msecs_to_jiffies(NCI_CMD_TIMEOUT)); +} +EXPORT_SYMBOL(nci_prop_cmd); + +int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload) +{ + struct nci_cmd_param param; + + param.opcode = opcode; + param.len = len; + param.payload = payload; + + return __nci_request(ndev, nci_generic_req, (unsigned long)¶m, + msecs_to_jiffies(NCI_CMD_TIMEOUT)); +} +EXPORT_SYMBOL(nci_core_cmd); + +int nci_core_reset(struct nci_dev *ndev) +{ + return __nci_request(ndev, nci_reset_req, 0, + msecs_to_jiffies(NCI_RESET_TIMEOUT)); +} +EXPORT_SYMBOL(nci_core_reset); + +int nci_core_init(struct nci_dev *ndev) +{ + return __nci_request(ndev, nci_init_req, 0, + msecs_to_jiffies(NCI_INIT_TIMEOUT)); +} +EXPORT_SYMBOL(nci_core_init); + static int nci_open_device(struct nci_dev *ndev) { int rc = 0; @@ -343,17 +412,26 @@ static int nci_open_device(struct nci_dev *ndev) set_bit(NCI_INIT, &ndev->flags); - rc = __nci_request(ndev, nci_reset_req, 0, - msecs_to_jiffies(NCI_RESET_TIMEOUT)); + if (ndev->ops->init) + rc = ndev->ops->init(ndev); + + if (!rc) { + rc = __nci_request(ndev, nci_reset_req, 0, + msecs_to_jiffies(NCI_RESET_TIMEOUT)); + } - if (ndev->ops->setup) - ndev->ops->setup(ndev); + if (!rc && ndev->ops->setup) { + rc = ndev->ops->setup(ndev); + } if (!rc) { rc = __nci_request(ndev, nci_init_req, 0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } + if (!rc && ndev->ops->post_setup) + rc = ndev->ops->post_setup(ndev); + if (!rc) { rc = __nci_request(ndev, nci_init_complete_req, 0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); @@ -407,6 +485,12 @@ static int nci_close_device(struct nci_dev *ndev) set_bit(NCI_INIT, &ndev->flags); __nci_request(ndev, nci_reset_req, 0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); + + /* After this point our queues are empty + * and no works are scheduled. + */ + ndev->ops->close(ndev); + clear_bit(NCI_INIT, &ndev->flags); del_timer_sync(&ndev->cmd_timer); @@ -414,10 +498,6 @@ static int nci_close_device(struct nci_dev *ndev) /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); - /* After this point our queues are empty - * and no works are scheduled. */ - ndev->ops->close(ndev); - /* Clear flags */ ndev->flags = 0; @@ -486,7 +566,7 @@ static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt) int nci_nfcee_discover(struct nci_dev *ndev, u8 action) { - return nci_request(ndev, nci_nfcee_discover_req, action, + return __nci_request(ndev, nci_nfcee_discover_req, action, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_discover); @@ -507,8 +587,9 @@ int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode) cmd.nfcee_id = nfcee_id; cmd.nfcee_mode = nfcee_mode; - return nci_request(ndev, nci_nfcee_mode_set_req, (unsigned long)&cmd, - msecs_to_jiffies(NCI_CMD_TIMEOUT)); + return __nci_request(ndev, nci_nfcee_mode_set_req, + (unsigned long)&cmd, + msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_mode_set); @@ -534,12 +615,19 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, if (!cmd) return -ENOMEM; + if (!number_destination_params) + return -EINVAL; + cmd->destination_type = destination_type; cmd->number_destination_params = number_destination_params; memcpy(cmd->params, params, params_len); data.cmd = cmd; - ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX]; + + if (params->length > 0) + ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX]; + else + ndev->cur_id = 0; r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data, @@ -558,8 +646,8 @@ static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt) int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) { - return nci_request(ndev, nci_core_conn_close_req, conn_id, - msecs_to_jiffies(NCI_CMD_TIMEOUT)); + return __nci_request(ndev, nci_core_conn_close_req, conn_id, + msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_conn_close); @@ -747,9 +835,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, } static void nci_deactivate_target(struct nfc_dev *nfc_dev, - struct nfc_target *target) + struct nfc_target *target, + __u8 mode) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); + u8 nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; pr_debug("entry\n"); @@ -760,9 +850,14 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, ndev->target_active_prot = 0; + switch (mode) { + case NFC_TARGET_MODE_SLEEP: + nci_mode = NCI_DEACTIVATE_TYPE_SLEEP_MODE; + break; + } + if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) { - nci_request(ndev, nci_rf_deactivate_req, - NCI_DEACTIVATE_TYPE_SLEEP_MODE, + nci_request(ndev, nci_rf_deactivate_req, nci_mode, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } } @@ -796,7 +891,7 @@ static int nci_dep_link_down(struct nfc_dev *nfc_dev) pr_debug("entry\n"); if (nfc_dev->rf_mode == NFC_RF_INITIATOR) { - nci_deactivate_target(nfc_dev, NULL); + nci_deactivate_target(nfc_dev, NULL, NCI_DEACTIVATE_TYPE_IDLE_MODE); } else { if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE || atomic_read(&ndev->state) == NCI_DISCOVERY) { @@ -961,6 +1056,14 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, return NULL; ndev->ops = ops; + + if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { + pr_err("Too many proprietary commands: %zd\n", + ops->n_prop_ops); + ops->prop_ops = NULL; + ops->n_prop_ops = 0; + } + ndev->tx_headroom = tx_headroom; ndev->tx_tailroom = tx_tailroom; init_completion(&ndev->req_completion); @@ -1115,7 +1218,7 @@ int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb) } EXPORT_SYMBOL(nci_recv_frame); -static int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) +int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) { pr_debug("len %d\n", skb->len); @@ -1133,6 +1236,7 @@ static int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) return ndev->ops->send(ndev, skb); } +EXPORT_SYMBOL(nci_send_frame); /* Send NCI command */ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) @@ -1164,6 +1268,81 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) return 0; } +EXPORT_SYMBOL(nci_send_cmd); + +/* Proprietary commands API */ +static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, + size_t n_ops, + __u16 opcode) +{ + size_t i; + struct nci_driver_ops *op; + + if (!ops || !n_ops) + return NULL; + + for (i = 0; i < n_ops; i++) { + op = &ops[i]; + if (op->opcode == opcode) + return op; + } + + return NULL; +} + +static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, + struct sk_buff *skb, struct nci_driver_ops *ops, + size_t n_ops) +{ + struct nci_driver_ops *op; + + op = ops_cmd_lookup(ops, n_ops, rsp_opcode); + if (!op || !op->rsp) + return -ENOTSUPP; + + return op->rsp(ndev, skb); +} + +static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, + struct sk_buff *skb, struct nci_driver_ops *ops, + size_t n_ops) +{ + struct nci_driver_ops *op; + + op = ops_cmd_lookup(ops, n_ops, ntf_opcode); + if (!op || !op->ntf) + return -ENOTSUPP; + + return op->ntf(ndev, skb); +} + +int nci_prop_rsp_packet(struct nci_dev *ndev, __u16 opcode, + struct sk_buff *skb) +{ + return nci_op_rsp_packet(ndev, opcode, skb, ndev->ops->prop_ops, + ndev->ops->n_prop_ops); +} + +int nci_prop_ntf_packet(struct nci_dev *ndev, __u16 opcode, + struct sk_buff *skb) +{ + return nci_op_ntf_packet(ndev, opcode, skb, ndev->ops->prop_ops, + ndev->ops->n_prop_ops); +} + +int nci_core_rsp_packet(struct nci_dev *ndev, __u16 opcode, + struct sk_buff *skb) +{ + return nci_op_rsp_packet(ndev, opcode, skb, ndev->ops->core_ops, + ndev->ops->n_core_ops); +} + +int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, + struct sk_buff *skb) +{ + return nci_op_ntf_packet(ndev, opcode, skb, ndev->ops->core_ops, + ndev->ops->n_core_ops); +} /* ---- NCI TX Data worker thread ---- */ diff --git a/kernel/net/nfc/nci/data.c b/kernel/net/nfc/nci/data.c index 566466d90..dbd242544 100644 --- a/kernel/net/nfc/nci/data.c +++ b/kernel/net/nfc/nci/data.c @@ -90,6 +90,18 @@ static inline void nci_push_data_hdr(struct nci_dev *ndev, nci_pbf_set((__u8 *)hdr, pbf); } +int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id) +{ + struct nci_conn_info *conn_info; + + conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); + if (!conn_info) + return -EPROTO; + + return conn_info->max_pkt_payload_len; +} +EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size); + static int nci_queue_tx_data_frags(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { @@ -203,6 +215,7 @@ free_exit: exit: return rc; } +EXPORT_SYMBOL(nci_send_data); /* ----------------- NCI RX Data ----------------- */ diff --git a/kernel/net/nfc/nci/hci.c b/kernel/net/nfc/nci/hci.c index b33fed6d1..2aedac15c 100644 --- a/kernel/net/nfc/nci/hci.c +++ b/kernel/net/nfc/nci/hci.c @@ -70,6 +70,7 @@ struct nci_hcp_packet { #define NCI_HCI_ANY_SET_PARAMETER 0x01 #define NCI_HCI_ANY_GET_PARAMETER 0x02 #define NCI_HCI_ANY_CLOSE_PIPE 0x04 +#define NCI_HCI_ADM_CLEAR_ALL_PIPE 0x14 #define NCI_HFP_NO_CHAINING 0x80 @@ -78,6 +79,8 @@ struct nci_hcp_packet { #define NCI_EVT_HOT_PLUG 0x03 #define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01 +#define NCI_HCI_ADM_CREATE_PIPE 0x10 +#define NCI_HCI_ADM_DELETE_PIPE 0x11 /* HCP headers */ #define NCI_HCI_HCP_PACKET_HEADER_LEN 1 @@ -101,6 +104,20 @@ struct nci_hcp_packet { #define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f) #define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f) +static int nci_hci_result_to_errno(u8 result) +{ + switch (result) { + case NCI_HCI_ANY_OK: + return 0; + case NCI_HCI_ANY_E_REG_PAR_UNKNOWN: + return -EOPNOTSUPP; + case NCI_HCI_ANY_E_TIMEOUT: + return -ETIME; + default: + return -1; + } +} + /* HCI core */ static void nci_hci_reset_pipes(struct nci_hci_dev *hdev) { @@ -146,18 +163,18 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, if (!conn_info) return -EPROTO; - skb = nci_skb_alloc(ndev, 2 + conn_info->max_pkt_payload_len + + i = 0; + skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; - skb_reserve(skb, 2 + NCI_DATA_HDR_SIZE); + skb_reserve(skb, NCI_DATA_HDR_SIZE + 2); *skb_push(skb, 1) = data_type; - i = 0; - len = conn_info->max_pkt_payload_len; - do { + len = conn_info->max_pkt_payload_len; + /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { @@ -177,9 +194,15 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, return r; i += len; + if (i < data_len) { - skb_trim(skb, 0); - skb_pull(skb, len); + skb = nci_skb_alloc(ndev, + conn_info->max_pkt_payload_len + + NCI_DATA_HDR_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, NCI_DATA_HDR_SIZE + 1); } } while (i < data_len); @@ -212,7 +235,8 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { - struct nci_conn_info *conn_info; + struct nci_hcp_message *message; + struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -232,14 +256,34 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); - - if (r == NCI_STATUS_OK && skb) - *skb = conn_info->rx_skb; + if (r == NCI_STATUS_OK) { + message = (struct nci_hcp_message *)conn_info->rx_skb->data; + r = nci_hci_result_to_errno( + NCI_HCP_MSG_GET_CMD(message->header)); + skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); + + if (!r && skb) + *skb = conn_info->rx_skb; + } return r; } EXPORT_SYMBOL(nci_hci_send_cmd); +int nci_hci_clear_all_pipes(struct nci_dev *ndev) +{ + int r; + + r = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, + NCI_HCI_ADM_CLEAR_ALL_PIPE, NULL, 0, NULL); + if (r < 0) + return r; + + nci_hci_reset_pipes(ndev->hci_dev); + return r; +} +EXPORT_SYMBOL(nci_hci_clear_all_pipes); + static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe, u8 event, struct sk_buff *skb) { @@ -328,9 +372,6 @@ static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct nci_conn_info *conn_info; u8 status = result; - if (result != NCI_HCI_ANY_OK) - goto exit; - conn_info = ndev->hci_dev->conn_info; if (!conn_info) { status = NCI_STATUS_REJECTED; @@ -340,7 +381,7 @@ static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, conn_info->rx_skb = skb; exit: - nci_req_complete(ndev, status); + nci_req_complete(ndev, NCI_STATUS_OK); } /* Receive hcp message for pipe, with type and cmd. @@ -366,7 +407,7 @@ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, break; } - nci_req_complete(ndev, 0); + nci_req_complete(ndev, NCI_STATUS_OK); } static void nci_hci_msg_rx_work(struct work_struct *work) @@ -378,7 +419,7 @@ static void nci_hci_msg_rx_work(struct work_struct *work) u8 pipe, type, instruction; while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) { - pipe = skb->data[0]; + pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]); skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN); message = (struct nci_hcp_message *)skb->data; type = NCI_HCP_MSG_GET_TYPE(message->header); @@ -395,7 +436,7 @@ void nci_hci_data_received_cb(void *context, { struct nci_dev *ndev = (struct nci_dev *)context; struct nci_hcp_packet *packet; - u8 pipe, type, instruction; + u8 pipe, type; struct sk_buff *hcp_skb; struct sk_buff *frag_skb; int msg_len; @@ -415,7 +456,7 @@ void nci_hci_data_received_cb(void *context, /* it's the last fragment. Does it need re-aggregation? */ if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) { - pipe = packet->header & NCI_HCI_FRAGMENT; + pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); msg_len = 0; @@ -434,7 +475,7 @@ void nci_hci_data_received_cb(void *context, *skb_put(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN) = pipe; skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { - msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN; + msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN; memcpy(skb_put(hcp_skb, msg_len), frag_skb->data + NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len); } @@ -452,11 +493,10 @@ void nci_hci_data_received_cb(void *context, packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { - pipe = packet->header; - instruction = NCI_HCP_MSG_GET_CMD(packet->message.header); - skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN + - NCI_HCI_HCP_MESSAGE_HEADER_LEN); - nci_hci_hcp_message_rx(ndev, pipe, type, instruction, hcp_skb); + pipe = NCI_HCP_MSG_GET_PIPE(packet->header); + skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN); + nci_hci_hcp_message_rx(ndev, pipe, type, + NCI_STATUS_OK, hcp_skb); } else { skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb); schedule_work(&ndev->hci_dev->msg_rx_work); @@ -485,9 +525,47 @@ int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) } EXPORT_SYMBOL(nci_hci_open_pipe); +static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, + u8 dest_gate, int *result) +{ + u8 pipe; + struct sk_buff *skb; + struct nci_hci_create_pipe_params params; + struct nci_hci_create_pipe_resp *resp; + + pr_debug("gate=%d\n", dest_gate); + + params.src_gate = NCI_HCI_ADMIN_GATE; + params.dest_host = dest_host; + params.dest_gate = dest_gate; + + *result = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, + NCI_HCI_ADM_CREATE_PIPE, + (u8 *)¶ms, sizeof(params), &skb); + if (*result < 0) + return NCI_HCI_INVALID_PIPE; + + resp = (struct nci_hci_create_pipe_resp *)skb->data; + pipe = resp->pipe; + kfree_skb(skb); + + pr_debug("pipe created=%d\n", pipe); + + return pipe; +} + +static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) +{ + pr_debug("\n"); + + return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, + NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); +} + int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { + struct nci_hcp_message *message; struct nci_conn_info *conn_info; struct nci_data data; int r; @@ -520,6 +598,12 @@ int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); + if (r == NCI_STATUS_OK) { + message = (struct nci_hcp_message *)conn_info->rx_skb->data; + r = nci_hci_result_to_errno( + NCI_HCP_MSG_GET_CMD(message->header)); + skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); + } kfree(tmp); return r; @@ -529,6 +613,7 @@ EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { + struct nci_hcp_message *message; struct nci_conn_info *conn_info; struct nci_data data; int r; @@ -553,8 +638,15 @@ int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); - if (r == NCI_STATUS_OK) - *skb = conn_info->rx_skb; + if (r == NCI_STATUS_OK) { + message = (struct nci_hcp_message *)conn_info->rx_skb->data; + r = nci_hci_result_to_errno( + NCI_HCP_MSG_GET_CMD(message->header)); + skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); + + if (!r && skb) + *skb = conn_info->rx_skb; + } return r; } @@ -563,6 +655,7 @@ EXPORT_SYMBOL(nci_hci_get_param); int nci_hci_connect_gate(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, u8 pipe) { + bool pipe_created = false; int r; if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE) @@ -581,12 +674,26 @@ int nci_hci_connect_gate(struct nci_dev *ndev, case NCI_HCI_ADMIN_GATE: pipe = NCI_HCI_ADMIN_PIPE; break; + default: + pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r); + if (pipe < 0) + return r; + pipe_created = true; + break; } open_pipe: r = nci_hci_open_pipe(ndev, pipe); - if (r < 0) + if (r < 0) { + if (pipe_created) { + if (nci_hci_delete_pipe(ndev, pipe) < 0) { + /* TODO: Cannot clean by deleting pipe... + * -> inconsistent state + */ + } + } return r; + } ndev->hci_dev->pipes[pipe].gate = dest_gate; ndev->hci_dev->pipes[pipe].host = dest_host; @@ -639,23 +746,24 @@ int nci_hci_dev_session_init(struct nci_dev *ndev) ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) - goto exit; + return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) - goto exit; + return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && - memcmp(ndev->hci_dev->init_data.session_id, - skb->data, skb->len) == 0 && + !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); + } else { + r = nci_hci_clear_all_pipes(ndev); if (r < 0) goto exit; - } else { + r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, ndev->hci_dev->init_data.gates); @@ -667,8 +775,6 @@ int nci_hci_dev_session_init(struct nci_dev *ndev) ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } - if (r == 0) - goto exit; exit: kfree_skb(skb); diff --git a/kernel/net/nfc/nci/ntf.c b/kernel/net/nfc/nci/ntf.c index 321807107..2ada2b39e 100644 --- a/kernel/net/nfc/nci/ntf.c +++ b/kernel/net/nfc/nci/ntf.c @@ -758,6 +758,15 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); + if (nci_opcode_gid(ntf_opcode) == NCI_GID_PROPRIETARY) { + if (nci_prop_ntf_packet(ndev, ntf_opcode, skb) == -ENOTSUPP) { + pr_err("unsupported ntf opcode 0x%x\n", + ntf_opcode); + } + + goto end; + } + switch (ntf_opcode) { case NCI_OP_CORE_CONN_CREDITS_NTF: nci_core_conn_credits_ntf_packet(ndev, skb); @@ -796,5 +805,7 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) break; } + nci_core_ntf_packet(ndev, ntf_opcode, skb); +end: kfree_skb(skb); } diff --git a/kernel/net/nfc/nci/rsp.c b/kernel/net/nfc/nci/rsp.c index 02486bc2c..9b6eb913d 100644 --- a/kernel/net/nfc/nci/rsp.c +++ b/kernel/net/nfc/nci/rsp.c @@ -296,6 +296,15 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); + if (nci_opcode_gid(rsp_opcode) == NCI_GID_PROPRIETARY) { + if (nci_prop_rsp_packet(ndev, rsp_opcode, skb) == -ENOTSUPP) { + pr_err("unsupported rsp opcode 0x%x\n", + rsp_opcode); + } + + goto end; + } + switch (rsp_opcode) { case NCI_OP_CORE_RESET_RSP: nci_core_reset_rsp_packet(ndev, skb); @@ -346,6 +355,8 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) break; } + nci_core_rsp_packet(ndev, rsp_opcode, skb); +end: kfree_skb(skb); /* trigger the next cmd */ diff --git a/kernel/net/nfc/nci/spi.c b/kernel/net/nfc/nci/spi.c index ec250e777..d904cd2f1 100644 --- a/kernel/net/nfc/nci/spi.c +++ b/kernel/net/nfc/nci/spi.c @@ -18,6 +18,8 @@ #define pr_fmt(fmt) "nci_spi: %s: " fmt, __func__ +#include + #include #include #include @@ -56,6 +58,7 @@ static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb, } t.cs_change = cs_change; t.delay_usecs = nspi->xfer_udelay; + t.speed_hz = nspi->xfer_speed_hz; spi_message_init(&m); spi_message_add_tail(&t, &m); @@ -142,7 +145,8 @@ struct nci_spi *nci_spi_allocate_spi(struct spi_device *spi, nspi->acknowledge_mode = acknowledge_mode; nspi->xfer_udelay = delay; - + /* Use controller max SPI speed by default */ + nspi->xfer_speed_hz = 0; nspi->spi = spi; nspi->ndev = ndev; init_completion(&nspi->req_completion); @@ -195,12 +199,14 @@ static struct sk_buff *__nci_spi_read(struct nci_spi *nspi) tx.tx_buf = req; tx.len = 2; tx.cs_change = 0; + tx.speed_hz = nspi->xfer_speed_hz; spi_message_add_tail(&tx, &m); memset(&rx, 0, sizeof(struct spi_transfer)); rx.rx_buf = resp_hdr; rx.len = 2; rx.cs_change = 1; + rx.speed_hz = nspi->xfer_speed_hz; spi_message_add_tail(&rx, &m); ret = spi_sync(nspi->spi, &m); @@ -224,6 +230,7 @@ static struct sk_buff *__nci_spi_read(struct nci_spi *nspi) rx.len = rx_len; rx.cs_change = 0; rx.delay_usecs = nspi->xfer_udelay; + rx.speed_hz = nspi->xfer_speed_hz; spi_message_add_tail(&rx, &m); ret = spi_sync(nspi->spi, &m); @@ -320,3 +327,5 @@ done: return skb; } EXPORT_SYMBOL_GPL(nci_spi_read); + +MODULE_LICENSE("GPL"); diff --git a/kernel/net/nfc/nci/uart.c b/kernel/net/nfc/nci/uart.c new file mode 100644 index 000000000..21d887567 --- /dev/null +++ b/kernel/net/nfc/nci/uart.c @@ -0,0 +1,494 @@ +/* + * Copyright (C) 2015, Marvell International Ltd. + * + * This software file (the "File") is distributed by Marvell International + * Ltd. under the terms of the GNU General Public License Version 2, June 1991 + * (the "License"). You may use, redistribute and/or modify this File in + * accordance with the terms and conditions of the License, a copy of which + * is available on the worldwide web at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. + * + * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE + * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY DISCLAIMED. The License provides additional details about + * this warranty disclaimer. + */ + +/* Inspired (hugely) by HCI LDISC implementation in Bluetooth. + * + * Copyright (C) 2000-2001 Qualcomm Incorporated + * Copyright (C) 2002-2003 Maxim Krasnyansky + * Copyright (C) 2004-2005 Marcel Holtmann + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* TX states */ +#define NCI_UART_SENDING 1 +#define NCI_UART_TX_WAKEUP 2 + +static struct nci_uart *nci_uart_drivers[NCI_UART_DRIVER_MAX]; + +static inline struct sk_buff *nci_uart_dequeue(struct nci_uart *nu) +{ + struct sk_buff *skb = nu->tx_skb; + + if (!skb) + skb = skb_dequeue(&nu->tx_q); + else + nu->tx_skb = NULL; + + return skb; +} + +static inline int nci_uart_queue_empty(struct nci_uart *nu) +{ + if (nu->tx_skb) + return 0; + + return skb_queue_empty(&nu->tx_q); +} + +static int nci_uart_tx_wakeup(struct nci_uart *nu) +{ + if (test_and_set_bit(NCI_UART_SENDING, &nu->tx_state)) { + set_bit(NCI_UART_TX_WAKEUP, &nu->tx_state); + return 0; + } + + schedule_work(&nu->write_work); + + return 0; +} + +static void nci_uart_write_work(struct work_struct *work) +{ + struct nci_uart *nu = container_of(work, struct nci_uart, write_work); + struct tty_struct *tty = nu->tty; + struct sk_buff *skb; + +restart: + clear_bit(NCI_UART_TX_WAKEUP, &nu->tx_state); + + if (nu->ops.tx_start) + nu->ops.tx_start(nu); + + while ((skb = nci_uart_dequeue(nu))) { + int len; + + set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); + len = tty->ops->write(tty, skb->data, skb->len); + skb_pull(skb, len); + if (skb->len) { + nu->tx_skb = skb; + break; + } + kfree_skb(skb); + } + + if (test_bit(NCI_UART_TX_WAKEUP, &nu->tx_state)) + goto restart; + + if (nu->ops.tx_done && nci_uart_queue_empty(nu)) + nu->ops.tx_done(nu); + + clear_bit(NCI_UART_SENDING, &nu->tx_state); +} + +static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver) +{ + struct nci_uart *nu = NULL; + int ret; + + if (driver >= NCI_UART_DRIVER_MAX) + return -EINVAL; + + if (!nci_uart_drivers[driver]) + return -ENOENT; + + nu = kzalloc(sizeof(*nu), GFP_KERNEL); + if (!nu) + return -ENOMEM; + + memcpy(nu, nci_uart_drivers[driver], sizeof(struct nci_uart)); + nu->tty = tty; + tty->disc_data = nu; + skb_queue_head_init(&nu->tx_q); + INIT_WORK(&nu->write_work, nci_uart_write_work); + spin_lock_init(&nu->rx_lock); + + ret = nu->ops.open(nu); + if (ret) { + tty->disc_data = NULL; + kfree(nu); + } else if (!try_module_get(nu->owner)) { + nu->ops.close(nu); + tty->disc_data = NULL; + kfree(nu); + return -ENOENT; + } + return ret; +} + +/* ------ LDISC part ------ */ + +/* nci_uart_tty_open + * + * Called when line discipline changed to NCI_UART. + * + * Arguments: + * tty pointer to tty info structure + * Return Value: + * 0 if success, otherwise error code + */ +static int nci_uart_tty_open(struct tty_struct *tty) +{ + /* Error if the tty has no write op instead of leaving an exploitable + * hole + */ + if (!tty->ops->write) + return -EOPNOTSUPP; + + tty->disc_data = NULL; + tty->receive_room = 65536; + + /* Flush any pending characters in the driver and line discipline. */ + + /* FIXME: why is this needed. Note don't use ldisc_ref here as the + * open path is before the ldisc is referencable. + */ + + if (tty->ldisc->ops->flush_buffer) + tty->ldisc->ops->flush_buffer(tty); + tty_driver_flush_buffer(tty); + + return 0; +} + +/* nci_uart_tty_close() + * + * Called when the line discipline is changed to something + * else, the tty is closed, or the tty detects a hangup. + */ +static void nci_uart_tty_close(struct tty_struct *tty) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + /* Detach from the tty */ + tty->disc_data = NULL; + + if (!nu) + return; + + if (nu->tx_skb) + kfree_skb(nu->tx_skb); + if (nu->rx_skb) + kfree_skb(nu->rx_skb); + + skb_queue_purge(&nu->tx_q); + + nu->ops.close(nu); + nu->tty = NULL; + module_put(nu->owner); + + cancel_work_sync(&nu->write_work); + + kfree(nu); +} + +/* nci_uart_tty_wakeup() + * + * Callback for transmit wakeup. Called when low level + * device driver can accept more send data. + * + * Arguments: tty pointer to associated tty instance data + * Return Value: None + */ +static void nci_uart_tty_wakeup(struct tty_struct *tty) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + if (!nu) + return; + + clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); + + if (tty != nu->tty) + return; + + nci_uart_tx_wakeup(nu); +} + +/* nci_uart_tty_receive() + * + * Called by tty low level driver when receive data is + * available. + * + * Arguments: tty pointer to tty isntance data + * data pointer to received data + * flags pointer to flags for data + * count count of received data in bytes + * + * Return Value: None + */ +static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data, + char *flags, int count) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + if (!nu || tty != nu->tty) + return; + + spin_lock(&nu->rx_lock); + nu->ops.recv_buf(nu, (void *)data, flags, count); + spin_unlock(&nu->rx_lock); + + tty_unthrottle(tty); +} + +/* nci_uart_tty_ioctl() + * + * Process IOCTL system call for the tty device. + * + * Arguments: + * + * tty pointer to tty instance data + * file pointer to open file object for device + * cmd IOCTL command code + * arg argument for IOCTL call (cmd dependent) + * + * Return Value: Command dependent + */ +static int nci_uart_tty_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct nci_uart *nu = (void *)tty->disc_data; + int err = 0; + + switch (cmd) { + case NCIUARTSETDRIVER: + if (!nu) + return nci_uart_set_driver(tty, (unsigned int)arg); + else + return -EBUSY; + break; + default: + err = n_tty_ioctl_helper(tty, file, cmd, arg); + break; + } + + return err; +} + +/* We don't provide read/write/poll interface for user space. */ +static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file, + unsigned char __user *buf, size_t nr) +{ + return 0; +} + +static ssize_t nci_uart_tty_write(struct tty_struct *tty, struct file *file, + const unsigned char *data, size_t count) +{ + return 0; +} + +static unsigned int nci_uart_tty_poll(struct tty_struct *tty, + struct file *filp, poll_table *wait) +{ + return 0; +} + +static int nci_uart_send(struct nci_uart *nu, struct sk_buff *skb) +{ + /* Queue TX packet */ + skb_queue_tail(&nu->tx_q, skb); + + /* Try to start TX (if possible) */ + nci_uart_tx_wakeup(nu); + + return 0; +} + +/* -- Default recv_buf handler -- + * + * This handler supposes that NCI frames are sent over UART link without any + * framing. It reads NCI header, retrieve the packet size and once all packet + * bytes are received it passes it to nci_uart driver for processing. + */ +static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, + char *flags, int count) +{ + int chunk_len; + + if (!nu->ndev) { + nfc_err(nu->tty->dev, + "receive data from tty but no NCI dev is attached yet, drop buffer\n"); + return 0; + } + + /* Decode all incoming data in packets + * and enqueue then for processing. + */ + while (count > 0) { + /* If this is the first data of a packet, allocate a buffer */ + if (!nu->rx_skb) { + nu->rx_packet_len = -1; + nu->rx_skb = nci_skb_alloc(nu->ndev, + NCI_MAX_PACKET_SIZE, + GFP_KERNEL); + if (!nu->rx_skb) + return -ENOMEM; + } + + /* Eat byte after byte till full packet header is received */ + if (nu->rx_skb->len < NCI_CTRL_HDR_SIZE) { + *skb_put(nu->rx_skb, 1) = *data++; + --count; + continue; + } + + /* Header was received but packet len was not read */ + if (nu->rx_packet_len < 0) + nu->rx_packet_len = NCI_CTRL_HDR_SIZE + + nci_plen(nu->rx_skb->data); + + /* Compute how many bytes are missing and how many bytes can + * be consumed. + */ + chunk_len = nu->rx_packet_len - nu->rx_skb->len; + if (count < chunk_len) + chunk_len = count; + memcpy(skb_put(nu->rx_skb, chunk_len), data, chunk_len); + data += chunk_len; + count -= chunk_len; + + /* Chcek if packet is fully received */ + if (nu->rx_packet_len == nu->rx_skb->len) { + /* Pass RX packet to driver */ + if (nu->ops.recv(nu, nu->rx_skb) != 0) + nfc_err(nu->tty->dev, "corrupted RX packet\n"); + /* Next packet will be a new one */ + nu->rx_skb = NULL; + } + } + + return 0; +} + +/* -- Default recv handler -- */ +static int nci_uart_default_recv(struct nci_uart *nu, struct sk_buff *skb) +{ + return nci_recv_frame(nu->ndev, skb); +} + +int nci_uart_register(struct nci_uart *nu) +{ + if (!nu || !nu->ops.open || + !nu->ops.recv || !nu->ops.close) + return -EINVAL; + + /* Set the send callback */ + nu->ops.send = nci_uart_send; + + /* Install default handlers if not overridden */ + if (!nu->ops.recv_buf) + nu->ops.recv_buf = nci_uart_default_recv_buf; + if (!nu->ops.recv) + nu->ops.recv = nci_uart_default_recv; + + /* Add this driver in the driver list */ + if (nci_uart_drivers[nu->driver]) { + pr_err("driver %d is already registered\n", nu->driver); + return -EBUSY; + } + nci_uart_drivers[nu->driver] = nu; + + pr_info("NCI uart driver '%s [%d]' registered\n", nu->name, nu->driver); + + return 0; +} +EXPORT_SYMBOL_GPL(nci_uart_register); + +void nci_uart_unregister(struct nci_uart *nu) +{ + pr_info("NCI uart driver '%s [%d]' unregistered\n", nu->name, + nu->driver); + + /* Remove this driver from the driver list */ + nci_uart_drivers[nu->driver] = NULL; +} +EXPORT_SYMBOL_GPL(nci_uart_unregister); + +void nci_uart_set_config(struct nci_uart *nu, int baudrate, int flow_ctrl) +{ + struct ktermios new_termios; + + if (!nu->tty) + return; + + down_read(&nu->tty->termios_rwsem); + new_termios = nu->tty->termios; + up_read(&nu->tty->termios_rwsem); + tty_termios_encode_baud_rate(&new_termios, baudrate, baudrate); + + if (flow_ctrl) + new_termios.c_cflag |= CRTSCTS; + else + new_termios.c_cflag &= ~CRTSCTS; + + tty_set_termios(nu->tty, &new_termios); +} +EXPORT_SYMBOL_GPL(nci_uart_set_config); + +static struct tty_ldisc_ops nci_uart_ldisc = { + .magic = TTY_LDISC_MAGIC, + .owner = THIS_MODULE, + .name = "n_nci", + .open = nci_uart_tty_open, + .close = nci_uart_tty_close, + .read = nci_uart_tty_read, + .write = nci_uart_tty_write, + .poll = nci_uart_tty_poll, + .receive_buf = nci_uart_tty_receive, + .write_wakeup = nci_uart_tty_wakeup, + .ioctl = nci_uart_tty_ioctl, +}; + +static int __init nci_uart_init(void) +{ + memset(nci_uart_drivers, 0, sizeof(nci_uart_drivers)); + return tty_register_ldisc(N_NCI, &nci_uart_ldisc); +} + +static void __exit nci_uart_exit(void) +{ + tty_unregister_ldisc(N_NCI); +} + +module_init(nci_uart_init); +module_exit(nci_uart_exit); + +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_DESCRIPTION("NFC NCI UART driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_LDISC(N_NCI); diff --git a/kernel/net/nfc/netlink.c b/kernel/net/nfc/netlink.c index 376303671..f58c1fba1 100644 --- a/kernel/net/nfc/netlink.c +++ b/kernel/net/nfc/netlink.c @@ -5,6 +5,12 @@ * Lauro Ramos Venancio * Aloisio Almeida Jr * + * Vendor commands implementation based on net/wireless/nl80211.c + * which is: + * + * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -57,6 +63,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, + [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, + }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { @@ -877,7 +885,7 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); - nfc_deactivate_target(dev, target_idx); + nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); @@ -1101,10 +1109,8 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); - if (!dev) { - rc = -ENODEV; - goto exit; - } + if (!dev) + return -ENODEV; device_lock(&dev->dev); @@ -1489,6 +1495,131 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); } +static int nfc_genl_vendor_cmd(struct sk_buff *skb, + struct genl_info *info) +{ + struct nfc_dev *dev; + struct nfc_vendor_cmd *cmd; + u32 dev_idx, vid, subcmd; + u8 *data; + size_t data_len; + int i, err; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || + !info->attrs[NFC_ATTR_VENDOR_ID] || + !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) + return -EINVAL; + + dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); + subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); + + dev = nfc_get_device(dev_idx); + if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + return -ENODEV; + + if (info->attrs[NFC_ATTR_VENDOR_DATA]) { + data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); + data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); + if (data_len == 0) + return -EINVAL; + } else { + data = NULL; + data_len = 0; + } + + for (i = 0; i < dev->n_vendor_cmds; i++) { + cmd = &dev->vendor_cmds[i]; + + if (cmd->vendor_id != vid || cmd->subcmd != subcmd) + continue; + + dev->cur_cmd_info = info; + err = cmd->doit(dev, data, data_len); + dev->cur_cmd_info = NULL; + return err; + } + + return -EOPNOTSUPP; +} + +/* message building helper */ +static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, + int flags, u8 cmd) +{ + /* since there is no private header just add the generic one */ + return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); +} + +static struct sk_buff * +__nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, + u32 portid, u32 seq, + enum nfc_attrs attr, + u32 oui, u32 subcmd, gfp_t gfp) +{ + struct sk_buff *skb; + void *hdr; + + skb = nlmsg_new(approxlen + 100, gfp); + if (!skb) + return NULL; + + hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); + if (!hdr) { + kfree_skb(skb); + return NULL; + } + + if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) + goto nla_put_failure; + if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) + goto nla_put_failure; + if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) + goto nla_put_failure; + + ((void **)skb->cb)[0] = dev; + ((void **)skb->cb)[1] = hdr; + + return skb; + +nla_put_failure: + kfree_skb(skb); + return NULL; +} + +struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, + enum nfc_attrs attr, + u32 oui, u32 subcmd, + int approxlen) +{ + if (WARN_ON(!dev->cur_cmd_info)) + return NULL; + + return __nfc_alloc_vendor_cmd_skb(dev, approxlen, + dev->cur_cmd_info->snd_portid, + dev->cur_cmd_info->snd_seq, attr, + oui, subcmd, GFP_KERNEL); +} +EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); + +int nfc_vendor_cmd_reply(struct sk_buff *skb) +{ + struct nfc_dev *dev = ((void **)skb->cb)[0]; + void *hdr = ((void **)skb->cb)[1]; + + /* clear CB data for netlink core to own from now on */ + memset(skb->cb, 0, sizeof(skb->cb)); + + if (WARN_ON(!dev->cur_cmd_info)) { + kfree_skb(skb); + return -EINVAL; + } + + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, dev->cur_cmd_info); +} +EXPORT_SYMBOL(nfc_vendor_cmd_reply); + static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -1579,6 +1710,11 @@ static const struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_activate_target, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_VENDOR, + .doit = nfc_genl_vendor_cmd, + .policy = nfc_genl_policy, + }, }; diff --git a/kernel/net/nfc/nfc.h b/kernel/net/nfc/nfc.h index a8ce80b47..c20b784ad 100644 --- a/kernel/net/nfc/nfc.h +++ b/kernel/net/nfc/nfc.h @@ -25,12 +25,15 @@ #include #include +#define NFC_TARGET_MODE_IDLE 0 +#define NFC_TARGET_MODE_SLEEP 1 + struct nfc_protocol { int id; struct proto *proto; struct module *owner; int (*create)(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto); + const struct nfc_protocol *nfc_proto, int kern); }; struct nfc_rawsock { @@ -147,7 +150,7 @@ int nfc_dep_link_down(struct nfc_dev *dev); int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol); -int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx); +int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode); int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context); diff --git a/kernel/net/nfc/rawsock.c b/kernel/net/nfc/rawsock.c index 82b4e8024..e386e6c90 100644 --- a/kernel/net/nfc/rawsock.c +++ b/kernel/net/nfc/rawsock.c @@ -321,7 +321,8 @@ static void rawsock_destruct(struct sock *sk) if (sk->sk_state == TCP_ESTABLISHED) { nfc_deactivate_target(nfc_rawsock(sk)->dev, - nfc_rawsock(sk)->target_idx); + nfc_rawsock(sk)->target_idx, + NFC_TARGET_MODE_IDLE); nfc_put_device(nfc_rawsock(sk)->dev); } @@ -334,7 +335,7 @@ static void rawsock_destruct(struct sock *sk) } static int rawsock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -348,7 +349,7 @@ static int rawsock_create(struct net *net, struct socket *sock, else sock->ops = &rawsock_ops; - sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto); + sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/openvswitch/Kconfig b/kernel/net/openvswitch/Kconfig index ed6b0f8dd..d143aa9f6 100644 --- a/kernel/net/openvswitch/Kconfig +++ b/kernel/net/openvswitch/Kconfig @@ -5,6 +5,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) select LIBCRC32C select MPLS select NET_MPLS_GSO @@ -34,7 +36,7 @@ config OPENVSWITCH config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH - depends on NET_IPGRE_DEMUX + depends on NET_IPGRE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create GRE diff --git a/kernel/net/openvswitch/Makefile b/kernel/net/openvswitch/Makefile index 91b947841..60f809085 100644 --- a/kernel/net/openvswitch/Makefile +++ b/kernel/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif + +obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o -obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/kernel/net/openvswitch/actions.c b/kernel/net/openvswitch/actions.c index b491c1c29..c88d0f2d3 100644 --- a/kernel/net/openvswitch/actions.c +++ b/kernel/net/openvswitch/actions.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -29,8 +30,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -38,6 +41,7 @@ #include "datapath.h" #include "flow.h" +#include "conntrack.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, @@ -52,6 +56,20 @@ struct deferred_action { struct sw_flow_key pkt_key; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + __u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[MAX_L2_LEN]; +}; + +static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); + #define DEFERRED_ACTION_FIFO_SIZE 10 struct action_fifo { int head; @@ -185,10 +203,6 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, return 0; } -/* 'KEY' must not have any bits set outside of the 'MASK' */ -#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) -#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK)) - static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { @@ -201,7 +215,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; stack = (__be32 *)skb_mpls_header(skb); - lse = MASKED(*stack, *mpls_lse, *mask); + lse = OVS_MASKED(*stack, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~(*stack), lse }; @@ -244,9 +258,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) const u16 *src = (const u16 *)src_; const u16 *mask = (const u16 *)mask_; - SET_MASKED(dst[0], src[0], mask[0]); - SET_MASKED(dst[1], src[1], mask[1]); - SET_MASKED(dst[2], src[2], mask[2]); + OVS_SET_MASKED(dst[0], src[0], mask[0]); + OVS_SET_MASKED(dst[1], src[1], mask[1]); + OVS_SET_MASKED(dst[2], src[2], mask[2]); } static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, @@ -273,28 +287,36 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, - __be32 *addr, __be32 new_addr) +static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, + __be32 addr, __be32 new_addr) { int transport_len = skb->len - skb_transport_offset(skb); + if (nh->frag_off & htons(IP_OFFSET)) + return; + if (nh->protocol == IPPROTO_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, - *addr, new_addr, 1); + addr, new_addr, true); } else if (nh->protocol == IPPROTO_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&uh->check, skb, - *addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } } } +} +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + update_ip_l4_checksum(skb, nh, *addr, new_addr); csum_replace4(&nh->check, *addr, new_addr); skb_clear_hash(skb); *addr = new_addr; @@ -308,14 +330,14 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, if (l4_proto == NEXTHDR_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, - addr, new_addr, 1); + addr, new_addr, true); } else if (l4_proto == NEXTHDR_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&uh->check, skb, - addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -323,17 +345,17 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, } else if (l4_proto == NEXTHDR_ICMP) { if (likely(transport_len >= sizeof(struct icmp6hdr))) inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, - skb, addr, new_addr, 1); + skb, addr, new_addr, true); } } static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], const __be32 mask[4], __be32 masked[4]) { - masked[0] = MASKED(old[0], addr[0], mask[0]); - masked[1] = MASKED(old[1], addr[1], mask[1]); - masked[2] = MASKED(old[2], addr[2], mask[2]); - masked[3] = MASKED(old[3], addr[3], mask[3]); + masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); + masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); + masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); + masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); } static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, @@ -350,15 +372,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) { /* Bits 21-24 are always unmasked, so this retains their values. */ - SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, u8 mask) { - new_ttl = MASKED(nh->ttl, new_ttl, mask); + new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); nh->ttl = new_ttl; @@ -384,7 +406,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, * makes sense to check if the value actually changed. */ if (mask->ipv4_src) { - new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); + new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); if (unlikely(new_addr != nh->saddr)) { set_ip_addr(skb, nh, &nh->saddr, new_addr); @@ -392,7 +414,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv4_dst) { - new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); + new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); if (unlikely(new_addr != nh->daddr)) { set_ip_addr(skb, nh, &nh->daddr, new_addr); @@ -480,7 +502,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit); + OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, + mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; @@ -490,7 +513,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) { - inet_proto_csum_replace2(check, skb, *port, new_port, 0); + inet_proto_csum_replace2(check, skb, *port, new_port, false); *port = new_port; } @@ -509,8 +532,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, uh = udp_hdr(skb); /* Either of the masks is non-zero, so do not bother checking them. */ - src = MASKED(uh->source, key->udp_src, mask->udp_src); - dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst); + src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); + dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { if (likely(src != uh->source)) { @@ -550,12 +573,12 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; th = tcp_hdr(skb); - src = MASKED(th->source, key->tcp_src, mask->tcp_src); + src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); if (likely(src != th->source)) { set_tp_port(skb, &th->source, src, &th->check); flow_key->tp.src = src; } - dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); if (likely(dst != th->dest)) { set_tp_port(skb, &th->dest, dst, &th->check); flow_key->tp.dst = dst; @@ -582,8 +605,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, old_csum = sh->checksum; old_correct_csum = sctp_compute_cksum(skb, sctphoff); - sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src); - sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); new_csum = sctp_compute_cksum(skb, sctphoff); @@ -597,28 +620,162 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct vport *vport = data->vport; + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *OVS_CB(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + ovs_vport_send(vport, skb); + return 0; +} + +static unsigned int +ovs_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops ovs_dst_ops = { + .family = AF_UNSPEC, + .mtu = ovs_dst_get_mtu, +}; + +/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is + * ovs_vport_output(), which is called once per fragmented packet. + */ +static void prepare_frag(struct vport *vport, struct sk_buff *skb) +{ + unsigned int hlen = skb_network_offset(skb); + struct ovs_frag_data *data; + + data = this_cpu_ptr(&ovs_frag_data_storage); + data->dst = skb->_skb_refdst; + data->vport = vport; + data->cb = *OVS_CB(skb); + data->inner_protocol = skb->inner_protocol; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static void ovs_fragment(struct net *net, struct vport *vport, + struct sk_buff *skb, u16 mru, __be16 ethertype) +{ + if (skb_network_offset(skb) > MAX_L2_LEN) { + OVS_NLERR(1, "L2 header too long to fragment"); + goto err; + } + + if (ethertype == htons(ETH_P_IP)) { + struct dst_entry ovs_dst; + unsigned long orig_dst; + + prepare_frag(vport, skb); + dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_dst); + IPCB(skb)->frag_max_size = mru; + + ip_do_fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else if (ethertype == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + unsigned long orig_dst; + struct rt6_info ovs_rt; + + if (!v6ops) { + goto err; + } + + prepare_frag(vport, skb); + memset(&ovs_rt, 0, sizeof(ovs_rt)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + v6ops->fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else { + WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", + ovs_vport_name(vport), ntohs(ethertype), mru, + vport->dev->mtu); + goto err; + } + + return; +err: + kfree_skb(skb); +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + struct sw_flow_key *key) { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport)) - ovs_vport_send(vport, skb); - else + if (likely(vport)) { + u16 mru = OVS_CB(skb)->mru; + + if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { + ovs_vport_send(vport, skb); + } else if (mru <= vport->dev->mtu) { + struct net *net = read_pnet(&dp->net); + __be16 ethertype = key->eth.type; + + if (!is_flow_key_valid(key)) { + if (eth_p_mpls(skb->protocol)) + ethertype = skb->inner_protocol; + else + ethertype = vlan_get_protocol(skb); + } + + ovs_fragment(net, vport, skb, mru, ethertype); + } else { + kfree_skb(skb); + } + } else { kfree_skb(skb); + } } static int output_userspace(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { - struct ovs_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.userdata = NULL; - upcall.portid = 0; - upcall.egress_tun_info = NULL; + upcall.mru = OVS_CB(skb)->mru; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -639,11 +796,18 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, if (vport) { int err; - err = ovs_vport_get_egress_tun_info(vport, skb, - &info); + err = dev_fill_metadata_dst(vport->dev, skb); if (!err) - upcall.egress_tun_info = &info; + upcall.egress_tun_info = skb_tunnel_info(skb); } + + break; + } + + case OVS_USERSPACE_ATTR_ACTIONS: { + /* Include actions. */ + upcall.actions = actions; + upcall.actions_len = actions_len; break; } @@ -654,7 +818,8 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } static int sample(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -662,9 +827,12 @@ static int sample(struct datapath *dp, struct sk_buff *skb, for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { + u32 probability; + switch (nla_type(a)) { case OVS_SAMPLE_ATTR_PROBABILITY: - if (prandom_u32() >= nla_get_u32(a)) + probability = nla_get_u32(a); + if (!probability || prandom_u32() > probability) return 0; break; @@ -688,7 +856,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, */ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a); + return output_userspace(dp, skb, key, a, actions, actions_len); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -726,7 +894,11 @@ static int execute_set_action(struct sk_buff *skb, { /* Only tunnel set execution is supported without a mask. */ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_CB(skb)->egress_tun_info = nla_data(a); + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); return 0; } @@ -744,12 +916,13 @@ static int execute_masked_set_action(struct sk_buff *skb, switch (nla_type(a)) { case OVS_KEY_ATTR_PRIORITY: - SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->priority, nla_get_u32(a), + *get_mask(a, u32 *)); flow_key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: - SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); flow_key->phy.skb_mark = skb->mark; break; @@ -792,6 +965,13 @@ static int execute_masked_set_action(struct sk_buff *skb, err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, __be32 *)); break; + + case OVS_KEY_ATTR_CT_STATE: + case OVS_KEY_ATTR_CT_ZONE: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: + err = -EINVAL; + break; } return err; @@ -861,7 +1041,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); if (out_skb) - do_output(dp, out_skb, prev_port); + do_output(dp, out_skb, prev_port, key); prev_port = -1; } @@ -872,7 +1052,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a); + output_userspace(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_HASH: @@ -916,7 +1096,22 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, key, a); + err = sample(dp, skb, key, a, attr, len); + break; + + case OVS_ACTION_ATTR_CT: + if (!is_flow_key_valid(key)) { + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, + nla_data(a)); + + /* Hide stolen IP fragments from user space. */ + if (err) + return err == -EINPROGRESS ? 0 : err; break; } @@ -927,7 +1122,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } if (prev_port != -1) - do_output(dp, skb, prev_port); + do_output(dp, skb, prev_port, key); else consume_skb(skb); @@ -969,7 +1164,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, int err; this_cpu_inc(exec_actions_level); - OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/kernel/net/openvswitch/conntrack.c b/kernel/net/openvswitch/conntrack.c new file mode 100644 index 000000000..e004067ec --- /dev/null +++ b/kernel/net/openvswitch/conntrack.c @@ -0,0 +1,790 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "conntrack.h" +#include "flow.h" +#include "flow_netlink.h" + +struct ovs_ct_len_tbl { + size_t maxlen; + size_t minlen; +}; + +/* Metadata mark for masked write to conntrack mark */ +struct md_mark { + u32 value; + u32 mask; +}; + +/* Metadata label for masked write to conntrack label. */ +struct md_labels { + struct ovs_key_ct_labels value; + struct ovs_key_ct_labels mask; +}; + +/* Conntrack action context for execution. */ +struct ovs_conntrack_info { + struct nf_conntrack_helper *helper; + struct nf_conntrack_zone zone; + struct nf_conn *ct; + u8 commit : 1; + u16 family; + struct md_mark mark; + struct md_labels labels; +}; + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); + +static u16 key_to_nfproto(const struct sw_flow_key *key) +{ + switch (ntohs(key->eth.type)) { + case ETH_P_IP: + return NFPROTO_IPV4; + case ETH_P_IPV6: + return NFPROTO_IPV6; + default: + return NFPROTO_UNSPEC; + } +} + +/* Map SKB connection state into the values used by flow definition. */ +static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) +{ + u8 ct_state = OVS_CS_F_TRACKED; + + switch (ctinfo) { + case IP_CT_ESTABLISHED_REPLY: + case IP_CT_RELATED_REPLY: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_REPLY_DIR; + break; + default: + break; + } + + switch (ctinfo) { + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + ct_state |= OVS_CS_F_ESTABLISHED; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_RELATED; + break; + case IP_CT_NEW: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_NEW; + break; + default: + break; + } + + return ct_state; +} + +static u32 ovs_ct_get_mark(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + return ct ? ct->mark : 0; +#else + return 0; +#endif +} + +static void ovs_ct_get_labels(const struct nf_conn *ct, + struct ovs_key_ct_labels *labels) +{ + struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + + if (cl) { + size_t len = cl->words * sizeof(long); + + if (len > OVS_CT_LABELS_LEN) + len = OVS_CT_LABELS_LEN; + else if (len < OVS_CT_LABELS_LEN) + memset(labels, 0, OVS_CT_LABELS_LEN); + memcpy(labels, cl->bits, len); + } else { + memset(labels, 0, OVS_CT_LABELS_LEN); + } +} + +static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, + const struct nf_conntrack_zone *zone, + const struct nf_conn *ct) +{ + key->ct.state = state; + key->ct.zone = zone->id; + key->ct.mark = ovs_ct_get_mark(ct); + ovs_ct_get_labels(ct, &key->ct.labels); +} + +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. + */ +static void ovs_ct_update_key(const struct sk_buff *skb, + const struct ovs_conntrack_info *info, + struct sw_flow_key *key, bool post_ct) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u8 state = 0; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + state = ovs_ct_get_state(ctinfo); + if (!nf_ct_is_confirmed(ct)) + state |= OVS_CS_F_NEW; + if (ct->master) + state |= OVS_CS_F_RELATED; + zone = nf_ct_zone(ct); + } else if (post_ct) { + state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + if (info) + zone = &info->zone; + } + __ovs_ct_update_key(key, state, zone, ct); +} + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +{ + ovs_ct_update_key(skb, NULL, key, false); +} + +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +{ + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), + &key->ct.labels)) + return -EMSGSIZE; + + return 0; +} + +static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, + u32 ct_mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 new_mark; + + + /* The connection could be invalid, in which case set_mark is no-op. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + new_mark = ct_mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + nf_conntrack_event_cache(IPCT_MARK, ct); + key->ct.mark = new_mark; + } + + return 0; +#else + return -ENOTSUPP; +#endif +} + +static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + int err; + + /* The connection could be invalid, in which case set_label is no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + cl = nf_ct_labels_find(ct); + if (!cl) { + nf_ct_labels_ext_add(ct); + cl = nf_ct_labels_find(ct); + } + if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + return -ENOSPC; + + err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, + OVS_CT_LABELS_LEN / sizeof(u32)); + if (err) + return err; + + ovs_ct_get_labels(ct, &key->ct.labels); + return 0; +} + +/* 'skb' should already be pulled to nh_ofs. */ +static int ovs_ct_helper(struct sk_buff *skb, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +static int handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, struct sk_buff *skb) +{ + struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + + if (key->eth.type == htons(ETH_P_IP)) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + int err; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + err = ip_defrag(net, skb, user); + if (err) + return err; + + ovs_cb.mru = IPCB(skb)->frag_max_size; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + } else if (key->eth.type == htons(ETH_P_IPV6)) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + struct sk_buff *reasm; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + reasm = nf_ct_frag6_gather(net, skb, user); + if (!reasm) + return -EINPROGRESS; + + if (skb == reasm) { + kfree_skb(skb); + return -EINVAL; + } + + /* Don't free 'skb' even though it is one of the original + * fragments, as we're going to morph it into the head. + */ + skb_get(skb); + nf_ct_frag6_consume_orig(reasm); + + key->ip.proto = ipv6_hdr(reasm)->nexthdr; + skb_morph(skb, reasm); + skb->next = reasm->next; + consume_skb(reasm); + ovs_cb.mru = IP6CB(skb)->frag_max_size; +#endif + } else { + kfree_skb(skb); + return -EPFNOSUPPORT; + } + + key->ip.frag = OVS_FRAG_TYPE_NONE; + skb_clear_hash(skb); + skb->ignore_df = 1; + *OVS_CB(skb) = ovs_cb; + + return 0; +} + +static struct nf_conntrack_expect * +ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, + u16 proto, const struct sk_buff *skb) +{ + struct nf_conntrack_tuple tuple; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) + return NULL; + return __nf_ct_expect_find(net, zone, &tuple); +} + +/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, + const struct ovs_conntrack_info *info) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) + return false; + if (info->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != info->helper) + return false; + } + + return true; +} + +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + /* If we are recirculating packets to match on conntrack fields and + * committing with a separate conntrack action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + if (!skb_nfct_cached(net, skb, info)) { + struct nf_conn *tmpl = info->ct; + + /* Associate skb with specified zone. */ + if (tmpl) { + if (skb->nfct) + nf_conntrack_put(skb->nfct); + nf_conntrack_get(&tmpl->ct_general); + skb->nfct = &tmpl->ct_general; + skb->nfctinfo = IP_CT_NEW; + } + + if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, + skb) != NF_ACCEPT) + return -ENOENT; + + if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + WARN_ONCE(1, "helper rejected packet"); + return -EINVAL; + } + } + + ovs_ct_update_key(skb, info, key, true); + + return 0; +} + +/* Lookup connection and read fields into key. */ +static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nf_conntrack_expect *exp; + + exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); + if (exp) { + u8 state; + + state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; + __ovs_ct_update_key(key, state, &info->zone, exp->master); + } else { + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + } + + return 0; +} + +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + u8 state; + int err; + + state = key->ct.state; + if (key->ct.zone == info->zone.id && + ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { + /* Previous lookup has shown that this connection is already + * tracked and committed. Skip committing. + */ + return 0; + } + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + return -EINVAL; + + return 0; +} + +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) +{ + size_t i; + + for (i = 0; i < sizeof(*labels); i++) + if (labels->ct_labels[i]) + return true; + + return false; +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + int nh_ofs; + int err; + + /* The conntrack module expects to be working at L3. */ + nh_ofs = skb_network_offset(skb); + skb_pull(skb, nh_ofs); + + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { + err = handle_fragments(net, key, info->zone.id, skb); + if (err) + return err; + } + + if (info->commit) + err = ovs_ct_commit(net, key, info, skb); + else + err = ovs_ct_lookup(net, key, info, skb); + if (err) + goto err; + + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + goto err; + } + if (labels_nonzero(&info->labels.mask)) + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); +err: + skb_push(skb, nh_ofs); + if (err) + kfree_skb(skb); + return err; +} + +static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, + const struct sw_flow_key *key, bool log) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + + helper = nf_conntrack_helper_try_module_get(name, info->family, + key->ip.proto); + if (!helper) { + OVS_NLERR(log, "Unknown helper \"%s\"", name); + return -EINVAL; + } + + help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + if (!help) { + module_put(helper->me); + return -ENOMEM; + } + + rcu_assign_pointer(help->helper, helper); + info->helper = helper; + return 0; +} + +static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { + [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), + .maxlen = sizeof(u16) }, + [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), + .maxlen = sizeof(struct md_mark) }, + [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), + .maxlen = sizeof(struct md_labels) }, + [OVS_CT_ATTR_HELPER] = { .minlen = 1, + .maxlen = NF_CT_HELPER_NAME_LEN } +}; + +static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, + const char **helper, bool log) +{ + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int maxlen = ovs_ct_attr_lens[type].maxlen; + int minlen = ovs_ct_attr_lens[type].minlen; + + if (type > OVS_CT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown conntrack attr (type=%d, max=%d)", + type, OVS_CT_ATTR_MAX); + return -EINVAL; + } + if (nla_len(a) < minlen || nla_len(a) > maxlen) { + OVS_NLERR(log, + "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", + type, nla_len(a), maxlen); + return -EINVAL; + } + + switch (type) { + case OVS_CT_ATTR_COMMIT: + info->commit = true; + break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case OVS_CT_ATTR_ZONE: + info->zone.id = nla_get_u16(a); + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + case OVS_CT_ATTR_MARK: { + struct md_mark *mark = nla_data(a); + + if (!mark->mask) { + OVS_NLERR(log, "ct_mark mask cannot be 0"); + return -EINVAL; + } + info->mark = *mark; + break; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case OVS_CT_ATTR_LABELS: { + struct md_labels *labels = nla_data(a); + + if (!labels_nonzero(&labels->mask)) { + OVS_NLERR(log, "ct_labels mask cannot be 0"); + return -EINVAL; + } + info->labels = *labels; + break; + } +#endif + case OVS_CT_ATTR_HELPER: + *helper = nla_data(a); + if (!memchr(*helper, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown conntrack attr (%d)", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); + return -EINVAL; + } + + return 0; +} + +bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) +{ + if (attr == OVS_KEY_ATTR_CT_STATE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + attr == OVS_KEY_ATTR_CT_ZONE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + attr == OVS_KEY_ATTR_CT_MARK) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + attr == OVS_KEY_ATTR_CT_LABELS) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } + + return false; +} + +int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + struct ovs_conntrack_info ct_info; + const char *helper = NULL; + u16 family; + int err; + + family = key_to_nfproto(key); + if (family == NFPROTO_UNSPEC) { + OVS_NLERR(log, "ct family unspecified"); + return -EINVAL; + } + + memset(&ct_info, 0, sizeof(ct_info)); + ct_info.family = family; + + nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); + + err = parse_ct(attr, &ct_info, &helper, log); + if (err) + return err; + + /* Set up template for tracking connections in specific zones. */ + ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); + if (!ct_info.ct) { + OVS_NLERR(log, "Failed to allocate conntrack template"); + return -ENOMEM; + } + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + + if (helper) { + err = ovs_ct_add_helper(&ct_info, helper, key, log); + if (err) + goto err_free_ct; + } + + err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, + sizeof(ct_info), log); + if (err) + goto err_free_ct; + + return 0; +err_free_ct: + __ovs_ct_free_action(&ct_info); + return err; +} + +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); + if (!start) + return -EMSGSIZE; + + if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && + nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), + &ct_info->mark)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&ct_info->labels.mask) && + nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), + &ct_info->labels)) + return -EMSGSIZE; + if (ct_info->helper) { + if (nla_put_string(skb, OVS_CT_ATTR_HELPER, + ct_info->helper->name)) + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + + return 0; +} + +void ovs_ct_free_action(const struct nlattr *a) +{ + struct ovs_conntrack_info *ct_info = nla_data(a); + + __ovs_ct_free_action(ct_info); +} + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) +{ + if (ct_info->helper) + module_put(ct_info->helper->me); + if (ct_info->ct) + nf_ct_put(ct_info->ct); +} + +void ovs_ct_init(struct net *net) +{ + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (nf_connlabels_get(net, n_bits)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } +} + +void ovs_ct_exit(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (ovs_net->xt_label) + nf_connlabels_put(net); +} diff --git a/kernel/net/openvswitch/conntrack.h b/kernel/net/openvswitch/conntrack.h new file mode 100644 index 000000000..a7544f405 --- /dev/null +++ b/kernel/net/openvswitch/conntrack.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OVS_CONNTRACK_H +#define OVS_CONNTRACK_H 1 + +#include "flow.h" + +struct ovs_conntrack_info; +enum ovs_key_attr; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +void ovs_ct_init(struct net *); +void ovs_ct_exit(struct net *); +bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); +int ovs_ct_copy_action(struct net *, const struct nlattr *, + const struct sw_flow_key *, struct sw_flow_actions **, + bool log); +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); + +int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, + const struct ovs_conntrack_info *); + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +void ovs_ct_free_action(const struct nlattr *a); + +#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ + OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ + OVS_CS_F_INVALID | OVS_CS_F_TRACKED) +#else +#include + +static inline void ovs_ct_init(struct net *net) { } + +static inline void ovs_ct_exit(struct net *net) { } + +static inline bool ovs_ct_verify(struct net *net, int attr) +{ + return false; +} + +static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, + const struct sw_flow_key *key, + struct sw_flow_actions **acts, bool log) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + kfree_skb(skb); + return -ENOTSUPP; +} + +static inline void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key) +{ + key->ct.state = 0; + key->ct.zone = 0; + key->ct.mark = 0; + memset(&key->ct.labels, 0, sizeof(key->ct.labels)); +} + +static inline int ovs_ct_put_key(const struct sw_flow_key *key, + struct sk_buff *skb) +{ + return 0; +} + +static inline void ovs_ct_free_action(const struct nlattr *a) { } + +#define CT_SUPPORTED_MASK 0 +#endif /* CONFIG_NF_CONNTRACK */ +#endif /* ovs_conntrack.h */ diff --git a/kernel/net/openvswitch/datapath.c b/kernel/net/openvswitch/datapath.c index 27e14962b..deadfdab1 100644 --- a/kernel/net/openvswitch/datapath.c +++ b/kernel/net/openvswitch/datapath.c @@ -91,8 +91,7 @@ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { - genl_notify(family, skb, genl_info_net(info), info->snd_portid, - 0, info->nlhdr, GFP_KERNEL); + genl_notify(family, skb, info, 0, GFP_KERNEL); } /** @@ -176,7 +175,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) @@ -188,7 +187,7 @@ static int get_dpifindex(const struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -272,10 +271,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_upcall_info upcall; int error; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); - upcall.egress_tun_info = NULL; + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -337,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; int err; - ovs_cb = *OVS_CB(skb); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) @@ -360,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, /* Queue all of the segments. */ skb = segs; do { - *OVS_CB(skb) = ovs_cb; if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -397,9 +393,27 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); + /* OVS_PACKET_ATTR_ACTIONS */ + if (upcall_info->actions_len) + size += nla_total_size(upcall_info->actions_len); + + /* OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) + size += nla_total_size(sizeof(upcall_info->mru)); + return size; } +static void pad_packet(struct datapath *dp, struct sk_buff *skb) +{ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(skb->len) - skb->len; + + if (plen > 0) + memset(skb_put(skb, plen), 0, plen); + } +} + static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) @@ -472,12 +486,33 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); - err = ovs_nla_put_egress_tunnel_key(user_skb, - upcall_info->egress_tun_info); + err = ovs_nla_put_tunnel_info(user_skb, + upcall_info->egress_tun_info); BUG_ON(err); nla_nest_end(user_skb, nla); } + if (upcall_info->actions_len) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + err = ovs_nla_put_actions(upcall_info->actions, + upcall_info->actions_len, + user_skb); + if (!err) + nla_nest_end(user_skb, nla); + else + nla_nest_cancel(user_skb, nla); + } + + /* Add OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) { + if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, + upcall_info->mru)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { @@ -491,12 +526,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ - if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { - size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; - - if (plen > 0) - memset(skb_put(user_skb, plen), 0, plen); - } + pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; @@ -513,6 +543,7 @@ out: static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; @@ -521,6 +552,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; + u16 mru = 0; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -545,34 +577,40 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) /* Normally, setting the skb 'protocol' field would be handled by a * call to eth_type_trans(), but it assumes there's a sending * device, which we may not have. */ - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(eth->h_proto)) packet->protocol = eth->h_proto; else packet->protocol = htons(ETH_P_802_2); + /* Set packet's mru */ + if (a[OVS_PACKET_ATTR_MRU]) { + mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); + packet->ignore_df = 1; + } + OVS_CB(packet)->mru = mru; + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, - &flow->key, log); + err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], + packet, &flow->key, log); if (err) goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -584,6 +622,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!input_vport) goto err_unlock; + packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); @@ -610,6 +649,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, }; static const struct genl_ops dp_packet_genl_ops[] = { @@ -699,7 +739,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) - len += nla_total_size(acts->actions_len); + len += nla_total_size(acts->orig_len); return len + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ @@ -866,6 +906,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow *flow = NULL, *new_flow; @@ -901,7 +942,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; @@ -915,8 +956,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_flow; /* Validate actions. */ - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - &acts, log); + error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], + &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_flow; @@ -930,7 +971,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1004,7 +1045,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_unlock(); - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } @@ -1016,7 +1057,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1024,7 +1065,8 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, +static struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) @@ -1034,7 +1076,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, int error; ovs_flow_mask_key(&masked_key, key, true, mask); - error = ovs_nla_copy_actions(a, &masked_key, &acts, log); + error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); @@ -1046,6 +1088,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow_key key; @@ -1070,15 +1113,15 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, - log); + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, + &mask, log); if (IS_ERR(acts)) { error = PTR_ERR(acts); goto error; @@ -1094,7 +1137,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1129,7 +1172,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) info, OVS_FLOW_CMD_NEW, false, ufid_flags); - if (unlikely(IS_ERR(reply))) { + if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_unlock_ovs; } @@ -1143,7 +1186,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); return 0; @@ -1151,7 +1194,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); error: return error; } @@ -1160,6 +1203,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; @@ -1174,7 +1218,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, @@ -1218,6 +1262,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; @@ -1232,8 +1277,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, - log); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + NULL, log); if (unlikely(err)) return err; } @@ -1786,7 +1831,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - vport->ops->get_name(vport))) + ovs_vport_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -2189,6 +2234,7 @@ static int __net_init ovs_init_net(struct net *net) INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + ovs_ct_init(net); return 0; } @@ -2205,13 +2251,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (dev_net(netdev_vport->dev) == dnet) + if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } @@ -2226,6 +2269,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) struct net *net; LIST_HEAD(head); + ovs_ct_exit(dnet); ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); diff --git a/kernel/net/openvswitch/datapath.h b/kernel/net/openvswitch/datapath.h index 4ec4a480b..67bdecd9f 100644 --- a/kernel/net/openvswitch/datapath.h +++ b/kernel/net/openvswitch/datapath.h @@ -25,10 +25,11 @@ #include #include #include +#include +#include "conntrack.h" #include "flow.h" #include "flow_table.h" -#include "vport.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -92,14 +93,14 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @egress_tun_key: Tunnel information about this packet on egress path. - * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. + * @mru: The maximum received fragement size; 0 if the packet is not + * fragmented. */ struct ovs_skb_cb { - struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; + u16 mru; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -112,12 +113,16 @@ struct ovs_skb_cb { * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { - const struct ovs_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; + const struct nlattr *actions; + int actions_len; u32 portid; u8 cmd; + u16 mru; }; /** @@ -128,7 +133,9 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; - struct vport_net vport_net; + + /* Module reference for configuring conntrack. */ + bool xt_label; }; extern int ovs_net_id; @@ -197,6 +204,10 @@ void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) + #define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ if (logging_allowed && net_ratelimit()) \ diff --git a/kernel/net/openvswitch/dp_notify.c b/kernel/net/openvswitch/dp_notify.c index 2c631fe76..653d073ba 100644 --- a/kernel/net/openvswitch/dp_notify.c +++ b/kernel/net/openvswitch/dp_notify.c @@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } diff --git a/kernel/net/openvswitch/flow.c b/kernel/net/openvswitch/flow.c index 2dacc7b5a..0ea128eee 100644 --- a/kernel/net/openvswitch/flow.c +++ b/kernel/net/openvswitch/flow.c @@ -46,9 +46,11 @@ #include #include +#include "conntrack.h" #include "datapath.h" #include "flow.h" #include "flow_netlink.h" +#include "vport.h" u64 ovs_flow_used_time(unsigned long flow_jiffies) { @@ -271,8 +273,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ipv6.addr.dst = nh->daddr; payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - if (unlikely(payload_ofs < 0)) - return -EINVAL; if (frag_off) { if (frag_off & htons(~0x7)) @@ -283,6 +283,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_NONE; } + /* Delayed handling of error in ipv6_skip_exthdr() as it + * always sets frag_off to a valid value which may be + * used to set key->ip.frag above. + */ + if (unlikely(payload_ofs < 0)) + return -EPROTO; + nh_len = payload_ofs - nh_ofs; skb_set_transport_header(skb, nh_ofs + nh_len); key->ip.proto = nexthdr; @@ -332,7 +339,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) proto = *(__be16 *) skb->data; __skb_pull(skb, sizeof(__be16)); - if (ntohs(proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(proto)) return proto; if (skb->len < sizeof(struct llc_snap_hdr)) @@ -349,7 +356,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) __skb_pull(skb, sizeof(struct llc_snap_hdr)); - if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(llc->ethertype)) return llc->ethertype; return htons(ETH_P_802_2); @@ -622,12 +629,16 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); - if (nh_len == -EINVAL) { + switch (nh_len) { + case -EINVAL: + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + /* fall-through */ + case -EPROTO: skb->transport_header = skb->network_header; error = 0; - } else { + break; + default: error = nh_len; } return error; @@ -682,24 +693,27 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ if (tun_info) { - memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + key->tun_proto = ip_tunnel_info_af(tun_info); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); - if (tun_info->options) { + if (tun_info->options_len) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * 8)) - 1 > sizeof(key->tun_opts)); - memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), - tun_info->options, tun_info->options_len); + + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); key->tun_opts_len = tun_info->options_len; } else { key->tun_opts_len = 0; } } else { + key->tun_proto = 0; key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); } @@ -707,13 +721,14 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; + ovs_ct_fill_key(skb, key); key->ovs_flow_hash = 0; key->recirc_id = 0; return key_extract(skb, key); } -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log) { @@ -722,7 +737,7 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(attr, key, log); + err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) return err; diff --git a/kernel/net/openvswitch/flow.h b/kernel/net/openvswitch/flow.h index a076e445c..1d055c559 100644 --- a/kernel/net/openvswitch/flow.h +++ b/kernel/net/openvswitch/flow.h @@ -32,31 +32,11 @@ #include #include #include +#include +#include struct sk_buff; -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; - __be16 tp_src; - __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ - -struct ovs_tunnel_info { - struct ovs_key_ipv4_tunnel tunnel; - const void *options; - u8 options_len; -}; - /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -66,54 +46,9 @@ struct ovs_tunnel_info { #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) -static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = saddr; - tun_info->tunnel.ipv4_dst = daddr; - tun_info->tunnel.ipv4_tos = tos; - tun_info->tunnel.ipv4_ttl = ttl; - tun_info->tunnel.tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - tun_info->tunnel.tp_src = tp_src; - tun_info->tunnel.tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; -} - -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, - tp_src, tp_dst, - tun_id, tun_flags, - opts, opts_len); -} +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ @@ -122,12 +57,13 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u8 tun_proto; /* Protocol of encapsulating tunnel. */ u32 ovs_flow_hash; /* Datapath computed hash value. */ u32 recirc_id; /* Recirculation ID. */ struct { @@ -176,6 +112,14 @@ struct sw_flow_key { } nd; } ipv6; }; + struct { + /* Connection tracking fields. */ + u16 zone; + u32 mark; + u8 state; + struct ovs_key_ct_labels labels; + } ct; + } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ struct sw_flow_key_range { @@ -209,6 +153,7 @@ struct sw_flow_id { struct sw_flow_actions { struct rcu_head rcu; + size_t orig_len; /* From flow_cmd_new netlink actions size */ u32 actions_len; struct nlattr actions[]; }; @@ -273,11 +218,11 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log); diff --git a/kernel/net/openvswitch/flow_netlink.c b/kernel/net/openvswitch/flow_netlink.c index c691b1a1e..d1bd4a45c 100644 --- a/kernel/net/openvswitch/flow_netlink.c +++ b/kernel/net/openvswitch/flow_netlink.c @@ -47,9 +47,9 @@ #include #include #include +#include #include "flow_netlink.h" -#include "vport-vxlan.h" struct ovs_len_tbl { int len; @@ -57,6 +57,7 @@ struct ovs_len_tbl { }; #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -261,8 +262,8 @@ size_t ovs_tun_key_attr_size(void) * updating this function. */ return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ @@ -281,7 +282,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -290,6 +291,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -300,6 +305,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, @@ -311,8 +320,11 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, + [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -339,8 +351,19 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, + [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + static bool is_all_zero(const u8 *fp, size_t size) { int i; @@ -380,7 +403,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } expected_len = ovs_key_lens[type].len; - if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(nla), expected_len)) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -465,29 +488,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) { - struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + struct nlattr *a; + int rem; unsigned long opt_key_offset; - struct ovs_vxlan_opts opts; - int err; + struct vxlan_metadata opts; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); - if (err < 0) - return err; - memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (tb[OVS_VXLAN_EXT_GBP]) - opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } if (!is_mask) SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -500,15 +544,15 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static int ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) +static int ip_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) { - struct nlattr *a; - int rem; - bool ttl = false; + bool ttl = false, ipv4 = false, ipv6 = false; __be16 tun_flags = 0; int opts_type = 0; + struct nlattr *a; + int rem; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -520,8 +564,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type].len != nla_len(a) && - ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; @@ -534,19 +578,31 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); + ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; break; case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + SW_FLOW_KEY_PUT(match, tun_key.tos, nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + SW_FLOW_KEY_PUT(match, tun_key.ttl, nla_get_u8(a), is_mask); ttl = true; break; @@ -594,28 +650,46 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, opts_type = type; break; default: - OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", + OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); return -EINVAL; } } SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + if (is_mask) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); + else + SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, + false); if (rem > 0) { - OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", rem); return -EINVAL; } + if (ipv4 && ipv6) { + OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); + return -EINVAL; + } + if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { + if (!ipv4 && !ipv6) { + OVS_NLERR(log, "IP tunnel dst address not specified"); + return -EINVAL; + } + if (ipv4 && !match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } + if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { + OVS_NLERR(log, "IPv6 tunnel dst address is zero"); + return -EINVAL; + } if (!ttl) { - OVS_NLERR(log, "IPv4 tunnel TTL not specified."); + OVS_NLERR(log, "IP tunnel TTL not specified."); return -EINVAL; } } @@ -626,7 +700,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, static int vxlan_opt_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len) { - const struct ovs_vxlan_opts *opts = tun_opts; + const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); @@ -640,25 +714,40 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, return 0; } -static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, - const void *tun_opts, int swkey_tun_opts_len) +static int __ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->ipv4_src && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->ipv4_src)) - return -EMSGSIZE; - if (output->ipv4_dst && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->ipv4_dst)) - return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + switch (tun_proto) { + case AF_INET: + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) + return -EMSGSIZE; + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&output->u.ipv6.src) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, + &output->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&output->u.ipv6.dst) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, + &output->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) @@ -675,7 +764,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; - if (tun_opts) { + if (swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_GENEVE_OPT && nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, swkey_tun_opts_len, tun_opts)) @@ -688,9 +777,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, - const void *tun_opts, int swkey_tun_opts_len) +static int ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { struct nlattr *nla; int err; @@ -699,7 +789,8 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, + tun_proto); if (err) return err; @@ -707,17 +798,18 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ovs_tunnel_info *egress_tun_info) +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, - egress_tun_info->options, - egress_tun_info->options_len); + return __ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); } -static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) { if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -763,21 +855,58 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask, log) < 0) + if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + + if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { + u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); + + if (ct_state & ~CT_SUPPORTED_MASK) { + OVS_NLERR(log, "ct_state flags %08x unsupported", + ct_state); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { + u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); + + SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { + u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); + + SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { + const struct ovs_key_ct_labels *cl; + + cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); + SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels, + sizeof(*cl), is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); + } return 0; } -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 attrs, const struct nlattr **a, + bool is_mask, bool log) { int err; - err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); + err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); if (err) return err; @@ -816,7 +945,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, if (is_mask) { /* Always exact match EtherType. */ eth_type = htons(0xffff); - } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + } else if (!eth_proto_is_802_3(eth_type)) { OVS_NLERR(log, "EtherType %x is less than min %x", ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; @@ -1012,10 +1141,16 @@ static void nlattr_set(struct nlattr *attr, u8 val, /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next); - else + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { + if (tbl[nla_type(nla)].next) + tbl = tbl[nla_type(nla)].next; + nlattr_set(nla, val, tbl); + } else { memset(nla_data(nla), val, nla_len(nla)); + } + + if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) + *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK; } } @@ -1029,6 +1164,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * mask. In case the 'mask' is NULL, the flow is treated as exact match * flow. Otherwise, it is treated as a wildcarded flow, except the mask * does not include any don't care bit. + * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation @@ -1039,7 +1175,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. */ -int ovs_nla_get_match(struct sw_flow_match *match, +int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, const struct nlattr *nla_key, const struct nlattr *nla_mask, bool log) @@ -1089,7 +1225,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); + err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) return err; @@ -1116,7 +1252,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.ipv4_dst) + if (match->key->tun_proto) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1169,7 +1305,8 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); + err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, + log); if (err) goto free_newmask; } @@ -1250,7 +1387,7 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(const struct nlattr *attr, +int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, struct sw_flow_key *key, bool log) { @@ -1266,9 +1403,10 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, memset(&match, 0, sizeof(match)); match.key = key; + memset(&key->ct, 0, sizeof(key->ct)); key->phy.in_port = DP_MAX_PORTS; - return metadata_from_nlattrs(&match, &attrs, a, false, log); + return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } static int __ovs_nla_put_key(const struct sw_flow_key *swkey, @@ -1287,14 +1425,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask)) { + if ((swkey->tun_proto || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); - if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len)) + if (ip_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len, swkey->tun_proto)) goto nla_put_failure; } @@ -1314,6 +1452,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; + if (ovs_ct_put_key(output, skb)) + goto nla_put_failure; + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; @@ -1548,11 +1689,51 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) return sfa; } +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + case OVS_ACTION_ATTR_CT: + ovs_ct_free_action(a); + break; + } + } + + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + /* Schedules 'sf_acts' to be freed after the next RCU grace period. * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) { - kfree_rcu(sf_acts, rcu); + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1582,6 +1763,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); acts->actions_len = (*sfa)->actions_len; + acts->orig_len = (*sfa)->orig_len; kfree(*sfa); *sfa = acts; @@ -1609,8 +1791,8 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa, return a; } -static int add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len, bool log) +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, + int len, bool log) { struct nlattr *a; @@ -1625,7 +1807,7 @@ static inline int add_nested_action_start(struct sw_flow_actions **sfa, int used = (*sfa)->actions_len; int err; - err = add_action(sfa, attrtype, NULL, 0, log); + err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); if (err) return err; @@ -1641,12 +1823,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log); -static int validate_and_copy_sample(const struct nlattr *attr, +static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -1678,15 +1860,15 @@ static int validate_and_copy_sample(const struct nlattr *attr, start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32), log); + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32), log); if (err) return err; st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); if (st_acts < 0) return st_acts; - err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, eth_type, vlan_tci, log); if (err) return err; @@ -1746,12 +1928,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; - struct ovs_tunnel_info *tun_info; + struct metadata_dst *tun_dst; + struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; ovs_match_init(&match, &key, NULL); - opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); + opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; @@ -1771,27 +1955,33 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len, log); - if (IS_ERR(a)) + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); return PTR_ERR(a); + } - tun_info = nla_data(a); - tun_info->tunnel = key.tun_key; - tun_info->options_len = key.tun_opts_len; + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; - if (tun_info->options_len) { - /* We need to store the options in the action itself since - * everything else will go away after flow setup. We can append - * it to tun_info and then point there. - */ - memcpy((tun_info + 1), - TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); - tun_info->options = (tun_info + 1); - } else { - tun_info->options = NULL; - } + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; + if (key.tun_proto == AF_INET6) + tun_info->mode |= IP_TUNNEL_INFO_IPV6; + tun_info->key = key.tun_key; + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); add_nested_action_end(*sfa, start); return err; @@ -1829,8 +2019,7 @@ static int validate_set(const struct nlattr *a, key_len /= 2; if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type].len != key_len && - ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + !check_attr_len(key_len, ovs_key_lens[key_type].len)) return -EINVAL; if (masked && !validate_masked(nla_data(ovs_key), key_len)) @@ -1843,6 +2032,8 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: case OVS_KEY_ATTR_ETHERNET: break; @@ -2008,7 +2199,7 @@ static int copy_action(const struct nlattr *from, return 0; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -2032,7 +2223,8 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_SET] = (u32)-1, [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_CT] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2139,13 +2331,20 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa, + err = validate_and_copy_sample(net, a, key, depth, sfa, eth_type, vlan_tci, log); if (err) return err; skip_copy = true; break; + case OVS_ACTION_ATTR_CT: + err = ovs_ct_copy_action(net, a, key, sfa, log); + if (err) + return err; + skip_copy = true; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -2164,7 +2363,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, } /* 'key' must be the masked key. */ -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { @@ -2174,10 +2373,11 @@ int ovs_nla_copy_actions(const struct nlattr *attr, if (IS_ERR(*sfa)) return PTR_ERR(*sfa); - err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + (*sfa)->orig_len = nla_len(attr); + err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) - kfree(*sfa); + ovs_nla_free_flow_actions(*sfa); return err; } @@ -2227,16 +2427,17 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, - tun_info->options_len ? - tun_info->options : NULL, - tun_info->options_len); + err = ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); if (err) return err; nla_nest_end(skb, start); @@ -2298,6 +2499,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) if (err) return err; break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_action_to_attr(nla_data(a), skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/kernel/net/openvswitch/flow_netlink.h b/kernel/net/openvswitch/flow_netlink.h index 5c3d75bff..47dd142ec 100644 --- a/kernel/net/openvswitch/flow_netlink.h +++ b/kernel/net/openvswitch/flow_netlink.h @@ -45,29 +45,34 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, - bool log); +int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, + struct sw_flow_key *, bool log); int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); -int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, - const struct nlattr *mask, bool log); -int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ovs_tunnel_info *); +int ovs_nla_get_match(struct net *, struct sw_flow_match *, + const struct nlattr *key, const struct nlattr *mask, + bool log); + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, const struct sw_flow_key *key, bool log); u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log); +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/kernel/net/openvswitch/flow_table.c b/kernel/net/openvswitch/flow_table.c index aa349514e..d073fff82 100644 --- a/kernel/net/openvswitch/flow_table.c +++ b/kernel/net/openvswitch/flow_table.c @@ -18,6 +18,7 @@ #include "flow.h" #include "datapath.h" +#include "flow_netlink.h" #include #include #include @@ -92,7 +93,8 @@ struct sw_flow *ovs_flow_alloc(void) /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_KERNEL | __GFP_ZERO, 0); + GFP_KERNEL | __GFP_ZERO, + node_online(0) ? 0 : NUMA_NO_NODE); if (!stats) goto err; @@ -144,7 +146,8 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); - kfree((struct sw_flow_actions __force *)flow->sf_acts); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) kmem_cache_free(flow_stats_cache, @@ -425,7 +428,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.ipv4_dst) + if (key->tun_proto) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), @@ -753,7 +756,7 @@ int ovs_flow_init(void) BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) - + (num_possible_nodes() + + (nr_node_ids * sizeof(struct flow_stats *)), 0, 0, NULL); if (flow_cache == NULL) diff --git a/kernel/net/openvswitch/vport-geneve.c b/kernel/net/openvswitch/vport-geneve.c index bf02fd580..e41cd12d9 100644 --- a/kernel/net/openvswitch/vport-geneve.c +++ b/kernel/net/openvswitch/vport-geneve.c @@ -26,113 +26,42 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_geneve_vport_ops; - /** * struct geneve_port - Keeps track of open UDP ports - * @gs: The socket created for this port number. - * @name: vport name. + * @dst_port: destination port. */ struct geneve_port { - struct geneve_sock *gs; - char name[IFNAMSIZ]; + u16 port_no; }; -static LIST_HEAD(geneve_ports); - static inline struct geneve_port *geneve_vport(const struct vport *vport) { return vport_priv(vport); } -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - -/* Convert 64 bit tunnel ID to 24 bit VNI. */ -static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) -{ -#ifdef __BIG_ENDIAN - vni[0] = (__force __u8)(tun_id >> 16); - vni[1] = (__force __u8)(tun_id >> 8); - vni[2] = (__force __u8)tun_id; -#else - vni[0] = (__force __u8)((__force u64)tun_id >> 40); - vni[1] = (__force __u8)((__force u64)tun_id >> 48); - vni[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit VNI to 64 bit tunnel ID. */ -static __be64 vni_to_tunnel_id(const __u8 *vni) -{ -#ifdef __BIG_ENDIAN - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -#else - return (__force __be64)(((__force u64)vni[0] << 40) | - ((__force u64)vni[1] << 48) | - ((__force u64)vni[2] << 56)); -#endif -} - -static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) -{ - struct vport *vport = gs->rcv_data; - struct genevehdr *geneveh = geneve_hdr(skb); - int opts_len; - struct ovs_tunnel_info tun_info; - __be64 key; - __be16 flags; - - opts_len = geneveh->opt_len * 4; - - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | - (geneveh->oam ? TUNNEL_OAM : 0) | - (geneveh->critical ? TUNNEL_CRIT_OPT : 0); - - key = vni_to_tunnel_id(geneveh->vni); - - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, - geneveh->options, opts_len); - - ovs_vport_receive(vport, skb, &tun_info); -} - static int geneve_get_options(const struct vport *vport, struct sk_buff *skb) { struct geneve_port *geneve_port = geneve_vport(vport); - struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) return -EMSGSIZE; return 0; } -static void geneve_tnl_destroy(struct vport *vport) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - geneve_sock_release(geneve_port->gs); - - ovs_vport_deferred_free(vport); -} - static struct vport *geneve_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct geneve_port *geneve_port; - struct geneve_sock *gs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -154,106 +83,40 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return vport; geneve_port = geneve_vport(vport); - strncpy(geneve_port->name, parms->name, IFNAMSIZ); + geneve_port->port_no = dst_port; - gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); - if (IS_ERR(gs)) { + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)gs; + return ERR_CAST(dev); } - geneve_port->gs = gs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; error: return ERR_PTR(err); } -static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +static struct vport *geneve_create(const struct vport_parms *parms) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct ovs_tunnel_info *tun_info; - struct net *net = ovs_dp_get_net(vport->dp); - struct geneve_port *geneve_port = geneve_vport(vport); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport; - struct rtable *rt; - struct flowi4 fl; - u8 vni[3], opts_len, *opts; - __be16 df; - int err; - - tun_info = OVS_CB(skb)->egress_tun_info; - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - tunnel_id_to_vni(tun_key->tun_id, vni); - skb->ignore_df = 1; - - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { - opts = (u8 *)tun_info->options; - opts_len = tun_info->options_len; - } else { - opts = NULL; - opts_len = 0; - } - - err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->ipv4_dst, tun_key->ipv4_tos, - tun_key->ipv4_ttl, df, sport, dport, - tun_key->tun_flags, vni, opts_len, opts, - !!(tun_key->tun_flags & TUNNEL_CSUM), false); - if (err < 0) - ip_rt_put(rt); - return err; - -error: - kfree_skb(skb); - return err; -} - -static const char *geneve_get_name(const struct vport *vport) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - return geneve_port->name; -} + struct vport *vport; -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* Get tp_src and tp_dst, refert to geneve_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, sport, dport); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, - .create = geneve_tnl_create, - .destroy = geneve_tnl_destroy, - .get_name = geneve_get_name, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, .get_options = geneve_get_options, - .send = geneve_tnl_send, - .owner = THIS_MODULE, - .get_egress_tun_info = geneve_get_egress_tun_info, + .send = dev_queue_xmit, }; static int __init ovs_geneve_tnl_init(void) diff --git a/kernel/net/openvswitch/vport-gre.c b/kernel/net/openvswitch/vport-gre.c index f17ac9642..7f8897f33 100644 --- a/kernel/net/openvswitch/vport-gre.c +++ b/kernel/net/openvswitch/vport-gre.c @@ -45,254 +45,50 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_gre_vport_ops; -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 be64_get_low32(__be64 x) +static struct vport *gre_tnl_create(const struct vport_parms *parms) { -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - -static __be16 filter_tnl_flags(__be16 flags) -{ - return flags & (TUNNEL_CSUM | TUNNEL_KEY); -} - -static struct sk_buff *__build_header(struct sk_buff *skb, - int tunnel_hlen) -{ - struct tnl_ptk_info tpi; - const struct ovs_key_ipv4_tunnel *tun_key; - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - - skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); - if (IS_ERR(skb)) - return skb; - - tpi.flags = filter_tnl_flags(tun_key->tun_flags); - tpi.proto = htons(ETH_P_TEB); - tpi.key = be64_get_low32(tun_key->tun_id); - tpi.seq = 0; - gre_build_header(skb, &tpi, tunnel_hlen); - - return skb; -} - -static __be64 key_to_tunnel_id(__be32 key, __be32 seq) -{ -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u64)seq << 32 | (__force u32)key); -#else - return (__force __be64)((__force u64)key << 32 | (__force u32)seq); -#endif -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_rcv(struct sk_buff *skb, - const struct tnl_ptk_info *tpi) -{ - struct ovs_tunnel_info tun_info; - struct ovs_net *ovs_net; - struct vport *vport; - __be64 key; - - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - if (unlikely(!vport)) - return PACKET_REJECT; - - key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); - - ovs_vport_receive(vport, skb, &tun_info); - return PACKET_RCVD; -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) -{ - struct ovs_net *ovs_net; + struct net *net = ovs_dp_get_net(parms->dp); + struct net_device *dev; struct vport *vport; - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - - if (unlikely(!vport)) - return PACKET_REJECT; - else - return PACKET_RCVD; -} - -static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - const struct ovs_key_ipv4_tunnel *tun_key; - struct flowi4 fl; - struct rtable *rt; - int min_headroom; - int tunnel_hlen; - __be16 df; - int err; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto err_free_skb; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto err_free_skb; - } - - tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) { - err = -ENOMEM; - goto err_free_rt; - } - - /* Push Tunnel header. */ - skb = __build_header(skb, tunnel_hlen); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - skb = NULL; - goto err_free_rt; + vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + rtnl_lock(); + dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); } - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - tun_key->ipv4_dst, IPPROTO_GRE, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - return err; -} - -static struct gre_cisco_protocol gre_protocol = { - .handler = gre_rcv, - .err_handler = gre_err, - .priority = 1, -}; - -static int gre_ports; -static int gre_init(void) -{ - int err; - - gre_ports++; - if (gre_ports > 1) - return 0; - - err = gre_cisco_register(&gre_protocol); - if (err) - pr_warn("cannot register gre protocol handler\n"); - - return err; -} - -static void gre_exit(void) -{ - gre_ports--; - if (gre_ports > 0) - return; - - gre_cisco_unregister(&gre_protocol); -} + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); -static const char *gre_get_name(const struct vport *vport) -{ - return vport_priv(vport); + return vport; } static struct vport *gre_create(const struct vport_parms *parms) { - struct net *net = ovs_dp_get_net(parms->dp); - struct ovs_net *ovs_net; struct vport *vport; - int err; - err = gre_init(); - if (err) - return ERR_PTR(err); - - ovs_net = net_generic(net, ovs_net_id); - if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { - vport = ERR_PTR(-EEXIST); - goto error; - } - - vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + vport = gre_tnl_create(parms); if (IS_ERR(vport)) - goto error; - - strncpy(vport_priv(vport), parms->name, IFNAMSIZ); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); - return vport; + return vport; -error: - gre_exit(); - return vport; -} - -static void gre_tnl_destroy(struct vport *vport) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct ovs_net *ovs_net; - - ovs_net = net_generic(net, ovs_net_id); - - RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); - ovs_vport_deferred_free(vport); - gre_exit(); -} - -static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_GRE, skb->mark, 0, 0); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, - .destroy = gre_tnl_destroy, - .get_name = gre_get_name, - .send = gre_tnl_send, - .get_egress_tun_info = gre_get_egress_tun_info, - .owner = THIS_MODULE, + .send = dev_queue_xmit, + .destroy = ovs_netdev_tunnel_destroy, }; static int __init ovs_gre_tnl_init(void) diff --git a/kernel/net/openvswitch/vport-internal_dev.c b/kernel/net/openvswitch/vport-internal_dev.c index 6a55f7105..ec76398a7 100644 --- a/kernel/net/openvswitch/vport-internal_dev.c +++ b/kernel/net/openvswitch/vport-internal_dev.c @@ -43,35 +43,26 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev) return netdev_priv(netdev); } -/* This function is only called by the kernel network layer.*/ -static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct vport *vport = ovs_internal_dev_get_vport(netdev); - struct ovs_vport_stats vport_stats; - - ovs_vport_get_stats(vport, &vport_stats); - - /* The tx and rx stats need to be swapped because the - * switch and host OS have opposite perspectives. */ - stats->rx_packets = vport_stats.tx_packets; - stats->tx_packets = vport_stats.rx_packets; - stats->rx_bytes = vport_stats.tx_bytes; - stats->tx_bytes = vport_stats.rx_bytes; - stats->rx_errors = vport_stats.tx_errors; - stats->tx_errors = vport_stats.rx_errors; - stats->rx_dropped = vport_stats.tx_dropped; - stats->tx_dropped = vport_stats.rx_dropped; - - return stats; -} - /* Called with rcu_read_lock_bh. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { + int len, err; + + len = skb->len; rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); + + if (likely(!err)) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + netdev->stats.tx_errors++; + } return 0; } @@ -115,13 +106,45 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } +static struct rtnl_link_stats64 * +internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + stats->rx_errors = dev->stats.rx_errors; + stats->tx_errors = dev->stats.tx_errors; + stats->tx_dropped = dev->stats.tx_dropped; + stats->rx_dropped = dev->stats.rx_dropped; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } + + return stats; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, - .ndo_get_stats64 = internal_dev_get_stats, + .ndo_get_stats64 = internal_get_stats, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -135,7 +158,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -156,49 +179,51 @@ static void do_setup(struct net_device *netdev) static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, - do_setup); - if (!netdev_vport->dev) { + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } + vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!vport->dev->tstats) { + err = -ENOMEM; + goto error_free_netdev; + } - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) - goto error_free_netdev; + goto error_unlock; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; -error_free_netdev: +error_unlock: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_percpu(vport->dev->tstats); +error_free_netdev: + free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); error: @@ -207,49 +232,49 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); - + unregister_netdevice(vport->dev); + free_percpu(vport->dev->tstats); rtnl_unlock(); } -static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +static netdev_tx_t internal_dev_recv(struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; - int len; + struct net_device *netdev = skb->dev; + struct pcpu_sw_netstats *stats; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); - return 0; + netdev->stats.rx_dropped++; + return NETDEV_TX_OK; } - len = skb->len; - skb_dst_drop(skb); nf_reset(skb); secpath_reset(skb); - skb->dev = netdev; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - netif_rx(skb); + stats = this_cpu_ptr(netdev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); - return len; + netif_rx(skb); + return NETDEV_TX_OK; } static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; diff --git a/kernel/net/openvswitch/vport-netdev.c b/kernel/net/openvswitch/vport-netdev.c index 33e6d6e29..6b0190b98 100644 --- a/kernel/net/openvswitch/vport-netdev.c +++ b/kernel/net/openvswitch/vport-netdev.c @@ -26,18 +26,24 @@ #include #include #include +#include -#include +#include +#include #include "datapath.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +static void netdev_port_receive(struct sk_buff *skb) { + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; @@ -53,10 +59,8 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - - ovs_vport_receive(vport, skb, NULL); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; - error: kfree_skb(skb); } @@ -65,15 +69,11 @@ error: static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - struct vport *vport; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } @@ -83,139 +83,116 @@ static struct net_device *get_dpdev(const struct datapath *dp) local = ovs_vport_ovsl(dp, OVSP_LOCAL); BUG_ON(!local); - return netdev_vport_priv(local)->dev; + return local->dev; } -static struct vport *netdev_create(const struct vport_parms *parms) +struct vport *ovs_netdev_link(struct vport *vport, const char *name) { - struct vport *vport; - struct netdev_vport *netdev_vport; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_master_upper_dev_link(netdev_vport->dev, + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp)); if (err) goto error_unlock; - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; - dev_disable_lro(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(ovs_netdev_link); -static void free_port_rcu(struct rcu_head *rcu) +static struct vport *netdev_create(const struct vport_parms *parms) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + return ovs_netdev_link(vport, parms->name); } -void ovs_netdev_detach_dev(struct vport *vport) +static void vport_netdev_free(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + struct vport *vport = container_of(rcu, struct vport, rcu); + if (vport->dev) + dev_put(vport->dev); + ovs_vport_free(vport); +} + +void ovs_netdev_detach_dev(struct vport *vport) +{ ASSERT_RTNL(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, - netdev_master_upper_dev_get(netdev_vport->dev)); - dev_set_promiscuity(netdev_vport->dev, -1); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } +EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev); static void netdev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); - if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&netdev_vport->rcu, free_port_rcu); -} - -const char *ovs_netdev_get_name(const struct vport *vport) -{ - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; -} - -static unsigned int packet_length(const struct sk_buff *skb) -{ - unsigned int length = skb->len - ETH_HLEN; - - if (skb->protocol == htons(ETH_P_8021Q)) - length -= VLAN_HLEN; - - return length; + call_rcu(&vport->rcu, vport_netdev_free); } -static int netdev_send(struct vport *vport, struct sk_buff *skb) +void ovs_netdev_tunnel_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; - int len; - - if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, - packet_length(skb), mtu); - goto drop; - } - - skb->dev = netdev_vport->dev; - len = skb->len; - dev_queue_xmit(skb); + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); - return len; + /* We can be invoked by both explicit vport deletion and + * underlying netdev deregistration; delete the link only + * if it's not already shutting down. + */ + if (vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev); + dev_put(vport->dev); + vport->dev = NULL; + rtnl_unlock(); -drop: - kfree_skb(skb); - return 0; + call_rcu(&vport->rcu, vport_netdev_free); } +EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) @@ -231,8 +208,7 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, - .send = netdev_send, + .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) diff --git a/kernel/net/openvswitch/vport-netdev.h b/kernel/net/openvswitch/vport-netdev.h index 6f7038e79..19e29c12a 100644 --- a/kernel/net/openvswitch/vport-netdev.h +++ b/kernel/net/openvswitch/vport-netdev.h @@ -26,22 +26,11 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; - - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - -const char *ovs_netdev_get_name(const struct vport *); +struct vport *ovs_netdev_link(struct vport *vport, const char *name); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); void ovs_netdev_exit(void); +void ovs_netdev_tunnel_destroy(struct vport *vport); #endif /* vport_netdev.h */ diff --git a/kernel/net/openvswitch/vport-vxlan.c b/kernel/net/openvswitch/vport-vxlan.c index 6d39766e7..d933cb89e 100644 --- a/kernel/net/openvswitch/vport-vxlan.c +++ b/kernel/net/openvswitch/vport-vxlan.c @@ -17,94 +17,37 @@ * 02110-1301, USA */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include +#include +#include +#include #include - -#include -#include #include #include #include -#include -#include -#include -#include -#include #include #include "datapath.h" #include "vport.h" -#include "vport-vxlan.h" - -/** - * struct vxlan_port - Keeps track of open UDP ports - * @vs: vxlan_sock created for the port. - * @name: vport name. - */ -struct vxlan_port { - struct vxlan_sock *vs; - char name[IFNAMSIZ]; - u32 exts; /* VXLAN_F_* in */ -}; - -static struct vport_ops ovs_vxlan_vport_ops; - -static inline struct vxlan_port *vxlan_vport(const struct vport *vport) -{ - return vport_priv(vport); -} +#include "vport-netdev.h" -/* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct ovs_tunnel_info tun_info; - struct vxlan_port *vxlan_port; - struct vport *vport = vs->data; - struct iphdr *iph; - struct ovs_vxlan_opts opts = { - .gbp = md->gbp, - }; - __be64 key; - __be16 flags; - - flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); - vxlan_port = vxlan_vport(vport); - if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) - flags |= TUNNEL_VXLAN_OPT; - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(md->vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); - - ovs_vport_receive(vport, skb, &tun_info); -} +static struct vport_ops ovs_vxlan_netdev_vport_ops; static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) { - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) return -EMSGSIZE; - if (vxlan_port->exts) { + if (vxlan->flags & VXLAN_F_GBP) { struct nlattr *exts; exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); if (!exts) return -EMSGSIZE; - if (vxlan_port->exts & VXLAN_F_GBP && + if (vxlan->flags & VXLAN_F_GBP && nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) return -EMSGSIZE; @@ -114,23 +57,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) return 0; } -static void vxlan_tnl_destroy(struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - - vxlan_sock_release(vxlan_port->vs); - - ovs_vport_deferred_free(vport); -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, }; -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) { - struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; - struct vxlan_port *vxlan_port; + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; int err; if (nla_len(attr) < sizeof(struct nlattr)) @@ -140,10 +74,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) if (err < 0) return err; - vxlan_port = vxlan_vport(vport); - if (exts[OVS_VXLAN_EXT_GBP]) - vxlan_port->exts |= VXLAN_F_GBP; + conf->flags |= VXLAN_F_GBP; return 0; } @@ -152,166 +84,84 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; - struct vxlan_port *vxlan_port; - struct vxlan_sock *vs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - u16 dst_port; int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + }; if (!options) { err = -EINVAL; goto error; } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); + conf.dst_port = htons(nla_get_u16(a)); } else { /* Require destination port from userspace. */ err = -EINVAL; goto error; } - vport = ovs_vport_alloc(sizeof(struct vxlan_port), - &ovs_vxlan_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; - vxlan_port = vxlan_vport(vport); - strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); if (a) { - err = vxlan_configure_exts(vport, a); + err = vxlan_configure_exts(vport, a, &conf); if (err) { ovs_vport_free(vport); goto error; } } - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, - vxlan_port->exts); - if (IS_ERR(vs)) { + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)vs; + return ERR_CAST(dev); } - vxlan_port->vs = vs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; - error: return ERR_PTR(err); } -static int vxlan_ext_gbp(struct sk_buff *skb) +static struct vport *vxlan_create(const struct vport_parms *parms) { - const struct ovs_tunnel_info *tun_info; - const struct ovs_vxlan_opts *opts; - - tun_info = OVS_CB(skb)->egress_tun_info; - opts = tun_info->options; - - if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && - tun_info->options_len >= sizeof(*opts)) - return opts->gbp; - else - return 0; -} - -static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - struct sock *sk = vxlan_port->vs->sock->sk; - __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ovs_key_ipv4_tunnel *tun_key; - struct vxlan_metadata md = {0}; - struct rtable *rt; - struct flowi4 fl; - __be16 src_port; - __be16 df; - int err; - u32 vxflags; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - src_port = udp_flow_src_port(net, skb, 0, 0, true); - md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); - md.gbp = vxlan_ext_gbp(skb); - vxflags = vxlan_port->exts | - (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - - err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, - src_port, dst_port, - &md, false, vxflags); - if (err < 0) - ip_rt_put(rt); - return err; -error: - kfree_skb(skb); - return err; -} - -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); + struct vport *vport; - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - src_port, dst_port); -} + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; -static const char *vxlan_get_name(const struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - return vxlan_port->name; + return ovs_netdev_link(vport, parms->name); } -static struct vport_ops ovs_vxlan_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_tnl_create, - .destroy = vxlan_tnl_destroy, - .get_name = vxlan_get_name, - .get_options = vxlan_get_options, - .send = vxlan_tnl_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, - .owner = THIS_MODULE, +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = vxlan_get_options, + .send = dev_queue_xmit, }; static int __init ovs_vxlan_tnl_init(void) { - return ovs_vport_ops_register(&ovs_vxlan_vport_ops); + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); } static void __exit ovs_vxlan_tnl_exit(void) { - ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); } module_init(ovs_vxlan_tnl_init); diff --git a/kernel/net/openvswitch/vport-vxlan.h b/kernel/net/openvswitch/vport-vxlan.h deleted file mode 100644 index 4b08233e7..000000000 --- a/kernel/net/openvswitch/vport-vxlan.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef VPORT_VXLAN_H -#define VPORT_VXLAN_H 1 - -#include -#include - -struct ovs_vxlan_opts { - __u32 gbp; -}; - -#endif diff --git a/kernel/net/openvswitch/vport.c b/kernel/net/openvswitch/vport.c index 067a3fff1..31cbc8c5c 100644 --- a/kernel/net/openvswitch/vport.c +++ b/kernel/net/openvswitch/vport.c @@ -34,9 +34,6 @@ #include "vport.h" #include "vport-internal_dev.h" -static void ovs_vport_record_error(struct vport *, - enum vport_err_type err_type); - static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -74,7 +71,7 @@ static struct hlist_head *hash_bucket(const struct net *net, const char *name) return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } -int ovs_vport_ops_register(struct vport_ops *ops) +int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; @@ -90,7 +87,7 @@ errout: ovs_unlock(); return err; } -EXPORT_SYMBOL_GPL(ovs_vport_ops_register); +EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { @@ -113,7 +110,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -157,12 +154,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return ERR_PTR(-EINVAL); } - vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->percpu_stats) { - kfree(vport); - return ERR_PTR(-ENOMEM); - } - return vport; } EXPORT_SYMBOL_GPL(ovs_vport_alloc); @@ -183,7 +174,6 @@ void ovs_vport_free(struct vport *vport) * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); - free_percpu(vport->percpu_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); @@ -226,7 +216,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) } bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); + ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } @@ -266,8 +256,8 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) * * @vport: vport to delete. * - * Detaches @vport from its datapath and destroys it. It is possible to fail - * for reasons such as lack of memory. ovs_mutex must be held. + * Detaches @vport from its datapath and destroys it. ovs_mutex must + * be held. */ void ovs_vport_del(struct vport *vport) { @@ -290,41 +280,19 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { - int i; - - memset(stats, 0, sizeof(*stats)); - - /* We potentially have 2 sources of stats that need to be combined: - * those we have collected (split into err_stats and percpu_stats) from - * set_stats() and device error stats from netdev->get_stats() (for - * errors that happen downstream and therefore aren't reported through - * our vport_record_error() function). - * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). - * netdev-stats can be directly read over netlink-ioctl. - */ - - stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); - stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); - stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); - stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(vport->percpu_stats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } + const struct rtnl_link_stats64 *dev_stats; + struct rtnl_link_stats64 temp; + + dev_stats = dev_get_stats(vport->dev, &temp); + stats->rx_errors = dev_stats->rx_errors; + stats->tx_errors = dev_stats->tx_errors; + stats->tx_dropped = dev_stats->tx_dropped; + stats->rx_dropped = dev_stats->rx_dropped; + + stats->rx_bytes = dev_stats->rx_bytes; + stats->rx_packets = dev_stats->rx_packets; + stats->tx_bytes = dev_stats->tx_bytes; + stats->tx_packets = dev_stats->tx_packets; } /** @@ -468,94 +436,34 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ovs_tunnel_info *tun_info) +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) { - struct pcpu_sw_netstats *stats; struct sw_flow_key key; int error; - stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len + - (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_info = NULL; + OVS_CB(skb)->mru = 0; + if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { + u32 mark; + + mark = skb->mark; + skb_scrub_packet(skb, true); + skb->mark = mark; + tun_info = NULL; + } + /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); - return; + return error; } ovs_dp_process_packet(skb, &key); + return 0; } EXPORT_SYMBOL_GPL(ovs_vport_receive); -/** - * ovs_vport_send - send a packet on a device - * - * @vport: vport on which to send the packet - * @skb: skb to send - * - * Sends the given packet and returns the length of data sent. Either ovs - * lock or rcu_read_lock must be held. - */ -int ovs_vport_send(struct vport *vport, struct sk_buff *skb) -{ - int sent = vport->ops->send(vport, skb); - - if (likely(sent > 0)) { - struct pcpu_sw_netstats *stats; - - stats = this_cpu_ptr(vport->percpu_stats); - - u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += sent; - u64_stats_update_end(&stats->syncp); - } else if (sent < 0) { - ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - } else { - ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); - } - return sent; -} - -/** - * ovs_vport_record_error - indicate device error to generic stats layer - * - * @vport: vport that encountered the error - * @err_type: one of enum vport_err_type types to indicate the error type - * - * If using the vport generic stats layer indicate that an error of the given - * type has occurred. - */ -static void ovs_vport_record_error(struct vport *vport, - enum vport_err_type err_type) -{ - switch (err_type) { - case VPORT_E_RX_DROPPED: - atomic_long_inc(&vport->err_stats.rx_dropped); - break; - - case VPORT_E_RX_ERROR: - atomic_long_inc(&vport->err_stats.rx_errors); - break; - - case VPORT_E_TX_DROPPED: - atomic_long_inc(&vport->err_stats.tx_dropped); - break; - - case VPORT_E_TX_ERROR: - atomic_long_inc(&vport->err_stats.tx_errors); - break; - } - -} - static void free_vport_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); @@ -572,56 +480,32 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, - struct net *net, - const struct ovs_tunnel_info *tun_info, - u8 ipproto, - u32 skb_mark, - __be16 tp_src, - __be16 tp_dst) +static unsigned int packet_length(const struct sk_buff *skb) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct rtable *rt; - struct flowi4 fl; - - if (unlikely(!tun_info)) - return -EINVAL; - - tun_key = &tun_info->tunnel; - - /* Route lookup to get srouce IP address. - * The process may need to be changed if the corresponding process - * in vports ops changed. - */ - rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); - if (IS_ERR(rt)) - return PTR_ERR(rt); + unsigned int length = skb->len - ETH_HLEN; - ip_rt_put(rt); + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; - /* Generate egress_tun_info based on tun_info, - * saddr, tp_src and tp_dst - */ - __ovs_flow_tun_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); - - return 0; + return length; } -EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info) +void ovs_vport_send(struct vport *vport, struct sk_buff *skb) { - /* get_egress_tun_info() is only implemented on tunnel ports. */ - if (unlikely(!vport->ops->get_egress_tun_info)) - return -EINVAL; + int mtu = vport->dev->mtu; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", + vport->dev->name, + packet_length(skb), mtu); + vport->dev->stats.tx_errors++; + goto drop; + } + + skb->dev = vport->dev; + vport->ops->send(skb); + return; - return vport->ops->get_egress_tun_info(vport, skb, info); +drop: + kfree_skb(skb); } diff --git a/kernel/net/openvswitch/vport.h b/kernel/net/openvswitch/vport.h index bc85331a6..8ea3a9698 100644 --- a/kernel/net/openvswitch/vport.h +++ b/kernel/net/openvswitch/vport.h @@ -35,10 +35,6 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -struct vport_net { - struct vport __rcu *gre_vport; -}; - int ovs_vport_init(void); void ovs_vport_exit(void); @@ -56,26 +52,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_vport_send(struct vport *, struct sk_buff *); - -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, - struct net *net, - const struct ovs_tunnel_info *tun_info, - u8 ipproto, - u32 skb_mark, - __be16 tp_src, - __be16 tp_dst); -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info); - -/* The following definitions are for implementers of vport devices: */ - -struct vport_err_stats { - atomic_long_t rx_dropped; - atomic_long_t rx_errors; - atomic_long_t tx_dropped; - atomic_long_t tx_errors; -}; /** * struct vport_portids - array of netlink portids of a vport. * must be protected by rcu. @@ -101,12 +77,10 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. - * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @err_stats: Points to error statistics used and maintained by vport * @detach_list: list used for detaching vport in net-exit call. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -115,10 +89,8 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_sw_netstats __percpu *percpu_stats; - - struct vport_err_stats err_stats; struct list_head detach_list; + struct rcu_head rcu; }; /** @@ -155,11 +127,8 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @get_name: Get the device's name. - * @send: Send a packet on the device. Returns the length of the packet sent, + * @send: Send a packet on the device. * zero for dropped packets or negative for error. - * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for - * a packet. */ struct vport_ops { enum ovs_vport_type type; @@ -171,24 +140,11 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or ovs_mutex. */ - const char *(*get_name)(const struct vport *); - - int (*send)(struct vport *, struct sk_buff *); - int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); - + netdev_tx_t (*send) (struct sk_buff *skb); struct module *owner; struct list_head list; }; -enum vport_err_type { - VPORT_E_RX_DROPPED, - VPORT_E_RX_ERROR, - VPORT_E_TX_DROPPED, - VPORT_E_TX_ERROR, -}; - struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); @@ -225,8 +181,8 @@ static inline struct vport *vport_from_priv(void *priv) return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ovs_tunnel_info *); +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -235,11 +191,22 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } -int ovs_vport_ops_register(struct vport_ops *ops); +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev->name; +} + +int __ovs_vport_ops_register(struct vport_ops *ops); +#define ovs_vport_ops_register(ops) \ + ({ \ + (ops)->owner = THIS_MODULE; \ + __ovs_vport_ops_register(ops); \ + }) + void ovs_vport_ops_unregister(struct vport_ops *ops); static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ovs_key_ipv4_tunnel *key, + const struct ip_tunnel_key *key, u32 mark, struct flowi4 *fl, u8 protocol) @@ -247,13 +214,16 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, struct rtable *rt; memset(fl, 0, sizeof(*fl)); - fl->daddr = key->ipv4_dst; - fl->saddr = key->ipv4_src; - fl->flowi4_tos = RT_TOS(key->ipv4_tos); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; rt = ip_route_output_key(net, fl); return rt; } + +void ovs_vport_send(struct vport *vport, struct sk_buff *skb); + #endif /* vport.h */ diff --git a/kernel/net/packet/af_packet.c b/kernel/net/packet/af_packet.c index f9f259247..2c23bae0e 100644 --- a/kernel/net/packet/af_packet.c +++ b/kernel/net/packet/af_packet.c @@ -93,6 +93,7 @@ #ifdef CONFIG_INET #include #endif +#include #include "internal.h" @@ -230,6 +231,8 @@ struct packet_skb_cb { } sa; }; +#define vio_le() virtio_legacy_is_little_endian() + #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) @@ -519,13 +522,11 @@ static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, - int tx_ring, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; - pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : - GET_PBDQC_FROM_RB(&po->rx_ring); + pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; @@ -544,15 +545,11 @@ static void prb_init_blk_timer(struct packet_sock *po, pkc->retire_blk_timer.expires = jiffies; } -static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) +static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; - if (tx_ring) - BUG(); - - pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : - GET_PBDQC_FROM_RB(&po->rx_ring); + pkc = GET_PBDQC_FROM_RB(&po->rx_ring); prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); } @@ -608,7 +605,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, - union tpacket_req_u *req_u, int tx_ring) + union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; @@ -635,7 +632,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po, tx_ring); + prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } @@ -1235,27 +1232,81 @@ static void packet_free_pending(struct packet_sock *po) free_percpu(po->tx_ring.pending_refcnt); } -static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +#define ROOM_POW_OFF 2 +#define ROOM_NONE 0x0 +#define ROOM_LOW 0x1 +#define ROOM_NORMAL 0x2 + +static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.frame_max + 1; + idx = po->rx_ring.head; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.prb_bdqc.knum_blocks; + idx = po->rx_ring.prb_bdqc.kactive_blk_num; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; - bool has_room; + int ret = ROOM_NONE; + + if (po->prot_hook.func != tpacket_rcv) { + int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + return ROOM_NORMAL; + else if (avail > 0) + return ROOM_LOW; + else + return ROOM_NONE; + } - if (po->prot_hook.func != tpacket_rcv) - return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) - <= sk->sk_rcvbuf; + if (po->tp_version == TPACKET_V3) { + if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_v3_has_room(po, 0)) + ret = ROOM_LOW; + } else { + if (__tpacket_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_has_room(po, 0)) + ret = ROOM_LOW; + } - spin_lock(&sk->sk_receive_queue.lock); - if (po->tp_version == TPACKET_V3) - has_room = prb_lookup_block(po, &po->rx_ring, - po->rx_ring.prb_bdqc.kactive_blk_num, - TP_STATUS_KERNEL); - else - has_room = packet_lookup_frame(po, &po->rx_ring, - po->rx_ring.head, - TP_STATUS_KERNEL); - spin_unlock(&sk->sk_receive_queue.lock); + return ret; +} - return has_room; +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + int ret; + bool has_room; + + spin_lock_bh(&po->sk.sk_receive_queue.lock); + ret = __packet_rcv_has_room(po, skb); + has_room = ret == ROOM_NORMAL; + if (po->pressure == has_room) + po->pressure = !has_room; + spin_unlock_bh(&po->sk.sk_receive_queue.lock); + + return ret; } static void packet_sock_destruct(struct sock *sk) @@ -1273,6 +1324,20 @@ static void packet_sock_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } +static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) +{ + u32 rxhash; + int i, count = 0; + + rxhash = skb_get_hash(skb); + for (i = 0; i < ROLLOVER_HLEN; i++) + if (po->rollover->history[i] == rxhash) + count++; + + po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + return count > (ROLLOVER_HLEN >> 1); +} + static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) @@ -1305,22 +1370,40 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, - unsigned int idx, unsigned int skip, + unsigned int idx, bool try_self, unsigned int num) { - unsigned int i, j; + struct packet_sock *po, *po_next, *po_skip = NULL; + unsigned int i, j, room = ROOM_NONE; - i = j = min_t(int, f->next[idx], num - 1); + po = pkt_sk(f->arr[idx]); + + if (try_self) { + room = packet_rcv_has_room(po, skb); + if (room == ROOM_NORMAL || + (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) + return idx; + po_skip = po; + } + + i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + po_next = pkt_sk(f->arr[i]); + if (po_next != po_skip && !po_next->pressure && + packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) - f->next[idx] = i; + po->rollover->sock = i; + atomic_long_inc(&po->rollover->num); + if (room == ROOM_LOW) + atomic_long_inc(&po->rollover->num_huge); return i; } + if (++i == num) i = 0; } while (i != j); + atomic_long_inc(&po->rollover->num_failed); return idx; } @@ -1331,6 +1414,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f, return skb_get_queue_mapping(skb) % num; } +static unsigned int fanout_demux_bpf(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + struct bpf_prog *prog; + unsigned int ret = 0; + + rcu_read_lock(); + prog = rcu_dereference(f->bpf_prog); + if (prog) + ret = bpf_prog_run_clear_cb(prog, skb) % num; + rcu_read_unlock(); + + return ret; +} + static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); @@ -1341,17 +1440,17 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); + struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; - if (!net_eq(dev_net(dev), read_pnet(&f->net)) || - !num) { + if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { - skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); + skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } @@ -1373,17 +1472,18 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: - idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); + idx = fanout_demux_rollover(f, skb, 0, false, num); + break; + case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: + idx = fanout_demux_bpf(f, skb, num); break; } - po = pkt_sk(f->arr[idx]); - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && - unlikely(!packet_rcv_has_room(po, skb))) { - idx = fanout_demux_rollover(f, skb, idx, idx, num); - po = pkt_sk(f->arr[idx]); - } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) + idx = fanout_demux_rollover(f, skb, idx, true, num); + po = pkt_sk(f->arr[idx]); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } @@ -1420,10 +1520,107 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { - if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) - return true; + if (sk->sk_family != PF_PACKET) + return false; - return false; + return ptype->af_packet_priv == pkt_sk(sk)->fanout; +} + +static void fanout_init_data(struct packet_fanout *f) +{ + switch (f->type) { + case PACKET_FANOUT_LB: + atomic_set(&f->rr_cur, 0); + break; + case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: + RCU_INIT_POINTER(f->bpf_prog, NULL); + break; + } +} + +static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) +{ + struct bpf_prog *old; + + spin_lock(&f->lock); + old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); + rcu_assign_pointer(f->bpf_prog, new); + spin_unlock(&f->lock); + + if (old) { + synchronize_net(); + bpf_prog_destroy(old); + } +} + +static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, + unsigned int len) +{ + struct bpf_prog *new; + struct sock_fprog fprog; + int ret; + + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) + return -EPERM; + if (len != sizeof(fprog)) + return -EINVAL; + if (copy_from_user(&fprog, data, len)) + return -EFAULT; + + ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); + if (ret) + return ret; + + __fanout_set_data_bpf(po->fanout, new); + return 0; +} + +static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, + unsigned int len) +{ + struct bpf_prog *new; + u32 fd; + + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) + return -EPERM; + if (len != sizeof(fd)) + return -EINVAL; + if (copy_from_user(&fd, data, len)) + return -EFAULT; + + new = bpf_prog_get(fd); + if (IS_ERR(new)) + return PTR_ERR(new); + if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(new); + return -EINVAL; + } + + __fanout_set_data_bpf(po->fanout, new); + return 0; +} + +static int fanout_set_data(struct packet_sock *po, char __user *data, + unsigned int len) +{ + switch (po->fanout->type) { + case PACKET_FANOUT_CBPF: + return fanout_set_data_cbpf(po, data, len); + case PACKET_FANOUT_EBPF: + return fanout_set_data_ebpf(po, data, len); + default: + return -EINVAL; + }; +} + +static void fanout_release_data(struct packet_fanout *f) +{ + switch (f->type) { + case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: + __fanout_set_data_bpf(f, NULL); + }; } static int fanout_add(struct sock *sk, u16 id, u16 type_flags) @@ -1443,6 +1640,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: + case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: break; default: return -EINVAL; @@ -1454,6 +1653,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (po->fanout) return -EALREADY; + if (type == PACKET_FANOUT_ROLLOVER || + (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { + po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); + if (!po->rollover) + return -ENOMEM; + atomic_long_set(&po->rollover->num, 0); + atomic_long_set(&po->rollover->num_huge, 0); + atomic_long_set(&po->rollover->num_failed, 0); + } + mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { @@ -1475,10 +1684,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->id = id; match->type = type; match->flags = flags; - atomic_set(&match->rr_cur, 0); INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); atomic_set(&match->sk_ref, 0); + fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; @@ -1502,6 +1711,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: mutex_unlock(&fanout_mutex); + if (err) { + kfree(po->rollover); + po->rollover = NULL; + } return err; } @@ -1520,9 +1733,27 @@ static void fanout_release(struct sock *sk) if (atomic_dec_and_test(&f->sk_ref)) { list_del(&f->list); dev_remove_pack(&f->prot_hook); + fanout_release_data(f); kfree(f); } mutex_unlock(&fanout_mutex); + + if (po->rollover) + kfree_rcu(po->rollover, rcu); +} + +static bool packet_extra_vlan_len_allowed(const struct net_device *dev, + struct sk_buff *skb) +{ + /* Earlier code assumed this would be a VLAN pkt, double-check + * this now that we have the actual packet in hand. We can only + * do this check on Ethernet devices. + */ + if (unlikely(dev->type != ARPHRD_ETHER)) + return false; + + skb_reset_mac_header(skb); + return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; @@ -1686,18 +1917,10 @@ retry: goto retry; } - if (len > (dev->mtu + dev->hard_header_len + extra_len)) { - /* Earlier code assumed this would be a VLAN pkt, - * double-check this now that we have the actual - * packet in hand. - */ - struct ethhdr *ehdr; - skb_reset_mac_header(skb); - ehdr = eth_hdr(skb); - if (ehdr->h_proto != htons(ETH_P_8021Q)) { - err = -EMSGSIZE; - goto out_unlock; - } + if (len > (dev->mtu + dev->hard_header_len + extra_len) && + !packet_extra_vlan_len_allowed(dev, skb)) { + err = -EMSGSIZE; + goto out_unlock; } skb->protocol = proto; @@ -1723,16 +1946,16 @@ out_free: return err; } -static unsigned int run_filter(const struct sk_buff *skb, - const struct sock *sk, - unsigned int res) +static unsigned int run_filter(struct sk_buff *skb, + const struct sock *sk, + unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) - res = SK_RUN_FILTER(filter, skb); + res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; @@ -2107,8 +2330,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb) static bool ll_header_truncated(const struct net_device *dev, int len) { /* net device doesn't like empty head */ - if (unlikely(len <= dev->hard_header_len)) { - net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n", + if (unlikely(len < dev->hard_header_len)) { + net_warn_ratelimited("%s: packet size is too short (%d < %d)\n", current->comm, len, dev->hard_header_len); return true; } @@ -2116,6 +2339,15 @@ static bool ll_header_truncated(const struct net_device *dev, int len) return false; } +static void tpacket_set_protocol(const struct net_device *dev, + struct sk_buff *skb) +{ + if (dev->type == ARPHRD_ETHER) { + skb_reset_mac_header(skb); + skb->protocol = eth_hdr(skb)->h_proto; + } +} + static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, int size_max, __be16 proto, unsigned char *addr, int hlen) @@ -2152,8 +2384,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb_reserve(skb, hlen); skb_reset_network_header(skb); - if (!packet_use_direct_xmit(po)) - skb_probe_transport_header(skb, 0); if (unlikely(po->tp_tx_has_off)) { int off_min, off_max, off; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); @@ -2199,6 +2429,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, dev->hard_header_len); if (unlikely(err)) return err; + if (!skb->protocol) + tpacket_set_protocol(dev, skb); data += dev->hard_header_len; to_write -= dev->hard_header_len; @@ -2233,6 +2465,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, len = ((to_write > len_max) ? len_max : to_write); } + skb_probe_transport_header(skb, 0); + return tp_len; } @@ -2277,12 +2511,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - reserve = dev->hard_header_len + VLAN_HLEN; + if (po->sk.sk_socket->type == SOCK_RAW) + reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if (size_max > dev->mtu + reserve) - size_max = dev->mtu + reserve; + if (size_max > dev->mtu + reserve + VLAN_HLEN) + size_max = dev->mtu + reserve + VLAN_HLEN; do { ph = packet_current_frame(po, &po->tx_ring, @@ -2309,18 +2544,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); if (likely(tp_len >= 0) && - tp_len > dev->mtu + dev->hard_header_len) { - struct ethhdr *ehdr; - /* Earlier code assumed this would be a VLAN pkt, - * double-check this now that we have the actual - * packet in hand. - */ + tp_len > dev->mtu + reserve && + !packet_extra_vlan_len_allowed(dev, skb)) + tp_len = -EMSGSIZE; - skb_reset_mac_header(skb); - ehdr = eth_hdr(skb); - if (ehdr->h_proto != htons(ETH_P_8021Q)) - tp_len = -EMSGSIZE; - } if (unlikely(tp_len < 0)) { if (po->tp_loss) { __packet_set_status(po, ph, @@ -2414,6 +2641,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) __be16 proto; unsigned char *addr; int err, reserve = 0; + struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; int vnet_hdr_len; @@ -2449,6 +2677,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.mark = sk->sk_mark; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto out_unlock; + } + if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { @@ -2466,15 +2701,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(false, vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(false, - __virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2); + (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) + vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); err = -EINVAL; - if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len) + if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) goto out_unlock; if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -2517,7 +2752,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, - __virtio16_to_cpu(false, vnet_hdr.hdr_len), + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; @@ -2541,31 +2776,23 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (!gso_type && (len > dev->mtu + reserve + extra_len)) { - /* Earlier code assumed this would be a VLAN pkt, - * double-check this now that we have the actual - * packet in hand. - */ - struct ethhdr *ehdr; - skb_reset_mac_header(skb); - ehdr = eth_hdr(skb); - if (ehdr->h_proto != htons(ETH_P_8021Q)) { - err = -EMSGSIZE; - goto out_free; - } + if (!gso_type && (len > dev->mtu + reserve + extra_len) && + !packet_extra_vlan_len_allowed(dev, skb)) { + err = -EMSGSIZE; + goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc.mark; packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset); + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); if (!skb_partial_csum_set(skb, s, o)) { err = -EINVAL; goto out_free; @@ -2573,7 +2800,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(false, vnet_hdr.gso_size); + __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ @@ -2583,8 +2810,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) len += vnet_hdr_len; } - if (!packet_use_direct_xmit(po)) - skb_probe_transport_header(skb, reserve); + skb_probe_transport_header(skb, reserve); + if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2687,22 +2914,40 @@ static int packet_release(struct socket *sock) * Attach a packet hook. */ -static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) +static int packet_do_bind(struct sock *sk, const char *name, int ifindex, + __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev_curr; __be16 proto_curr; bool need_rehook; + struct net_device *dev = NULL; + int ret = 0; + bool unlisted = false; - if (po->fanout) { - if (dev) - dev_put(dev); - + if (po->fanout) return -EINVAL; - } lock_sock(sk); spin_lock(&po->bind_lock); + rcu_read_lock(); + + if (name) { + dev = dev_get_by_name_rcu(sock_net(sk), name); + if (!dev) { + ret = -ENODEV; + goto out_unlock; + } + } else if (ifindex) { + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); + if (!dev) { + ret = -ENODEV; + goto out_unlock; + } + } + + if (dev) + dev_hold(dev); proto_curr = po->prot_hook.type; dev_curr = po->prot_hook.dev; @@ -2710,14 +2955,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) need_rehook = proto_curr != proto || dev_curr != dev; if (need_rehook) { - unregister_prot_hook(sk, true); + if (po->running) { + rcu_read_unlock(); + __unregister_prot_hook(sk, true); + rcu_read_lock(); + dev_curr = po->prot_hook.dev; + if (dev) + unlisted = !dev_get_by_index_rcu(sock_net(sk), + dev->ifindex); + } po->num = proto; po->prot_hook.type = proto; - po->prot_hook.dev = dev; - po->ifindex = dev ? dev->ifindex : 0; - packet_cached_dev_assign(po, dev); + if (unlikely(unlisted)) { + dev_put(dev); + po->prot_hook.dev = NULL; + po->ifindex = -1; + packet_cached_dev_reset(po); + } else { + po->prot_hook.dev = dev; + po->ifindex = dev ? dev->ifindex : 0; + packet_cached_dev_assign(po, dev); + } } if (dev_curr) dev_put(dev_curr); @@ -2725,7 +2985,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) if (proto == 0 || !need_rehook) goto out_unlock; - if (!dev || (dev->flags & IFF_UP)) { + if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; @@ -2734,9 +2994,10 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) } out_unlock: + rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); - return 0; + return ret; } /* @@ -2748,8 +3009,6 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; char name[15]; - struct net_device *dev; - int err = -ENODEV; /* * Check legality @@ -2759,19 +3018,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, return -EINVAL; strlcpy(name, uaddr->sa_data, sizeof(name)); - dev = dev_get_by_name(sock_net(sk), name); - if (dev) - err = packet_do_bind(sk, dev, pkt_sk(sk)->num); - return err; + return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; - struct net_device *dev = NULL; - int err; - /* * Check legality @@ -2782,16 +3035,8 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_family != AF_PACKET) return -EINVAL; - if (sll->sll_ifindex) { - err = -ENODEV; - dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); - if (dev == NULL) - goto out; - } - err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); - -out: - return err; + return packet_do_bind(sk, NULL, sll->sll_ifindex, + sll->sll_protocol ? : pkt_sk(sk)->num); } static struct proto packet_proto = { @@ -2821,7 +3066,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock->state = SS_UNCONNECTED; err = -ENOBUFS; - sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; @@ -2851,6 +3096,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); + po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) @@ -2928,6 +3174,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; + if (pkt_sk(sk)->pressure) + packet_rcv_has_room(pkt_sk(sk), NULL); + if (pkt_sk(sk)->has_vnet_hdr) { struct virtio_net_hdr vnet_hdr = { 0 }; @@ -2943,9 +3192,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* This is a hint as to how much should be linear. */ vnet_hdr.hdr_len = - __cpu_to_virtio16(false, skb_headlen(skb)); + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); vnet_hdr.gso_size = - __cpu_to_virtio16(false, sinfo->gso_size); + __cpu_to_virtio16(vio_le(), sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) @@ -2963,9 +3212,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(false, + vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(false, + vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), skb->csum_offset); } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; @@ -3432,6 +3681,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return fanout_add(sk, val & 0xffff, val >> 16); } + case PACKET_FANOUT_DATA: + { + if (!po->fanout) + return -EINVAL; + + return fanout_set_data(po, optval, optlen); + } case PACKET_TX_HAS_OFF: { unsigned int val; @@ -3471,6 +3727,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3546,6 +3803,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_ROLLOVER_STATS: + if (!po->rollover) + return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); + break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; @@ -3683,6 +3949,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= POLLIN | POLLRDNORM; } + if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + po->pressure = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { @@ -3842,7 +4110,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; - if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) + if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; if (po->tp_version >= TPACKET_V3 && (int)(req->tp_block_size - @@ -3854,8 +4122,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; - rb->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (unlikely(rb->frames_per_block <= 0)) + rb->frames_per_block = req->tp_block_size / req->tp_frame_size; + if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) @@ -3872,7 +4140,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, * it above but just being paranoid */ if (!tx_ring) - init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); + init_prb_bdqc(po, rb, pg_vec, req_u); break; default: break; @@ -3932,7 +4200,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (closing && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) - prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); + prb_shutdown_retire_blk_timer(po, rb_queue); } release_sock(sk); diff --git a/kernel/net/packet/internal.h b/kernel/net/packet/internal.h index fe6e20cae..9ee46314b 100644 --- a/kernel/net/packet/internal.h +++ b/kernel/net/packet/internal.h @@ -79,15 +79,27 @@ struct packet_fanout { u16 id; u8 type; u8 flags; - atomic_t rr_cur; + union { + atomic_t rr_cur; + struct bpf_prog __rcu *bpf_prog; + }; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; - int next[PACKET_FANOUT_MAX]; spinlock_t lock; atomic_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; }; +struct packet_rollover { + int sock; + struct rcu_head rcu; + atomic_long_t num; + atomic_long_t num_huge; + atomic_long_t num_failed; +#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) + u32 history[ROLLOVER_HLEN] ____cacheline_aligned; +} ____cacheline_aligned_in_smp; + struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; @@ -102,8 +114,10 @@ struct packet_sock { auxdata:1, origdev:1, has_vnet_hdr:1; + int pressure; int ifindex; /* bound device */ __be16 num; + struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_t mapped; enum tpacket_versions tp_version; diff --git a/kernel/net/phonet/af_phonet.c b/kernel/net/phonet/af_phonet.c index 32ab87d34..f92575366 100644 --- a/kernel/net/phonet/af_phonet.c +++ b/kernel/net/phonet/af_phonet.c @@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, goto out; } - sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot); + sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; @@ -377,6 +377,10 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, struct sockaddr_pn sa; u16 len; + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + /* check we have at least a full Phonet header */ if (!pskb_pull(skb, sizeof(struct phonethdr))) goto out; diff --git a/kernel/net/phonet/pep.c b/kernel/net/phonet/pep.c index 6de2aeb98..850a86cde 100644 --- a/kernel/net/phonet/pep.c +++ b/kernel/net/phonet/pep.c @@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) } /* Create a new to-be-accepted sock */ - newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; diff --git a/kernel/net/rds/af_rds.c b/kernel/net/rds/af_rds.c index 10443377f..b5476aebd 100644 --- a/kernel/net/rds/af_rds.c +++ b/kernel/net/rds/af_rds.c @@ -40,15 +40,6 @@ #include "rds.h" -char *rds_str_array(char **array, size_t elements, size_t index) -{ - if ((index < elements) && array[index]) - return array[index]; - else - return "unknown"; -} -EXPORT_SYMBOL(rds_str_array); - /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); static unsigned long rds_sock_count; @@ -81,13 +72,7 @@ static int rds_release(struct socket *sock) rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); - /* - * the binding lookup hash uses rcu, we need to - * make sure we synchronize_rcu before we free our - * entry - */ rds_remove_bound(rs); - synchronize_rcu(); rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); @@ -270,6 +255,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } +static int rds_set_transport(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int t_type; + + if (rs->rs_transport) + return -EOPNOTSUPP; /* previously attached to transport */ + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) + return -EFAULT; + + if (t_type < 0 || t_type >= RDS_TRANS_COUNT) + return -EINVAL; + + rs->rs_transport = rds_trans_get(t_type); + + return rs->rs_transport ? 0 : -ENOPROTOOPT; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -300,6 +307,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, case RDS_CONG_MONITOR: ret = rds_cong_monitor(rs, optval, optlen); break; + case SO_RDS_TRANSPORT: + lock_sock(sock->sk); + ret = rds_set_transport(rs, optval, optlen); + release_sock(sock->sk); + break; default: ret = -ENOPROTOOPT; } @@ -312,6 +324,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret = -ENOPROTOOPT, len; + int trans; if (level != SOL_RDS) goto out; @@ -337,6 +350,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, else ret = 0; break; + case SO_RDS_TRANSPORT: + if (len < sizeof(int)) { + ret = -EINVAL; + break; + } + trans = (rs->rs_transport ? rs->rs_transport->t_type : + RDS_TRANS_NONE); /* unbound */ + if (put_user(trans, (int __user *)optval) || + put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; default: break; } @@ -406,6 +432,14 @@ static const struct proto_ops rds_proto_ops = { .sendpage = sock_no_sendpage, }; +static void rds_sock_destruct(struct sock *sk) +{ + struct rds_sock *rs = rds_sk_to_rs(sk); + + WARN_ON((&rs->rs_item != rs->rs_item.next || + &rs->rs_item != rs->rs_item.prev)); +} + static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { struct rds_sock *rs; @@ -413,6 +447,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; + sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); @@ -440,7 +475,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern); if (!sk) return -ENOMEM; @@ -538,6 +573,7 @@ static void rds_exit(void) rds_threads_exit(); rds_stats_exit(); rds_page_exit(); + rds_bind_lock_destroy(); rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); } @@ -547,9 +583,14 @@ static int rds_init(void) { int ret; - ret = rds_conn_init(); + ret = rds_bind_lock_init(); if (ret) goto out; + + ret = rds_conn_init(); + if (ret) + goto out_bind; + ret = rds_threads_init(); if (ret) goto out_conn; @@ -583,6 +624,8 @@ out_conn: rds_conn_exit(); rds_cong_exit(); rds_page_exit(); +out_bind: + rds_bind_lock_destroy(); out: return ret; } diff --git a/kernel/net/rds/bind.c b/kernel/net/rds/bind.c index a2e6562da..b22ea9565 100644 --- a/kernel/net/rds/bind.c +++ b/kernel/net/rds/bind.c @@ -38,51 +38,16 @@ #include #include "rds.h" -#define BIND_HASH_SIZE 1024 -static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_SPINLOCK(rds_bind_lock); +static struct rhashtable bind_hash_table; -static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) -{ - return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & - (BIND_HASH_SIZE - 1)); -} - -static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, - struct rds_sock *insert) -{ - struct rds_sock *rs; - struct hlist_head *head = hash_to_bucket(addr, port); - u64 cmp; - u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - - rcu_read_lock(); - hlist_for_each_entry_rcu(rs, head, rs_bound_node) { - cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | - be16_to_cpu(rs->rs_bound_port); - - if (cmp == needle) { - rcu_read_unlock(); - return rs; - } - } - rcu_read_unlock(); - - if (insert) { - /* - * make sure our addr and port are set before - * we are added to the list, other people - * in rcu will find us as soon as the - * hlist_add_head_rcu is done - */ - insert->rs_bound_addr = addr; - insert->rs_bound_port = port; - rds_sock_addref(insert); - - hlist_add_head_rcu(&insert->rs_bound_node, head); - } - return NULL; -} +static struct rhashtable_params ht_parms = { + .nelem_hint = 768, + .key_len = sizeof(u64), + .key_offset = offsetof(struct rds_sock, rs_bound_key), + .head_offset = offsetof(struct rds_sock, rs_bound_node), + .max_size = 16384, + .min_size = 1024, +}; /* * Return the rds_sock bound at the given local address. @@ -92,10 +57,10 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, */ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { + u64 key = ((u64)addr << 32) | port; struct rds_sock *rs; - rs = rds_bind_lookup(addr, port, NULL); - + rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else @@ -103,15 +68,16 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); + return rs; } /* returns -ve errno or +ve port */ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) { - unsigned long flags; int ret = -EADDRINUSE; u16 rover, last; + u64 key; if (*port != 0) { rover = be16_to_cpu(*port); @@ -121,42 +87,49 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - spin_lock_irqsave(&rds_bind_lock, flags); - do { if (rover == 0) rover++; - if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + + key = ((u64)addr << 32) | cpu_to_be16(rover); + if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) + continue; + + rs->rs_bound_key = key; + rs->rs_bound_addr = addr; + rs->rs_bound_port = cpu_to_be16(rover); + rs->rs_bound_node.next = NULL; + rds_sock_addref(rs); + if (!rhashtable_insert_fast(&bind_hash_table, + &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; ret = 0; rdsdebug("rs %p binding to %pI4:%d\n", rs, &addr, (int)ntohs(*port)); break; + } else { + rds_sock_put(rs); + ret = -ENOMEM; + break; } } while (rover++ != last); - spin_unlock_irqrestore(&rds_bind_lock, flags); - return ret; } void rds_remove_bound(struct rds_sock *rs) { - unsigned long flags; - - spin_lock_irqsave(&rds_bind_lock, flags); - if (rs->rs_bound_addr) { - rdsdebug("rs %p unbinding from %pI4:%d\n", - rs, &rs->rs_bound_addr, - ntohs(rs->rs_bound_port)); + if (!rs->rs_bound_addr) + return; - hlist_del_init_rcu(&rs->rs_bound_node); - rds_sock_put(rs); - rs->rs_bound_addr = 0; - } + rdsdebug("rs %p unbinding from %pI4:%d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port)); - spin_unlock_irqrestore(&rds_bind_lock, flags); + rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); + rds_sock_put(rs); + rs->rs_bound_addr = 0; } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -181,7 +154,19 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (ret) goto out; - trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + if (rs->rs_transport) { /* previously bound */ + trans = rs->rs_transport; + if (trans->laddr_check(sock_net(sock->sk), + sin->sin_addr.s_addr) != 0) { + ret = -ENOPROTOOPT; + rds_remove_bound(rs); + } else { + ret = 0; + } + goto out; + } + trans = rds_trans_get_preferred(sock_net(sock->sk), + sin->sin_addr.s_addr); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); @@ -195,9 +180,15 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); - - /* we might have called rds_remove_bound on error */ - if (ret) - synchronize_rcu(); return ret; } + +void rds_bind_lock_destroy(void) +{ + rhashtable_destroy(&bind_hash_table); +} + +int rds_bind_lock_init(void) +{ + return rhashtable_init(&bind_hash_table, &ht_parms); +} diff --git a/kernel/net/rds/connection.c b/kernel/net/rds/connection.c index da6da57e5..e3b118cae 100644 --- a/kernel/net/rds/connection.c +++ b/kernel/net/rds/connection.c @@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) } while (0) /* rcu read lock must be held or the connection spinlock */ -static struct rds_connection *rds_conn_lookup(struct hlist_head *head, +static struct rds_connection *rds_conn_lookup(struct net *net, + struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) { @@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_trans == trans) { + conn->c_trans == trans && net == rds_conn_net(conn)) { ret = conn; break; } @@ -117,7 +118,8 @@ static void rds_conn_reset(struct rds_connection *conn) * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ -static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, +static struct rds_connection *__rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, int is_outgoing) { @@ -126,12 +128,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *loop_trans; unsigned long flags; int ret; - struct rds_transport *otrans = trans; - if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) - goto new_conn; rcu_read_lock(); - conn = rds_conn_lookup(head, laddr, faddr, trans); + conn = rds_conn_lookup(net, head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && laddr == faddr && !is_outgoing) { /* This is a looped back IB connection, and we're @@ -145,7 +144,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; -new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); @@ -157,6 +155,7 @@ new_conn: conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; + rds_conn_net_set(conn, net); init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); @@ -174,7 +173,7 @@ new_conn: * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(faddr); + loop_trans = rds_trans_get_preferred(net, faddr); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; @@ -198,6 +197,7 @@ new_conn: atomic_set(&conn->c_state, RDS_CONN_DOWN); conn->c_send_gen = 0; + conn->c_outgoing = (is_outgoing ? 1 : 0); conn->c_reconnect_jiffies = 0; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); @@ -234,22 +234,13 @@ new_conn: /* Creating normal conn */ struct rds_connection *found; - if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) - found = NULL; - else - found = rds_conn_lookup(head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || - (otrans->t_type != RDS_TRANS_TCP)) { - /* Only the active side should be added to - * reconnect list for TCP. - */ - hlist_add_head_rcu(&conn->c_hash_node, head); - } + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } @@ -260,17 +251,19 @@ out: return conn; } -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); } EXPORT_SYMBOL_GPL(rds_conn_create); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); @@ -297,6 +290,8 @@ void rds_conn_shutdown(struct rds_connection *conn) wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + wait_event(conn->c_waitq, + !test_bit(RDS_RECV_REFILL, &conn->c_flags)); conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); @@ -324,7 +319,9 @@ void rds_conn_shutdown(struct rds_connection *conn) rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); - rds_queue_reconnect(conn); + if (conn->c_trans->t_type != RDS_TRANS_TCP || + conn->c_outgoing == 1) + rds_queue_reconnect(conn); } else { rcu_read_unlock(); } diff --git a/kernel/net/rds/ib.c b/kernel/net/rds/ib.c index ba2dffeff..f222885ac 100644 --- a/kernel/net/rds/ib.c +++ b/kernel/net/rds/ib.c @@ -43,14 +43,14 @@ #include "rds.h" #include "ib.h" -static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; -unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; +unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; -module_param(fmr_pool_size, int, 0444); -MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); -module_param(fmr_message_size, int, 0444); -MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_fmr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA"); +module_param(rds_ib_fmr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); @@ -97,10 +97,10 @@ static void rds_ib_dev_free(struct work_struct *work) struct rds_ib_device *rds_ibdev = container_of(work, struct rds_ib_device, free_work); - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); - if (rds_ibdev->mr) - ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->mr_8k_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); + if (rds_ibdev->mr_1m_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); @@ -150,9 +150,13 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_fmrs = dev_attr->max_fmr ? - min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : - fmr_pool_size; + rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? + min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; + + rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? + min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; @@ -164,18 +168,25 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) { - rds_ibdev->mr = NULL; + rds_ibdev->mr_1m_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); + if (IS_ERR(rds_ibdev->mr_1m_pool)) { + rds_ibdev->mr_1m_pool = NULL; goto put_dev; } - rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); - if (IS_ERR(rds_ibdev->mr_pool)) { - rds_ibdev->mr_pool = NULL; + rds_ibdev->mr_8k_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); + if (IS_ERR(rds_ibdev->mr_8k_pool)) { + rds_ibdev->mr_8k_pool = NULL; goto put_dev; } + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", + dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, + rds_ibdev->max_8k_fmrs); + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); @@ -230,11 +241,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) * * This can be called at any time and can be racing with any other RDS path. */ -static void rds_ib_remove_one(struct ib_device *device) +static void rds_ib_remove_one(struct ib_device *device, void *client_data) { - struct rds_ib_device *rds_ibdev; + struct rds_ib_device *rds_ibdev = client_data; - rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (!rds_ibdev) return; @@ -317,7 +327,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_ib_laddr_check(__be32 addr) +static int rds_ib_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; @@ -326,7 +336,7 @@ static int rds_ib_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); @@ -366,6 +376,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); } struct rds_transport rds_ib_transport = { @@ -401,10 +412,14 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -427,6 +442,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } diff --git a/kernel/net/rds/ib.h b/kernel/net/rds/ib.h index c36d71322..b3fdebb57 100644 --- a/kernel/net/rds/ib.h +++ b/kernel/net/rds/ib.h @@ -9,8 +9,11 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 8192 +#define RDS_FMR_1M_POOL_SIZE (8192 / 2) +#define RDS_FMR_1M_MSG_SIZE 256 +#define RDS_FMR_8K_MSG_SIZE 2 +#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1)) +#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -24,6 +27,9 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 +#define RDS_IB_WC_MAX 32 +#define RDS_IB_SEND_OP BIT_ULL(63) + extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -69,7 +75,11 @@ struct rds_ib_connect_private { struct rds_ib_send_work { void *s_op; - struct ib_send_wr s_wr; + union { + struct ib_send_wr s_wr; + struct ib_rdma_wr s_rdma_wr; + struct ib_atomic_wr s_atomic_wr; + }; struct ib_sge s_sge[RDS_IB_MAX_SGE]; unsigned long s_queued; }; @@ -89,6 +99,20 @@ struct rds_ib_work_ring { atomic_t w_free_ctr; }; +/* Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + + struct rds_ib_device; struct rds_ib_connection { @@ -100,9 +124,14 @@ struct rds_ib_connection { /* alphabet soup, IBTA style */ struct rdma_cm_id *i_cm_id; struct ib_pd *i_pd; - struct ib_mr *i_mr; struct ib_cq *i_send_cq; struct ib_cq *i_recv_cq; + struct ib_wc i_send_wc[RDS_IB_WC_MAX]; + struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; + + /* interrupt handling */ + struct tasklet_struct i_send_tasklet; + struct tasklet_struct i_recv_tasklet; /* tx */ struct rds_ib_work_ring i_send_ring; @@ -113,7 +142,6 @@ struct rds_ib_connection { atomic_t i_signaled_sends; /* rx */ - struct tasklet_struct i_recv_tasklet; struct mutex i_recv_mutex; struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; @@ -165,6 +193,12 @@ struct rds_ib_connection { struct rds_ib_ipaddr { struct list_head list; __be32 ipaddr; + struct rcu_head rcu; +}; + +enum { + RDS_IB_MR_8K_POOL, + RDS_IB_MR_1M_POOL, }; struct rds_ib_device { @@ -173,10 +207,12 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - struct ib_mr *mr; - struct rds_ib_mr_pool *mr_pool; - unsigned int fmr_max_remaps; unsigned int max_fmrs; + struct rds_ib_mr_pool *mr_1m_pool; + struct rds_ib_mr_pool *mr_8k_pool; + unsigned int fmr_max_remaps; + unsigned int max_8k_fmrs; + unsigned int max_1m_fmrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -199,14 +235,14 @@ struct rds_ib_device { struct rds_ib_statistics { uint64_t s_ib_connect_raced; uint64_t s_ib_listen_closed_stale; - uint64_t s_ib_tx_cq_call; + uint64_t s_ib_evt_handler_call; + uint64_t s_ib_tasklet_call; uint64_t s_ib_tx_cq_event; uint64_t s_ib_tx_ring_full; uint64_t s_ib_tx_throttle; uint64_t s_ib_tx_sg_mapping_failure; uint64_t s_ib_tx_stalled; uint64_t s_ib_tx_credit_updates; - uint64_t s_ib_rx_cq_call; uint64_t s_ib_rx_cq_event; uint64_t s_ib_rx_ring_empty; uint64_t s_ib_rx_refill_from_cq; @@ -218,12 +254,18 @@ struct rds_ib_statistics { uint64_t s_ib_ack_send_delayed; uint64_t s_ib_ack_send_piggybacked; uint64_t s_ib_ack_received; - uint64_t s_ib_rdma_mr_alloc; - uint64_t s_ib_rdma_mr_free; - uint64_t s_ib_rdma_mr_used; - uint64_t s_ib_rdma_mr_pool_flush; - uint64_t s_ib_rdma_mr_pool_wait; - uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_rdma_mr_8k_alloc; + uint64_t s_ib_rdma_mr_8k_free; + uint64_t s_ib_rdma_mr_8k_used; + uint64_t s_ib_rdma_mr_8k_pool_flush; + uint64_t s_ib_rdma_mr_8k_pool_wait; + uint64_t s_ib_rdma_mr_8k_pool_depleted; + uint64_t s_ib_rdma_mr_1m_alloc; + uint64_t s_ib_rdma_mr_1m_free; + uint64_t s_ib_rdma_mr_1m_used; + uint64_t s_ib_rdma_mr_1m_pool_flush; + uint64_t s_ib_rdma_mr_1m_pool_wait; + uint64_t s_ib_rdma_mr_1m_pool_depleted; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; }; @@ -235,28 +277,34 @@ extern struct workqueue_struct *rds_ib_wq; * doesn't define it. */ static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) + struct scatterlist *sglist, + unsigned int sg_dma_len, + int direction) { + struct scatterlist *sg; unsigned int i; - for (i = 0; i < sg_dma_len; ++i) { + for_each_sg(sglist, sg, sg_dma_len, i) { ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), + ib_sg_dma_address(dev, sg), + ib_sg_dma_len(dev, sg), direction); } } #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) + struct scatterlist *sglist, + unsigned int sg_dma_len, + int direction) { + struct scatterlist *sg; unsigned int i; - for (i = 0; i < sg_dma_len; ++i) { + for_each_sg(sglist, sg, sg_dma_len, i) { ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), + ib_sg_dma_address(dev, sg), + ib_sg_dma_len(dev, sg), direction); } } @@ -269,7 +317,8 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_message_size; +extern unsigned int rds_ib_fmr_1m_pool_size; +extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; @@ -299,7 +348,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, + int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, @@ -307,6 +357,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ int rds_ib_recv_init(void); @@ -314,10 +366,11 @@ void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); -void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc, + struct rds_ib_ack_state *state); void rds_ib_recv_tasklet_fn(unsigned long data); void rds_ib_recv_init_ring(struct rds_ib_connection *ic); void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); @@ -325,6 +378,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic); void rds_ib_attempt_ack(struct rds_ib_connection *ic); void rds_ib_ack_send_complete(struct rds_ib_connection *ic); u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required); /* ib_ring.c */ void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); @@ -339,11 +393,10 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ -char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); diff --git a/kernel/net/rds/ib_cm.c b/kernel/net/rds/ib_cm.c index 8a09ee7db..da5a7fb98 100644 --- a/kernel/net/rds/ib_cm.c +++ b/kernel/net/rds/ib_cm.c @@ -39,36 +39,6 @@ #include "rds.h" #include "ib.h" -static char *rds_ib_event_type_strings[] = { -#define RDS_IB_EVENT_STRING(foo) \ - [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) - RDS_IB_EVENT_STRING(CQ_ERR), - RDS_IB_EVENT_STRING(QP_FATAL), - RDS_IB_EVENT_STRING(QP_REQ_ERR), - RDS_IB_EVENT_STRING(QP_ACCESS_ERR), - RDS_IB_EVENT_STRING(COMM_EST), - RDS_IB_EVENT_STRING(SQ_DRAINED), - RDS_IB_EVENT_STRING(PATH_MIG), - RDS_IB_EVENT_STRING(PATH_MIG_ERR), - RDS_IB_EVENT_STRING(DEVICE_FATAL), - RDS_IB_EVENT_STRING(PORT_ACTIVE), - RDS_IB_EVENT_STRING(PORT_ERR), - RDS_IB_EVENT_STRING(LID_CHANGE), - RDS_IB_EVENT_STRING(PKEY_CHANGE), - RDS_IB_EVENT_STRING(SM_CHANGE), - RDS_IB_EVENT_STRING(SRQ_ERR), - RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), - RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), - RDS_IB_EVENT_STRING(CLIENT_REREGISTER), -#undef RDS_IB_EVENT_STRING -}; - -static char *rds_ib_event_str(enum ib_event_type type) -{ - return rds_str_array(rds_ib_event_type_strings, - ARRAY_SIZE(rds_ib_event_type_strings), type); -}; - /* * Set the selected protocol version */ @@ -165,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, 1); + rds_ib_recv_refill(conn, 1, GFP_KERNEL); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -243,7 +213,97 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { rdsdebug("event %u (%s) data %p\n", - event->event, rds_ib_event_str(event->event), data); + event->event, ib_event_msg(event->event), data); +} + +/* Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr; + int i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + wc->byte_len, be32_to_cpu(wc->ex.imm_data)); + + if (wc->wr_id & RDS_IB_SEND_OP) + rds_ib_send_cqe_handler(ic, wc); + else + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + +static void rds_ib_tasklet_fn_send(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *)data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state state; + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&state, 0, sizeof(state)); + poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + + if (rds_conn_up(conn) && + (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued))) + rds_send_xmit(ic->conn); +} + +static void rds_ib_tasklet_fn_recv(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *)data; + struct rds_connection *conn = ic->conn; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + struct rds_ib_ack_state state; + + if (!rds_ibdev) + rds_conn_drop(conn); + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&state, 0, sizeof(state)); + poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -252,7 +312,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_ib_connection *ic = conn->c_transport_data; rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, - rds_ib_event_str(event->event)); + ib_event_msg(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: @@ -261,13 +321,25 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) default: rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", - event->event, rds_ib_event_str(event->event), + event->event, ib_event_msg(event->event), &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; } } +static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_send_tasklet); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -277,6 +349,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_connection *ic = conn->c_transport_data; struct ib_device *dev = ic->i_cm_id->device; struct ib_qp_init_attr attr; + struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; int ret; @@ -298,11 +371,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; - ic->i_mr = rds_ibdev->mr; - ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + cq_attr.cqe = ic->i_send_ring.w_nr + 1; + + ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, - ic->i_send_ring.w_nr + 1, 0); + &cq_attr); if (IS_ERR(ic->i_send_cq)) { ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; @@ -310,9 +384,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto out; } - ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + cq_attr.cqe = ic->i_recv_ring.w_nr; + ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, rds_ib_cq_event_handler, conn, - ic->i_recv_ring.w_nr, 0); + &cq_attr); if (IS_ERR(ic->i_recv_cq)) { ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; @@ -402,7 +477,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rds_ib_recv_init_ack(ic); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); out: @@ -475,8 +550,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid)); - conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, - GFP_KERNEL); + /* RDS/IB is not currently netns aware, thus init_net */ + conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, + &rds_ib_transport, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -592,7 +668,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); @@ -664,8 +740,18 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); + /* first destroy the ib state that generates callbacks */ + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + + /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * @@ -689,12 +775,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); rdma_destroy_id(ic->i_cm_id); /* @@ -705,7 +785,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id = NULL; ic->i_pd = NULL; - ic->i_mr = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; ic->i_send_hdrs = NULL; @@ -768,8 +847,10 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) } INIT_LIST_HEAD(&ic->ib_node); - tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, - (unsigned long) ic); + tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send, + (unsigned long)ic); + tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv, + (unsigned long)ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); diff --git a/kernel/net/rds/ib_rdma.c b/kernel/net/rds/ib_rdma.c index c8faaf364..19123a97b 100644 --- a/kernel/net/rds/ib_rdma.c +++ b/kernel/net/rds/ib_rdma.c @@ -66,6 +66,7 @@ struct rds_ib_mr { * Our own little FMR pool */ struct rds_ib_mr_pool { + unsigned int pool_type; struct mutex flush_lock; /* serialize fmr invalidate */ struct delayed_work flush_worker; /* flush worker */ @@ -84,6 +85,25 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; +static struct workqueue_struct *rds_ib_fmr_wq; + +int rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -141,10 +161,8 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) } spin_unlock_irq(&rds_ibdev->spinlock); - if (to_free) { - synchronize_rcu(); - kfree(to_free); - } + if (to_free) + kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) @@ -152,12 +170,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) { + if (!rds_ibdev_old) + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + + if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } + rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -213,7 +236,8 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) { struct rds_ib_mr_pool *pool; @@ -221,6 +245,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); + pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); @@ -228,28 +253,30 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - pool->fmr_attr.max_pages = fmr_message_size; + if (pool_type == RDS_IB_MR_1M_POOL) { + /* +1 allows for unaligned MRs */ + pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; + pool->max_items = RDS_FMR_1M_POOL_SIZE; + } else { + /* pool_type == RDS_IB_MR_8K_POOL */ + pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; + pool->max_items = RDS_FMR_8K_POOL_SIZE; + } + + pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - pool->max_items = rds_ibdev->max_fmrs; return pool; } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->fmr_attr.max_pages; + iinfo->rdma_mr_max = pool_1m->max_items; + iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) @@ -291,14 +318,28 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, + int npages) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool; struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (npages <= RDS_FMR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + + /* Switch pools if one of the pool is reaching upper limit */ + if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + pool = rds_ibdev->mr_1m_pool; + else + pool = rds_ibdev->mr_8k_pool; + } while (1) { ibmr = rds_ib_reuse_fmr(pool); @@ -320,12 +361,18 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) atomic_dec(&pool->item_count); if (++iter > 2) { - rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); return ERR_PTR(-EAGAIN); } /* We do have some empty MRs. Flush them out. */ - rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; @@ -337,8 +384,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - memset(ibmr, 0, sizeof(*ibmr)); - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | @@ -352,7 +397,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - rds_ib_stats_inc(s_ib_rdma_mr_alloc); + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + return ibmr; out_no_cigar: @@ -408,7 +458,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm } page_cnt += len >> PAGE_SHIFT; - if (page_cnt > fmr_message_size) + if (page_cnt > ibmr->pool->fmr_attr.max_pages) return -EINVAL; dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, @@ -440,7 +490,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm ibmr->sg_dma_len = sg_dma_len; ibmr->remap_count++; - rds_ib_stats_inc(s_ib_rdma_mr_used); + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); ret = 0; out: @@ -486,7 +539,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } @@ -503,8 +556,7 @@ static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) __rds_ib_teardown_mr(ibmr); if (pinned) { - struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } @@ -524,11 +576,13 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr /* * given an llist of mrs, put them all into the list_head for more processing */ -static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +static unsigned int llist_append_to_list(struct llist_head *llist, + struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; + unsigned int count = 0; node = llist_del_all(llist); while (node) { @@ -536,7 +590,9 @@ static void llist_append_to_list(struct llist_head *llist, struct list_head *lis ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; + count++; } + return count; } /* @@ -569,7 +625,7 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, * to free as many MRs as needed to get back to this limit. */ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; struct llist_node *clean_nodes; @@ -577,14 +633,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; int ret = 0; - rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); - while(!mutex_trylock(&pool->flush_lock)) { + while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) { *ibmr_ret = ibmr; @@ -619,8 +678,8 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ - llist_append_to_list(&pool->drop_list, &unmap_list); - llist_append_to_list(&pool->free_list, &unmap_list); + dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); + dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) llist_append_to_list(&pool->clean_list, &unmap_list); @@ -641,14 +700,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); - if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { - rds_ib_stats_inc(s_ib_rdma_mr_free); + if (nfreed < free_goal || + ibmr->remap_count >= pool->fmr_attr.max_maps) { + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); nfreed++; } - ncleaned++; } if (!list_empty(&unmap_list)) { @@ -674,7 +736,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, } atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: @@ -695,8 +757,8 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work) void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); @@ -711,16 +773,18 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + atomic_read(&pool->dirty_count) >= pool->max_items / 5) + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - schedule_delayed_work(&pool->flush_worker, 10); + * as use_once and invalidate at the same time. + */ + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } @@ -733,10 +797,11 @@ void rds_ib_flush_mrs(void) down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + if (rds_ibdev->mr_8k_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); - if (pool) - rds_ib_flush_mr_pool(pool, 0, NULL); + if (rds_ibdev->mr_1m_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } @@ -754,12 +819,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - if (!rds_ibdev->mr_pool) { + if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev); + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); if (IS_ERR(ibmr)) { rds_ib_dev_put(rds_ibdev); return ibmr; diff --git a/kernel/net/rds/ib_recv.c b/kernel/net/rds/ib_recv.c index 1b981a4e4..977fb8606 100644 --- a/kernel/net/rds/ib_recv.c +++ b/kernel/net/rds/ib_recv.c @@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; sge = &recv->r_sge[1]; sge->addr = 0; sge->length = RDS_FRAG_SIZE; - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; } } @@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, int prefill) + struct rds_ib_recv_work *recv, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct ib_sge *sge; @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (prefill) { + if (gfp & __GFP_DIRECT_RECLAIM) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -347,6 +347,24 @@ out: return ret; } +static int acquire_refill(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0; +} + +static void release_refill(struct rds_connection *conn) +{ + clear_bit(RDS_RECV_REFILL, &conn->c_flags); + + /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into @@ -354,15 +372,23 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; + bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM); u32 pos; + /* the goal here is to just make sure that someone, somewhere + * is posting buffers. If we can't get the refill lock, + * let them do their thing + */ + if (!acquire_refill(conn)) + return; + while ((prefill || rds_conn_up(conn)) && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { @@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, prefill); + ret = rds_ib_recv_refill_one(conn, recv, gfp); if (ret) { break; } @@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + + release_refill(conn); + + /* if we're called from the softirq handler, we'll be GFP_NOWAIT. + * in this case the ring being low is going to lead to more interrupts + * and we can safely let the softirq code take care of it unless the + * ring is completely empty. + * + * if we're called from krdsd, we'll be GFP_KERNEL. In this case + * we might have raced with the softirq code while we had the refill + * lock held. Use rds_ib_ring_low() instead of ring_empty to decide + * if we should requeue. + */ + if (rds_conn_up(conn) && + ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || + rds_ib_ring_empty(&ic->i_recv_ring))) { + queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + } } /* @@ -520,7 +564,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) sge->addr = ic->i_ack_dma; sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; wr->sg_list = sge; wr->num_sge = 1; @@ -552,8 +596,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) * wr_id and avoids working with the ring in that case. */ #ifndef KERNEL_HAS_ATOMIC64 -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, - int ack_required) +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { unsigned long flags; @@ -578,8 +621,7 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic) return seq; } #else -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, - int ack_required) +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { @@ -786,20 +828,6 @@ static void rds_ib_cong_recv(struct rds_connection *conn, rds_cong_map_updated(map, uncongested); } -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_ib_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - static void rds_ib_process_recv(struct rds_connection *conn, struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) @@ -925,89 +953,50 @@ static void rds_ib_process_recv(struct rds_connection *conn, } } -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p cq %p\n", conn, cq); - - rds_ib_stats_inc(s_ib_rx_cq_call); - - tasklet_schedule(&ic->i_recv_tasklet); -} - -static inline void rds_poll_cq(struct rds_ib_connection *ic, - struct rds_ib_ack_state *state) +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, + struct ib_wc *wc, + struct rds_ib_ack_state *state) { struct rds_connection *conn = ic->conn; - struct ib_wc wc; struct rds_ib_recv_work *recv; - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_rx_cq_event); - - recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); - ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { - /* We expect errors as the qp is drained during shutdown */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had " - "status %u (%s), disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status, - rds_ib_wc_status_str(wc.status)); - } + rds_ib_stats_inc(s_ib_rx_cq_event); + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, + DMA_FROM_DEVICE); - /* - * It's very important that we only free this ring entry if we've truly - * freed the resources allocated to the entry. The refilling path can - * leak if we don't. - */ - rds_ib_ring_free(&ic->i_recv_ring, 1); + /* Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (wc->status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc->byte_len, state); + } else { + /* We expect errors as the qp is drained during shutdown */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, + wc->status, + ib_wc_status_msg(wc->status)); } -} - -void rds_ib_recv_tasklet_fn(unsigned long data) -{ - struct rds_ib_connection *ic = (struct rds_ib_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state = { 0, }; - - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); - if (state.ack_next_valid) - rds_ib_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; + /* rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and + * the following rds_ib_ring_free() call tells the refill path + * that it will not find an allocated frag here. Make sure we + * keep that promise by freeing a frag that's still on the ring. + */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; } - if (rds_conn_up(conn)) - rds_ib_attempt_ack(ic); + rds_ib_ring_free(&ic->i_recv_ring, 1); /* If we ever end up with a really empty receive ring, we're * in deep trouble, as the sender will definitely see RNR @@ -1016,7 +1005,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data) rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) - rds_ib_recv_refill(conn, 0); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); } int rds_ib_recv(struct rds_connection *conn) @@ -1025,8 +1014,10 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - if (rds_conn_up(conn)) + if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); + rds_ib_recv_refill(conn, 0, GFP_KERNEL); + } return ret; } @@ -1049,9 +1040,10 @@ int rds_ib_recv_init(void) rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!rds_ib_frag_slab) + if (!rds_ib_frag_slab) { kmem_cache_destroy(rds_ib_incoming_slab); - else + rds_ib_incoming_slab = NULL; + } else ret = 0; out: return ret; diff --git a/kernel/net/rds/ib_send.c b/kernel/net/rds/ib_send.c index bd3825d38..eac30bf48 100644 --- a/kernel/net/rds/ib_send.c +++ b/kernel/net/rds/ib_send.c @@ -39,40 +39,6 @@ #include "rds.h" #include "ib.h" -static char *rds_ib_wc_status_strings[] = { -#define RDS_IB_WC_STATUS_STR(foo) \ - [IB_WC_##foo] = __stringify(IB_WC_##foo) - RDS_IB_WC_STATUS_STR(SUCCESS), - RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), - RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), - RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), - RDS_IB_WC_STATUS_STR(MW_BIND_ERR), - RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), - RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_OP_ERR), - RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), - RDS_IB_WC_STATUS_STR(INV_EECN_ERR), - RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), - RDS_IB_WC_STATUS_STR(FATAL_ERR), - RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), - RDS_IB_WC_STATUS_STR(GENERAL_ERR), -#undef RDS_IB_WC_STATUS_STR -}; - -char *rds_ib_wc_status_str(enum ib_wc_status status) -{ - return rds_str_array(rds_ib_wc_status_strings, - ARRAY_SIZE(rds_ib_wc_status_strings), status); -} - /* * Convert IB-specific error message to RDS error message and call core * completion handler. @@ -229,16 +195,16 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i; + send->s_wr.wr_id = i | RDS_IB_SEND_OP; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; - send->s_sge[1].lkey = ic->i_mr->lkey; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; } } @@ -271,81 +237,73 @@ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) * unallocs the next free entry in the ring it doesn't alter which is * the next to be freed, which is what this is concerned with. */ -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) { - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; struct rds_message *rm = NULL; - struct ib_wc wc; + struct rds_connection *conn = ic->conn; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; - int ret; int nr_sig = 0; - rdsdebug("cq %p conn %p\n", cq, conn); - rds_ib_stats_inc(s_ib_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_tx_cq_event); - - if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (time_after(jiffies, ic->i_ack_queued + HZ/2)) - rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ack_send_complete(ic); - continue; - } - oldest = rds_ib_ring_oldest(&ic->i_send_ring); + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); - completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + if (wc->wr_id == RDS_IB_ACK_WR_ID) { + if (time_after(jiffies, ic->i_ack_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + return; + } - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - if (send->s_wr.send_flags & IB_SEND_SIGNALED) - nr_sig++; + oldest = rds_ib_ring_oldest(&ic->i_send_ring); - rm = rds_ib_send_unmap_op(ic, send, wc.status); + completed = rds_ib_ring_completed(&ic->i_send_ring, + (wc->wr_id & ~RDS_IB_SEND_OP), + oldest); - if (time_after(jiffies, send->s_queued + HZ/2)) - rds_ib_stats_inc(s_ib_tx_stalled); + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - if (send->s_op) { - if (send->s_op == rm->m_final_op) { - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - } - rds_message_put(rm); - send->s_op = NULL; - } + rm = rds_ib_send_unmap_op(ic, send, wc->status); - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } + if (time_after(jiffies, send->s_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ring_free(&ic->i_send_ring, completed); - rds_ib_sub_signaled(ic, nr_sig); - nr_sig = 0; - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status " - "%u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc.status, - rds_ib_wc_status_str(wc.status)); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get + * flushed out, wake them up now + */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc->status, + ib_wc_status_msg(wc->status)); } } @@ -605,6 +563,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } rds_message_addref(rm); + rm->data.op_dmasg = 0; + rm->data.op_dmaoff = 0; ic->i_data_op = &rm->data; /* Finalize the header */ @@ -658,7 +618,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &ic->i_data_op->op_sg[sg]; + scat = &ic->i_data_op->op_sg[rm->data.op_dmasg]; i = 0; do { unsigned int len = 0; @@ -680,17 +640,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* Set up the data, if present */ if (i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]) { - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); send->s_wr.num_sge = 2; - send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].addr = ib_sg_dma_address(dev, scat); + send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; bytes_sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm->data.op_dmaoff += len; + if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm->data.op_dmasg++; + rm->data.op_dmaoff = 0; } } @@ -738,6 +701,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + prev->s_wr.send_flags |= IB_SEND_SIGNALED; + nr_sig++; + } ic->i_data_op = NULL; } @@ -809,23 +777,23 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_queued = jiffies; if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { - send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; - send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; - send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; - send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; - send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_atomic_wr.compare_add = op->op_m_cswp.compare; + send->s_atomic_wr.swap = op->op_m_cswp.swap; + send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask; } else { /* FADD */ - send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; - send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; - send->s_wr.wr.atomic.swap = 0; - send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; - send->s_wr.wr.atomic.swap_mask = 0; + send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_atomic_wr.compare_add = op->op_m_fadd.add; + send->s_atomic_wr.swap = 0; + send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_atomic_wr.swap_mask = 0; } nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); - send->s_wr.num_sge = 1; - send->s_wr.next = NULL; - send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; - send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_atomic_wr.wr.num_sge = 1; + send->s_atomic_wr.wr.next = NULL; + send->s_atomic_wr.remote_addr = op->op_remote_addr; + send->s_atomic_wr.rkey = op->op_rkey; send->s_op = op; rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); @@ -842,7 +810,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) /* Convert our struct scatterlist to struct ib_sge */ send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); - send->s_sge[0].lkey = ic->i_mr->lkey; + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, send->s_sge[0].addr, send->s_sge[0].length); @@ -850,11 +818,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) if (nr_sig) atomic_add(nr_sig, &ic->i_signaled_sends); - failed_wr = &send->s_wr; - ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + failed_wr = &send->s_atomic_wr.wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr); rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, - send, &send->s_wr, ret, failed_wr); - BUG_ON(failed_wr != &send->s_wr); + send, &send->s_atomic_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_atomic_wr.wr); if (ret) { printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); @@ -863,9 +831,9 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) goto out; } - if (unlikely(failed_wr != &send->s_wr)) { + if (unlikely(failed_wr != &send->s_atomic_wr.wr)) { printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); - BUG_ON(failed_wr != &send->s_wr); + BUG_ON(failed_wr != &send->s_atomic_wr.wr); } out: @@ -936,27 +904,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; - send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->op_rkey; + send->s_rdma_wr.remote_addr = remote_addr; + send->s_rdma_wr.rkey = op->op_rkey; if (num_sge > max_sge) { - send->s_wr.num_sge = max_sge; + send->s_rdma_wr.wr.num_sge = max_sge; num_sge -= max_sge; } else { - send->s_wr.num_sge = num_sge; + send->s_rdma_wr.wr.num_sge = num_sge; } - send->s_wr.next = NULL; + send->s_rdma_wr.wr.next = NULL; if (prev) - prev->s_wr.next = &send->s_wr; + prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + for (j = 0; j < send->s_rdma_wr.wr.num_sge && + scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_mr->lkey; + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); @@ -966,7 +935,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) } rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + &send->s_rdma_wr.wr, + send->s_rdma_wr.wr.num_sge, + send->s_rdma_wr.wr.next); prev = send; if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) @@ -987,11 +958,11 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) if (nr_sig) atomic_add(nr_sig, &ic->i_signaled_sends); - failed_wr = &first->s_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + failed_wr = &first->s_rdma_wr.wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_wr); + first, &first->s_rdma_wr.wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_rdma_wr.wr); if (ret) { printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); @@ -1000,9 +971,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) goto out; } - if (unlikely(failed_wr != &first->s_wr)) { + if (unlikely(failed_wr != &first->s_rdma_wr.wr)) { printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); - BUG_ON(failed_wr != &first->s_wr); + BUG_ON(failed_wr != &first->s_rdma_wr.wr); } diff --git a/kernel/net/rds/ib_stats.c b/kernel/net/rds/ib_stats.c index 2d5965d6e..d77e04473 100644 --- a/kernel/net/rds/ib_stats.c +++ b/kernel/net/rds/ib_stats.c @@ -42,14 +42,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", - "ib_tx_cq_call", + "s_ib_evt_handler_call", + "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", - "ib_rx_cq_call", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", @@ -61,12 +61,18 @@ static const char *const rds_ib_stat_names[] = { "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", - "ib_rdma_mr_alloc", - "ib_rdma_mr_free", - "ib_rdma_mr_used", - "ib_rdma_mr_pool_flush", - "ib_rdma_mr_pool_wait", - "ib_rdma_mr_pool_depleted", + "ib_rdma_mr_8k_alloc", + "ib_rdma_mr_8k_free", + "ib_rdma_mr_8k_used", + "ib_rdma_mr_8k_pool_flush", + "ib_rdma_mr_8k_pool_wait", + "ib_rdma_mr_8k_pool_depleted", + "ib_rdma_mr_1m_alloc", + "ib_rdma_mr_1m_free", + "ib_rdma_mr_1m_used", + "ib_rdma_mr_1m_pool_flush", + "ib_rdma_mr_1m_pool_wait", + "ib_rdma_mr_1m_pool_depleted", "ib_atomic_cswp", "ib_atomic_fadd", }; diff --git a/kernel/net/rds/iw.c b/kernel/net/rds/iw.c index 589935661..576f1825f 100644 --- a/kernel/net/rds/iw.c +++ b/kernel/net/rds/iw.c @@ -125,12 +125,11 @@ free_attr: kfree(dev_attr); } -static void rds_iw_remove_one(struct ib_device *device) +static void rds_iw_remove_one(struct ib_device *device, void *client_data) { - struct rds_iw_device *rds_iwdev; + struct rds_iw_device *rds_iwdev = client_data; struct rds_iw_cm_id *i_cm_id, *next; - rds_iwdev = ib_get_client_data(device, &rds_iw_client); if (!rds_iwdev) return; @@ -149,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device) if (rds_iwdev->mr) ib_dereg_mr(rds_iwdev->mr); - while (ib_dealloc_pd(rds_iwdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); - msleep(1); - } + ib_dealloc_pd(rds_iwdev->pd); list_del(&rds_iwdev->list); kfree(rds_iwdev); @@ -218,7 +214,7 @@ static void rds_iw_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_iw_laddr_check(__be32 addr) +static int rds_iw_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; @@ -227,7 +223,7 @@ static int rds_iw_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); diff --git a/kernel/net/rds/iw.h b/kernel/net/rds/iw.h index cbe6674e3..5af01d175 100644 --- a/kernel/net/rds/iw.h +++ b/kernel/net/rds/iw.h @@ -74,10 +74,13 @@ struct rds_iw_send_work { struct rm_rdma_op *s_op; struct rds_iw_mapping *s_mapping; struct ib_mr *s_mr; - struct ib_fast_reg_page_list *s_page_list; unsigned char s_remap_count; - struct ib_send_wr s_wr; + union { + struct ib_send_wr s_send_wr; + struct ib_rdma_wr s_rdma_wr; + struct ib_reg_wr s_reg_wr; + }; struct ib_sge s_sge[RDS_IW_MAX_SGE]; unsigned long s_queued; }; @@ -195,7 +198,7 @@ struct rds_iw_device { /* Magic WR_ID for ACKs */ #define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) -#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) +#define RDS_IW_REG_WR_ID ((u64)0xefefefefefefefefULL) #define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) struct rds_iw_statistics { diff --git a/kernel/net/rds/iw_cm.c b/kernel/net/rds/iw_cm.c index a6c2bea9f..aea4c911b 100644 --- a/kernel/net/rds/iw_cm.c +++ b/kernel/net/rds/iw_cm.c @@ -179,6 +179,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, void *context) { struct ib_device *dev = rds_iwdev->dev; + struct ib_cq_init_attr cq_attr = {}; unsigned int send_size, recv_size; int ret; @@ -198,9 +199,10 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, attr->sq_sig_type = IB_SIGNAL_REQ_WR; attr->qp_type = IB_QPT_RC; + cq_attr.cqe = send_size; attr->send_cq = ib_create_cq(dev, send_cq_handler, rds_iw_cq_event_handler, - context, send_size, 0); + context, &cq_attr); if (IS_ERR(attr->send_cq)) { ret = PTR_ERR(attr->send_cq); attr->send_cq = NULL; @@ -208,9 +210,10 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, goto out; } + cq_attr.cqe = recv_size; attr->recv_cq = ib_create_cq(dev, recv_cq_handler, rds_iw_cq_event_handler, - context, recv_size, 0); + context, &cq_attr); if (IS_ERR(attr->recv_cq)) { ret = PTR_ERR(attr->recv_cq); attr->recv_cq = NULL; @@ -395,8 +398,9 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, &dp->dp_saddr, &dp->dp_daddr, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); - conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, - GFP_KERNEL); + /* RDS/IW is not currently netns aware, thus init_net */ + conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, + &rds_iw_transport, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -520,7 +524,7 @@ int rds_iw_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); diff --git a/kernel/net/rds/iw_rdma.c b/kernel/net/rds/iw_rdma.c index dba8d0864..b09a40c1a 100644 --- a/kernel/net/rds/iw_rdma.c +++ b/kernel/net/rds/iw_rdma.c @@ -47,7 +47,6 @@ struct rds_iw_mr { struct rdma_cm_id *cm_id; struct ib_mr *mr; - struct ib_fast_reg_page_list *page_list; struct rds_iw_mapping mapping; unsigned char remap_count; @@ -75,10 +74,10 @@ struct rds_iw_mr_pool { int max_pages; }; -static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); +static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); static void rds_iw_mr_pool_flush_worker(struct work_struct *work); -static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, +static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr, struct scatterlist *sg, unsigned int nents); static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); @@ -258,19 +257,18 @@ static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, sg->bytes = 0; } -static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, - struct rds_iw_scatterlist *sg) +static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, + struct rds_iw_scatterlist *sg) { struct ib_device *dev = rds_iwdev->dev; - u64 *dma_pages = NULL; - int i, j, ret; + int i, ret; WARN_ON(sg->dma_len); sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); if (unlikely(!sg->dma_len)) { printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); - return ERR_PTR(-EBUSY); + return -EBUSY; } sg->bytes = 0; @@ -303,31 +301,14 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, if (sg->dma_npages > fastreg_message_size) goto out_unmap; - dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC); - if (!dma_pages) { - ret = -ENOMEM; - goto out_unmap; - } - - for (i = j = 0; i < sg->dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); - u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); - u64 end_addr; - end_addr = dma_addr + dma_len; - dma_addr &= ~PAGE_MASK; - for (; dma_addr < end_addr; dma_addr += PAGE_SIZE) - dma_pages[j++] = dma_addr; - BUG_ON(j > sg->dma_npages); - } - return dma_pages; + return 0; out_unmap: ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); sg->dma_len = 0; - kfree(dma_pages); - return ERR_PTR(ret); + return ret; } @@ -440,7 +421,7 @@ static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) INIT_LIST_HEAD(&ibmr->mapping.m_list); ibmr->mapping.m_mr = ibmr; - err = rds_iw_init_fastreg(pool, ibmr); + err = rds_iw_init_reg(pool, ibmr); if (err) goto out_no_cigar; @@ -479,14 +460,13 @@ void rds_iw_sync_mr(void *trans_private, int direction) * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) +static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) { struct rds_iw_mr *ibmr, *next; LIST_HEAD(unmap_list); LIST_HEAD(kill_list); unsigned long flags; unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; - int ret = 0; rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); @@ -538,7 +518,6 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) atomic_sub(nfreed, &pool->item_count); mutex_unlock(&pool->flush_lock); - return ret; } static void rds_iw_mr_pool_flush_worker(struct work_struct *work) @@ -622,7 +601,7 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, ibmr->cm_id = cm_id; ibmr->device = rds_iwdev; - ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents); + ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents); if (ret == 0) *key_ret = ibmr->mr->rkey; else @@ -638,7 +617,7 @@ out: } /* - * iWARP fastreg handling + * iWARP reg handling * * The life cycle of a fastreg registration is a bit different from * FMRs. @@ -650,7 +629,7 @@ out: * This creates a bit of a problem for us, as we do not have the destination * IP in GET_MR, so the connection must be setup prior to the GET_MR call for * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit - * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request + * will try to queue a LOCAL_INV (if needed) and a REG_MR work request * before queuing the SEND. When completions for these arrive, they are * dispatched to the MR has a bit set showing that RDMa can be performed. * @@ -659,71 +638,60 @@ out: * The expectation there is that this invalidation step includes ALL * PREVIOUSLY FREED MRs. */ -static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) +static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) { struct rds_iw_device *rds_iwdev = pool->device; - struct ib_fast_reg_page_list *page_list = NULL; struct ib_mr *mr; int err; - mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG, + pool->max_message_size); if (IS_ERR(mr)) { err = PTR_ERR(mr); - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); - return err; - } - - /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages - * is not filled in. - */ - page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size); - if (IS_ERR(page_list)) { - err = PTR_ERR(page_list); - - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err); - ib_dereg_mr(mr); + printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err); return err; } - ibmr->page_list = page_list; ibmr->mr = mr; return 0; } -static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) +static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping) { struct rds_iw_mr *ibmr = mapping->m_mr; - struct ib_send_wr f_wr, *failed_wr; - int ret; + struct rds_iw_scatterlist *m_sg = &mapping->m_sg; + struct ib_reg_wr reg_wr; + struct ib_send_wr *failed_wr; + int ret, n; + + n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE); + if (unlikely(n != m_sg->len)) + return n < 0 ? n : -EINVAL; + + reg_wr.wr.next = NULL; + reg_wr.wr.opcode = IB_WR_REG_MR; + reg_wr.wr.wr_id = RDS_IW_REG_WR_ID; + reg_wr.wr.num_sge = 0; + reg_wr.mr = ibmr->mr; + reg_wr.key = mapping->m_rkey; + reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; /* - * Perform a WR for the fast_reg_mr. Each individual page + * Perform a WR for the reg_mr. Each individual page * in the sg list is added to the fast reg page list and placed - * inside the fast_reg_mr WR. The key used is a rolling 8bit + * inside the reg_mr WR. The key used is a rolling 8bit * counter, which should guarantee uniqueness. */ ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); mapping->m_rkey = ibmr->mr->rkey; - memset(&f_wr, 0, sizeof(f_wr)); - f_wr.wr_id = RDS_IW_FAST_REG_WR_ID; - f_wr.opcode = IB_WR_FAST_REG_MR; - f_wr.wr.fast_reg.length = mapping->m_sg.bytes; - f_wr.wr.fast_reg.rkey = mapping->m_rkey; - f_wr.wr.fast_reg.page_list = ibmr->page_list; - f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; - f_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; - f_wr.wr.fast_reg.iova_start = 0; - f_wr.send_flags = IB_SEND_SIGNALED; - - failed_wr = &f_wr; - ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); - BUG_ON(failed_wr != &f_wr); + failed_wr = ®_wr.wr; + ret = ib_post_send(ibmr->cm_id->qp, ®_wr.wr, &failed_wr); + BUG_ON(failed_wr != ®_wr.wr); if (ret) printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", __func__, __LINE__, ret); @@ -755,21 +723,20 @@ out: return ret; } -static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, - unsigned int sg_len) +static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, + unsigned int sg_len) { struct rds_iw_device *rds_iwdev = pool->device; struct rds_iw_mapping *mapping = &ibmr->mapping; u64 *dma_pages; - int i, ret = 0; + int ret = 0; rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); - dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); - if (IS_ERR(dma_pages)) { - ret = PTR_ERR(dma_pages); + ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); + if (ret) { dma_pages = NULL; goto out; } @@ -779,10 +746,7 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, goto out; } - for (i = 0; i < mapping->m_sg.dma_npages; ++i) - ibmr->page_list->page_list[i] = dma_pages[i]; - - ret = rds_iw_rdma_build_fastreg(mapping); + ret = rds_iw_rdma_reg_mr(mapping); if (ret) goto out; @@ -868,8 +832,6 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr) { - if (ibmr->page_list) - ib_free_fast_reg_page_list(ibmr->page_list); if (ibmr->mr) ib_dereg_mr(ibmr->mr); } diff --git a/kernel/net/rds/iw_send.c b/kernel/net/rds/iw_send.c index 13834780a..e20bd503f 100644 --- a/kernel/net/rds/iw_send.c +++ b/kernel/net/rds/iw_send.c @@ -137,13 +137,13 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic) send->s_op = NULL; send->s_mapping = NULL; - send->s_wr.next = NULL; - send->s_wr.wr_id = i; - send->s_wr.sg_list = send->s_sge; - send->s_wr.num_sge = 1; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.send_flags = 0; - send->s_wr.ex.imm_data = 0; + send->s_send_wr.next = NULL; + send->s_send_wr.wr_id = i; + send->s_send_wr.sg_list = send->s_sge; + send->s_send_wr.num_sge = 1; + send->s_send_wr.opcode = IB_WR_SEND; + send->s_send_wr.send_flags = 0; + send->s_send_wr.ex.imm_data = 0; sge = rds_iw_data_sge(ic, send->s_sge); sge->lkey = 0; @@ -153,16 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic) sge->length = sizeof(struct rds_header); sge->lkey = 0; - send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG, + fastreg_message_size); if (IS_ERR(send->s_mr)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); - break; - } - - send->s_page_list = ib_alloc_fast_reg_page_list( - ic->i_cm_id->device, fastreg_message_size); - if (IS_ERR(send->s_page_list)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n"); + printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n"); break; } } @@ -176,9 +170,7 @@ void rds_iw_send_clear_ring(struct rds_iw_connection *ic) for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { BUG_ON(!send->s_mr); ib_dereg_mr(send->s_mr); - BUG_ON(!send->s_page_list); - ib_free_fast_reg_page_list(send->s_page_list); - if (send->s_wr.opcode == 0xdead) + if (send->s_send_wr.opcode == 0xdead) continue; if (send->s_rm) rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); @@ -226,7 +218,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) continue; } - if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) { + if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) { ic->i_fastreg_posted = 1; continue; } @@ -246,12 +238,12 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) send = &ic->i_sends[oldest]; /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_wr.opcode) { + switch (send->s_send_wr.opcode) { case IB_WR_SEND: if (send->s_rm) rds_iw_send_unmap_rm(ic, send, wc.status); break; - case IB_WR_FAST_REG_MR: + case IB_WR_REG_MR: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: @@ -261,12 +253,12 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) default: printk_ratelimited(KERN_NOTICE "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_wr.opcode); + __func__, send->s_send_wr.opcode); break; } - send->s_wr.opcode = 0xdead; - send->s_wr.num_sge = 1; + send->s_send_wr.opcode = 0xdead; + send->s_send_wr.num_sge = 1; if (time_after(jiffies, send->s_queued + HZ/2)) rds_iw_stats_inc(s_iw_tx_stalled); @@ -454,10 +446,10 @@ rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, WARN_ON(pos != send - ic->i_sends); - send->s_wr.send_flags = send_flags; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.num_sge = 2; - send->s_wr.next = NULL; + send->s_send_wr.send_flags = send_flags; + send->s_send_wr.opcode = IB_WR_SEND; + send->s_send_wr.num_sge = 2; + send->s_send_wr.next = NULL; send->s_queued = jiffies; send->s_op = NULL; @@ -471,7 +463,7 @@ rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, } else { /* We're sending a packet with no payload. There is only * one SGE */ - send->s_wr.num_sge = 1; + send->s_send_wr.num_sge = 1; sge = &send->s_sge[0]; } @@ -581,6 +573,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; rds_message_addref(rm); + rm->data.op_dmasg = 0; + rm->data.op_dmaoff = 0; ic->i_rm = rm; /* Finalize the header */ @@ -622,7 +616,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->data.op_sg[sg]; + scat = &rm->data.op_sg[rm->data.op_dmasg]; sent = 0; i = 0; @@ -656,10 +650,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); rds_iw_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + off, len, - send_flags); + ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len, + send_flags); /* * We want to delay signaling completions just enough to get @@ -668,29 +663,30 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, */ if (ic->i_unsignaled_wrs-- == 0) { ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; } ic->i_unsignaled_bytes -= len; if (ic->i_unsignaled_bytes <= 0) { ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; } /* * Always signal the last one if we're stopping due to flow control. */ if (flow_controlled && i == (work_alloc-1)) - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next); sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm->data.op_dmaoff += len; + if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm->data.op_dmaoff = 0; + rm->data.op_dmasg++; } add_header: @@ -717,7 +713,7 @@ add_header: } if (prev) - prev->s_wr.next = &send->s_wr; + prev->s_send_wr.next = &send->s_send_wr; prev = send; pos = (pos + 1) % ic->i_send_ring.w_nr; @@ -731,7 +727,7 @@ add_header: /* if we finished the message then send completion owns it */ if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_rm = ic->i_rm; - prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; } @@ -743,11 +739,11 @@ add_header: rds_iw_send_add_credits(conn, credit_alloc - i); /* XXX need to worry about failed_wr and partial sends. */ - failed_wr = &first->s_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + failed_wr = &first->s_send_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_wr); + first, &first->s_send_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_send_wr); if (ret) { printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); @@ -765,24 +761,26 @@ out: return ret; } -static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr) +static int rds_iw_build_send_reg(struct rds_iw_send_work *send, + struct scatterlist *sg, + int sg_nents) { - BUG_ON(nent > send->s_page_list->max_page_list_len); - /* - * Perform a WR for the fast_reg_mr. Each individual page - * in the sg list is added to the fast reg page list and placed - * inside the fast_reg_mr WR. - */ - send->s_wr.opcode = IB_WR_FAST_REG_MR; - send->s_wr.wr.fast_reg.length = len; - send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; - send->s_wr.wr.fast_reg.page_list = send->s_page_list; - send->s_wr.wr.fast_reg.page_list_len = nent; - send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; - send->s_wr.wr.fast_reg.iova_start = sg_addr; + int n; + + n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE); + if (unlikely(n != sg_nents)) + return n < 0 ? n : -EINVAL; + + send->s_reg_wr.wr.opcode = IB_WR_REG_MR; + send->s_reg_wr.wr.wr_id = 0; + send->s_reg_wr.wr.num_sge = 0; + send->s_reg_wr.mr = send->s_mr; + send->s_reg_wr.key = send->s_mr->rkey; + send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE; ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); + + return 0; } int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) @@ -803,6 +801,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int sent; int ret; int num_sge; + int sg_nents; rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); @@ -856,9 +855,10 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) scat = &op->op_sg[0]; sent = 0; num_sge = op->op_count; + sg_nents = 0; for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { - send->s_wr.send_flags = 0; + send->s_rdma_wr.wr.send_flags = 0; send->s_queued = jiffies; /* @@ -867,7 +867,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) */ if (ic->i_unsignaled_wrs-- == 0) { ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_wr.send_flags = IB_SEND_SIGNALED; + send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; } /* To avoid the need to have the plumbing to invalidate the fastreg_mr used @@ -875,30 +875,31 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. */ if (op->op_write) - send->s_wr.opcode = IB_WR_RDMA_WRITE; + send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; else - send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->op_rkey; + send->s_rdma_wr.remote_addr = remote_addr; + send->s_rdma_wr.rkey = op->op_rkey; send->s_op = op; if (num_sge > rds_iwdev->max_sge) { - send->s_wr.num_sge = rds_iwdev->max_sge; + send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge; num_sge -= rds_iwdev->max_sge; } else - send->s_wr.num_sge = num_sge; + send->s_rdma_wr.wr.num_sge = num_sge; - send->s_wr.next = NULL; + send->s_rdma_wr.wr.next = NULL; if (prev) - prev->s_wr.next = &send->s_wr; + prev->s_send_wr.next = &send->s_rdma_wr.wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + for (j = 0; j < send->s_rdma_wr.wr.num_sge && + scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); - if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) - send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat); + if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) + sg_nents++; else { send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); send->s_sge[j].length = len; @@ -912,15 +913,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) scat++; } - if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) { - send->s_wr.num_sge = 1; + if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) { + send->s_rdma_wr.wr.num_sge = 1; send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; } rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + &send->s_rdma_wr, + send->s_rdma_wr.wr.num_sge, + send->s_rdma_wr.wr.next); prev = send; if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) @@ -929,7 +932,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) /* if we finished the message then send completion owns it */ if (scat == &op->op_sg[op->op_count]) - first->s_wr.send_flags = IB_SEND_SIGNALED; + first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; if (i < work_alloc) { rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); @@ -943,16 +946,20 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) * fastreg_mr (or possibly a dma_mr) */ if (!op->op_write) { - rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], - op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos], + &op->op_sg[0], sg_nents); + if (ret) { + printk(KERN_WARNING "RDS/IW: failed to reg send mem\n"); + goto out; + } work_alloc++; } - failed_wr = &first->s_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + failed_wr = &first->s_rdma_wr.wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_wr); + first, &first->s_rdma_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_rdma_wr.wr); if (ret) { printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); diff --git a/kernel/net/rds/rdma.c b/kernel/net/rds/rdma.c index 40084d843..4c93badea 100644 --- a/kernel/net/rds/rdma.c +++ b/kernel/net/rds/rdma.c @@ -435,9 +435,10 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) /* If the MR was marked as invalidate, this will * trigger an async flush. */ - if (zot_me) + if (zot_me) { rds_destroy_mr(mr); - rds_mr_put(mr); + rds_mr_put(mr); + } } void rds_rdma_free_op(struct rm_rdma_op *ro) @@ -451,7 +452,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->op_write) { - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); } put_page(page); @@ -658,6 +659,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; + else + ret = 0; rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); diff --git a/kernel/net/rds/rdma_transport.c b/kernel/net/rds/rdma_transport.c index 6cd9d1dea..9c1fed81b 100644 --- a/kernel/net/rds/rdma_transport.c +++ b/kernel/net/rds/rdma_transport.c @@ -34,37 +34,10 @@ #include #include "rdma_transport.h" +#include "ib.h" static struct rdma_cm_id *rds_rdma_listen_id; -static char *rds_cm_event_strings[] = { -#define RDS_CM_EVENT_STRING(foo) \ - [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) - RDS_CM_EVENT_STRING(ADDR_RESOLVED), - RDS_CM_EVENT_STRING(ADDR_ERROR), - RDS_CM_EVENT_STRING(ROUTE_RESOLVED), - RDS_CM_EVENT_STRING(ROUTE_ERROR), - RDS_CM_EVENT_STRING(CONNECT_REQUEST), - RDS_CM_EVENT_STRING(CONNECT_RESPONSE), - RDS_CM_EVENT_STRING(CONNECT_ERROR), - RDS_CM_EVENT_STRING(UNREACHABLE), - RDS_CM_EVENT_STRING(REJECTED), - RDS_CM_EVENT_STRING(ESTABLISHED), - RDS_CM_EVENT_STRING(DISCONNECTED), - RDS_CM_EVENT_STRING(DEVICE_REMOVAL), - RDS_CM_EVENT_STRING(MULTICAST_JOIN), - RDS_CM_EVENT_STRING(MULTICAST_ERROR), - RDS_CM_EVENT_STRING(ADDR_CHANGE), - RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), -#undef RDS_CM_EVENT_STRING -}; - -static char *rds_cm_event_str(enum rdma_cm_event_type type) -{ - return rds_str_array(rds_cm_event_strings, - ARRAY_SIZE(rds_cm_event_strings), type); -}; - int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -74,7 +47,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, int ret = 0; rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, - event->event, rds_cm_event_str(event->event)); + event->event, rdma_event_msg(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -110,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - /* XXX worry about racing with listen acceptance */ - ret = trans->cm_initiate_connect(cm_id); + /* Connection could have been dropped so make sure the + * cm_id is valid before proceeding + */ + if (conn) { + struct rds_ib_connection *ibic; + + ibic = conn->c_transport_data; + if (ibic && ibic->i_cm_id == cm_id) + ret = trans->cm_initiate_connect(cm_id); + else + rds_conn_drop(conn); + } break; case RDMA_CM_EVENT_ESTABLISHED: @@ -139,7 +122,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ printk(KERN_ERR "RDS: unknown event %u (%s)!\n", - event->event, rds_cm_event_str(event->event)); + event->event, rdma_event_msg(event->event)); break; } @@ -148,7 +131,7 @@ out: mutex_unlock(&conn->c_cm_lock); rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, - rds_cm_event_str(event->event), ret); + rdma_event_msg(event->event), ret); return ret; } @@ -159,8 +142,8 @@ static int rds_rdma_listen_init(void) struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, - IB_QPT_RC); + cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); printk(KERN_ERR "RDS/RDMA: failed to setup listener, " diff --git a/kernel/net/rds/rds.h b/kernel/net/rds/rds.h index 0d41155a2..0e2797bdc 100644 --- a/kernel/net/rds/rds.h +++ b/kernel/net/rds/rds.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "info.h" @@ -80,12 +81,15 @@ enum { #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 +#define RDS_RECV_REFILL 3 struct rds_connection { struct hlist_node c_hash_node; __be32 c_laddr; __be32 c_faddr; - unsigned int c_loopback:1; + unsigned int c_loopback:1, + c_outgoing:1, + c_pad_to_32:30; struct rds_connection *c_passive; struct rds_cong_map *c_lcong; @@ -128,8 +132,21 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; + possible_net_t c_net; }; +static inline +struct net *rds_conn_net(struct rds_connection *conn) +{ + return read_pnet(&conn->c_net); +} + +static inline +void rds_conn_net_set(struct rds_connection *conn, struct net *net) +{ + write_pnet(&conn->c_net, net); +} + #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 @@ -363,6 +380,8 @@ struct rds_message { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; + unsigned int op_dmasg; + unsigned int op_dmaoff; struct scatterlist *op_sg; } data; }; @@ -408,11 +427,6 @@ struct rds_notifier { * should try hard not to block. */ -#define RDS_TRANS_IB 0 -#define RDS_TRANS_IWARP 1 -#define RDS_TRANS_TCP 2 -#define RDS_TRANS_COUNT 3 - struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; @@ -420,7 +434,7 @@ struct rds_transport { unsigned int t_prefer_loopback:1; unsigned int t_type; - int (*laddr_check)(__be32 addr); + int (*laddr_check)(struct net *net, __be32 addr); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_connect)(struct rds_connection *conn); @@ -461,7 +475,8 @@ struct rds_sock { * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ - struct hlist_node rs_bound_node; + struct rhash_head rs_bound_node; + u64 rs_bound_key; __be32 rs_bound_addr; __be32 rs_conn_addr; __be16 rs_bound_port; @@ -575,7 +590,6 @@ struct rds_statistics { }; /* af_rds.c */ -char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); @@ -593,6 +607,8 @@ extern wait_queue_head_t rds_poll_waitq; int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(__be32 addr, __be16 port); +int rds_bind_lock_init(void); +void rds_bind_lock_destroy(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); @@ -612,9 +628,11 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ int rds_conn_init(void); void rds_conn_exit(void); -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); @@ -799,10 +817,11 @@ void rds_connect_complete(struct rds_connection *conn); /* transport.c */ int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); -struct rds_transport *rds_trans_get_preferred(__be32 addr); +struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); +struct rds_transport *rds_trans_get(int t_type); int rds_trans_init(void); void rds_trans_exit(void); diff --git a/kernel/net/rds/send.c b/kernel/net/rds/send.c index e9430f537..c9cdb358e 100644 --- a/kernel/net/rds/send.c +++ b/kernel/net/rds/send.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "rds.h" @@ -51,7 +52,7 @@ * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ -static int send_batch_count = 64; +static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); @@ -223,7 +224,7 @@ restart: * through a lot of messages, lets back off and see * if anyone else jumps in */ - if (batch_count >= 1024) + if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&conn->c_lock, flags); @@ -282,26 +283,34 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_rdma_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_atomic_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } /* @@ -411,15 +420,19 @@ over_batch: */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue) && + if ((test_bit(0, &conn->c_map_queued) || + !list_empty(&conn->c_send_queue)) && send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); - goto restart; + if (batch_count < send_batch_count) + goto restart; + queue_delayed_work(rds_wq, &conn->c_send_w, 1); } } out: return ret; } +EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { @@ -769,8 +782,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); - rds_message_wait(rm); + + /* just in case the code above skipped this message + * because RDS_MSG_ON_CONN wasn't set, run it again here + * taking m_rs_lock is the only thing that keeps us + * from racing with ack processing. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); } } @@ -986,11 +1013,18 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) release_sock(sk); } - /* racing with another thread binding seems ok here */ + lock_sock(sk); if (daddr == 0 || rs->rs_bound_addr == 0) { + release_sock(sk); ret = -ENOTCONN; /* XXX not a great errno */ goto out; } + release_sock(sk); + + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } /* size of rm including all sgs */ ret = rds_rm_size(msg, payload_len); @@ -1023,7 +1057,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) conn = rs->rs_conn; else { - conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + conn = rds_conn_create_outgoing(sock_net(sock->sk), + rs->rs_bound_addr, daddr, rs->rs_transport, sock->sk->sk_allocation); if (IS_ERR(conn)) { @@ -1063,11 +1098,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); - /* XXX make sure this is reasonable */ - if (payload_len > rds_sk_sndbuf(rs)) { - ret = -EMSGSIZE; - goto out; - } + if (nonblock) { ret = -EAGAIN; goto out; @@ -1095,8 +1126,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) */ rds_stats_inc(s_send_queued); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return payload_len; @@ -1152,8 +1184,8 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + /* schedule the send work on rds_wq */ + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return 0; diff --git a/kernel/net/rds/tcp.c b/kernel/net/rds/tcp.c index edac9ef2b..9d6ddbacd 100644 --- a/kernel/net/rds/tcp.c +++ b/kernel/net/rds/tcp.c @@ -35,6 +35,9 @@ #include #include #include +#include +#include +#include #include "rds.h" #include "tcp.h" @@ -64,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock) set_fs(oldfs); } +/* All module specific customizations to the RDS-TCP socket should be done in + * rds_tcp_tune() and applied after socket creation. In general these + * customizations should be tunable via module_param() + */ void rds_tcp_tune(struct socket *sock) { - struct sock *sk = sock->sk; - rds_tcp_nonagle(sock); - - /* - * We're trying to saturate gigabit with the default, - * see svc_sock_setbufsize(). - */ - lock_sock(sk); - sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; - sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; - sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; - release_sock(sk); } u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) @@ -189,9 +184,9 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } -static int rds_tcp_laddr_check(__be32 addr) +static int rds_tcp_laddr_check(struct net *net, __be32 addr) { - if (inet_addr_type(&init_net, addr) == RTN_LOCAL) + if (inet_addr_type(net, addr) == RTN_LOCAL) return 0; return -EADDRNOTAVAIL; } @@ -250,16 +245,7 @@ static void rds_tcp_destroy_conns(void) } } -static void rds_tcp_exit(void) -{ - rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); - rds_tcp_listen_stop(); - rds_tcp_destroy_conns(); - rds_trans_unregister(&rds_tcp_transport); - rds_tcp_recv_exit(); - kmem_cache_destroy(rds_tcp_conn_slab); -} -module_exit(rds_tcp_exit); +static void rds_tcp_exit(void); struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, @@ -281,6 +267,136 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; +static int rds_tcp_netid; + +/* per-network namespace private data for this module */ +struct rds_tcp_net { + struct socket *rds_tcp_listen_sock; + struct work_struct rds_tcp_accept_w; +}; + +static void rds_tcp_accept_worker(struct work_struct *work) +{ + struct rds_tcp_net *rtn = container_of(work, + struct rds_tcp_net, + rds_tcp_accept_w); + + while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0) + cond_resched(); +} + +void rds_tcp_accept_work(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + queue_work(rds_wq, &rtn->rds_tcp_accept_w); +} + +static __net_init int rds_tcp_init_net(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + if (!rtn->rds_tcp_listen_sock) { + pr_warn("could not set up listen sock\n"); + return -EAFNOSUPPORT; + } + INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); + return 0; +} + +static void __net_exit rds_tcp_exit_net(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + /* If rds_tcp_exit_net() is called as a result of netns deletion, + * the rds_tcp_kill_sock() device notifier would already have cleaned + * up the listen socket, thus there is no work to do in this function. + * + * If rds_tcp_exit_net() is called as a result of module unload, + * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then + * we do need to clean up the listen socket here. + */ + if (rtn->rds_tcp_listen_sock) { + rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + rtn->rds_tcp_listen_sock = NULL; + flush_work(&rtn->rds_tcp_accept_w); + } +} + +static struct pernet_operations rds_tcp_net_ops = { + .init = rds_tcp_init_net, + .exit = rds_tcp_exit_net, + .id = &rds_tcp_netid, + .size = sizeof(struct rds_tcp_net), +}; + +static void rds_tcp_kill_sock(struct net *net) +{ + struct rds_tcp_connection *tc, *_tc; + struct sock *sk; + LIST_HEAD(tmp_list); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + rtn->rds_tcp_listen_sock = NULL; + flush_work(&rtn->rds_tcp_accept_w); + spin_lock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + struct net *c_net = read_pnet(&tc->conn->c_net); + + if (net != c_net || !tc->t_sock) + continue; + list_move_tail(&tc->t_tcp_node, &tmp_list); + } + spin_unlock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { + sk = tc->t_sock->sk; + sk->sk_prot->disconnect(sk, 0); + tcp_done(sk); + if (tc->conn->c_passive) + rds_conn_destroy(tc->conn->c_passive); + rds_conn_destroy(tc->conn); + } +} + +static int rds_tcp_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + /* rds-tcp registers as a pernet subys, so the ->exit will only + * get invoked after network acitivity has quiesced. We need to + * clean up all sockets to quiesce network activity, and use + * the unregistration of the per-net loopback device as a trigger + * to start that cleanup. + */ + if (event == NETDEV_UNREGISTER_FINAL && + dev->ifindex == LOOPBACK_IFINDEX) + rds_tcp_kill_sock(dev_net(dev)); + + return NOTIFY_DONE; +} + +static struct notifier_block rds_tcp_dev_notifier = { + .notifier_call = rds_tcp_dev_event, + .priority = -10, /* must be called after other network notifiers */ +}; + +static void rds_tcp_exit(void) +{ + rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + unregister_pernet_subsys(&rds_tcp_net_ops); + if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) + pr_warn("could not unregister rds_tcp_dev_notifier\n"); + rds_tcp_destroy_conns(); + rds_trans_unregister(&rds_tcp_transport); + rds_tcp_recv_exit(); + kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + static int rds_tcp_init(void) { int ret; @@ -293,6 +409,16 @@ static int rds_tcp_init(void) goto out; } + ret = register_netdevice_notifier(&rds_tcp_dev_notifier); + if (ret) { + pr_warn("could not register rds_tcp_dev_notifier\n"); + goto out; + } + + ret = register_pernet_subsys(&rds_tcp_net_ops); + if (ret) + goto out_slab; + ret = rds_tcp_recv_init(); if (ret) goto out_slab; @@ -301,19 +427,14 @@ static int rds_tcp_init(void) if (ret) goto out_recv; - ret = rds_tcp_listen_init(); - if (ret) - goto out_register; - rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; -out_register: - rds_trans_unregister(&rds_tcp_transport); out_recv: rds_tcp_recv_exit(); out_slab: + unregister_pernet_subsys(&rds_tcp_net_ops); kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/kernel/net/rds/tcp.h b/kernel/net/rds/tcp.h index 0dbdd3716..64f873c0c 100644 --- a/kernel/net/rds/tcp.h +++ b/kernel/net/rds/tcp.h @@ -52,6 +52,7 @@ u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; +void rds_tcp_accept_work(struct sock *sk); /* tcp_connect.c */ int rds_tcp_conn_connect(struct rds_connection *conn); @@ -59,9 +60,11 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -int rds_tcp_listen_init(void); -void rds_tcp_listen_stop(void); +struct socket *rds_tcp_listen_init(struct net *); +void rds_tcp_listen_stop(struct socket *); void rds_tcp_listen_data_ready(struct sock *sk); +int rds_tcp_accept_one(struct socket *sock); +int rds_tcp_keepalive(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/kernel/net/rds/tcp_connect.c b/kernel/net/rds/tcp_connect.c index 973109c7b..5cb16875c 100644 --- a/kernel/net/rds/tcp_connect.c +++ b/kernel/net/rds/tcp_connect.c @@ -79,7 +79,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct sockaddr_in src, dest; int ret; - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; @@ -111,10 +112,12 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; - if (ret == 0) + if (ret == 0) { + rds_tcp_keepalive(sock); sock = NULL; - else + } else { rds_tcp_restore_callbacks(sock, conn->c_transport_data); + } out: if (sock) diff --git a/kernel/net/rds/tcp_listen.c b/kernel/net/rds/tcp_listen.c index 0da49e344..0936a4a32 100644 --- a/kernel/net/rds/tcp_listen.c +++ b/kernel/net/rds/tcp_listen.c @@ -38,14 +38,7 @@ #include "rds.h" #include "tcp.h" -/* - * cheesy, but simple.. - */ -static void rds_tcp_accept_worker(struct work_struct *work); -static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); -static struct socket *rds_tcp_listen_sock; - -static int rds_tcp_keepalive(struct socket *sock) +int rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ @@ -77,7 +70,7 @@ bail: return ret; } -static int rds_tcp_accept_one(struct socket *sock) +int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; @@ -85,8 +78,9 @@ static int rds_tcp_accept_one(struct socket *sock) struct inet_sock *inet; struct rds_tcp_connection *rs_tcp; - ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, - sock->sk->sk_protocol, &new_sock); + ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, + sock->sk->sk_type, sock->sk->sk_protocol, + &new_sock); if (ret) goto out; @@ -108,35 +102,35 @@ static int rds_tcp_accept_one(struct socket *sock) &inet->inet_saddr, ntohs(inet->inet_sport), &inet->inet_daddr, ntohs(inet->inet_dport)); - conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, + conn = rds_conn_create(sock_net(sock->sk), + inet->inet_saddr, inet->inet_daddr, &rds_tcp_transport, GFP_KERNEL); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } /* An incoming SYN request came in, and TCP just accepted it. - * We always create a new conn for listen side of TCP, and do not - * add it to the c_hash_list. * * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - WARN_ON(!rs_tcp || rs_tcp->t_sock); - - /* - * see the comment above rds_queue_delayed_reconnect() - */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - if (rds_conn_state(conn) == RDS_CONN_UP) - rds_tcp_stats_inc(s_tcp_listen_closed_stale); - else - rds_tcp_stats_inc(s_tcp_connect_raced); - rds_conn_drop(conn); + if (rs_tcp->t_sock && + ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + struct sock *nsk = new_sock->sk; + + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); + new_sock = NULL; ret = 0; goto out; + } else if (rs_tcp->t_sock) { + rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + conn->c_outgoing = 0; } + rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); rds_tcp_set_callbacks(new_sock, conn); rds_connect_complete(conn); new_sock = NULL; @@ -148,12 +142,6 @@ out: return ret; } -static void rds_tcp_accept_worker(struct work_struct *work) -{ - while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) - cond_resched(); -} - void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); @@ -174,20 +162,20 @@ void rds_tcp_listen_data_ready(struct sock *sk) * socket */ if (sk->sk_state == TCP_LISTEN) - queue_work(rds_wq, &rds_tcp_listen_work); + rds_tcp_accept_work(sk); out: read_unlock(&sk->sk_callback_lock); ready(sk); } -int rds_tcp_listen_init(void) +struct socket *rds_tcp_listen_init(struct net *net) { struct sockaddr_in sin; struct socket *sock = NULL; int ret; - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; @@ -211,17 +199,15 @@ int rds_tcp_listen_init(void) if (ret < 0) goto out; - rds_tcp_listen_sock = sock; - sock = NULL; + return sock; out: if (sock) sock_release(sock); - return ret; + return NULL; } -void rds_tcp_listen_stop(void) +void rds_tcp_listen_stop(struct socket *sock) { - struct socket *sock = rds_tcp_listen_sock; struct sock *sk; if (!sock) @@ -242,5 +228,4 @@ void rds_tcp_listen_stop(void) /* wait for accepts to stop and close the socket */ flush_workqueue(rds_wq); sock_release(sock); - rds_tcp_listen_sock = NULL; } diff --git a/kernel/net/rds/tcp_recv.c b/kernel/net/rds/tcp_recv.c index fbc5ef88b..27a992154 100644 --- a/kernel/net/rds/tcp_recv.c +++ b/kernel/net/rds/tcp_recv.c @@ -214,8 +214,15 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } to_copy = min(tc->t_tinc_data_rem, left); - pskb_pull(clone, offset); - pskb_trim(clone, to_copy); + if (!pskb_pull(clone, offset) || + pskb_trim(clone, to_copy)) { + pr_warn("rds_tcp_data_recv: pull/trim failed " + "left %zu data_rem %zu skb_len %d\n", + left, tc->t_tinc_data_rem, skb->len); + kfree_skb(clone); + desc->error = -ENOMEM; + goto out; + } skb_queue_tail(&tinc->ti_skb_list, clone); rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " diff --git a/kernel/net/rds/tcp_send.c b/kernel/net/rds/tcp_send.c index 53b17ca0d..2894e6095 100644 --- a/kernel/net/rds/tcp_send.c +++ b/kernel/net/rds/tcp_send.c @@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, struct rds_tcp_connection *tc = conn->c_transport_data; int done = 0; int ret = 0; + int more; if (hdr_off == 0) { /* @@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } + more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0; while (sg < rm->data.op_nents) { + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; + ret = tc->t_sock->ops->sendpage(tc->t_sock, sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, - MSG_DONTWAIT|MSG_NOSIGNAL); + flags); rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); @@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, off = 0; sg++; } + if (sg == rm->data.op_nents - 1) + more = 0; } out: diff --git a/kernel/net/rds/threads.c b/kernel/net/rds/threads.c index dc2402e87..454aa6d23 100644 --- a/kernel/net/rds/threads.c +++ b/kernel/net/rds/threads.c @@ -162,7 +162,9 @@ void rds_send_worker(struct work_struct *work) int ret; if (rds_conn_state(conn) == RDS_CONN_UP) { + clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); ret = rds_send_xmit(conn); + cond_resched(); rdsdebug("conn %p ret %d\n", conn, ret); switch (ret) { case -EAGAIN: diff --git a/kernel/net/rds/transport.c b/kernel/net/rds/transport.c index 7f2ac4fec..f3afd1d60 100644 --- a/kernel/net/rds/transport.c +++ b/kernel/net/rds/transport.c @@ -73,11 +73,11 @@ EXPORT_SYMBOL_GPL(rds_trans_unregister); void rds_trans_put(struct rds_transport *trans) { - if (trans && trans->t_owner) + if (trans) module_put(trans->t_owner); } -struct rds_transport *rds_trans_get_preferred(__be32 addr) +struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) { struct rds_transport *ret = NULL; struct rds_transport *trans; @@ -90,7 +90,28 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr) for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; - if (trans && (trans->laddr_check(addr) == 0) && + if (trans && (trans->laddr_check(net, addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + +struct rds_transport *rds_trans_get(int t_type) +{ + struct rds_transport *ret = NULL; + struct rds_transport *trans; + unsigned int i; + + down_read(&rds_trans_sem); + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && trans->t_type == t_type && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; diff --git a/kernel/net/rfkill/Kconfig b/kernel/net/rfkill/Kconfig index 4c10e7e6c..598d374f6 100644 --- a/kernel/net/rfkill/Kconfig +++ b/kernel/net/rfkill/Kconfig @@ -36,7 +36,8 @@ config RFKILL_REGULATOR config RFKILL_GPIO tristate "GPIO RFKILL driver" - depends on RFKILL && GPIOLIB + depends on RFKILL + depends on GPIOLIB || COMPILE_TEST default n help If you say yes here you get support of a generic gpio RFKILL diff --git a/kernel/net/rfkill/core.c b/kernel/net/rfkill/core.c index fa7cd7927..cf5b69ab1 100644 --- a/kernel/net/rfkill/core.c +++ b/kernel/net/rfkill/core.c @@ -49,7 +49,6 @@ struct rfkill { spinlock_t lock; - const char *name; enum rfkill_type type; unsigned long state; @@ -73,6 +72,7 @@ struct rfkill { struct delayed_work poll_work; struct work_struct uevent_work; struct work_struct sync_work; + char name[]; }; #define to_rfkill(d) container_of(d, struct rfkill, dev) @@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; - rfkill_global_states[type].cur = blocked; + if (type == RFKILL_TYPE_ALL) { + int i; + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = blocked; + } else { + rfkill_global_states[type].cur = blocked; + } + list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; @@ -794,7 +802,8 @@ void rfkill_resume_polling(struct rfkill *rfkill) } EXPORT_SYMBOL(rfkill_resume_polling); -static int rfkill_suspend(struct device *dev, pm_message_t state) +#ifdef CONFIG_PM_SLEEP +static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); @@ -818,13 +827,18 @@ static int rfkill_resume(struct device *dev) return 0; } +static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume); +#define RFKILL_PM_OPS (&rfkill_pm_ops) +#else +#define RFKILL_PM_OPS NULL +#endif + static struct class rfkill_class = { .name = "rfkill", .dev_release = rfkill_release, .dev_groups = rfkill_dev_groups, .dev_uevent = rfkill_dev_uevent, - .suspend = rfkill_suspend, - .resume = rfkill_resume, + .pm = RFKILL_PM_OPS, }; bool rfkill_blocked(struct rfkill *rfkill) @@ -862,14 +876,14 @@ struct rfkill * __must_check rfkill_alloc(const char *name, if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES)) return NULL; - rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL); + rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL); if (!rfkill) return NULL; spin_lock_init(&rfkill->lock); INIT_LIST_HEAD(&rfkill->node); rfkill->type = type; - rfkill->name = name; + strcpy(rfkill->name, name); rfkill->ops = ops; rfkill->data = ops_data; @@ -1081,17 +1095,6 @@ static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait) return res; } -static bool rfkill_readable(struct rfkill_data *data) -{ - bool r; - - mutex_lock(&data->mtx); - r = !list_empty(&data->events); - mutex_unlock(&data->mtx); - - return r; -} - static ssize_t rfkill_fop_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { @@ -1108,8 +1111,11 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf, goto out; } mutex_unlock(&data->mtx); + /* since we re-check and it just compares pointers, + * using !list_empty() without locking isn't a problem + */ ret = wait_event_interruptible(data->read_wait, - rfkill_readable(data)); + !list_empty(&data->events)); mutex_lock(&data->mtx); if (ret) diff --git a/kernel/net/rfkill/rfkill-gpio.c b/kernel/net/rfkill/rfkill-gpio.c index d978f2f46..93127220c 100644 --- a/kernel/net/rfkill/rfkill-gpio.c +++ b/kernel/net/rfkill/rfkill-gpio.c @@ -112,21 +112,17 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->clk = devm_clk_get(&pdev->dev, NULL); - gpio = devm_gpiod_get(&pdev->dev, "reset"); - if (!IS_ERR(gpio)) { - ret = gpiod_direction_output(gpio, 0); - if (ret) - return ret; - rfkill->reset_gpio = gpio; - } + gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); - gpio = devm_gpiod_get(&pdev->dev, "shutdown"); - if (!IS_ERR(gpio)) { - ret = gpiod_direction_output(gpio, 0); - if (ret) - return ret; - rfkill->shutdown_gpio = gpio; - } + rfkill->reset_gpio = gpio; + + gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_OUT_LOW); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); + + rfkill->shutdown_gpio = gpio; /* Make sure at-least one of the GPIO is defined and that * a name is specified for this instance @@ -168,7 +164,6 @@ static int rfkill_gpio_remove(struct platform_device *pdev) #ifdef CONFIG_ACPI static const struct acpi_device_id rfkill_acpi_match[] = { { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, - { "BCM2E39", RFKILL_TYPE_BLUETOOTH }, { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, { "BCM2E40", RFKILL_TYPE_BLUETOOTH }, { "BCM2E64", RFKILL_TYPE_BLUETOOTH }, diff --git a/kernel/net/rose/af_rose.c b/kernel/net/rose/af_rose.c index 8ae603069..129d357d2 100644 --- a/kernel/net/rose/af_rose.c +++ b/kernel/net/rose/af_rose.c @@ -192,7 +192,8 @@ static void rose_kill_by_device(struct net_device *dev) if (rose->device == dev) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + if (rose->neighbour) + rose->neighbour->use--; rose->device = NULL; } } @@ -520,7 +521,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern); if (sk == NULL) return -ENOMEM; @@ -559,7 +560,7 @@ static struct sock *rose_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0); if (sk == NULL) return NULL; diff --git a/kernel/net/rose/rose_link.c b/kernel/net/rose/rose_link.c index e873d7d9f..c76638cc2 100644 --- a/kernel/net/rose/rose_link.c +++ b/kernel/net/rose/rose_link.c @@ -25,7 +25,6 @@ #include #include #include -#include #include static void rose_ftimer_expiry(unsigned long); diff --git a/kernel/net/rose/rose_route.c b/kernel/net/rose/rose_route.c index 40148932c..0fc76d845 100644 --- a/kernel/net/rose/rose_route.c +++ b/kernel/net/rose/rose_route.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/net/rxrpc/af_rxrpc.c b/kernel/net/rxrpc/af_rxrpc.c index 0095b9a0b..1f8a144a5 100644 --- a/kernel/net/rxrpc/af_rxrpc.c +++ b/kernel/net/rxrpc/af_rxrpc.c @@ -305,7 +305,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, if (!key) key = rx->key; - if (key && !key->payload.data) + if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp); @@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, sock->ops = &rxrpc_rpc_ops; sock->state = SS_UNCONNECTED; - sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto); + sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern); if (!sk) return -ENOMEM; diff --git a/kernel/net/rxrpc/ar-ack.c b/kernel/net/rxrpc/ar-ack.c index e0547f521..adc555e03 100644 --- a/kernel/net/rxrpc/ar-ack.c +++ b/kernel/net/rxrpc/ar-ack.c @@ -723,8 +723,10 @@ process_further: if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && - hard > tx) + hard > tx) { + call->acks_hard = tx; goto all_acked; + } smp_rmb(); rxrpc_rotate_tx_window(call, hard - 1); diff --git a/kernel/net/rxrpc/ar-connection.c b/kernel/net/rxrpc/ar-connection.c index 6631f4f1e..6c71ed1ca 100644 --- a/kernel/net/rxrpc/ar-connection.c +++ b/kernel/net/rxrpc/ar-connection.c @@ -500,7 +500,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, if (bundle->num_conns >= 20) { _debug("too many conns"); - if (!(gfp & __GFP_WAIT)) { + if (!gfpflags_allow_blocking(gfp)) { _leave(" = -EAGAIN"); return -EAGAIN; } @@ -808,7 +808,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn) ASSERTCMP(atomic_read(&conn->usage), >, 0); - conn->put_time = get_seconds(); + conn->put_time = ktime_get_seconds(); if (atomic_dec_and_test(&conn->usage)) { _debug("zombie"); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); @@ -852,7 +852,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) _enter(""); - now = get_seconds(); + now = ktime_get_seconds(); earliest = ULONG_MAX; write_lock_bh(&rxrpc_connection_lock); diff --git a/kernel/net/rxrpc/ar-internal.h b/kernel/net/rxrpc/ar-internal.h index aef1bd294..2934a73a5 100644 --- a/kernel/net/rxrpc/ar-internal.h +++ b/kernel/net/rxrpc/ar-internal.h @@ -208,7 +208,7 @@ struct rxrpc_transport { struct rb_root server_conns; /* server connections on this transport */ struct list_head link; /* link in master session list */ struct sk_buff_head error_queue; /* error packets awaiting processing */ - time_t put_time; /* time at which to reap */ + unsigned long put_time; /* time at which to reap */ spinlock_t client_lock; /* client connection allocation lock */ rwlock_t conn_lock; /* lock for active/dead connections */ atomic_t usage; @@ -256,7 +256,7 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long events; #define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ - time_t put_time; /* time at which to reap */ + unsigned long put_time; /* time at which to reap */ rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; diff --git a/kernel/net/rxrpc/ar-key.c b/kernel/net/rxrpc/ar-key.c index db0f39f5e..da3cc09f6 100644 --- a/kernel/net/rxrpc/ar-key.c +++ b/kernel/net/rxrpc/ar-key.c @@ -148,10 +148,10 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, token->kad->ticket[6], token->kad->ticket[7]); /* count the number of tokens attached */ - prep->type_data[0] = (void *)((unsigned long)prep->type_data[0] + 1); + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ - for (pptoken = (struct rxrpc_key_token **)&prep->payload[0]; + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; *pptoken; pptoken = &(*pptoken)->next) continue; @@ -522,7 +522,7 @@ static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep, goto inval; /* attach the payload */ - for (pptoken = (struct rxrpc_key_token **)&prep->payload[0]; + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; *pptoken; pptoken = &(*pptoken)->next) continue; @@ -764,10 +764,10 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep) memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); /* count the number of tokens attached */ - prep->type_data[0] = (void *)((unsigned long)prep->type_data[0] + 1); + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ - pp = (struct rxrpc_key_token **)&prep->payload[0]; + pp = (struct rxrpc_key_token **)&prep->payload.data[0]; while (*pp) pp = &(*pp)->next; *pp = token; @@ -814,7 +814,7 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) */ static void rxrpc_free_preparse(struct key_preparsed_payload *prep) { - rxrpc_free_token_list(prep->payload[0]); + rxrpc_free_token_list(prep->payload.data[0]); } /* @@ -831,7 +831,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) if (prep->datalen != 8) return -EINVAL; - memcpy(&prep->type_data, prep->data, 8); + memcpy(&prep->payload.data[2], prep->data, 8); ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(ci)) { @@ -842,7 +842,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0) BUG(); - prep->payload[0] = ci; + prep->payload.data[0] = ci; _leave(" = 0"); return 0; } @@ -852,8 +852,8 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) */ static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { - if (prep->payload[0]) - crypto_free_blkcipher(prep->payload[0]); + if (prep->payload.data[0]) + crypto_free_blkcipher(prep->payload.data[0]); } /* @@ -861,7 +861,7 @@ static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) */ static void rxrpc_destroy(struct key *key) { - rxrpc_free_token_list(key->payload.data); + rxrpc_free_token_list(key->payload.data[0]); } /* @@ -869,9 +869,9 @@ static void rxrpc_destroy(struct key *key) */ static void rxrpc_destroy_s(struct key *key) { - if (key->payload.data) { - crypto_free_blkcipher(key->payload.data); - key->payload.data = NULL; + if (key->payload.data[0]) { + crypto_free_blkcipher(key->payload.data[0]); + key->payload.data[0] = NULL; } } @@ -1070,7 +1070,7 @@ static long rxrpc_read(const struct key *key, size += 1 * 4; /* token count */ ntoks = 0; - for (token = key->payload.data; token; token = token->next) { + for (token = key->payload.data[0]; token; token = token->next) { toksize = 4; /* sec index */ switch (token->security_index) { @@ -1163,7 +1163,7 @@ static long rxrpc_read(const struct key *key, ENCODE(ntoks); tok = 0; - for (token = key->payload.data; token; token = token->next) { + for (token = key->payload.data[0]; token; token = token->next) { toksize = toksizes[tok++]; ENCODE(toksize); oldxdr = xdr; diff --git a/kernel/net/rxrpc/ar-local.c b/kernel/net/rxrpc/ar-local.c index ca904ed54..78483b460 100644 --- a/kernel/net/rxrpc/ar-local.c +++ b/kernel/net/rxrpc/ar-local.c @@ -73,8 +73,8 @@ static int rxrpc_create_local(struct rxrpc_local *local) _enter("%p{%d}", local, local->srx.transport_type); /* create a socket to represent the local endpoint */ - ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP, - &local->socket); + ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, + IPPROTO_UDP, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; diff --git a/kernel/net/rxrpc/ar-output.c b/kernel/net/rxrpc/ar-output.c index c0042807b..14c4e12c4 100644 --- a/kernel/net/rxrpc/ar-output.c +++ b/kernel/net/rxrpc/ar-output.c @@ -158,7 +158,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, service_id = htons(srx->srx_service); } key = rx->key; - if (key && !rx->key->payload.data) + if (key && !rx->key->payload.data[0]) key = NULL; bundle = rxrpc_get_bundle(rx, trans, key, service_id, GFP_KERNEL); @@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) return -EPIPE; diff --git a/kernel/net/rxrpc/ar-security.c b/kernel/net/rxrpc/ar-security.c index 49b3cc31e..8334474eb 100644 --- a/kernel/net/rxrpc/ar-security.c +++ b/kernel/net/rxrpc/ar-security.c @@ -137,9 +137,9 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) if (ret < 0) return ret; - if (!key->payload.data) + token = key->payload.data[0]; + if (!token) return -EKEYREJECTED; - token = key->payload.data; sec = rxrpc_security_lookup(token->security_index); if (!sec) diff --git a/kernel/net/rxrpc/ar-transport.c b/kernel/net/rxrpc/ar-transport.c index 1976dec84..9946467f1 100644 --- a/kernel/net/rxrpc/ar-transport.c +++ b/kernel/net/rxrpc/ar-transport.c @@ -189,7 +189,7 @@ void rxrpc_put_transport(struct rxrpc_transport *trans) ASSERTCMP(atomic_read(&trans->usage), >, 0); - trans->put_time = get_seconds(); + trans->put_time = ktime_get_seconds(); if (unlikely(atomic_dec_and_test(&trans->usage))) { _debug("zombie"); /* let the reaper determine the timeout to avoid a race with @@ -226,7 +226,7 @@ static void rxrpc_transport_reaper(struct work_struct *work) _enter(""); - now = get_seconds(); + now = ktime_get_seconds(); earliest = ULONG_MAX; /* extract all the transports that have been dead too long */ diff --git a/kernel/net/rxrpc/rxkad.c b/kernel/net/rxrpc/rxkad.c index f226709eb..d7a9ab5a9 100644 --- a/kernel/net/rxrpc/rxkad.c +++ b/kernel/net/rxrpc/rxkad.c @@ -67,7 +67,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); - token = conn->key->payload.data; + token = conn->key->payload.data[0]; conn->security_ix = token->security_index; ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); @@ -125,7 +125,7 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn) if (!conn->key) return; - token = conn->key->payload.data; + token = conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); desc.tfm = conn->cipher; @@ -221,7 +221,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr.checksum = 0; /* encrypt from the session key */ - token = call->conn->key->payload.data; + token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); desc.tfm = call->conn->cipher; desc.info = iv.x; @@ -433,7 +433,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, skb_to_sgvec(skb, sg, 0, skb->len); /* decrypt from the session key */ - token = call->conn->key->payload.data; + token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); desc.tfm = call->conn->cipher; desc.info = iv.x; @@ -780,7 +780,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, if (conn->security_level < min_level) goto protocol_error; - token = conn->key->payload.data; + token = conn->key->payload.data[0]; /* build the response packet */ memset(&resp, 0, sizeof(resp)); @@ -848,12 +848,12 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, } } - ASSERT(conn->server_key->payload.data != NULL); + ASSERT(conn->server_key->payload.data[0] != NULL); ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); - memcpy(&iv, &conn->server_key->type_data, sizeof(iv)); + memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv)); - desc.tfm = conn->server_key->payload.data; + desc.tfm = conn->server_key->payload.data[0]; desc.info = iv.x; desc.flags = 0; diff --git a/kernel/net/sched/Kconfig b/kernel/net/sched/Kconfig index 2274e723a..daa33432b 100644 --- a/kernel/net/sched/Kconfig +++ b/kernel/net/sched/Kconfig @@ -312,6 +312,7 @@ config NET_SCH_PIE config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT + select NET_INGRESS ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -477,6 +478,16 @@ config NET_CLS_BPF To compile this code as a module, choose M here: the module will be called cls_bpf. +config NET_CLS_FLOWER + tristate "Flower classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys and masks. + + To compile this code as a module, choose M here: the module will + be called cls_flower. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/kernel/net/sched/Makefile b/kernel/net/sched/Makefile index 7ca7f4c1b..690c1689e 100644 --- a/kernel/net/sched/Makefile +++ b/kernel/net/sched/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o +obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/kernel/net/sched/act_api.c b/kernel/net/sched/act_api.c index f8d9c2a2c..06e7c4a37 100644 --- a/kernel/net/sched/act_api.c +++ b/kernel/net/sched/act_api.c @@ -27,7 +27,16 @@ #include #include -void tcf_hash_destroy(struct tc_action *a) +static void free_tcf(struct rcu_head *head) +{ + struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + + free_percpu(p->cpu_bstats); + free_percpu(p->cpu_qstats); + kfree(p); +} + +static void tcf_hash_destroy(struct tc_action *a) { struct tcf_common *p = a->priv; struct tcf_hashinfo *hinfo = a->ops->hinfo; @@ -41,9 +50,8 @@ void tcf_hash_destroy(struct tc_action *a) * gen_estimator est_timer() might access p->tcfc_lock * or bstats, wait a RCU grace period before freeing p */ - kfree_rcu(p, tcfc_rcu); + call_rcu(&p->tcfc_rcu, free_tcf); } -EXPORT_SYMBOL(tcf_hash_destroy); int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) { @@ -231,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) if (est) gen_kill_estimator(&pc->tcfc_bstats, &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); + call_rcu(&pc->tcfc_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind) + int size, int bind, bool cpustats) { struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; @@ -247,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, if (bind) p->tcfc_bindcnt = 1; + if (cpustats) { + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!p->cpu_bstats) { +err1: + kfree(p); + return err; + } + p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!p->cpu_qstats) { +err2: + free_percpu(p->cpu_bstats); + goto err1; + } + } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, NULL, - &p->tcfc_rate_est, - &p->tcfc_lock, est); + err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, + &p->tcfc_rate_est, + &p->tcfc_lock, est); if (err) { - kfree(p); - return err; + free_percpu(p->cpu_qstats); + goto err2; } } @@ -393,11 +416,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, list_for_each_entry(a, actions, list) { repeat: ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) @@ -621,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, NULL, + gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfc_qstats, p->tcfc_qstats.qlen) < 0) goto errout; diff --git a/kernel/net/sched/act_bpf.c b/kernel/net/sched/act_bpf.c index 521ffca91..0bc6f912f 100644 --- a/kernel/net/sched/act_bpf.c +++ b/kernel/net/sched/act_bpf.c @@ -37,19 +37,25 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { struct tcf_bpf *prog = act->priv; + struct bpf_prog *filter; int action, filter_res; + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; if (unlikely(!skb_mac_header_was_set(skb))) return TC_ACT_UNSPEC; - spin_lock(&prog->tcf_lock); - - prog->tcf_tm.lastuse = jiffies; - bstats_update(&prog->tcf_bstats, skb); + tcf_lastuse_update(&prog->tcf_tm); + bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - /* Needed here for accessing maps. */ rcu_read_lock(); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter = rcu_dereference(prog->filter); + if (at_ingress) { + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(filter, skb); + } rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. @@ -66,11 +72,12 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, case TC_ACT_PIPE: case TC_ACT_RECLASSIFY: case TC_ACT_OK: + case TC_ACT_REDIRECT: action = filter_res; break; case TC_ACT_SHOT: action = filter_res; - prog->tcf_qstats.drops++; + qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats)); break; case TC_ACT_UNSPEC: action = prog->tcf_action; @@ -80,7 +87,6 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, break; } - spin_unlock(&prog->tcf_lock); return action; } @@ -256,7 +262,10 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, struct tcf_bpf_cfg *cfg) { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); - cfg->filter = prog->filter; + /* updates to prog->filter are prevented, since it's called either + * with rtnl lock or during final cleanup in rcu callback + */ + cfg->filter = rcu_dereference_protected(prog->filter, 1); cfg->bpf_ops = prog->bpf_ops; cfg->bpf_name = prog->bpf_name; @@ -271,7 +280,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct tc_act_bpf *parm; struct tcf_bpf *prog; bool is_bpf, is_ebpf; - int ret; + int ret, res = 0; if (!nla) return -EINVAL; @@ -280,45 +289,47 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (ret < 0) return ret; - is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; - is_ebpf = tb[TCA_ACT_BPF_FD]; - - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || - !tb[TCA_ACT_BPF_PARMS]) + if (!tb[TCA_ACT_BPF_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - memset(&cfg, 0, sizeof(cfg)); - - ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : - tcf_bpf_init_from_efd(tb, &cfg); - if (ret < 0) - return ret; - if (!tcf_hash_check(parm->index, act, bind)) { ret = tcf_hash_create(parm->index, est, act, - sizeof(*prog), bind); + sizeof(*prog), bind, true); if (ret < 0) - goto destroy_fp; + return ret; - ret = ACT_P_CREATED; + res = ACT_P_CREATED; } else { /* Don't override defaults. */ if (bind) - goto destroy_fp; + return 0; tcf_hash_release(act, bind); - if (!replace) { - ret = -EEXIST; - goto destroy_fp; - } + if (!replace) + return -EEXIST; } + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; + is_ebpf = tb[TCA_ACT_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { + ret = -EINVAL; + goto out; + } + + memset(&cfg, 0, sizeof(cfg)); + + ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : + tcf_bpf_init_from_efd(tb, &cfg); + if (ret < 0) + goto out; + prog = to_bpf(act); - spin_lock_bh(&prog->tcf_lock); + ASSERT_RTNL(); - if (ret != ACT_P_CREATED) + if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); prog->bpf_ops = cfg.bpf_ops; @@ -330,19 +341,21 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->bpf_fd = cfg.bpf_fd; prog->tcf_action = parm->action; - prog->filter = cfg.filter; - - spin_unlock_bh(&prog->tcf_lock); + rcu_assign_pointer(prog->filter, cfg.filter); - if (ret == ACT_P_CREATED) + if (res == ACT_P_CREATED) { tcf_hash_insert(act); - else + } else { + /* make sure the program being replaced is no longer executing */ + synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); + } - return ret; + return res; +out: + if (res == ACT_P_CREATED) + tcf_hash_cleanup(act, est); -destroy_fp: - tcf_bpf_cfg_cleanup(&cfg); return ret; } diff --git a/kernel/net/sched/act_connmark.c b/kernel/net/sched/act_connmark.c index 295d14bd6..bb41699c6 100644 --- a/kernel/net/sched/act_connmark.c +++ b/kernel/net/sched/act_connmark.c @@ -37,6 +37,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = a->priv; + struct nf_conntrack_zone zone; struct nf_conn *c; int proto; @@ -67,10 +68,13 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - proto, &tuple)) + proto, ca->net, &tuple)) goto out; - thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple); + zone.id = ca->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; + + thash = nf_conntrack_find_get(ca->net, &zone, &tuple); if (!thash) goto out; @@ -108,12 +112,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), + bind, false); if (ret) return ret; ci = to_connmark(a); ci->tcf_action = parm->action; + ci->net = net; ci->zone = parm->zone; tcf_hash_insert(a); diff --git a/kernel/net/sched/act_csum.c b/kernel/net/sched/act_csum.c index 4cd5cf1ae..b07c535ba 100644 --- a/kernel/net/sched/act_csum.c +++ b/kernel/net/sched/act_csum.c @@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_CSUM_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/kernel/net/sched/act_gact.c b/kernel/net/sched/act_gact.c index 7fffc2272..5c1b05170 100644 --- a/kernel/net/sched/act_gact.c +++ b/kernel/net/sched/act_gact.c @@ -28,14 +28,18 @@ #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { - if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (prandom_u32() % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } static int gact_determ(struct tcf_gact *gact) { - if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval) + u32 pack = atomic_inc_return(&gact->packets); + + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (pack % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } @@ -85,7 +89,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #endif if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -99,16 +104,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(a); - spin_lock_bh(&gact->tcf_lock); + ASSERT_RTNL(); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { gact->tcfg_paction = p_parm->paction; - gact->tcfg_pval = p_parm->pval; + gact->tcfg_pval = max_t(u16, 1, p_parm->pval); + /* Make sure tcfg_pval is written before tcfg_ptype + * coupled with smp_rmb() in gact_net_rand() & gact_determ() + */ + smp_wmb(); gact->tcfg_ptype = p_parm->ptype; } #endif - spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) tcf_hash_insert(a); return ret; @@ -118,23 +126,21 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = a->priv; - int action = TC_ACT_SHOT; + int action = READ_ONCE(gact->tcf_action); - spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype) - action = gact_rand[gact->tcfg_ptype](gact); - else - action = gact->tcf_action; -#else - action = gact->tcf_action; + { + u32 ptype = READ_ONCE(gact->tcfg_ptype); + + if (ptype) + action = gact_rand[ptype](gact); + } #endif - gact->tcf_bstats.bytes += qdisc_pkt_len(skb); - gact->tcf_bstats.packets++; + bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); if (action == TC_ACT_SHOT) - gact->tcf_qstats.drops++; - gact->tcf_tm.lastuse = jiffies; - spin_unlock(&gact->tcf_lock); + qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + + tcf_lastuse_update(&gact->tcf_tm); return action; } diff --git a/kernel/net/sched/act_ipt.c b/kernel/net/sched/act_ipt.c index cbc8dd7dd..d05869646 100644 --- a/kernel/net/sched/act_ipt.c +++ b/kernel/net/sched/act_ipt.c @@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, index = nla_get_u32(tb[TCA_IPT_INDEX]); if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -189,6 +189,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, * worry later - danger - this API seems to have changed * from earlier kernels */ + par.net = dev_net(skb->dev); par.in = skb->dev; par.out = NULL; par.hooknum = ipt->tcfi_hook; diff --git a/kernel/net/sched/act_mirred.c b/kernel/net/sched/act_mirred.c index 3f63ceac8..32fcdecdb 100644 --- a/kernel/net/sched/act_mirred.c +++ b/kernel/net/sched/act_mirred.c @@ -31,13 +31,19 @@ #define MIRRED_TAB_MASK 7 static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock); static void tcf_mirred_release(struct tc_action *a, int bind) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); + + /* We could be called either in a RCU callback or with RTNL lock held. */ + spin_lock_bh(&mirred_list_lock); list_del(&m->tcfm_list); - if (m->tcfm_dev) - dev_put(m->tcfm_dev); + spin_unlock_bh(&mirred_list_lock); + if (dev) + dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -93,32 +99,37 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), + bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_hash_release(a, bind); + if (bind) + return 0; + + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } m = to_mirred(a); - spin_lock_bh(&m->tcf_lock); + ASSERT_RTNL(); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; if (ret != ACT_P_CREATED) - dev_put(m->tcfm_dev); + dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); - m->tcfm_dev = dev; + rcu_assign_pointer(m->tcfm_dev, dev); m->tcfm_ok_push = ok_push; } - spin_unlock_bh(&m->tcf_lock); + if (ret == ACT_P_CREATED) { + spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); + spin_unlock_bh(&mirred_list_lock); tcf_hash_insert(a); } @@ -131,28 +142,30 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_mirred *m = a->priv; struct net_device *dev; struct sk_buff *skb2; + int retval, err; u32 at; - int retval, err = 1; - spin_lock(&m->tcf_lock); - m->tcf_tm.lastuse = jiffies; - bstats_update(&m->tcf_bstats, skb); + tcf_lastuse_update(&m->tcf_tm); + + bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - dev = m->tcfm_dev; - if (!dev) { - printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + rcu_read_lock(); + retval = READ_ONCE(m->tcf_action); + dev = rcu_dereference(m->tcfm_dev); + if (unlikely(!dev)) { + pr_notice_once("tc mirred: target device is gone\n"); goto out; } - if (!(dev->flags & IFF_UP)) { + if (unlikely(!(dev->flags & IFF_UP))) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto out; } at = G_TC_AT(skb->tc_verd); - skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); - if (skb2 == NULL) + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) goto out; if (!(at & AT_EGRESS)) { @@ -166,18 +179,16 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; + skb_sender_cpu_clear(skb2); err = dev_queue_xmit(skb2); -out: if (err) { - m->tcf_qstats.overlimits++; +out: + qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); if (m->tcfm_eaction != TCA_EGRESS_MIRROR) retval = TC_ACT_SHOT; - else - retval = m->tcf_action; - } else - retval = m->tcf_action; - spin_unlock(&m->tcf_lock); + } + rcu_read_unlock(); return retval; } @@ -216,15 +227,20 @@ static int mirred_device_event(struct notifier_block *unused, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; - if (event == NETDEV_UNREGISTER) + ASSERT_RTNL(); + if (event == NETDEV_UNREGISTER) { + spin_lock_bh(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { - spin_lock_bh(&m->tcf_lock); - if (m->tcfm_dev == dev) { + if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); - m->tcfm_dev = NULL; + /* Note : no rcu grace period necessary, as + * net_device are already rcu protected. + */ + RCU_INIT_POINTER(m->tcfm_dev, NULL); } - spin_unlock_bh(&m->tcf_lock); } + spin_unlock_bh(&mirred_list_lock); + } return NOTIFY_DONE; } diff --git a/kernel/net/sched/act_nat.c b/kernel/net/sched/act_nat.c index 270a030d5..b7c4ead8b 100644 --- a/kernel/net/sched/act_nat.c +++ b/kernel/net/sched/act_nat.c @@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_NAT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -161,7 +162,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, goto drop; tcph = (void *)(skb_network_header(skb) + ihl); - inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, + true); break; } case IPPROTO_UDP: @@ -177,7 +179,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, udph = (void *)(skb_network_header(skb) + ihl); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, - new_addr, 1); + new_addr, true); if (!udph->check) udph->check = CSUM_MANGLED_0; } @@ -230,7 +232,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, iph->saddr = new_addr; inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, - 0); + false); break; } default: diff --git a/kernel/net/sched/act_pedit.c b/kernel/net/sched/act_pedit.c index 59649d588..e38a7701f 100644 --- a/kernel/net/sched/act_pedit.c +++ b/kernel/net/sched/act_pedit.c @@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; p = to_pedit(a); @@ -68,13 +69,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; } else { - p = to_pedit(a); - tcf_hash_release(a, bind); if (bind) return 0; + tcf_hash_release(a, bind); if (!ovr) return -EEXIST; - + p = to_pedit(a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -108,7 +108,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = a->priv; - int i, munged = 0; + int i; unsigned int off; if (skb_unclone(skb, GFP_ATOMIC)) @@ -156,11 +156,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, *ptr = ((*ptr & tkey->mask) ^ tkey->val); if (ptr == &_data) skb_store_bits(skb, off + offset, ptr, 4); - munged++; } - if (munged) - skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); goto done; } else WARN(1, "pedit BUG: index %d\n", p->tcf_index); diff --git a/kernel/net/sched/act_simple.c b/kernel/net/sched/act_simple.c index 6a8d94886..d6b708d6a 100644 --- a/kernel/net/sched/act_simple.c +++ b/kernel/net/sched/act_simple.c @@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, defdata = nla_data(tb[TCA_DEF_DATA]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/kernel/net/sched/act_skbedit.c b/kernel/net/sched/act_skbedit.c index fcfeeaf83..6751b5f8c 100644 --- a/kernel/net/sched/act_skbedit.c +++ b/kernel/net/sched/act_skbedit.c @@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/kernel/net/sched/act_vlan.c b/kernel/net/sched/act_vlan.c index d735ecf0b..796785e0b 100644 --- a/kernel/net/sched/act_vlan.c +++ b/kernel/net/sched/act_vlan.c @@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*v), + bind, false); if (ret) return ret; diff --git a/kernel/net/sched/cls_bpf.c b/kernel/net/sched/cls_bpf.c index c0b86f2bf..5faaa5425 100644 --- a/kernel/net/sched/cls_bpf.c +++ b/kernel/net/sched/cls_bpf.c @@ -38,6 +38,7 @@ struct cls_bpf_prog { struct bpf_prog *filter; struct list_head link; struct tcf_result res; + bool exts_integrated; struct tcf_exts exts; u32 handle; union { @@ -52,6 +53,7 @@ struct cls_bpf_prog { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FLAGS] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, @@ -59,11 +61,30 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; +static int cls_bpf_exec_opcode(int code) +{ + switch (code) { + case TC_ACT_OK: + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + case TC_ACT_REDIRECT: + case TC_ACT_UNSPEC: + return code; + default: + return TC_ACT_UNSPEC; + } +} + static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; +#ifdef CONFIG_NET_CLS_ACT + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; +#else + bool at_ingress = false; +#endif int ret = -1; if (unlikely(!skb_mac_header_was_set(skb))) @@ -72,7 +93,28 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { - int filter_res = BPF_PROG_RUN(prog->filter, skb); + int filter_res; + + qdisc_skb_cb(skb)->tc_classid = prog->res.classid; + + if (at_ingress) { + /* It is safe to push/pull even if skb_shared() */ + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } + + if (prog->exts_integrated) { + res->class = prog->res.class; + res->classid = qdisc_skb_cb(skb)->tc_classid; + + ret = cls_bpf_exec_opcode(filter_res); + if (ret == TC_ACT_UNSPEC) + continue; + break; + } if (filter_res == 0) continue; @@ -181,8 +223,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_prog_from_ops(struct nlattr **tb, - struct cls_bpf_prog *prog, u32 classid) +static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; @@ -216,15 +257,13 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, prog->bpf_ops = bpf_ops; prog->bpf_num_ops = bpf_num_ops; prog->bpf_name = NULL; - prog->filter = fp; - prog->res.classid = classid; return 0; } -static int cls_bpf_prog_from_efd(struct nlattr **tb, - struct cls_bpf_prog *prog, u32 classid) +static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, + const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; @@ -254,9 +293,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, prog->bpf_ops = NULL; prog->bpf_fd = bpf_fd; prog->bpf_name = name; - prog->filter = fp; - prog->res.classid = classid; + + if (fp->dst_needed) + netif_keep_dst(qdisc_dev(tp->q)); return 0; } @@ -266,16 +306,13 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { + bool is_bpf, is_ebpf, have_exts = false; struct tcf_exts exts; - bool is_bpf, is_ebpf; - u32 classid; int ret; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; is_ebpf = tb[TCA_BPF_FD]; - - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || - !tb[TCA_BPF_CLASSID]) + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); @@ -283,18 +320,32 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if (ret < 0) return ret; - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + if (tb[TCA_BPF_FLAGS]) { + u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); + + if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { + tcf_exts_destroy(&exts); + return -EINVAL; + } + + have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; + } + + prog->exts_integrated = have_exts; - ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : - cls_bpf_prog_from_efd(tb, prog, classid); + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : + cls_bpf_prog_from_efd(tb, prog, tp); if (ret < 0) { tcf_exts_destroy(&exts); return ret; } - tcf_bind_filter(tp, &prog->res, base); - tcf_exts_change(tp, &prog->exts, &exts); + if (tb[TCA_BPF_CLASSID]) { + prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + tcf_bind_filter(tp, &prog->res, base); + } + tcf_exts_change(tp, &prog->exts, &exts); return 0; } @@ -415,6 +466,7 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; struct nlattr *nest; + u32 bpf_flags = 0; int ret; if (prog == NULL) @@ -426,7 +478,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) + if (prog->res.classid && + nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; if (cls_bpf_is_ebpf(prog)) @@ -439,6 +492,11 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; + if (prog->exts_integrated) + bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; + if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) + goto nla_put_failure; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &prog->exts) < 0) diff --git a/kernel/net/sched/cls_cgroup.c b/kernel/net/sched/cls_cgroup.c index ea611b216..4c85bd3a7 100644 --- a/kernel/net/sched/cls_cgroup.c +++ b/kernel/net/sched/cls_cgroup.c @@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); - u32 classid; - - classid = task_cls_state(current)->classid; - - /* - * Due to the nature of the classifier it is required to ignore all - * packets originating from softirq context as accessing `current' - * would lead to false results. - * - * This test assumes that all callers of dev_queue_xmit() explicitely - * disable bh. Knowing this, it is possible to detect softirq based - * calls by looking at the number of nested bh disable calls because - * softirqs always disables bh. - */ - if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ - if (!skb->sk) - return -1; - classid = skb->sk->sk_classid; - } + u32 classid = task_get_classid(skb); if (!classid) return -1; - if (!tcf_em_tree_match(skb, &head->ematches, NULL)) return -1; res->classid = classid; res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); } diff --git a/kernel/net/sched/cls_flow.c b/kernel/net/sched/cls_flow.c index 75df923f5..fbfec6a18 100644 --- a/kernel/net/sched/cls_flow.c +++ b/kernel/net/sched/cls_flow.c @@ -22,11 +22,12 @@ #include #include #include +#include #include #include #include -#include +#include #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include @@ -68,35 +69,41 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->src) - return ntohl(flow->src); + __be32 src = flow_get_u32_src(flow); + + if (src) + return ntohl(src); + return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->dst) - return ntohl(flow->dst); + __be32 dst = flow_get_u32_dst(flow); + + if (dst) + return ntohl(dst); + return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - return flow->ip_proto; + return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[0]); + if (flow->ports.ports) + return ntohs(flow->ports.src); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[1]); + if (flow->ports.ports) + return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } @@ -191,8 +198,11 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb) static u32 flow_get_skuid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { - kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; + struct sock *sk = skb_to_full_sk(skb); + + if (sk && sk->sk_socket && sk->sk_socket->file) { + kuid_t skuid = sk->sk_socket->file->f_cred->fsuid; + return from_kuid(&init_user_ns, skuid); } return 0; @@ -200,8 +210,11 @@ static u32 flow_get_skuid(const struct sk_buff *skb) static u32 flow_get_skgid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { - kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; + struct sock *sk = skb_to_full_sk(skb); + + if (sk && sk->sk_socket && sk->sk_socket->file) { + kgid_t skgid = sk->sk_socket->file->f_cred->fsgid; + return from_kgid(&init_user_ns, skgid); } return 0; @@ -295,7 +308,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/kernel/net/sched/cls_flower.c b/kernel/net/sched/cls_flower.c new file mode 100644 index 000000000..95b021243 --- /dev/null +++ b/kernel/net/sched/cls_flower.c @@ -0,0 +1,697 @@ +/* + * net/sched/cls_flower.c Flower classifier + * + * Copyright (c) 2015 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +struct fl_flow_key { + int indev_ifindex; + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_addrs ipaddrs; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct fl_flow_mask_range { + unsigned short int start; + unsigned short int end; +}; + +struct fl_flow_mask { + struct fl_flow_key key; + struct fl_flow_mask_range range; + struct rcu_head rcu; +}; + +struct cls_fl_head { + struct rhashtable ht; + struct fl_flow_mask mask; + struct flow_dissector dissector; + u32 hgen; + bool mask_assigned; + struct list_head filters; + struct rhashtable_params ht_params; + struct rcu_head rcu; +}; + +struct cls_fl_filter { + struct rhash_head ht_node; + struct fl_flow_key mkey; + struct tcf_exts exts; + struct tcf_result res; + struct fl_flow_key key; + struct list_head list; + u32 handle; + struct rcu_head rcu; +}; + +static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) +{ + return mask->range.end - mask->range.start; +} + +static void fl_mask_update_range(struct fl_flow_mask *mask) +{ + const u8 *bytes = (const u8 *) &mask->key; + size_t size = sizeof(mask->key); + size_t i, first = 0, last = size - 1; + + for (i = 0; i < sizeof(mask->key); i++) { + if (bytes[i]) { + if (!first && i) + first = i; + last = i; + } + } + mask->range.start = rounddown(first, sizeof(long)); + mask->range.end = roundup(last + 1, sizeof(long)); +} + +static void *fl_key_get_start(struct fl_flow_key *key, + const struct fl_flow_mask *mask) +{ + return (u8 *) key + mask->range.start; +} + +static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + const long *lkey = fl_key_get_start(key, mask); + const long *lmask = fl_key_get_start(&mask->key, mask); + long *lmkey = fl_key_get_start(mkey, mask); + int i; + + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) + *lmkey++ = *lkey++ & *lmask++; +} + +static void fl_clear_masked_range(struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); +} + +static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_fl_head *head = rcu_dereference_bh(tp->root); + struct cls_fl_filter *f; + struct fl_flow_key skb_key; + struct fl_flow_key skb_mkey; + + fl_clear_masked_range(&skb_key, &head->mask); + skb_key.indev_ifindex = skb->skb_iif; + /* skb_flow_dissect() does not set n_proto in case an unknown protocol, + * so do it rather here. + */ + skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect(skb, &head->dissector, &skb_key, 0); + + fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); + + f = rhashtable_lookup_fast(&head->ht, + fl_key_get_start(&skb_mkey, &head->mask), + head->ht_params); + if (f) { + *res = f->res; + return tcf_exts_exec(skb, &f->exts, res); + } + return -1; +} + +static int fl_init(struct tcf_proto *tp) +{ + struct cls_fl_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + INIT_LIST_HEAD_RCU(&head->filters); + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void fl_destroy_filter(struct rcu_head *head) +{ + struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + + tcf_exts_destroy(&f->exts); + kfree(f); +} + +static bool fl_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f, *next; + + if (!force && !list_empty(&head->filters)) + return false; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del_rcu(&f->list); + call_rcu(&f->rcu, fl_destroy_filter); + } + RCU_INIT_POINTER(tp->root, NULL); + if (head->mask_assigned) + rhashtable_destroy(&head->ht); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long fl_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { + [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, + [TCA_FLOWER_INDEV] = { .type = NLA_STRING, + .len = IFNAMSIZ }, + [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, +}; + +static void fl_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + if (!tb[val_type]) + return; + memcpy(val, nla_data(tb[val_type]), len); + if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + memcpy(mask, nla_data(tb[mask_type]), len); +} + +static int fl_set_key(struct net *net, struct nlattr **tb, + struct fl_flow_key *key, struct fl_flow_key *mask) +{ +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + if (err < 0) + return err; + key->indev_ifindex = err; + mask->indev_ifindex = 0xffffffff; + } +#endif + + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)); + fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)); + + fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + + if (key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto)); + } + + if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)); + fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)); + } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) { + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)); + fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)); + } + + if (key->basic.ip_proto == IPPROTO_TCP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } else if (key->basic.ip_proto == IPPROTO_UDP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } + + return 0; +} + +static bool fl_mask_eq(struct fl_flow_mask *mask1, + struct fl_flow_mask *mask2) +{ + const long *lmask1 = fl_key_get_start(&mask1->key, mask1); + const long *lmask2 = fl_key_get_start(&mask2->key, mask2); + + return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && + !memcmp(lmask1, lmask2, fl_mask_range(mask1)); +} + +static const struct rhashtable_params fl_ht_params = { + .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */ + .head_offset = offsetof(struct cls_fl_filter, ht_node), + .automatic_shrinking = true, +}; + +static int fl_init_hashtable(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + head->ht_params = fl_ht_params; + head->ht_params.key_len = fl_mask_range(mask); + head->ht_params.key_offset += mask->range.start; + + return rhashtable_init(&head->ht, &head->ht_params); +} + +#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) +#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_END_OFFSET(member) \ + (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) + +#define FL_KEY_IN_RANGE(mask, member) \ + (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ + FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) + +#define FL_KEY_SET(keys, cnt, id, member) \ + do { \ + keys[cnt].key_id = id; \ + keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \ + cnt++; \ + } while(0); + +#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ + do { \ + if (FL_KEY_IN_RANGE(mask, member)) \ + FL_KEY_SET(keys, cnt, id, member); \ + } while(0); + +static void fl_init_dissector(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; + size_t cnt = 0; + + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + + skb_flow_dissector_init(&head->dissector, keys, cnt); +} + +static int fl_check_assign_mask(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + int err; + + if (head->mask_assigned) { + if (!fl_mask_eq(&head->mask, mask)) + return -EINVAL; + else + return 0; + } + + /* Mask is not assigned yet. So assign it and init hashtable + * according to that. + */ + err = fl_init_hashtable(head, mask); + if (err) + return err; + memcpy(&head->mask, mask, sizeof(head->mask)); + head->mask_assigned = true; + + fl_init_dissector(head, mask); + + return 0; +} + +static int fl_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_fl_filter *f, struct fl_flow_mask *mask, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_FLOWER_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_FLOWER_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + err = fl_set_key(net, tb, &f->key, &mask->key); + if (err) + goto errout; + + fl_mask_update_range(mask); + fl_set_masked_key(&f->mkey, &f->key, mask); + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(&e); + return err; +} + +static u32 fl_grab_new_handle(struct tcf_proto *tp, + struct cls_fl_head *head) +{ + unsigned int i = 0x80000000; + u32 handle; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && fl_get(tp, head->hgen)); + + if (unlikely(i == 0)) { + pr_err("Insufficient number of handles\n"); + handle = 0; + } else { + handle = head->hgen; + } + + return handle; +} + +static int fl_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; + struct cls_fl_filter *fnew; + struct nlattr *tb[TCA_FLOWER_MAX + 1]; + struct fl_flow_mask mask = {}; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); + if (err < 0) + return err; + + if (fold && handle && fold->handle != handle) + return -EINVAL; + + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + + if (!handle) { + handle = fl_grab_new_handle(tp, head); + if (!handle) { + err = -EINVAL; + goto errout; + } + } + fnew->handle = handle; + + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + err = fl_check_assign_mask(head, &mask); + if (err) + goto errout; + + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + if (fold) + rhashtable_remove_fast(&head->ht, &fold->ht_node, + head->ht_params); + + *arg = (unsigned long) fnew; + + if (fold) { + list_replace_rcu(&fold->list, &fnew->list); + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, fl_destroy_filter); + } else { + list_add_tail_rcu(&fnew->list, &head->filters); + } + + return 0; + +errout: + kfree(fnew); + return err; +} + +static int fl_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) arg; + + rhashtable_remove_fast(&head->ht, &f->ht_node, + head->ht_params); + list_del_rcu(&f->list); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fl_destroy_filter); + return 0; +} + +static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry_rcu(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + int err; + + if (!memchr_inv(mask, 0, len)) + return 0; + err = nla_put(skb, val_type, len, val); + if (err) + return err; + if (mask_type != TCA_FLOWER_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + return 0; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &head->mask.key; + + if (mask->indev_ifindex) { + struct net_device *dev; + + dev = __dev_get_by_index(net, key->indev_ifindex); + if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) + goto nla_put_failure; + } + + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)) || + fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)) || + fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto))) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) && + fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto))) + goto nla_put_failure; + + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && + (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)) || + fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)))) + goto nla_put_failure; + else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && + (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)) || + fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)))) + goto nla_put_failure; + + if (key->basic.ip_proto == IPPROTO_TCP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_UDP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .kind = "flower", + .classify = fl_classify, + .init = fl_init, + .destroy = fl_destroy, + .get = fl_get, + .change = fl_change, + .delete = fl_delete, + .walk = fl_walk, + .dump = fl_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_fl_init(void) +{ + return register_tcf_proto_ops(&cls_fl_ops); +} + +static void __exit cls_fl_exit(void) +{ + unregister_tcf_proto_ops(&cls_fl_ops); +} + +module_init(cls_fl_init); +module_exit(cls_fl_exit); + +MODULE_AUTHOR("Jiri Pirko "); +MODULE_DESCRIPTION("Flower classifier"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/net/sched/cls_rsvp.h b/kernel/net/sched/cls_rsvp.h index 02fa82792..f9c9fc075 100644 --- a/kernel/net/sched/cls_rsvp.h +++ b/kernel/net/sched/cls_rsvp.h @@ -283,12 +283,22 @@ static int rsvp_init(struct tcf_proto *tp) return -ENOBUFS; } -static void -rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +static void rsvp_delete_filter_rcu(struct rcu_head *head) { - tcf_unbind_filter(tp, &f->res); + struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); + tcf_exts_destroy(&f->exts); - kfree_rcu(f, rcu); + kfree(f); +} + +static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + /* all classifiers are required to call tcf_exts_destroy() after rcu + * grace period, since converted-to-rcu actions are relying on that + * in cleanup() callback + */ + call_rcu(&f->rcu, rsvp_delete_filter_rcu); } static bool rsvp_destroy(struct tcf_proto *tp, bool force) diff --git a/kernel/net/sched/cls_tcindex.c b/kernel/net/sched/cls_tcindex.c index a557dbaf5..944c8ff45 100644 --- a/kernel/net/sched/cls_tcindex.c +++ b/kernel/net/sched/cls_tcindex.c @@ -27,6 +27,7 @@ struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct rcu_head rcu; }; struct tcindex_filter { @@ -133,8 +134,23 @@ static int tcindex_init(struct tcf_proto *tp) return 0; } -static int -tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static void tcindex_destroy_rexts(struct rcu_head *head) +{ + struct tcindex_filter_result *r; + + r = container_of(head, struct tcindex_filter_result, rcu); + tcf_exts_destroy(&r->exts); +} + +static void tcindex_destroy_fexts(struct rcu_head *head) +{ + struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); + + tcf_exts_destroy(&f->result.exts); + kfree(f); +} + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; @@ -162,9 +178,14 @@ found: rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); - tcf_exts_destroy(&r->exts); + /* all classifiers are required to call tcf_exts_destroy() after rcu + * grace period, since converted-to-rcu actions are relying on that + * in cleanup() callback + */ if (f) - kfree_rcu(f, rcu); + call_rcu(&f->rcu, tcindex_destroy_fexts); + else + call_rcu(&r->rcu, tcindex_destroy_rexts); return 0; } diff --git a/kernel/net/sched/em_ipset.c b/kernel/net/sched/em_ipset.c index a3d79c8bf..c66ca9400 100644 --- a/kernel/net/sched/em_ipset.c +++ b/kernel/net/sched/em_ipset.c @@ -92,9 +92,10 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, rcu_read_lock(); - if (dev && skb->skb_iif) - indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); + acpar.net = em->net; acpar.in = indev ? indev : dev; acpar.out = dev; diff --git a/kernel/net/sched/em_meta.c b/kernel/net/sched/em_meta.c index b5294ce20..f2aabc008 100644 --- a/kernel/net/sched/em_meta.c +++ b/kernel/net/sched/em_meta.c @@ -343,119 +343,145 @@ META_COLLECTOR(int_sk_refcnt) META_COLLECTOR(int_sk_rcvbuf) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvbuf; + dst->value = sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_shutdown; + dst->value = sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_protocol; + dst->value = sk->sk_protocol; } META_COLLECTOR(int_sk_type) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_type; + dst->value = sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = sk_rmem_alloc_get(skb->sk); + dst->value = sk_rmem_alloc_get(sk); } META_COLLECTOR(int_sk_wmem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = sk_wmem_alloc_get(skb->sk); + dst->value = sk_wmem_alloc_get(sk); } META_COLLECTOR(int_sk_omem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = atomic_read(&skb->sk->sk_omem_alloc); + dst->value = atomic_read(&sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_receive_queue.qlen; + dst->value = sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_write_queue.qlen; + dst->value = sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_wmem_queued; + dst->value = sk->sk_wmem_queued; } META_COLLECTOR(int_sk_fwd_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_forward_alloc; + dst->value = sk->sk_forward_alloc; } META_COLLECTOR(int_sk_sndbuf) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_sndbuf; + dst->value = sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = (__force int) skb->sk->sk_allocation; + dst->value = (__force int) sk->sk_allocation; } META_COLLECTOR(int_sk_hash) @@ -469,92 +495,112 @@ META_COLLECTOR(int_sk_hash) META_COLLECTOR(int_sk_lingertime) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_lingertime / HZ; + dst->value = sk->sk_lingertime / HZ; } META_COLLECTOR(int_sk_err_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_error_queue.qlen; + dst->value = sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_ack_backlog; + dst->value = sk->sk_ack_backlog; } META_COLLECTOR(int_sk_max_ack_bl) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_max_ack_backlog; + dst->value = sk->sk_max_ack_backlog; } META_COLLECTOR(int_sk_prio) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_priority; + dst->value = sk->sk_priority; } META_COLLECTOR(int_sk_rcvlowat) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvlowat; + dst->value = sk->sk_rcvlowat; } META_COLLECTOR(int_sk_rcvtimeo) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvtimeo / HZ; + dst->value = sk->sk_rcvtimeo / HZ; } META_COLLECTOR(int_sk_sndtimeo) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_sndtimeo / HZ; + dst->value = sk->sk_sndtimeo / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_frag.offset; + dst->value = sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_write_pending; + dst->value = sk->sk_write_pending; } /************************************************************************** diff --git a/kernel/net/sched/sch_api.c b/kernel/net/sched/sch_api.c index 1e1c89e51..af1acf009 100644 --- a/kernel/net/sched/sch_api.c +++ b/kernel/net/sched/sch_api.c @@ -253,7 +253,8 @@ int qdisc_set_default(const char *name) } /* We know handle. Find qdisc among all qdisc's attached to device - (root qdisc, all its children, children of children etc.) + * (root qdisc, all its children, children of children etc.) + * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) @@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - list_for_each_entry(q, &root->list, list) { + list_for_each_entry_rcu(q, &root->list, list) { if (q->handle == handle) return q; } @@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q) struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); - list_add_tail(&q->list, &root->list); + ASSERT_RTNL(); + list_add_tail_rcu(&q->list, &root->list); } } EXPORT_SYMBOL(qdisc_list_add); void qdisc_list_del(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_del(&q->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + ASSERT_RTNL(); + list_del_rcu(&q->list); + } } EXPORT_SYMBOL(qdisc_list_del); @@ -750,14 +754,18 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) if (n == 0) return; drops = max_t(int, n, 0); + rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) - return; + break; + if (sch->flags & TCQ_F_NOPARENT) + break; + /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { - WARN_ON(parentid != TC_H_ROOT); - return; + WARN_ON_ONCE(parentid != TC_H_ROOT); + break; } cops = sch->ops->cl_ops; if (cops->qlen_notify) { @@ -768,6 +776,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) sch->q.qlen -= n; __qdisc_qstats_drop(sch, drops); } + rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -1806,57 +1815,46 @@ done: * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) { __be16 protocol = tc_skb_protocol(skb); - int err; +#ifdef CONFIG_NET_CLS_ACT + const struct tcf_proto *old_tp = tp; + int limit = 0; +reclassify: +#endif for (; tp; tp = rcu_dereference_bh(tp->next)) { + int err; + if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; - err = tp->classify(skb, tp, res); - if (err >= 0) { + err = tp->classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT - if (err != TC_ACT_RECLASSIFY && skb->tc_verd) - skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) + goto reset; #endif + if (err >= 0) return err; - } } - return -1; -} -EXPORT_SYMBOL(tc_classify_compat); - -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - int err = 0; -#ifdef CONFIG_NET_CLS_ACT - const struct tcf_proto *otp = tp; -reclassify: -#endif - err = tc_classify_compat(skb, tp, res); + return -1; #ifdef CONFIG_NET_CLS_ACT - if (err == TC_ACT_RECLASSIFY) { - u32 verd = G_TC_VERD(skb->tc_verd); - tp = otp; - - if (verd++ >= MAX_REC_LOOP) { - net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, - ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); - goto reclassify; +reset: + if (unlikely(limit++ >= MAX_REC_LOOP)) { + net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", + tp->q->ops->id, tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; } + + tp = old_tp; + protocol = tc_skb_protocol(skb); + goto reclassify; #endif - return err; } EXPORT_SYMBOL(tc_classify); @@ -1885,13 +1883,10 @@ EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { - struct timespec ts; - - hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, - (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); + (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } @@ -1956,6 +1951,7 @@ static int __init pktsched_init(void) register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); + register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); diff --git a/kernel/net/sched/sch_atm.c b/kernel/net/sched/sch_atm.c index e3e2cc5fd..1911af3ca 100644 --- a/kernel/net/sched/sch_atm.c +++ b/kernel/net/sched/sch_atm.c @@ -375,7 +375,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/kernel/net/sched/sch_blackhole.c b/kernel/net/sched/sch_blackhole.c index 094a874b4..3fee70d98 100644 --- a/kernel/net/sched/sch_blackhole.c +++ b/kernel/net/sched/sch_blackhole.c @@ -11,7 +11,7 @@ * Note: Quantum tunneling is not supported. */ -#include +#include #include #include #include @@ -37,17 +37,8 @@ static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; -static int __init blackhole_module_init(void) +static int __init blackhole_init(void) { return register_qdisc(&blackhole_qdisc_ops); } - -static void __exit blackhole_module_exit(void) -{ - unregister_qdisc(&blackhole_qdisc_ops); -} - -module_init(blackhole_module_init) -module_exit(blackhole_module_exit) - -MODULE_LICENSE("GPL"); +device_initcall(blackhole_init) diff --git a/kernel/net/sched/sch_cbq.c b/kernel/net/sched/sch_cbq.c index beeb75f80..c538d9e4a 100644 --- a/kernel/net/sched/sch_cbq.c +++ b/kernel/net/sched/sch_cbq.c @@ -240,7 +240,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (!fl || result < 0) goto fallback; diff --git a/kernel/net/sched/sch_choke.c b/kernel/net/sched/sch_choke.c index c009eb904..5ffb8b833 100644 --- a/kernel/net/sched/sch_choke.c +++ b/kernel/net/sched/sch_choke.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* CHOKe stateless AQM for fair bandwidth allocation @@ -133,16 +133,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* private part of skb->cb[] that a qdisc is allowed to use - * is limited to QDISC_CB_PRIV_LEN bytes. - * As a flow key might be too large, we store a part of it only. - */ -#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) - struct choke_skb_cb { u16 classid; u8 keys_valid; - u8 keys[QDISC_CB_PRIV_LEN - 3]; + struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -176,19 +170,19 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &temp); - memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); + skb_flow_dissect_flow_keys(skb1, &temp, 0); + make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &temp); - memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); + skb_flow_dissect_flow_keys(skb2, &temp, 0); + make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - CHOKE_K_LEN); + sizeof(choke_skb_cb(skb1)->keys)); } /* @@ -207,7 +201,7 @@ static bool choke_classify(struct sk_buff *skb, int result; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -391,6 +385,19 @@ static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); + while (q->head != q->tail) { + struct sk_buff *skb = q->tab[q->head]; + + q->head = (q->head + 1) & q->tab_mask; + if (!skb) + continue; + qdisc_qstats_backlog_dec(sch, skb); + --sch->q.qlen; + qdisc_drop(skb, sch); + } + + memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); + q->head = q->tail = 0; red_restart(&q->vars); } @@ -546,65 +553,6 @@ static void choke_destroy(struct Qdisc *sch) choke_free(q->tab); } -static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg) -{ - return NULL; -} - -static unsigned long choke_get(struct Qdisc *sch, u32 classid) -{ - return 0; -} - -static void choke_put(struct Qdisc *q, unsigned long cl) -{ -} - -static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, - u32 classid) -{ - return 0; -} - -static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch, - unsigned long cl) -{ - struct choke_sched_data *q = qdisc_priv(sch); - - if (cl) - return NULL; - return &q->filter_list; -} - -static int choke_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - tcm->tcm_handle |= TC_H_MIN(cl); - return 0; -} - -static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg) -{ - if (!arg->stop) { - if (arg->fn(sch, 1, arg) < 0) { - arg->stop = 1; - return; - } - arg->count++; - } -} - -static const struct Qdisc_class_ops choke_class_ops = { - .leaf = choke_leaf, - .get = choke_get, - .put = choke_put, - .tcf_chain = choke_find_tcf, - .bind_tcf = choke_bind, - .unbind_tcf = choke_put, - .dump = choke_dump_class, - .walk = choke_walk, -}; - static struct sk_buff *choke_peek_head(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); diff --git a/kernel/net/sched/sch_codel.c b/kernel/net/sched/sch_codel.c index 7a0bdb16a..535007d5f 100644 --- a/kernel/net/sched/sch_codel.c +++ b/kernel/net/sched/sch_codel.c @@ -6,7 +6,7 @@ * * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, [TCA_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 }, }; static int codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); + + q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); @@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_CODEL_ECN, q->params.ecn)) goto nla_put_failure; - + if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, + codel_time_to_us(q->params.ce_threshold))) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: @@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ldelay = codel_time_to_us(q->vars.ldelay), .dropping = q->vars.dropping, .ecn_mark = q->stats.ecn_mark, + .ce_mark = q->stats.ce_mark, }; if (q->vars.dropping) { diff --git a/kernel/net/sched/sch_drr.c b/kernel/net/sched/sch_drr.c index 338706092..f26bdea87 100644 --- a/kernel/net/sched/sch_drr.c +++ b/kernel/net/sched/sch_drr.c @@ -331,7 +331,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/kernel/net/sched/sch_dsmark.c b/kernel/net/sched/sch_dsmark.c index 66700a611..f357f34d0 100644 --- a/kernel/net/sched/sch_dsmark.c +++ b/kernel/net/sched/sch_dsmark.c @@ -35,14 +35,20 @@ #define NO_DEFAULT_INDEX (1 << 16) +struct mask_value { + u8 mask; + u8 value; +}; + struct dsmark_qdisc_data { struct Qdisc *q; struct tcf_proto __rcu *filter_list; - u8 *mask; /* "owns" the array */ - u8 *value; + struct mask_value *mv; u16 indices; + u8 set_tc_index; u32 default_index; /* index range is 0...0xffff */ - int set_tc_index; +#define DSMARK_EMBEDDED_SZ 16 + struct mask_value embedded[DSMARK_EMBEDDED_SZ]; }; static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) @@ -116,7 +122,6 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; - u8 mask = 0; pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", __func__, sch, p, classid, parent, *arg); @@ -133,14 +138,11 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, if (err < 0) goto errout; - if (tb[TCA_DSMARK_MASK]) - mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - if (tb[TCA_DSMARK_VALUE]) - p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); + p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]); if (tb[TCA_DSMARK_MASK]) - p->mask[*arg - 1] = mask; + p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]); err = 0; @@ -155,8 +157,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg) if (!dsmark_valid_index(p, arg)) return -EINVAL; - p->mask[arg - 1] = 0xff; - p->value[arg - 1] = 0; + p->mv[arg - 1].mask = 0xff; + p->mv[arg - 1].value = 0; return 0; } @@ -173,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { - if (p->mask[i] == 0xff && !p->value[i]) + if (p->mv[i].mask == 0xff && !p->mv[i].value) goto ignore; if (walker->count >= walker->skip) { if (walker->fn(sch, i + 1, walker) < 0) { @@ -230,7 +232,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tc_classify(skb, fl, &res); + int result = tc_classify(skb, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); @@ -291,12 +293,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mask[index], - p->value[index]); + ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, + p->mv[index].value); break; case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], - p->value[index]); + ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask, + p->mv[index].value); break; default: /* @@ -304,7 +306,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) * This way, we can send non-IP traffic through dsmark * and don't need yet another qdisc as a bypass. */ - if (p->mask[index] != 0xff || p->value[index]) + if (p->mv[index].mask != 0xff || p->mv[index].value) pr_warn("%s: unsupported protocol %d\n", __func__, ntohs(tc_skb_protocol(skb))); break; @@ -346,7 +348,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) int err = -EINVAL; u32 default_index = NO_DEFAULT_INDEX; u16 indices; - u8 *mask; + int i; pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); @@ -366,18 +368,18 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_DSMARK_DEFAULT_INDEX]) default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); - mask = kmalloc(indices * 2, GFP_KERNEL); - if (mask == NULL) { + if (indices <= DSMARK_EMBEDDED_SZ) + p->mv = p->embedded; + else + p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL); + if (!p->mv) { err = -ENOMEM; goto errout; } - - p->mask = mask; - memset(p->mask, 0xff, indices); - - p->value = p->mask + indices; - memset(p->value, 0, indices); - + for (i = 0; i < indices; i++) { + p->mv[i].mask = 0xff; + p->mv[i].value = 0; + } p->indices = indices; p->default_index = default_index; p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); @@ -410,7 +412,8 @@ static void dsmark_destroy(struct Qdisc *sch) tcf_destroy_chain(&p->filter_list); qdisc_destroy(p->q); - kfree(p->mask); + if (p->mv != p->embedded) + kfree(p->mv); } static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, @@ -430,8 +433,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || - nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) + if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || + nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/kernel/net/sched/sch_fifo.c b/kernel/net/sched/sch_fifo.c index 2e2398cfc..2177eac0a 100644 --- a/kernel/net/sched/sch_fifo.c +++ b/kernel/net/sched/sch_fifo.c @@ -54,7 +54,7 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt) bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { - u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; + u32 limit = qdisc_dev(sch)->tx_queue_len; if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); diff --git a/kernel/net/sched/sch_fq.c b/kernel/net/sched/sch_fq.c index f377702d4..109b23227 100644 --- a/kernel/net/sched/sch_fq.c +++ b/kernel/net/sched/sch_fq.c @@ -224,13 +224,16 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; - /* SYNACK messages are attached to a listener socket. - * 1) They are not part of a 'flow' yet - * 2) We do not want to rate limit them (eg SYNFLOOD attack), + /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * or a listener (SYNCOOKIE mode) + * 1) request sockets are not full blown, + * they do not contain sk_pacing_rate + * 2) They are not part of a 'flow' yet + * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE - * 3) We pretend they are orphaned + * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_LISTEN) { + if (!sk || sk_listener(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not diff --git a/kernel/net/sched/sch_fq_codel.c b/kernel/net/sched/sch_fq_codel.c index 9291598b5..4c834e93d 100644 --- a/kernel/net/sched/sch_fq_codel.c +++ b/kernel/net/sched/sch_fq_codel.c @@ -6,7 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet */ #include @@ -23,7 +23,6 @@ #include #include #include -#include #include /* Fair Queue CoDel. @@ -68,15 +67,9 @@ struct fq_codel_sched_data { }; static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, - const struct sk_buff *skb) + struct sk_buff *skb) { - struct flow_keys keys; - unsigned int hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); + u32 hash = skb_get_hash_perturb(skb, q->perturbation); return reciprocal_scale(hash, q->flows_cnt); } @@ -99,7 +92,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, filter, &res); + result = tc_classify(skb, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -170,6 +163,15 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) return idx; } +static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) +{ + unsigned int prev_backlog; + + prev_backlog = sch->qstats.backlog; + fq_codel_drop(sch); + return prev_backlog - sch->qstats.backlog; +} + static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -286,10 +288,26 @@ begin: static void fq_codel_reset(struct Qdisc *sch) { - struct sk_buff *skb; + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; - while ((skb = fq_codel_dequeue(sch)) != NULL) - kfree_skb(skb); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + while (flow->head) { + struct sk_buff *skb = dequeue_head(flow); + + qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); + } + + INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); + } + memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); + sch->q.qlen = 0; } static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { @@ -299,6 +317,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -329,6 +348,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); + + q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -448,6 +473,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -466,6 +496,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; + st.qdisc_stats.ce_mark = q->cstats.ce_mark; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; @@ -598,7 +629,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, - .drop = fq_codel_drop, + .drop = fq_codel_qdisc_drop, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/kernel/net/sched/sch_generic.c b/kernel/net/sched/sch_generic.c index 1e346523f..47ef1b11b 100644 --- a/kernel/net/sched/sch_generic.c +++ b/kernel/net/sched/sch_generic.c @@ -416,33 +416,25 @@ struct Qdisc noop_qdisc = { }; EXPORT_SYMBOL(noop_qdisc); -static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { +static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + /* register_qdisc() assigns a default of noop_enqueue if unset, + * but __dev_queue_xmit() treats noqueue only as such + * if this is NULL - so clear it here. */ + qdisc->enqueue = NULL; + return 0; +} + +struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, + .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; -static struct Qdisc noqueue_qdisc; -static struct netdev_queue noqueue_netdev_queue = { - .qdisc = &noqueue_qdisc, - .qdisc_sleeping = &noqueue_qdisc, -}; - -static struct Qdisc noqueue_qdisc = { - .enqueue = NULL, - .dequeue = noop_dequeue, - .flags = TCQ_F_BUILTIN, - .ops = &noqueue_qdisc_ops, - .list = LIST_HEAD_INIT(noqueue_qdisc.list), - .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), - .dev_queue = &noqueue_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), -}; - - static const u8 prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; @@ -666,8 +658,10 @@ static void qdisc_rcu_free(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) + if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); + free_percpu(qdisc->cpu_qstats); + } kfree((char *) qdisc - qdisc->padded); } @@ -733,18 +727,19 @@ static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { - struct Qdisc *qdisc = &noqueue_qdisc; + struct Qdisc *qdisc; + const struct Qdisc_ops *ops = default_qdisc_ops; - if (dev->tx_queue_len) { - qdisc = qdisc_create_dflt(dev_queue, - default_qdisc_ops, TC_H_ROOT); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); - return; - } - if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + if (dev->priv_flags & IFF_NO_QUEUE) + ops = &noqueue_qdisc_ops; + + qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT); + if (!qdisc) { + netdev_info(dev, "activation failed\n"); + return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; } @@ -755,7 +750,8 @@ static void attach_default_qdiscs(struct net_device *dev) txq = netdev_get_tx_queue(dev, 0); - if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) { + if (!netif_is_multiqueue(dev) || + dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; atomic_inc(&dev->qdisc->refcnt); @@ -779,7 +775,7 @@ static void transition_one_qdisc(struct net_device *dev, clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); - if (need_watchdog_p && new_qdisc != &noqueue_qdisc) { + if (need_watchdog_p) { dev_queue->trans_start = 0; *need_watchdog_p = 1; } diff --git a/kernel/net/sched/sch_gred.c b/kernel/net/sched/sch_gred.c index 634529e0c..80105109f 100644 --- a/kernel/net/sched/sch_gred.c +++ b/kernel/net/sched/sch_gred.c @@ -165,7 +165,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) * if no default DP has been configured. This * allows for DP flows to be left untouched. */ - if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; @@ -397,7 +398,10 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, q->DP = dp; q->prio = prio; - q->limit = ctl->limit; + if (ctl->limit > sch->limit) + q->limit = sch->limit; + else + q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); @@ -414,6 +418,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, + [TCA_GRED_LIMIT] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -433,11 +438,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { + if (tb[TCA_GRED_LIMIT] != NULL) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, opt); + } if (tb[TCA_GRED_PARMS] == NULL || - tb[TCA_GRED_STAB] == NULL) + tb[TCA_GRED_STAB] == NULL || + tb[TCA_GRED_LIMIT] != NULL) return -EINVAL; max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; @@ -501,6 +510,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; + if (tb[TCA_GRED_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); + else + sch->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } @@ -531,6 +546,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) + goto nla_put_failure; + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; diff --git a/kernel/net/sched/sch_hfsc.c b/kernel/net/sched/sch_hfsc.c index e6c7416d0..b7ebe2c87 100644 --- a/kernel/net/sched/sch_hfsc.c +++ b/kernel/net/sched/sch_hfsc.c @@ -1165,7 +1165,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/kernel/net/sched/sch_hhf.c b/kernel/net/sched/sch_hhf.c index 15d3aabfe..86b04e31e 100644 --- a/kernel/net/sched/sch_hhf.c +++ b/kernel/net/sched/sch_hhf.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -176,22 +175,6 @@ static u32 hhf_time_stamp(void) return jiffies; } -static unsigned int skb_hash(const struct hhf_sched_data *q, - const struct sk_buff *skb) -{ - struct flow_keys keys; - unsigned int hash; - - if (skb->sk && skb->sk->sk_hash) - return skb->sk->sk_hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); - return hash; -} - /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, @@ -280,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_hash(q, skb); + hash = skb_get_hash_perturb(skb, q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; @@ -385,6 +368,15 @@ static unsigned int hhf_drop(struct Qdisc *sch) return bucket - q->buckets; } +static unsigned int hhf_qdisc_drop(struct Qdisc *sch) +{ + unsigned int prev_backlog; + + prev_backlog = sch->qstats.backlog; + hhf_drop(sch); + return prev_backlog - sch->qstats.backlog; +} + static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); @@ -713,7 +705,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, - .drop = hhf_drop, + .drop = hhf_qdisc_drop, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, diff --git a/kernel/net/sched/sch_htb.c b/kernel/net/sched/sch_htb.c index f1acb0f60..15ccd7f8f 100644 --- a/kernel/net/sched/sch_htb.c +++ b/kernel/net/sched/sch_htb.c @@ -229,7 +229,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -1048,11 +1048,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); - else { + else q->direct_qlen = qdisc_dev(sch)->tx_queue_len; - if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ - q->direct_qlen = 2; - } + if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; diff --git a/kernel/net/sched/sch_ingress.c b/kernel/net/sched/sch_ingress.c index 4cdbfb856..e7c648fa9 100644 --- a/kernel/net/sched/sch_ingress.c +++ b/kernel/net/sched/sch_ingress.c @@ -12,16 +12,10 @@ #include #include #include + #include #include - -struct ingress_qdisc_data { - struct tcf_proto __rcu *filter_list; -}; - -/* ------------------------- Class/flow operations ------------------------- */ - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -49,57 +43,24 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = qdisc_priv(sch); - - return &p->filter_list; -} - -/* --------------------------- Qdisc operations ---------------------------- */ + struct net_device *dev = qdisc_dev(sch); -static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct ingress_qdisc_data *p = qdisc_priv(sch); - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result; - - result = tc_classify(skb, fl, &res); - - qdisc_bstats_update(sch, skb); - switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - qdisc_qstats_drop(sch); - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - default: - result = TC_ACT_OK; - break; - } - - return result; + return &dev->ingress_cl_list; } -/* ------------------------------------------------------------- */ - static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { net_inc_ingress_queue(); + sch->flags |= TCQ_F_CPUSTATS; return 0; } static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_destroy_chain(&p->filter_list); + tcf_destroy_chain(&dev->ingress_cl_list); net_dec_ingress_queue(); } @@ -110,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; + return nla_nest_end(skb, nest); nla_put_failure: @@ -130,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", - .priv_size = sizeof(struct ingress_qdisc_data), - .enqueue = ingress_enqueue, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -148,6 +108,7 @@ static void __exit ingress_module_exit(void) unregister_qdisc(&ingress_qdisc_ops); } -module_init(ingress_module_init) -module_exit(ingress_module_exit) +module_init(ingress_module_init); +module_exit(ingress_module_exit); + MODULE_LICENSE("GPL"); diff --git a/kernel/net/sched/sch_mq.c b/kernel/net/sched/sch_mq.c index f3cbaecd2..3e82f047c 100644 --- a/kernel/net/sched/sch_mq.c +++ b/kernel/net/sched/sch_mq.c @@ -63,7 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; @@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/kernel/net/sched/sch_mqprio.c b/kernel/net/sched/sch_mqprio.c index 3811a7454..ad70ecf57 100644 --- a/kernel/net/sched/sch_mqprio.c +++ b/kernel/net/sched/sch_mqprio.c @@ -132,7 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own @@ -209,7 +209,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); diff --git a/kernel/net/sched/sch_multiq.c b/kernel/net/sched/sch_multiq.c index 42dd21887..4e904ca0a 100644 --- a/kernel/net/sched/sch_multiq.c +++ b/kernel/net/sched/sch_multiq.c @@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/kernel/net/sched/sch_netem.c b/kernel/net/sched/sch_netem.c index 956ead2ca..5abd1d9de 100644 --- a/kernel/net/sched/sch_netem.c +++ b/kernel/net/sched/sch_netem.c @@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { struct Qdisc *rootq = qdisc_root(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; - qdisc_enqueue_root(skb2, rootq); + q->duplicate = 0; + rootq->enqueue(skb2, rootq); q->duplicate = dupsave; } diff --git a/kernel/net/sched/sch_plug.c b/kernel/net/sched/sch_plug.c index 89f8fcf73..5abfe4467 100644 --- a/kernel/net/sched/sch_plug.c +++ b/kernel/net/sched/sch_plug.c @@ -130,12 +130,8 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt) q->unplug_indefinite = false; if (opt == NULL) { - /* We will set a default limit of 100 pkts (~150kB) - * in case tx_queue_len is not available. The - * default value is completely arbitrary. - */ - u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; - q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + q->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); @@ -216,6 +212,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .peek = qdisc_peek_head, .init = plug_init, .change = plug_change, + .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; diff --git a/kernel/net/sched/sch_prio.c b/kernel/net/sched/sch_prio.c index 8e5cd34aa..ba6487f27 100644 --- a/kernel/net/sched/sch_prio.c +++ b/kernel/net/sched/sch_prio.c @@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/kernel/net/sched/sch_qfq.c b/kernel/net/sched/sch_qfq.c index 3ec7e88a4..3dc3a6e56 100644 --- a/kernel/net/sched/sch_qfq.c +++ b/kernel/net/sched/sch_qfq.c @@ -186,7 +186,6 @@ struct qfq_sched { u64 oldV, V; /* Precise virtual times. */ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ - u32 num_active_agg; /* Num. of active aggregates */ u32 wsum; /* weight sum */ u32 iwsum; /* inverse weight sum */ @@ -339,8 +338,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - if (!hlist_unhashed(&agg->nonfull_next)) - hlist_del_init(&agg->nonfull_next); + hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; @@ -719,7 +717,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/kernel/net/sched/sch_sfb.c b/kernel/net/sched/sch_sfb.c index 5819dd826..5bbb6332e 100644 --- a/kernel/net/sched/sch_sfb.c +++ b/kernel/net/sched/sch_sfb.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -259,7 +258,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -285,9 +284,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) int i; u32 p_min = ~0; u32 minqlen = ~0; - u32 r, slot, salt, sfbhash; + u32 r, sfbhash; + u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); @@ -309,22 +308,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) fl = rcu_dereference_bh(q->filter_list); if (fl) { + u32 salt; + /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - keys.src = salt; - keys.dst = 0; - keys.ports = 0; + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); } else { - skb_flow_dissect(skb, &keys); + sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); } - slot = q->slot; - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -356,10 +350,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -510,7 +502,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) limit = ctl->limit; if (limit == 0) - limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + limit = qdisc_dev(sch)->tx_queue_len; child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); if (IS_ERR(child)) diff --git a/kernel/net/sched/sch_sfq.c b/kernel/net/sched/sch_sfq.c index b877140be..3abab534e 100644 --- a/kernel/net/sched/sch_sfq.c +++ b/kernel/net/sched/sch_sfq.c @@ -23,7 +23,6 @@ #include #include #include -#include #include @@ -156,30 +155,10 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_MAX_FLOWS]; } -/* - * In order to be able to quickly rehash our queue when timer changes - * q->perturbation, we store flow_keys in skb->cb[] - */ -struct sfq_skb_cb { - struct flow_keys keys; -}; - -static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; -} - static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; - unsigned int hash; - - hash = jhash_3words((__force u32)keys->dst, - (__force u32)keys->src ^ keys->ip_proto, - (__force u32)keys->ports, q->perturbation); - return hash & (q->divisor - 1); + return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -196,13 +175,11 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); - if (!fl) { - skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); + if (!fl) return sfq_hash(q, skb) + 1; - } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -329,10 +306,10 @@ drop: len = qdisc_pkt_len(skb); slot->backlog -= len; sfq_dec(q, x); - kfree_skb(skb); sch->q.qlen--; qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); return len; } diff --git a/kernel/net/sctp/associola.c b/kernel/net/sctp/associola.c index 197c3f59e..559afd0ee 100644 --- a/kernel/net/sctp/associola.c +++ b/kernel/net/sctp/associola.c @@ -1208,20 +1208,22 @@ void sctp_assoc_update(struct sctp_association *asoc, * within this document. * * Our basic strategy is to round-robin transports in priorities - * according to sctp_state_prio_map[] e.g., if no such + * according to sctp_trans_score() e.g., if no such * transport with state SCTP_ACTIVE exists, round-robin through * SCTP_UNKNOWN, etc. You get the picture. */ -static const u8 sctp_trans_state_to_prio_map[] = { - [SCTP_ACTIVE] = 3, /* best case */ - [SCTP_UNKNOWN] = 2, - [SCTP_PF] = 1, - [SCTP_INACTIVE] = 0, /* worst case */ -}; - static u8 sctp_trans_score(const struct sctp_transport *trans) { - return sctp_trans_state_to_prio_map[trans->state]; + switch (trans->state) { + case SCTP_ACTIVE: + return 3; /* best case */ + case SCTP_UNKNOWN: + return 2; + case SCTP_PF: + return 1; + default: /* case SCTP_INACTIVE */ + return 0; /* worst case */ + } } static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, @@ -1588,7 +1590,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { - bool preload = !!(gfp & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp); int ret; /* If the id is already assigned, keep it. */ diff --git a/kernel/net/sctp/auth.c b/kernel/net/sctp/auth.c index 4f15b7d73..1543e39f4 100644 --- a/kernel/net/sctp/auth.c +++ b/kernel/net/sctp/auth.c @@ -809,8 +809,8 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, if (!has_sha1) return -EINVAL; - memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0], - hmacs->shmac_num_idents * sizeof(__u16)); + for (i = 0; i < hmacs->shmac_num_idents; i++) + ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]); ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) + hmacs->shmac_num_idents * sizeof(__u16)); return 0; diff --git a/kernel/net/sctp/ipv6.c b/kernel/net/sctp/ipv6.c index 0e4198ee2..ec529121f 100644 --- a/kernel/net/sctp/ipv6.c +++ b/kernel/net/sctp/ipv6.c @@ -209,6 +209,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); @@ -220,7 +221,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - return ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + rcu_read_lock(); + res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); + rcu_read_unlock(); + return res; } /* Returns the dst cache entry for the given source and destination ip @@ -262,7 +266,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, pr_debug("src=%pI6 - ", &fl6->saddr); } - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!asoc || saddr) goto out; @@ -316,14 +323,13 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, } } } - rcu_read_unlock(); - if (baddr) { fl6->saddr = baddr->v6.sin6_addr; fl6->fl6_sport = baddr->v6.sin6_port; - final_p = fl6_update_dst(fl6, np->opt, &final); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); dst = ip6_dst_lookup_flow(sk, fl6, final_p); } + rcu_read_unlock(); out: if (!IS_ERR_OR_NULL(dst)) { @@ -331,8 +337,9 @@ out: rt = (struct rt6_info *)dst; t->dst = dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr, + t->dst_cookie = rt6_get_cookie(rt); + pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl6->saddr); } else { t->dst = NULL; @@ -634,8 +641,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; + struct ipv6_txoptions *opt; - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) goto out; @@ -653,6 +661,13 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + rcu_read_lock(); + opt = rcu_dereference(np->opt); + if (opt) + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + rcu_read_unlock(); + /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ diff --git a/kernel/net/sctp/outqueue.c b/kernel/net/sctp/outqueue.c index 7e8f0a117..c0380cfb1 100644 --- a/kernel/net/sctp/outqueue.c +++ b/kernel/net/sctp/outqueue.c @@ -324,6 +324,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); + sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); @@ -1251,6 +1252,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) */ sack_a_rwnd = ntohl(sack->a_rwnd); + asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) diff --git a/kernel/net/sctp/protocol.c b/kernel/net/sctp/protocol.c index e13c3c3ea..8b4ff3156 100644 --- a/kernel/net/sctp/protocol.c +++ b/kernel/net/sctp/protocol.c @@ -60,6 +60,8 @@ #include #include +#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) + /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; @@ -487,23 +489,43 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { + struct net_device *odev; + if (!laddr->valid) continue; - if ((laddr->state == SCTP_ADDR_SRC) && - (AF_INET == laddr->a.sa.sa_family)) { - fl4->fl4_sport = laddr->a.v4.sin_port; - flowi4_update_output(fl4, - asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), - daddr->v4.sin_addr.s_addr, - laddr->a.v4.sin_addr.s_addr); - - rt = ip_route_output_key(sock_net(sk), fl4); - if (!IS_ERR(rt)) { - dst = &rt->dst; - goto out_unlock; - } + if (laddr->state != SCTP_ADDR_SRC || + AF_INET != laddr->a.sa.sa_family) + continue; + + fl4->fl4_sport = laddr->a.v4.sin_port; + flowi4_update_output(fl4, + asoc->base.sk->sk_bound_dev_if, + RT_CONN_FLAGS(asoc->base.sk), + daddr->v4.sin_addr.s_addr, + laddr->a.v4.sin_addr.s_addr); + + rt = ip_route_output_key(sock_net(sk), fl4); + if (IS_ERR(rt)) + continue; + + if (!dst) + dst = &rt->dst; + + /* Ensure the src address belongs to the output + * interface. + */ + odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, + false); + if (!odev || odev->ifindex != fl4->flowi4_oif) { + if (&rt->dst != dst) + dst_release(&rt->dst); + continue; } + + if (dst != &rt->dst) + dst_release(dst); + dst = &rt->dst; + break; } out_unlock: @@ -550,7 +572,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot); + sk->sk_prot, 0); struct inet_sock *newinet; if (!newsk) @@ -1332,6 +1354,8 @@ static __init int sctp_init(void) unsigned long limit; int max_share; int order; + int num_entries; + int max_entry_order; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); @@ -1384,14 +1408,24 @@ static __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. + * Though not identical. Start by getting a goal size */ if (totalram_pages >= (128 * 1024)) goal = totalram_pages >> (22 - PAGE_SHIFT); else goal = totalram_pages >> (24 - PAGE_SHIFT); - for (order = 0; (1UL << order) < goal; order++) - ; + /* Then compute the page order for said goal */ + order = get_order(goal); + + /* Now compute the required page order for the maximum sized table we + * want to create + */ + max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * + sizeof(struct sctp_bind_hashbucket)); + + /* Limit the page order by that maximum hash table size */ + order = min(order, max_entry_order); do { sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE / @@ -1425,20 +1459,35 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } - /* Allocate and initialize the SCTP port hash table. */ + /* Allocate and initialize the SCTP port hash table. + * Note that order is initalized to start at the max sized + * table we want to support. If we can't get that many pages + * reduce the order and try again + */ do { - sctp_port_hashsize = (1UL << order) * PAGE_SIZE / - sizeof(struct sctp_bind_hashbucket); - if ((sctp_port_hashsize > (64 * 1024)) && order > 0) - continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } + + /* Now compute the number of entries that will fit in the + * port hash space we allocated + */ + num_entries = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + + /* And finish by rounding it down to the nearest power of two + * this wastes some memory of course, but its needed because + * the hash function operates based on the assumption that + * that the number of entries is a power of two + */ + sctp_port_hashsize = rounddown_pow_of_two(num_entries); + for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); diff --git a/kernel/net/sctp/sm_make_chunk.c b/kernel/net/sctp/sm_make_chunk.c index 06320c8c1..5d6a03fad 100644 --- a/kernel/net/sctp/sm_make_chunk.c +++ b/kernel/net/sctp/sm_make_chunk.c @@ -1652,7 +1652,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, - ktime_get()); + ktime_get_real()); /* Copy the peer's init packet. */ memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr, @@ -1780,7 +1780,7 @@ no_hmac: if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else - kt = ktime_get(); + kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { /* @@ -2494,7 +2494,7 @@ static int sctp_process_param(struct sctp_association *asoc, __u16 sat; int retval = 1; sctp_scope_t scope; - time_t stale; + u32 stale; struct sctp_af *af; union sctp_addr_param *addr_param; struct sctp_transport *t; @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); - } else - sctp_assoc_del_peer(asoc, &addr); + return SCTP_ERROR_NO_ERROR; + } + + /* If the address is not part of the association, the + * ASCONF-ACK with Error Cause Indication Parameter + * which including cause of Unresolvable Address should + * be sent. + */ + peer = sctp_assoc_lookup_paddr(asoc, &addr); + if (!peer) + return SCTP_ERROR_DNS_FAILED; + + sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 @@ -3132,11 +3143,18 @@ bool sctp_verify_asconf(const struct sctp_association *asoc, case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(sctp_ipv4addr_param_t)) return false; + /* ensure there is only one addr param and it's in the + * beginning of addip_hdr params, or we reject it. + */ + if (param.v != addip->addip_hdr.params) + return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(sctp_ipv6addr_param_t)) return false; + if (param.v != addip->addip_hdr.params) + return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: diff --git a/kernel/net/sctp/sm_sideeffect.c b/kernel/net/sctp/sm_sideeffect.c index fef2acdf4..6098d4c42 100644 --- a/kernel/net/sctp/sm_sideeffect.c +++ b/kernel/net/sctp/sm_sideeffect.c @@ -244,12 +244,13 @@ void sctp_generate_t3_rtx_event(unsigned long peer) int error; struct sctp_transport *transport = (struct sctp_transport *) peer; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); /* Check whether a task is in the sock. */ - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -272,10 +273,10 @@ void sctp_generate_t3_rtx_event(unsigned long peer) transport, GFP_ATOMIC); if (error) - asoc->base.sk->sk_err = -error; + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_transport_put(transport); } @@ -285,11 +286,12 @@ out_unlock: static void sctp_generate_timeout_event(struct sctp_association *asoc, sctp_event_timeout_t timeout_type) { - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); int error = 0; - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); @@ -312,10 +314,10 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc, (void *)timeout_type, GFP_ATOMIC); if (error) - asoc->base.sk->sk_err = -error; + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_association_put(asoc); } @@ -365,10 +367,11 @@ void sctp_generate_heartbeat_event(unsigned long data) int error = 0; struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -388,11 +391,11 @@ void sctp_generate_heartbeat_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); - if (error) - asoc->base.sk->sk_err = -error; + if (error) + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_transport_put(transport); } @@ -403,10 +406,11 @@ void sctp_generate_proto_unreach_event(unsigned long data) { struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -427,7 +431,7 @@ void sctp_generate_proto_unreach_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_association_put(asoc); } @@ -702,7 +706,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, * outstanding data and rely on the retransmission limit be reached * to shutdown the association. */ - if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING) + if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) t->asoc->overall_error_count = 0; /* Clear the hb_sent flag to signal that we had a good @@ -954,7 +958,7 @@ static void sctp_cmd_del_non_primary(struct sctp_association *asoc) t = list_entry(pos, struct sctp_transport, transports); if (!sctp_cmp_addr_exact(&t->ipaddr, &asoc->peer.primary_addr)) { - sctp_assoc_del_peer(asoc, &t->ipaddr); + sctp_assoc_rm_peer(asoc, t); } } } diff --git a/kernel/net/sctp/sm_statefuns.c b/kernel/net/sctp/sm_statefuns.c index 3ee27b770..22c2bf367 100644 --- a/kernel/net/sctp/sm_statefuns.c +++ b/kernel/net/sctp/sm_statefuns.c @@ -853,7 +853,7 @@ nomem: /* * Respond to a normal COOKIE ACK chunk. - * We are the side that is being asked for an association. + * We are the side that is asking for an association. * * RFC 2960 5.1 Normal Establishment of an Association * @@ -2306,7 +2306,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - time_t stale; + u32 stale; sctp_cookie_preserve_param_t bht; sctp_errhdr_t *err; struct sctp_chunk *reply; @@ -4829,7 +4829,8 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort( retval = SCTP_DISPOSITION_CONSUME; - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + if (abort) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); /* Even if we can't send the ABORT due to low memory delete the * TCB. This is a departure from our typical NOMEM handling. @@ -4966,7 +4967,8 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort( SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); retval = SCTP_DISPOSITION_CONSUME; - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + if (abort) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); @@ -5412,7 +5414,8 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS); if (asoc->overall_error_count >= asoc->max_retrans) { - if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { + if (asoc->peer.zero_window_announced && + asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { /* * We are here likely because the receiver had its rwnd * closed for a while and we have not been able to diff --git a/kernel/net/sctp/socket.c b/kernel/net/sctp/socket.c index 5f6c4e613..be1489fc3 100644 --- a/kernel/net/sctp/socket.c +++ b/kernel/net/sctp/socket.c @@ -972,7 +972,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_KERNEL); + kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN); if (unlikely(!kaddrs)) return -ENOMEM; @@ -1301,8 +1301,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk, int addrs_size, sctp_assoc_t *assoc_id) { - int err = 0; struct sockaddr *kaddrs; + gfp_t gfp = GFP_KERNEL; + int err = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", __func__, sk, addrs, addrs_size); @@ -1315,7 +1316,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_KERNEL); + if (sk->sk_socket->file) + gfp = GFP_USER | __GFP_NOWARN; + kaddrs = kmalloc(addrs_size, gfp); if (unlikely(!kaddrs)) return -ENOMEM; @@ -1513,8 +1516,7 @@ static void sctp_close(struct sock *sk, long timeout) struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, NULL, 0); - if (chunk) - sctp_primitive_ABORT(net, asoc, chunk); + sctp_primitive_ABORT(net, asoc, chunk); } else sctp_primitive_SHUTDOWN(net, asoc, NULL); } @@ -1952,8 +1954,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { - sctp_chunk_hold(chunk); - /* Do accounting for the write space. */ sctp_set_owner_w(chunk); @@ -1966,15 +1966,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * breaks. */ err = sctp_primitive_SEND(net, asoc, datamsg); + sctp_datamsg_put(datamsg); /* Did the lower layer accept the chunk? */ - if (err) { - sctp_datamsg_free(datamsg); + if (err) goto out_free; - } pr_debug("%s: we sent primitively\n", __func__); - sctp_datamsg_put(datamsg); err = msg_len; if (unlikely(wait_connect)) { @@ -2121,12 +2119,6 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (sp->subscribe.sctp_data_io_event) sctp_ulpevent_read_sndrcvinfo(event, msg); -#if 0 - /* FIXME: we should be calling IP/IPv6 layers. */ - if (sk->sk_protinfo.af_inet.cmsg_flags) - ip_cmsg_recv(msg, skb); -#endif - err = copied; /* If skb's length exceeds the user's buffer, update the skb and @@ -2206,12 +2198,6 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) return -EFAULT; - if (sctp_sk(sk)->subscribe.sctp_data_io_event) - pr_warn_ratelimited(DEPRECATED "%s (pid %d) " - "Requested SCTP_SNDRCVINFO event.\n" - "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n", - current->comm, task_pid_nr(current)); - /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. @@ -4487,7 +4473,7 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval } newfile = sock_alloc_file(newsock, 0, NULL); - if (unlikely(IS_ERR(newfile))) { + if (IS_ERR(newfile)) { put_unused_fd(retval); sock_release(newsock); return PTR_ERR(newfile); @@ -4940,7 +4926,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); - addrs = kmalloc(space_left, GFP_KERNEL); + addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN); if (!addrs) return -ENOMEM; @@ -5556,6 +5542,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; + int i; if (!ep->auth_enable) return -EACCES; @@ -5573,8 +5560,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; - if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len)) - return -EFAULT; + for (i = 0; i < num_idents; i++) { + __u16 hmacid = ntohs(hmacs->hmac_ids[i]); + + if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) + return -EFAULT; + } return 0; } @@ -5789,7 +5780,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; - ids = kmalloc(len, GFP_KERNEL); + ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; @@ -6470,7 +6461,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sctp_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* * Since the socket is not locked, the buffer * might be made available after the writeable check and @@ -6654,6 +6645,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_SACK_IMMEDIATELY | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -6677,6 +6669,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_SACK_IMMEDIATELY | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -6813,26 +6806,30 @@ no_packet: static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; - struct socket *sock = sk->sk_socket; - if ((sctp_wspace(asoc) > 0) && sock) { - if (waitqueue_active(&asoc->wait)) - wake_up_interruptible(&asoc->wait); + if (sctp_wspace(asoc) <= 0) + return; - if (sctp_writeable(sk)) { - wait_queue_head_t *wq = sk_sleep(sk); + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); - if (wq && waitqueue_active(wq)) - wake_up_interruptible(wq); + if (sctp_writeable(sk)) { + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq) { + if (waitqueue_active(&wq->wait)) + wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, - SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } + rcu_read_unlock(); } } @@ -7175,6 +7172,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; newsk->sk_flags = sk->sk_flags; + newsk->sk_tsflags = sk->sk_tsflags; newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; @@ -7207,6 +7205,11 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; + + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); + + security_sk_clone(sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, @@ -7387,6 +7390,13 @@ struct proto sctp_prot = { #if IS_ENABLED(CONFIG_IPV6) +#include +static void sctp_v6_destroy_sock(struct sock *sk) +{ + sctp_destroy_sock(sk); + inet6_destroy_sock(sk); +} + struct proto sctpv6_prot = { .name = "SCTPv6", .owner = THIS_MODULE, @@ -7396,7 +7406,7 @@ struct proto sctpv6_prot = { .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_init_sock, - .destroy = sctp_destroy_sock, + .destroy = sctp_v6_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, diff --git a/kernel/net/sctp/sysctl.c b/kernel/net/sctp/sysctl.c index 26d50c565..3e0fc5127 100644 --- a/kernel/net/sctp/sysctl.c +++ b/kernel/net/sctp/sysctl.c @@ -320,7 +320,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, struct ctl_table tbl; bool changed = false; char *none = "none"; - char tmp[8]; + char tmp[8] = {0}; int ret; memset(&tbl, 0, sizeof(struct ctl_table)); diff --git a/kernel/net/sctp/transport.c b/kernel/net/sctp/transport.c index a0a431824..aab9e3f29 100644 --- a/kernel/net/sctp/transport.c +++ b/kernel/net/sctp/transport.c @@ -331,7 +331,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) * 1/8, rto_alpha would be expressed as 3. */ tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) - + (((__u32)abs64((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); + + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) + (rtt >> net->sctp.rto_alpha); } else { diff --git a/kernel/net/socket.c b/kernel/net/socket.c index 884e32997..d730ef9df 100644 --- a/kernel/net/socket.c +++ b/kernel/net/socket.c @@ -257,6 +257,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) } init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; + wq->flags = 0; RCU_INIT_POINTER(ei->socket.wq, wq); ei->socket.state = SS_UNCONNECTED; @@ -373,7 +374,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); - if (unlikely(IS_ERR(file))) { + if (IS_ERR(file)) { /* drop dentry, keep inode */ ihold(d_inode(path.dentry)); path_put(&path); @@ -576,9 +577,6 @@ void sock_release(struct socket *sock) if (rcu_dereference_protected(sock->wq, 1)->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); - if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags)) - return; - this_cpu_sub(sockets_in_use, 1); if (!sock->file) { iput(SOCK_INODE(sock)); @@ -1059,27 +1057,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock or rcu_lock */ +/* This function may be called only under rcu_lock */ -int sock_wake_async(struct socket *sock, int how, int band) +int sock_wake_async(struct socket_wq *wq, int how, int band) { - struct socket_wq *wq; - - if (!sock) + if (!wq || !wq->fasync_list) return -1; - rcu_read_lock(); - wq = rcu_dereference(sock->wq); - if (!wq || !wq->fasync_list) { - rcu_read_unlock(); - return -1; - } + switch (how) { case SOCK_WAKE_WAITD: - if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; /* fall through */ case SOCK_WAKE_IO: @@ -1089,7 +1080,7 @@ call_kill: case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } - rcu_read_unlock(); + return 0; } EXPORT_SYMBOL(sock_wake_async); @@ -1213,9 +1204,9 @@ int sock_create(int family, int type, int protocol, struct socket **res) } EXPORT_SYMBOL(sock_create); -int sock_create_kern(int family, int type, int protocol, struct socket **res) +int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { - return __sock_create(&init_net, family, type, protocol, res, 1); + return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); @@ -1306,7 +1297,7 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, } newfile1 = sock_alloc_file(sock1, flags, NULL); - if (unlikely(IS_ERR(newfile1))) { + if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); goto out_put_unused_both; } @@ -1470,7 +1461,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, goto out_put; } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); - if (unlikely(IS_ERR(newfile))) { + if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); sock_release(newsock); @@ -1705,6 +1696,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; + msg.msg_iocb = NULL; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); diff --git a/kernel/net/sunrpc/Kconfig b/kernel/net/sunrpc/Kconfig index 9068e72aa..04ce2c0b6 100644 --- a/kernel/net/sunrpc/Kconfig +++ b/kernel/net/sunrpc/Kconfig @@ -48,28 +48,16 @@ config SUNRPC_DEBUG If unsure, say Y. -config SUNRPC_XPRT_RDMA_CLIENT - tristate "RPC over RDMA Client Support" +config SUNRPC_XPRT_RDMA + tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS default SUNRPC && INFINIBAND help - This option allows the NFS client to support an RDMA-enabled - transport. + This option allows the NFS client and server to use RDMA + transports (InfiniBand, iWARP, or RoCE). - To compile RPC client RDMA transport support as a module, - choose M here: the module will be called xprtrdma. + To compile this support as a module, choose M. The module + will be called rpcrdma.ko. - If unsure, say N. - -config SUNRPC_XPRT_RDMA_SERVER - tristate "RPC over RDMA Server Support" - depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS - default SUNRPC && INFINIBAND - help - This option allows the NFS server to support an RDMA-enabled - transport. - - To compile RPC server RDMA transport support as a module, - choose M here: the module will be called svcrdma. - - If unsure, say N. + If unsure, or you know there is no RDMA capability on your + hardware platform, say N. diff --git a/kernel/net/sunrpc/Makefile b/kernel/net/sunrpc/Makefile index 15e6f6c23..b512fbd9d 100644 --- a/kernel/net/sunrpc/Makefile +++ b/kernel/net/sunrpc/Makefile @@ -5,8 +5,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ - -obj-y += xprtrdma/ +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o auth_generic.o \ @@ -15,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ sunrpc_syms.o cache.o rpc_pipe.o \ svc_xprt.o sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o -sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o +sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/kernel/net/sunrpc/auth.c b/kernel/net/sunrpc/auth.c index 47f38be41..02f53674d 100644 --- a/kernel/net/sunrpc/auth.c +++ b/kernel/net/sunrpc/auth.c @@ -72,7 +72,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); -static struct kernel_param_ops param_ops_hashtbl_sz = { +static const struct kernel_param_ops param_ops_hashtbl_sz = { .set = param_set_hashtbl_sz, .get = param_get_hashtbl_sz, }; diff --git a/kernel/net/sunrpc/auth_gss/auth_gss.c b/kernel/net/sunrpc/auth_gss/auth_gss.c index dace13d76..799e65b94 100644 --- a/kernel/net/sunrpc/auth_gss/auth_gss.c +++ b/kernel/net/sunrpc/auth_gss/auth_gss.c @@ -1411,17 +1411,16 @@ gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; - unsigned long now = jiffies; - unsigned long expire; + unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); + int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); - if (ctx) - expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ); + if (!ctx || time_after(timeout, ctx->gc_expiry)) + ret = -EACCES; rcu_read_unlock(); - if (!ctx || time_after(now, expire)) - return -EACCES; - return 0; + + return ret; } static int diff --git a/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c b/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c index b5408e8a3..fee3c15a4 100644 --- a/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -881,9 +881,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, if (err) goto out_err; - sg_init_table(sg, 1); - sg_set_buf(sg, &zeroconstant, 4); - + sg_init_one(sg, &zeroconstant, 4); err = crypto_hash_digest(&desc, sg, 4, Kseq); if (err) goto out_err; @@ -951,9 +949,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, if (err) goto out_err; - sg_init_table(sg, 1); - sg_set_buf(sg, zeroconstant, 4); - + sg_init_one(sg, zeroconstant, 4); err = crypto_hash_digest(&desc, sg, 4, Kcrypt); if (err) goto out_err; diff --git a/kernel/net/sunrpc/auth_unix.c b/kernel/net/sunrpc/auth_unix.c index 4feda2d0a..548240dd1 100644 --- a/kernel/net/sunrpc/auth_unix.c +++ b/kernel/net/sunrpc/auth_unix.c @@ -23,7 +23,7 @@ struct unx_cred { }; #define uc_uid uc_base.cr_uid -#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) +#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH diff --git a/kernel/net/sunrpc/backchannel_rqst.c b/kernel/net/sunrpc/backchannel_rqst.c index 28504dfd3..229956bf8 100644 --- a/kernel/net/sunrpc/backchannel_rqst.c +++ b/kernel/net/sunrpc/backchannel_rqst.c @@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ static inline int xprt_need_to_requeue(struct rpc_xprt *xprt) { - return xprt->bc_alloc_count > 0; + return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots); } static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n) { + atomic_add(n, &xprt->bc_free_slots); xprt->bc_alloc_count += n; } static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n) { + atomic_sub(n, &xprt->bc_free_slots); return xprt->bc_alloc_count -= n; } @@ -67,6 +69,55 @@ static void xprt_free_allocation(struct rpc_rqst *req) kfree(req); } +static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) +{ + struct page *page; + /* Preallocate one XDR receive buffer */ + page = alloc_page(gfp_flags); + if (page == NULL) + return -ENOMEM; + buf->head[0].iov_base = page_address(page); + buf->head[0].iov_len = PAGE_SIZE; + buf->tail[0].iov_base = NULL; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->len = 0; + buf->buflen = PAGE_SIZE; + return 0; +} + +static +struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) +{ + struct rpc_rqst *req; + + /* Pre-allocate one backchannel rpc_rqst */ + req = kzalloc(sizeof(*req), gfp_flags); + if (req == NULL) + return NULL; + + req->rq_xprt = xprt; + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_bc_list); + + /* Preallocate one XDR receive buffer */ + if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) { + printk(KERN_ERR "Failed to create bc receive xbuf\n"); + goto out_free; + } + req->rq_rcv_buf.len = PAGE_SIZE; + + /* Preallocate one XDR send buffer */ + if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) { + printk(KERN_ERR "Failed to create bc snd xbuf\n"); + goto out_free; + } + return req; +out_free: + xprt_free_allocation(req); + return NULL; +} + /* * Preallocate up to min_reqs structures and related buffers for use * by the backchannel. This function can be called multiple times @@ -87,9 +138,15 @@ static void xprt_free_allocation(struct rpc_rqst *req) */ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) { - struct page *page_rcv = NULL, *page_snd = NULL; - struct xdr_buf *xbufp = NULL; - struct rpc_rqst *req, *tmp; + if (!xprt->ops->bc_setup) + return 0; + return xprt->ops->bc_setup(xprt, min_reqs); +} +EXPORT_SYMBOL_GPL(xprt_setup_backchannel); + +int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) +{ + struct rpc_rqst *req; struct list_head tmp_list; int i; @@ -106,7 +163,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) INIT_LIST_HEAD(&tmp_list); for (i = 0; i < min_reqs; i++) { /* Pre-allocate one backchannel rpc_rqst */ - req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); + req = xprt_alloc_bc_req(xprt, GFP_KERNEL); if (req == NULL) { printk(KERN_ERR "Failed to create bc rpc_rqst\n"); goto out_free; @@ -115,41 +172,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) /* Add the allocated buffer to the tmp list */ dprintk("RPC: adding req= %p\n", req); list_add(&req->rq_bc_pa_list, &tmp_list); - - req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_list); - INIT_LIST_HEAD(&req->rq_bc_list); - - /* Preallocate one XDR receive buffer */ - page_rcv = alloc_page(GFP_KERNEL); - if (page_rcv == NULL) { - printk(KERN_ERR "Failed to create bc receive xbuf\n"); - goto out_free; - } - xbufp = &req->rq_rcv_buf; - xbufp->head[0].iov_base = page_address(page_rcv); - xbufp->head[0].iov_len = PAGE_SIZE; - xbufp->tail[0].iov_base = NULL; - xbufp->tail[0].iov_len = 0; - xbufp->page_len = 0; - xbufp->len = PAGE_SIZE; - xbufp->buflen = PAGE_SIZE; - - /* Preallocate one XDR send buffer */ - page_snd = alloc_page(GFP_KERNEL); - if (page_snd == NULL) { - printk(KERN_ERR "Failed to create bc snd xbuf\n"); - goto out_free; - } - - xbufp = &req->rq_snd_buf; - xbufp->head[0].iov_base = page_address(page_snd); - xbufp->head[0].iov_len = 0; - xbufp->tail[0].iov_base = NULL; - xbufp->tail[0].iov_len = 0; - xbufp->page_len = 0; - xbufp->len = 0; - xbufp->buflen = PAGE_SIZE; } /* @@ -167,7 +189,10 @@ out_free: /* * Memory allocation failed, free the temporary list */ - list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) { + while (!list_empty(&tmp_list)) { + req = list_first_entry(&tmp_list, + struct rpc_rqst, + rq_bc_pa_list); list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); } @@ -175,7 +200,6 @@ out_free: dprintk("RPC: setup backchannel transport failed\n"); return -ENOMEM; } -EXPORT_SYMBOL_GPL(xprt_setup_backchannel); /** * xprt_destroy_backchannel - Destroys the backchannel preallocated structures. @@ -187,6 +211,13 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel); * of reqs specified by the caller. */ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) +{ + if (xprt->ops->bc_destroy) + xprt->ops->bc_destroy(xprt, max_reqs); +} +EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); + +void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs) { struct rpc_rqst *req = NULL, *tmp = NULL; @@ -210,16 +241,21 @@ out: dprintk("RPC: backchannel list empty= %s\n", list_empty(&xprt->bc_pa_list) ? "true" : "false"); } -EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *req = NULL; dprintk("RPC: allocate a backchannel request\n"); - if (list_empty(&xprt->bc_pa_list)) + if (atomic_read(&xprt->bc_free_slots) <= 0) goto not_found; - + if (list_empty(&xprt->bc_pa_list)) { + req = xprt_alloc_bc_req(xprt, GFP_ATOMIC); + if (!req) + goto not_found; + list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); + xprt->bc_alloc_count++; + } req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, rq_bc_pa_list); req->rq_reply_bytes_recvd = 0; @@ -241,15 +277,32 @@ void xprt_free_bc_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; + xprt->ops->bc_free_rqst(req); +} + +void xprt_free_bc_rqst(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + dprintk("RPC: free backchannel req=%p\n", req); req->rq_connect_cookie = xprt->connect_cookie - 1; smp_mb__before_atomic(); - WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); smp_mb__after_atomic(); - if (!xprt_need_to_requeue(xprt)) { + /* + * Return it to the list of preallocations so that it + * may be reused by a new callback request. + */ + spin_lock_bh(&xprt->bc_pa_lock); + if (xprt_need_to_requeue(xprt)) { + list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); + xprt->bc_alloc_count++; + req = NULL; + } + spin_unlock_bh(&xprt->bc_pa_lock); + if (req != NULL) { /* * The last remaining session was destroyed while this * entry was in use. Free the entry and don't attempt @@ -260,14 +313,6 @@ void xprt_free_bc_request(struct rpc_rqst *req) xprt_free_allocation(req); return; } - - /* - * Return it to the list of preallocations so that it - * may be reused by a new callback request. - */ - spin_lock_bh(&xprt->bc_pa_lock); - list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); } /* @@ -311,6 +356,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) spin_lock(&xprt->bc_pa_lock); list_del(&req->rq_bc_pa_list); + xprt_dec_alloc_count(xprt, 1); spin_unlock(&xprt->bc_pa_lock); req->rq_private_buf.len = copied; diff --git a/kernel/net/sunrpc/bc_svc.c b/kernel/net/sunrpc/bc_svc.c deleted file mode 100644 index 15c7a8a1c..000000000 --- a/kernel/net/sunrpc/bc_svc.c +++ /dev/null @@ -1,63 +0,0 @@ -/****************************************************************************** - -(c) 2007 Network Appliance, Inc. All Rights Reserved. -(c) 2009 NetApp. All Rights Reserved. - -NetApp provides this source code under the GPL v2 License. -The GPL v2 license is available at -http://opensource.org/licenses/gpl-license.php. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -******************************************************************************/ - -/* - * The NFSv4.1 callback service helper routines. - * They implement the transport level processing required to send the - * reply over an existing open connection previously established by the client. - */ - -#include - -#include -#include -#include - -#define RPCDBG_FACILITY RPCDBG_SVCDSP - -/* Empty callback ops */ -static const struct rpc_call_ops nfs41_callback_ops = { -}; - - -/* - * Send the callback reply - */ -int bc_send(struct rpc_rqst *req) -{ - struct rpc_task *task; - int ret; - - dprintk("RPC: bc_send req= %p\n", req); - task = rpc_run_bc_task(req, &nfs41_callback_ops); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); - ret = task->tk_status; - rpc_put_task(task); - } - dprintk("RPC: bc_send ret= %d\n", ret); - return ret; -} - diff --git a/kernel/net/sunrpc/cache.c b/kernel/net/sunrpc/cache.c index 2928afffb..21e203531 100644 --- a/kernel/net/sunrpc/cache.c +++ b/kernel/net/sunrpc/cache.c @@ -41,28 +41,30 @@ static bool cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); -static void cache_init(struct cache_head *h) +static void cache_init(struct cache_head *h, struct cache_detail *detail) { time_t now = seconds_since_boot(); - h->next = NULL; + INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; + if (now <= detail->flush_time) + /* ensure it isn't already expired */ + now = detail->flush_time + 1; h->last_refresh = now; } struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, struct cache_head *key, int hash) { - struct cache_head **head, **hp; - struct cache_head *new = NULL, *freeme = NULL; + struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL; + struct hlist_head *head; head = &detail->hash_table[hash]; read_lock(&detail->hash_lock); - for (hp=head; *hp != NULL ; hp = &(*hp)->next) { - struct cache_head *tmp = *hp; + hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) /* This entry is expired, we will discard it. */ @@ -82,18 +84,16 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, * we might get lose if we need to * cache_put it soon. */ - cache_init(new); + cache_init(new, detail); detail->init(new, key); write_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - for (hp=head; *hp != NULL ; hp = &(*hp)->next) { - struct cache_head *tmp = *hp; + hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - *hp = tmp->next; - tmp->next = NULL; + hlist_del_init(&tmp->cache_list); detail->entries --; freeme = tmp; break; @@ -104,8 +104,8 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, return tmp; } } - new->next = *head; - *head = new; + + hlist_add_head(&new->cache_list, head); detail->entries++; cache_get(new); write_unlock(&detail->hash_lock); @@ -119,10 +119,15 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); -static void cache_fresh_locked(struct cache_head *head, time_t expiry) +static void cache_fresh_locked(struct cache_head *head, time_t expiry, + struct cache_detail *detail) { + time_t now = seconds_since_boot(); + if (now <= detail->flush_time) + /* ensure it isn't immediately treated as expired */ + now = detail->flush_time + 1; head->expiry_time = expiry; - head->last_refresh = seconds_since_boot(); + head->last_refresh = now; smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */ set_bit(CACHE_VALID, &head->flags); } @@ -143,7 +148,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, * If 'old' is not VALID, we update it directly, * otherwise we need to replace it */ - struct cache_head **head; struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { @@ -153,7 +157,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); - cache_fresh_locked(old, new->expiry_time); + cache_fresh_locked(old, new->expiry_time, detail); write_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; @@ -166,21 +170,19 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_put(old, detail); return NULL; } - cache_init(tmp); + cache_init(tmp, detail); detail->init(tmp, old); - head = &detail->hash_table[hash]; write_lock(&detail->hash_lock); if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &tmp->flags); else detail->update(tmp, new); - tmp->next = *head; - *head = tmp; + hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); - cache_fresh_locked(tmp, new->expiry_time); - cache_fresh_locked(old, 0); + cache_fresh_locked(tmp, new->expiry_time, detail); + cache_fresh_locked(old, 0, detail); write_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); @@ -225,7 +227,8 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h rv = cache_is_valid(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); - cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY); + cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, + detail); rv = -ENOENT; } write_unlock(&detail->hash_lock); @@ -416,28 +419,29 @@ static int cache_clean(void) /* find a non-empty bucket in the table */ while (current_detail && current_index < current_detail->hash_size && - current_detail->hash_table[current_index] == NULL) + hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ if (current_detail && current_index < current_detail->hash_size) { - struct cache_head *ch, **cp; + struct cache_head *ch = NULL; struct cache_detail *d; + struct hlist_head *head; + struct hlist_node *tmp; write_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ - cp = & current_detail->hash_table[current_index]; - for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) { + head = ¤t_detail->hash_table[current_index]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) current_detail->nextcheck = ch->expiry_time+1; if (!cache_is_expired(current_detail, ch)) continue; - *cp = ch->next; - ch->next = NULL; + hlist_del_init(&ch->cache_list); current_detail->entries--; rv = 1; break; @@ -492,10 +496,13 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - detail->flush_time = LONG_MAX; + time_t now = seconds_since_boot(); + if (detail->flush_time >= now) + now = detail->flush_time + 1; + /* 'now' is the maximum value any 'last_refresh' can have */ + detail->flush_time = now; detail->nextcheck = seconds_since_boot(); cache_flush(); - detail->flush_time = 1; } EXPORT_SYMBOL_GPL(cache_purge); @@ -1218,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize) if (bp[0] == '\\' && bp[1] == 'x') { /* HEX STRING */ bp += 2; - while (len < bufsize) { + while (len < bufsize - 1) { int h, l; h = hex_to_bin(bp[0]); @@ -1270,18 +1277,13 @@ EXPORT_SYMBOL_GPL(qword_get); * get a header, then pass each real item in the cache */ -struct handle { - struct cache_detail *cd; -}; - -static void *c_start(struct seq_file *m, loff_t *pos) +void *cache_seq_start(struct seq_file *m, loff_t *pos) __acquires(cd->hash_lock) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; - struct cache_detail *cd = ((struct handle*)m->private)->cd; - + struct cache_detail *cd = m->private; read_lock(&cd->hash_lock); if (!n--) @@ -1289,7 +1291,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) hash = n >> 32; entry = n & ((1LL<<32) - 1); - for (ch=cd->hash_table[hash]; ch; ch=ch->next) + hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); @@ -1297,51 +1299,57 @@ static void *c_start(struct seq_file *m, loff_t *pos) hash++; n += 1LL<<32; } while(hash < cd->hash_size && - cd->hash_table[hash]==NULL); + hlist_empty(&cd->hash_table[hash])); if (hash >= cd->hash_size) return NULL; *pos = n+1; - return cd->hash_table[hash]; + return hlist_entry_safe(cd->hash_table[hash].first, + struct cache_head, cache_list); } +EXPORT_SYMBOL_GPL(cache_seq_start); -static void *c_next(struct seq_file *m, void *p, loff_t *pos) +void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) hash = 0; - else if (ch->next == NULL) { + else if (ch->cache_list.next == NULL) { hash++; *pos += 1LL<<32; } else { ++*pos; - return ch->next; + return hlist_entry_safe(ch->cache_list.next, + struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); while (hash < cd->hash_size && - cd->hash_table[hash] == NULL) { + hlist_empty(&cd->hash_table[hash])) { hash++; *pos += 1LL<<32; } if (hash >= cd->hash_size) return NULL; ++*pos; - return cd->hash_table[hash]; + return hlist_entry_safe(cd->hash_table[hash].first, + struct cache_head, cache_list); } +EXPORT_SYMBOL_GPL(cache_seq_next); -static void c_stop(struct seq_file *m, void *p) +void cache_seq_stop(struct seq_file *m, void *p) __releases(cd->hash_lock) { - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; read_unlock(&cd->hash_lock); } +EXPORT_SYMBOL_GPL(cache_seq_stop); static int c_show(struct seq_file *m, void *p) { struct cache_head *cp = p; - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) return cd->cache_show(m, cd, NULL); @@ -1364,33 +1372,36 @@ static int c_show(struct seq_file *m, void *p) } static const struct seq_operations cache_content_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, + .start = cache_seq_start, + .next = cache_seq_next, + .stop = cache_seq_stop, .show = c_show, }; static int content_open(struct inode *inode, struct file *file, struct cache_detail *cd) { - struct handle *han; + struct seq_file *seq; + int err; if (!cd || !try_module_get(cd->owner)) return -EACCES; - han = __seq_open_private(file, &cache_content_op, sizeof(*han)); - if (han == NULL) { + + err = seq_open(file, &cache_content_op); + if (err) { module_put(cd->owner); - return -ENOMEM; + return err; } - han->cd = cd; + seq = file->private_data; + seq->private = cd; return 0; } static int content_release(struct inode *inode, struct file *file, struct cache_detail *cd) { - int ret = seq_release_private(inode, file); + int ret = seq_release(inode, file); module_put(cd->owner); return ret; } @@ -1437,6 +1448,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, { char tbuf[20]; char *bp, *ep; + time_t then, now; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; @@ -1448,8 +1460,22 @@ static ssize_t write_flush(struct file *file, const char __user *buf, return -EINVAL; bp = tbuf; - cd->flush_time = get_expiry(&bp); - cd->nextcheck = seconds_since_boot(); + then = get_expiry(&bp); + now = seconds_since_boot(); + cd->nextcheck = now; + /* Can only set flush_time to 1 second beyond "now", or + * possibly 1 second beyond flushtime. This is because + * flush_time never goes backwards so it mustn't get too far + * ahead of time. + */ + if (then >= now) { + /* Want to flush everything, so behave like cache_purge() */ + if (cd->flush_time >= now) + now = cd->flush_time + 1; + then = now; + } + + cd->flush_time = then; cache_flush(); *ppos += count; @@ -1665,17 +1691,21 @@ EXPORT_SYMBOL_GPL(cache_unregister_net); struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net) { struct cache_detail *cd; + int i; cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); - cd->hash_table = kzalloc(cd->hash_size * sizeof(struct cache_head *), + cd->hash_table = kzalloc(cd->hash_size * sizeof(struct hlist_head), GFP_KERNEL); if (cd->hash_table == NULL) { kfree(cd); return ERR_PTR(-ENOMEM); } + + for (i = 0; i < cd->hash_size; i++) + INIT_HLIST_HEAD(&cd->hash_table[i]); cd->net = net; return cd; } diff --git a/kernel/net/sunrpc/clnt.c b/kernel/net/sunrpc/clnt.c index e6ce15173..23608eb0d 100644 --- a/kernel/net/sunrpc/clnt.c +++ b/kernel/net/sunrpc/clnt.c @@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - if (sk_memalloc_socks()) { - struct rpc_xprt *xprt; - - rcu_read_lock(); - xprt = rcu_dereference(clnt->cl_xprt); - if (xprt->swapper) - task->tk_flags |= RPC_TASK_SWAPPER; - rcu_read_unlock(); - } + if (atomic_read(&clnt->cl_swapper)) + task->tk_flags |= RPC_TASK_SWAPPER; /* Add to the client's list of all tasks */ spin_lock(&clnt->cl_lock); list_add_tail(&task->tk_task, &clnt->cl_tasks); @@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async); * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it * @req: RPC request - * @tk_ops: RPC call ops */ -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, - const struct rpc_call_ops *tk_ops) +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; struct xdr_buf *xbufp = &req->rq_snd_buf; struct rpc_task_setup task_setup_data = { - .callback_ops = tk_ops, + .callback_ops = &rpc_default_ops, + .flags = RPC_TASK_SOFTCONN, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); @@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task) req->rq_callsize + req->rq_rcvsize); if (req->rq_buffer != NULL) return; + xprt_inject_disconnect(xprt); dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); @@ -1909,6 +1902,7 @@ call_transmit_status(struct rpc_task *task) switch (task->tk_status) { case -EAGAIN: + case -ENOBUFS: break; default: dprint_status(task); @@ -1935,7 +1929,6 @@ call_transmit_status(struct rpc_task *task) case -ECONNABORTED: case -EADDRINUSE: case -ENOTCONN: - case -ENOBUFS: case -EPIPE: rpc_task_force_reencode(task); } @@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (!xprt_prepare_transmit(task)) { - /* - * Could not reserve the transport. Try again after the - * transport is released. - */ - task->tk_status = 0; - task->tk_action = call_bc_transmit; - return; - } + if (!xprt_prepare_transmit(task)) + goto out_retry; - task->tk_action = rpc_exit_task; if (task->tk_status < 0) { printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); - return; + goto out_done; } + if (req->rq_connect_cookie != req->rq_xprt->connect_cookie) + req->rq_bytes_sent = 0; xprt_transmit(task); + + if (task->tk_status == -EAGAIN) + goto out_nospace; + xprt_end_transmit(task); dprint_status(task); switch (task->tk_status) { case 0: /* Success */ - break; case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -ECONNRESET: + case -ECONNREFUSED: + case -EADDRINUSE: + case -ENOTCONN: + case -EPIPE: + break; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the @@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task) break; } rpc_wake_up_queued_task(&req->rq_xprt->pending, task); +out_done: + task->tk_action = rpc_exit_task; + return; +out_nospace: + req->rq_connect_cookie = req->rq_xprt->connect_cookie; +out_retry: + task->tk_status = 0; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ @@ -2054,12 +2057,13 @@ call_status(struct rpc_task *task) case -ECONNABORTED: rpc_force_rebind(clnt); case -EADDRINUSE: - case -ENOBUFS: rpc_delay(task, 3*HZ); case -EPIPE: case -ENOTCONN: task->tk_action = call_bind; break; + case -ENOBUFS: + rpc_delay(task, HZ>>2); case -EAGAIN: task->tk_action = call_transmit; break; @@ -2476,3 +2480,59 @@ void rpc_show_tasks(struct net *net) spin_unlock(&sn->rpc_client_lock); } #endif + +#if IS_ENABLED(CONFIG_SUNRPC_SWAP) +int +rpc_clnt_swap_activate(struct rpc_clnt *clnt) +{ + int ret = 0; + struct rpc_xprt *xprt; + + if (atomic_inc_return(&clnt->cl_swapper) == 1) { +retry: + rcu_read_lock(); + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + rcu_read_unlock(); + if (!xprt) { + /* + * If we didn't get a reference, then we likely are + * racing with a migration event. Wait for a grace + * period and try again. + */ + synchronize_rcu(); + goto retry; + } + + ret = xprt_enable_swap(xprt); + xprt_put(xprt); + } + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate); + +void +rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + + if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) { +retry: + rcu_read_lock(); + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + rcu_read_unlock(); + if (!xprt) { + /* + * If we didn't get a reference, then we likely are + * racing with a migration event. Wait for a grace + * period and try again. + */ + synchronize_rcu(); + goto retry; + } + + xprt_disable_swap(xprt); + xprt_put(xprt); + } +} +EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate); +#endif /* CONFIG_SUNRPC_SWAP */ diff --git a/kernel/net/sunrpc/debugfs.c b/kernel/net/sunrpc/debugfs.c index 82962f7e6..e7b4d9356 100644 --- a/kernel/net/sunrpc/debugfs.c +++ b/kernel/net/sunrpc/debugfs.c @@ -10,9 +10,12 @@ #include "netns.h" static struct dentry *topdir; +static struct dentry *rpc_fault_dir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; +unsigned int rpc_inject_disconnect; + struct rpc_clnt_iter { struct rpc_clnt *clnt; loff_t pos; @@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; } + + atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } +static int +fault_open(struct inode *inode, struct file *filp) +{ + filp->private_data = kmalloc(128, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + return 0; +} + +static int +fault_release(struct inode *inode, struct file *filp) +{ + kfree(filp->private_data); + return 0; +} + +static ssize_t +fault_disconnect_read(struct file *filp, char __user *user_buf, + size_t len, loff_t *offset) +{ + char *buffer = (char *)filp->private_data; + size_t size; + + size = sprintf(buffer, "%u\n", rpc_inject_disconnect); + return simple_read_from_buffer(user_buf, len, offset, buffer, size); +} + +static ssize_t +fault_disconnect_write(struct file *filp, const char __user *user_buf, + size_t len, loff_t *offset) +{ + char buffer[16]; + + if (len >= sizeof(buffer)) + len = sizeof(buffer) - 1; + if (copy_from_user(buffer, user_buf, len)) + return -EFAULT; + buffer[len] = '\0'; + if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) + return -EINVAL; + return len; +} + +static const struct file_operations fault_disconnect_fops = { + .owner = THIS_MODULE, + .open = fault_open, + .read = fault_disconnect_read, + .write = fault_disconnect_write, + .release = fault_release, +}; + +static struct dentry * +inject_fault_dir(struct dentry *topdir) +{ + struct dentry *faultdir; + + faultdir = debugfs_create_dir("inject_fault", topdir); + if (!faultdir) + return NULL; + + if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir, + NULL, &fault_disconnect_fops)) + return NULL; + + return faultdir; +} + void __exit sunrpc_debugfs_exit(void) { debugfs_remove_recursive(topdir); topdir = NULL; + rpc_fault_dir = NULL; rpc_clnt_dir = NULL; rpc_xprt_dir = NULL; } @@ -282,6 +355,10 @@ sunrpc_debugfs_init(void) if (!topdir) return; + rpc_fault_dir = inject_fault_dir(topdir); + if (!rpc_fault_dir) + goto out_remove; + rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); if (!rpc_clnt_dir) goto out_remove; @@ -294,5 +371,6 @@ sunrpc_debugfs_init(void) out_remove: debugfs_remove_recursive(topdir); topdir = NULL; + rpc_fault_dir = NULL; rpc_clnt_dir = NULL; } diff --git a/kernel/net/sunrpc/sched.c b/kernel/net/sunrpc/sched.c index 337ca851a..73ad57a59 100644 --- a/kernel/net/sunrpc/sched.c +++ b/kernel/net/sunrpc/sched.c @@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(struct wait_bit_key *key) +static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } @@ -1092,14 +1092,10 @@ void rpc_destroy_mempool(void) { rpciod_stop(); - if (rpc_buffer_mempool) - mempool_destroy(rpc_buffer_mempool); - if (rpc_task_mempool) - mempool_destroy(rpc_task_mempool); - if (rpc_task_slabp) - kmem_cache_destroy(rpc_task_slabp); - if (rpc_buffer_slabp) - kmem_cache_destroy(rpc_buffer_slabp); + mempool_destroy(rpc_buffer_mempool); + mempool_destroy(rpc_task_mempool); + kmem_cache_destroy(rpc_task_slabp); + kmem_cache_destroy(rpc_buffer_slabp); rpc_destroy_wait_queue(&delay_queue); } diff --git a/kernel/net/sunrpc/svc.c b/kernel/net/sunrpc/svc.c index 78974e4d9..cc9852897 100644 --- a/kernel/net/sunrpc/svc.c +++ b/kernel/net/sunrpc/svc.c @@ -34,36 +34,19 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net); -#define svc_serv_is_pooled(serv) ((serv)->sv_function) +#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function) -/* - * Mode for mapping cpus to pools. - */ -enum { - SVC_POOL_AUTO = -1, /* choose one of the others */ - SVC_POOL_GLOBAL, /* no mapping, just a single global pool - * (legacy & UP mode) */ - SVC_POOL_PERCPU, /* one pool per cpu */ - SVC_POOL_PERNODE /* one pool per numa node */ -}; #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ -static struct svc_pool_map { - int count; /* How many svc_servs use us */ - int mode; /* Note: int not enum to avoid - * warnings about "enumeration value - * not handled in switch" */ - unsigned int npools; - unsigned int *pool_to; /* maps pool id to cpu or node */ - unsigned int *to_pool; /* maps cpu or node to pool id */ -} svc_pool_map = { - .count = 0, +struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; +EXPORT_SYMBOL_GPL(svc_pool_map); + static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int @@ -236,7 +219,7 @@ svc_pool_map_init_pernode(struct svc_pool_map *m) * vice versa). Initialise the map if we're the first user. * Returns the number of pools. */ -static unsigned int +unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; @@ -271,7 +254,7 @@ svc_pool_map_get(void) mutex_unlock(&svc_pool_map_mutex); return m->npools; } - +EXPORT_SYMBOL_GPL(svc_pool_map_get); /* * Drop a reference to the global map of cpus to pools. @@ -280,7 +263,7 @@ svc_pool_map_get(void) * mode using the pool_mode module option without * rebooting or re-loading sunrpc.ko. */ -static void +void svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; @@ -297,7 +280,7 @@ svc_pool_map_put(void) mutex_unlock(&svc_pool_map_mutex); } - +EXPORT_SYMBOL_GPL(svc_pool_map_put); static int svc_pool_map_get_node(unsigned int pidx) { @@ -423,7 +406,7 @@ EXPORT_SYMBOL_GPL(svc_bind); */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv, struct net *net)) + struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int vers; @@ -440,7 +423,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); - serv->sv_shutdown = shutdown; + serv->sv_ops = ops; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -486,26 +469,22 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv, struct net *net)) + struct svc_serv_ops *ops) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, ops); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv, struct net *net), - svc_thread_fn func, struct module *mod) + struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, ops); if (!serv) goto out_err; - - serv->sv_function = func; - serv->sv_module = mod; return serv; out_err: svc_pool_map_put(); @@ -517,8 +496,8 @@ void svc_shutdown_net(struct svc_serv *serv, struct net *net) { svc_close_net(serv, net); - if (serv->sv_shutdown) - serv->sv_shutdown(serv, net); + if (serv->sv_ops->svo_shutdown) + serv->sv_ops->svo_shutdown(serv, net); } EXPORT_SYMBOL_GPL(svc_shutdown_net); @@ -604,40 +583,52 @@ svc_release_buffer(struct svc_rqst *rqstp) } struct svc_rqst * -svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) +svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node); if (!rqstp) - goto out_enomem; + return rqstp; - serv->sv_nrthreads++; __set_bit(RQ_BUSY, &rqstp->rq_flags); spin_lock_init(&rqstp->rq_lock); rqstp->rq_server = serv; rqstp->rq_pool = pool; - spin_lock_bh(&pool->sp_lock); - pool->sp_nrthreads++; - list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); - spin_unlock_bh(&pool->sp_lock); rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) - goto out_thread; + goto out_enomem; rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_resp) - goto out_thread; + goto out_enomem; if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) - goto out_thread; + goto out_enomem; return rqstp; -out_thread: - svc_exit_thread(rqstp); out_enomem: - return ERR_PTR(-ENOMEM); + svc_rqst_free(rqstp); + return NULL; +} +EXPORT_SYMBOL_GPL(svc_rqst_alloc); + +struct svc_rqst * +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) +{ + struct svc_rqst *rqstp; + + rqstp = svc_rqst_alloc(serv, pool, node); + if (!rqstp) + return ERR_PTR(-ENOMEM); + + serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); + return rqstp; } EXPORT_SYMBOL_GPL(svc_prepare_thread); @@ -739,12 +730,12 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) break; } - __module_get(serv->sv_module); - task = kthread_create_on_node(serv->sv_function, rqstp, + __module_get(serv->sv_ops->svo_module); + task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { error = PTR_ERR(task); - module_put(serv->sv_module); + module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); break; } @@ -772,15 +763,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); * mutex" for the service. */ void -svc_exit_thread(struct svc_rqst *rqstp) +svc_rqst_free(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - svc_release_buffer(rqstp); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); + kfree_rcu(rqstp, rq_rcu_head); +} +EXPORT_SYMBOL_GPL(svc_rqst_free); + +void +svc_exit_thread(struct svc_rqst *rqstp) +{ + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads--; @@ -788,7 +785,7 @@ svc_exit_thread(struct svc_rqst *rqstp) list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); - kfree_rcu(rqstp, rq_rcu_head); + svc_rqst_free(rqstp); /* Release the server */ if (serv) @@ -1290,7 +1287,6 @@ err_bad: svc_putnl(resv, ntohl(rpc_stat)); goto sendit; } -EXPORT_SYMBOL_GPL(svc_process); /* * Process the RPC request. @@ -1338,6 +1334,7 @@ out_drop: svc_drop(rqstp); return 0; } +EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* @@ -1350,6 +1347,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; + struct rpc_task *task; + int proc_error; + int error; + + dprintk("svc: %s(%p)\n", __func__, req); /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xprt = serv->sv_bc_xprt; @@ -1362,31 +1364,54 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); + /* Adjust the argument buffer length */ + rqstp->rq_arg.len = req->rq_private_buf.len; + if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; + rqstp->rq_arg.page_len = 0; + } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len) + rqstp->rq_arg.page_len = rqstp->rq_arg.len - + rqstp->rq_arg.head[0].iov_len; + else + rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len; + /* reset result send buffer "put" position */ resv->iov_len = 0; - if (rqstp->rq_prot != IPPROTO_TCP) { - printk(KERN_ERR "No support for Non-TCP transports!\n"); - BUG(); - } - /* * Skip the next two words because they've already been - * processed in the trasport + * processed in the transport */ svc_getu32(argv); /* XID */ svc_getnl(argv); /* CALLDIR */ - /* Returns 1 for send, 0 for drop */ - if (svc_process_common(rqstp, argv, resv)) { - memcpy(&req->rq_snd_buf, &rqstp->rq_res, - sizeof(req->rq_snd_buf)); - return bc_send(req); - } else { - /* drop request */ + /* Parse and execute the bc call */ + proc_error = svc_process_common(rqstp, argv, resv); + + atomic_inc(&req->rq_xprt->bc_free_slots); + if (!proc_error) { + /* Processing error: drop the request */ xprt_free_bc_request(req); return 0; } + + /* Finally, send the reply synchronously */ + memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); + task = rpc_run_bc_task(req); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; + } + + WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); + error = task->tk_status; + rpc_put_task(task); + +out: + dprintk("svc: %s(), error=%d\n", __func__, error); + return error; } EXPORT_SYMBOL_GPL(bc_svc_process); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ diff --git a/kernel/net/sunrpc/svc_xprt.c b/kernel/net/sunrpc/svc_xprt.c index ba2313cd4..5b69bb580 100644 --- a/kernel/net/sunrpc/svc_xprt.c +++ b/kernel/net/sunrpc/svc_xprt.c @@ -24,7 +24,6 @@ static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static void svc_age_temp_xprts(unsigned long closure); static void svc_delete_xprt(struct svc_xprt *xprt); -static void svc_xprt_do_enqueue(struct svc_xprt *xprt); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -225,12 +224,12 @@ static void svc_xprt_received(struct svc_xprt *xprt) } /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_do_enqueue with: + * 'put', so we need a reference to call svc_enqueue_xprt with: */ svc_xprt_get(xprt); smp_mb__before_atomic(); clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_do_enqueue(xprt); + xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); svc_xprt_put(xprt); } @@ -320,7 +319,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) return false; } -static void svc_xprt_do_enqueue(struct svc_xprt *xprt) +void svc_xprt_do_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp = NULL; @@ -402,6 +401,7 @@ redo_search: out: trace_svc_xprt_do_enqueue(xprt, rqstp); } +EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); /* * Queue up a transport with data pending. If there are idle nfsd @@ -412,7 +412,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) { if (test_bit(XPT_BUSY, &xprt->xpt_flags)) return; - svc_xprt_do_enqueue(xprt); + xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); diff --git a/kernel/net/sunrpc/svcsock.c b/kernel/net/sunrpc/svcsock.c index 0c8120229..1413cdcc1 100644 --- a/kernel/net/sunrpc/svcsock.c +++ b/kernel/net/sunrpc/svcsock.c @@ -181,7 +181,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr, struct page **ppage = xdr->pages; size_t base = xdr->page_base; unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE; + unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; int slen; int len = 0; @@ -399,6 +399,31 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp) return svc_port_is_privileged(svc_addr(rqstp)); } +static bool sunrpc_waitqueue_active(wait_queue_head_t *wq) +{ + if (!wq) + return false; + /* + * There should normally be a memory * barrier here--see + * wq_has_sleeper(). + * + * It appears that isn't currently necessary, though, basically + * because callers all appear to have sufficient memory barriers + * between the time the relevant change is made and the + * time they call these callbacks. + * + * The nfsd code itself doesn't actually explicitly wait on + * these waitqueues, but it may wait on them for example in + * sendpage() or sendmsg() calls. (And those may be the only + * places, since it it uses nonblocking reads.) + * + * Maybe we should add the memory barriers anyway, but these are + * hot paths so we'd need to be convinced there's no sigificant + * penalty. + */ + return waitqueue_active(wq); +} + /* * INET callback when data has been received on the socket. */ @@ -414,7 +439,7 @@ static void svc_udp_data_ready(struct sock *sk) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } @@ -432,7 +457,7 @@ static void svc_write_space(struct sock *sk) svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) { + if (sunrpc_waitqueue_active(wq)) { dprintk("RPC svc_write_space: someone sleeping on %p\n", svsk); wake_up_interruptible(wq); @@ -787,7 +812,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) } wq = sk_sleep(sk); - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible_all(wq); } @@ -808,7 +833,7 @@ static void svc_tcp_state_change(struct sock *sk) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible_all(wq); } @@ -823,7 +848,7 @@ static void svc_tcp_data_ready(struct sock *sk) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } @@ -1367,7 +1392,6 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs); /* * Initialize socket for RPC use and create svc_sock struct - * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, @@ -1594,7 +1618,7 @@ static void svc_sock_detach(struct svc_xprt *xprt) sk->sk_write_space = svsk->sk_owspace; wq = sk_sleep(sk); - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } diff --git a/kernel/net/sunrpc/sysctl.c b/kernel/net/sunrpc/sysctl.c index 887f0183b..c88d9bc06 100644 --- a/kernel/net/sunrpc/sysctl.c +++ b/kernel/net/sunrpc/sysctl.c @@ -76,7 +76,7 @@ static int proc_dodebug(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - char tmpbuf[20], c, *s; + char tmpbuf[20], c, *s = NULL; char __user *p; unsigned int value; size_t left, len; @@ -103,23 +103,24 @@ proc_dodebug(struct ctl_table *table, int write, return -EFAULT; tmpbuf[left] = '\0'; - for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--) - value = 10 * value + (*s - '0'); - if (*s && !isspace(*s)) - return -EINVAL; - while (left && isspace(*s)) - left--, s++; + value = simple_strtol(tmpbuf, &s, 0); + if (s) { + left -= (s - tmpbuf); + if (left && !isspace(*s)) + return -EINVAL; + while (left && isspace(*s)) + left--, s++; + } else + left = 0; *(unsigned int *) table->data = value; /* Display the RPC tasks on writing to rpc_debug */ if (strcmp(table->procname, "rpc_debug") == 0) rpc_show_tasks(&init_net); } else { - if (!access_ok(VERIFY_WRITE, buffer, left)) - return -EFAULT; - len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (__copy_to_user(buffer, tmpbuf, len)) + if (copy_to_user(buffer, tmpbuf, len)) return -EFAULT; if ((left -= len) > 0) { if (put_user('\n', (char __user *)buffer + len)) diff --git a/kernel/net/sunrpc/xprt.c b/kernel/net/sunrpc/xprt.c index d109d308e..2e98f4a24 100644 --- a/kernel/net/sunrpc/xprt.c +++ b/kernel/net/sunrpc/xprt.c @@ -68,6 +68,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); +static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) } xprt_clear_locked(xprt); out_sleep: + if (req) + __xprt_put_cong(xprt, req); dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; @@ -608,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work) struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); - xprt->ops->close(xprt); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + xprt->ops->close(xprt); xprt_release_write(xprt, NULL); wake_up_bit(&xprt->state, XPRT_LOCKED); } @@ -969,6 +972,7 @@ void xprt_transmit(struct rpc_task *task) task->tk_status = status; return; } + xprt_inject_disconnect(xprt); dprintk("RPC: %5u xmit complete\n", task->tk_pid); task->tk_flags |= RPC_TASK_SENT; @@ -1287,6 +1291,7 @@ void xprt_release(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(req->rq_buffer); + xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); task->tk_rqstp = NULL; diff --git a/kernel/net/sunrpc/xprtrdma/Makefile b/kernel/net/sunrpc/xprtrdma/Makefile index 579f72bbc..33f99d300 100644 --- a/kernel/net/sunrpc/xprtrdma/Makefile +++ b/kernel/net/sunrpc/xprtrdma/Makefile @@ -1,9 +1,8 @@ -obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o \ - fmr_ops.o frwr_ops.o physical_ops.o - -obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o - -svcrdma-y := svc_rdma.o svc_rdma_transport.o \ - svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o +rpcrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o \ + svc_rdma.o svc_rdma_transport.o \ + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ + module.o +rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/kernel/net/sunrpc/xprtrdma/backchannel.c b/kernel/net/sunrpc/xprtrdma/backchannel.c new file mode 100644 index 000000000..2dcb44f69 --- /dev/null +++ b/kernel/net/sunrpc/xprtrdma/backchannel.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA. + */ + +#include +#include +#include +#include + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#define RPCRDMA_BACKCHANNEL_DEBUG + +static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + spin_lock(&buf->rb_reqslock); + list_del(&req->rl_all); + spin_unlock(&buf->rb_reqslock); + + rpcrdma_destroy_req(&r_xprt->rx_ia, req); + + kfree(rqst); +} + +static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_regbuf *rb; + struct rpcrdma_req *req; + struct xdr_buf *buf; + size_t size; + + req = rpcrdma_create_req(r_xprt); + if (!req) + return -ENOMEM; + req->rl_backchannel = true; + + size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); + rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + if (IS_ERR(rb)) + goto out_fail; + req->rl_rdmabuf = rb; + + size += RPCRDMA_INLINE_READ_THRESHOLD(rqst); + rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + if (IS_ERR(rb)) + goto out_fail; + rb->rg_owner = req; + req->rl_sendbuf = rb; + /* so that rpcr_to_rdmar works when receiving a request */ + rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; + + buf = &rqst->rq_snd_buf; + buf->head[0].iov_base = rqst->rq_buffer; + buf->head[0].iov_len = 0; + buf->tail[0].iov_base = NULL; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->len = 0; + buf->buflen = size; + + return 0; + +out_fail: + rpcrdma_bc_free_rqst(r_xprt, rqst); + return -ENOMEM; +} + +/* Allocate and add receive buffers to the rpcrdma_buffer's + * existing list of rep's. These are released when the + * transport is destroyed. + */ +static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, + unsigned int count) +{ + struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; + struct rpcrdma_rep *rep; + unsigned long flags; + int rc = 0; + + while (count--) { + rep = rpcrdma_create_rep(r_xprt); + if (IS_ERR(rep)) { + pr_err("RPC: %s: reply buffer alloc failed\n", + __func__); + rc = PTR_ERR(rep); + break; + } + + spin_lock_irqsave(&buffers->rb_lock, flags); + list_add(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock_irqrestore(&buffers->rb_lock, flags); + } + + return rc; +} + +/** + * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources + * @reqs: number of concurrent incoming requests to expect + * + * Returns 0 on success; otherwise a negative errno + */ +int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; + struct rpc_rqst *rqst; + unsigned int i; + int rc; + + /* The backchannel reply path returns each rpc_rqst to the + * bc_pa_list _after_ the reply is sent. If the server is + * faster than the client, it can send another backward + * direction request before the rpc_rqst is returned to the + * list. The client rejects the request in this case. + * + * Twice as many rpc_rqsts are prepared to ensure there is + * always an rpc_rqst available as soon as a reply is sent. + */ + if (reqs > RPCRDMA_BACKWARD_WRS >> 1) + goto out_err; + + for (i = 0; i < (reqs << 1); i++) { + rqst = kzalloc(sizeof(*rqst), GFP_KERNEL); + if (!rqst) { + pr_err("RPC: %s: Failed to create bc rpc_rqst\n", + __func__); + goto out_free; + } + + rqst->rq_xprt = &r_xprt->rx_xprt; + INIT_LIST_HEAD(&rqst->rq_list); + INIT_LIST_HEAD(&rqst->rq_bc_list); + + if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) + goto out_free; + + spin_lock_bh(&xprt->bc_pa_lock); + list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); + } + + rc = rpcrdma_bc_setup_reps(r_xprt, reqs); + if (rc) + goto out_free; + + rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs); + if (rc) + goto out_free; + + buffer->rb_bc_srv_max_requests = reqs; + request_module("svcrdma"); + + return 0; + +out_free: + xprt_rdma_bc_destroy(xprt, reqs); + +out_err: + pr_err("RPC: %s: setup backchannel transport failed\n", __func__); + return -ENOMEM; +} + +/** + * xprt_rdma_bc_up - Create transport endpoint for backchannel service + * @serv: server endpoint + * @net: network namespace + * + * The "xprt" is an implied argument: it supplies the name of the + * backchannel transport class. + * + * Returns zero on success, negative errno on failure + */ +int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) +{ + int ret; + + ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0); + if (ret < 0) + return ret; + return 0; +} + +/** + * rpcrdma_bc_marshal_reply - Send backwards direction reply + * @rqst: buffer containing RPC reply data + * + * Returns zero on success. + */ +int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp; + size_t rpclen; + + headerp = rdmab_to_msg(req->rl_rdmabuf); + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = + cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + + rpclen = rqst->rq_svec[0].iov_len; + + pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", + __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); + pr_info("RPC: %s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); + pr_info("RPC: %s: RPC: %*ph\n", + __func__, (int)rpclen, rqst->rq_svec[0].iov_base); + + req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); + req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; + req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); + req->rl_send_iov[1].length = rpclen; + req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); + + req->rl_niovs = 2; + return 0; +} + +/** + * xprt_rdma_bc_destroy - Release resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources + * @reqs: number of incoming requests to destroy; ignored + */ +void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpc_rqst *rqst, *tmp; + + spin_lock_bh(&xprt->bc_pa_lock); + list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { + list_del(&rqst->rq_bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); + + rpcrdma_bc_free_rqst(r_xprt, rqst); + + spin_lock_bh(&xprt->bc_pa_lock); + } + spin_unlock_bh(&xprt->bc_pa_lock); +} + +/** + * xprt_rdma_bc_free_rqst - Release a backchannel rqst + * @rqst: request to release + */ +void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + + smp_mb__before_atomic(); + WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); + clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + smp_mb__after_atomic(); + + spin_lock_bh(&xprt->bc_pa_lock); + list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); +} + +/** + * rpcrdma_bc_receive_call - Handle a backward direction call + * @xprt: transport receiving the call + * @rep: receive buffer containing the call + * + * Called in the RPC reply handler, which runs in a tasklet. + * Be quick about it. + * + * Operational assumptions: + * o Backchannel credits are ignored, just as the NFS server + * forechannel currently does + * o The ULP manages a replay cache (eg, NFSv4.1 sessions). + * No replay detection is done at the transport level + */ +void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_msg *headerp; + struct svc_serv *bc_serv; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct xdr_buf *buf; + size_t size; + __be32 *p; + + headerp = rdmab_to_msg(rep->rr_rdmabuf); +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: callback XID %08x, length=%u\n", + __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); + pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); +#endif + + /* Sanity check: + * Need at least enough bytes for RPC/RDMA header, as code + * here references the header fields by array offset. Also, + * backward calls are always inline, so ensure there + * are some bytes beyond the RPC/RDMA header. + */ + if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) + goto out_short; + p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); + size = rep->rr_len - RPCRDMA_HDRLEN_MIN; + + /* Grab a free bc rqst */ + spin_lock(&xprt->bc_pa_lock); + if (list_empty(&xprt->bc_pa_list)) { + spin_unlock(&xprt->bc_pa_lock); + goto out_overflow; + } + rqst = list_first_entry(&xprt->bc_pa_list, + struct rpc_rqst, rq_bc_pa_list); + list_del(&rqst->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: using rqst %p\n", __func__, rqst); +#endif + + /* Prepare rqst */ + rqst->rq_reply_bytes_recvd = 0; + rqst->rq_bytes_sent = 0; + rqst->rq_xid = headerp->rm_xid; + set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + + buf = &rqst->rq_rcv_buf; + memset(buf, 0, sizeof(*buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = size; + buf->len = size; + + /* The receive buffer has to be hooked to the rpcrdma_req + * so that it can be reposted after the server is done + * parsing it but just before sending the backward + * direction reply. + */ + req = rpcr_to_rdmar(rqst); +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: attaching rep %p to req %p\n", + __func__, rep, req); +#endif + req->rl_reply = rep; + + /* Defeat the retransmit detection logic in send_request */ + req->rl_connect_cookie = 0; + + /* Queue rqst for ULP's callback service */ + bc_serv = xprt->bc_serv; + spin_lock(&bc_serv->sv_cb_lock); + list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list); + spin_unlock(&bc_serv->sv_cb_lock); + + wake_up(&bc_serv->sv_cb_waitq); + + r_xprt->rx_stats.bcall_count++; + return; + +out_overflow: + pr_warn("RPC/RDMA backchannel overflow\n"); + xprt_disconnect_done(xprt); + /* This receive buffer gets reposted automatically + * when the connection is re-established. + */ + return; + +out_short: + pr_warn("RPC/RDMA short backward direction call\n"); + + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + xprt_disconnect_done(xprt); + else + pr_warn("RPC: %s: reposting rep %p\n", + __func__, rep); +} diff --git a/kernel/net/sunrpc/xprtrdma/fmr_ops.c b/kernel/net/sunrpc/xprtrdma/fmr_ops.c index 302d4ebf6..f1e8dafbd 100644 --- a/kernel/net/sunrpc/xprtrdma/fmr_ops.c +++ b/kernel/net/sunrpc/xprtrdma/fmr_ops.c @@ -11,6 +11,21 @@ * can take tens of usecs to complete. */ +/* Normal operation + * + * A Memory Region is prepared for RDMA READ or WRITE using the + * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is + * finished, the Memory Region is unmapped using the ib_unmap_fmr + * verb (fmr_op_unmap). + */ + +/* Transport recovery + * + * After a transport reconnect, fmr_op_map re-uses the MR already + * allocated for the RPC, but generates a fresh rkey then maps the + * MR again. This process is synchronous. + */ + #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) struct rpcrdma_mw *r; int i, rc; + spin_lock_init(&buf->rb_mwlock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); + i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); + i += 2; /* head + tail */ + i *= buf->rb_max_requests; /* one set for each RPC slot */ + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + rc = -ENOMEM; while (i--) { r = kzalloc(sizeof(*r), GFP_KERNEL); if (!r) - return -ENOMEM; + goto out; - r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) + r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * + sizeof(u64), GFP_KERNEL); + if (!r->r.fmr.physaddrs) + goto out_free; + + r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr.fmr)) goto out_fmr_err; list_add(&r->mw_list, &buf->rb_mws); @@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) return 0; out_fmr_err: - rc = PTR_ERR(r->r.fmr); + rc = PTR_ERR(r->r.fmr.fmr); dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r->r.fmr.physaddrs); +out_free: kfree(r); +out: return rc; } +static int +__fmr_unmap(struct rpcrdma_mw *r) +{ + LIST_HEAD(l); + + list_add(&r->r.fmr.fmr->list, &l); + return ib_unmap_fmr(&l); +} + /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ @@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_id->device; + struct ib_device *device = ia->ri_device; enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; int len, pageoff, i, rc; + struct rpcrdma_mw *mw; + + mw = seg1->rl_mw; + seg1->rl_mw = NULL; + if (!mw) { + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOMEM; + } else { + /* this is a retransmit; generate a fresh rkey */ + rc = __fmr_unmap(mw); + if (rc) + return rc; + } pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { rpcrdma_map_one(device, seg, direction); - physaddrs[i] = seg->mr_dma; + mw->r.fmr.physaddrs[i] = seg->mr_dma; len += seg->mr_len; ++seg; ++i; @@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, break; } - rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); + rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs, + i, seg1->mr_dma); if (rc) goto out_maperr; - seg1->mr_rkey = mw->r.fmr->rkey; + seg1->rl_mw = mw; + seg1->mr_rkey = mw->r.fmr.fmr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; @@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mr_seg *seg1 = seg; - struct ib_device *device; + struct rpcrdma_mw *mw = seg1->rl_mw; int rc, nsegs = seg->mr_nsegs; - LIST_HEAD(l); - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - device = ia->ri_id->device; + dprintk("RPC: %s: FMR %p\n", __func__, mw); + + seg1->rl_mw = NULL; while (seg1->mr_nsegs--) - rpcrdma_unmap_one(device, seg++); - read_unlock(&ia->ri_qplock); + rpcrdma_unmap_one(ia->ri_device, seg++); + rc = __fmr_unmap(mw); if (rc) goto out_err; + rpcrdma_put_mw(r_xprt, mw); return nsegs; out_err: + /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy + * will attempt to release it when the transport is destroyed. + */ dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); return nsegs; } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -fmr_op_reset(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mw *r; - LIST_HEAD(list); - int rc; - - list_for_each_entry(r, &buf->rb_all, mw_all) - list_add(&r->r.fmr->list, &list); - - rc = ib_unmap_fmr(&list); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); -} - static void fmr_op_destroy(struct rpcrdma_buffer *buf) { @@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) while (!list_empty(&buf->rb_all)) { r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); list_del(&r->mw_all); - rc = ib_dealloc_fmr(r->r.fmr); + kfree(r->r.fmr.physaddrs); + + rc = ib_dealloc_fmr(r->r.fmr.fmr); if (rc) dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", __func__, rc); + kfree(r); } } @@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init = fmr_op_init, - .ro_reset = fmr_op_reset, .ro_destroy = fmr_op_destroy, .ro_displayname = "fmr", }; diff --git a/kernel/net/sunrpc/xprtrdma/frwr_ops.c b/kernel/net/sunrpc/xprtrdma/frwr_ops.c index dff0481db..88cf9e726 100644 --- a/kernel/net/sunrpc/xprtrdma/frwr_ops.c +++ b/kernel/net/sunrpc/xprtrdma/frwr_ops.c @@ -11,12 +11,136 @@ * but most complex memory registration mode. */ +/* Normal operation + * + * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * Work Request (frmr_op_map). When the RDMA operation is finished, this + * Memory Region is invalidated using a LOCAL_INV Work Request + * (frmr_op_unmap). + * + * Typically these Work Requests are not signaled, and neither are RDMA + * SEND Work Requests (with the exception of signaling occasionally to + * prevent provider work queue overflows). This greatly reduces HCA + * interrupt workload. + * + * As an optimization, frwr_op_unmap marks MRs INVALID before the + * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on + * rb_mws immediately so that no work (like managing a linked list + * under a spinlock) is needed in the completion upcall. + * + * But this means that frwr_op_map() can occasionally encounter an MR + * that is INVALID but the LOCAL_INV WR has not completed. Work Queue + * ordering prevents a subsequent FAST_REG WR from executing against + * that MR while it is still being invalidated. + */ + +/* Transport recovery + * + * ->op_map and the transport connect worker cannot run at the same + * time, but ->op_unmap can fire while the transport connect worker + * is running. Thus MR recovery is handled in ->op_map, to guarantee + * that recovered MRs are owned by a sending RPC, and not one where + * ->op_unmap could fire at the same time transport reconnect is + * being done. + * + * When the underlying transport disconnects, MRs are left in one of + * three states: + * + * INVALID: The MR was not in use before the QP entered ERROR state. + * (Or, the LOCAL_INV WR has not completed or flushed yet). + * + * STALE: The MR was being registered or unregistered when the QP + * entered ERROR state, and the pending WR was flushed. + * + * VALID: The MR was registered before the QP entered ERROR state. + * + * When frwr_op_map encounters STALE and VALID MRs, they are recovered + * with ib_dereg_mr and then are re-initialized. Beause MR recovery + * allocates fresh resources, it is deferred to a workqueue, and the + * recovered MRs are placed back on the rb_mws list when recovery is + * complete. frwr_op_map allocates another MR for the current RPC while + * the broken MR is reset. + * + * To ensure that frwr_op_map doesn't encounter an MR that is marked + * INVALID but that is about to be flushed due to a previous transport + * disconnect, the transport connect worker attempts to drain all + * pending send queue WRs before the transport is reconnected. + */ + #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static struct workqueue_struct *frwr_recovery_wq; + +#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) + +int +frwr_alloc_recovery_wq(void) +{ + frwr_recovery_wq = alloc_workqueue("frwr_recovery", + FRWR_RECOVERY_WQ_FLAGS, 0); + return !frwr_recovery_wq ? -ENOMEM : 0; +} + +void +frwr_destroy_recovery_wq(void) +{ + struct workqueue_struct *wq; + + if (!frwr_recovery_wq) + return; + + wq = frwr_recovery_wq; + frwr_recovery_wq = NULL; + destroy_workqueue(wq); +} + +/* Deferred reset of a single FRMR. Generate a fresh rkey by + * replacing the MR. + * + * There's no recovery if this fails. The FRMR is abandoned, but + * remains in rb_all. It will be cleaned up when the transport is + * destroyed. + */ +static void +__frwr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, + r.frmr.fr_work); + struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + + if (ib_dereg_mr(r->r.frmr.fr_mr)) + goto out_fail; + + r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); + if (IS_ERR(r->r.frmr.fr_mr)) + goto out_fail; + + dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); + r->r.frmr.fr_state = FRMR_IS_INVALID; + rpcrdma_put_mw(r_xprt, r); + return; + +out_fail: + pr_warn("RPC: %s: FRMR %p unrecovered\n", + __func__, r); +} + +/* A broken MR was discovered in a context that can't sleep. + * Defer recovery to the recovery worker. + */ +static void +__frwr_queue_recovery(struct rpcrdma_mw *r) +{ + INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker); + queue_work(frwr_recovery_wq, &r->r.frmr.fr_work); +} + static int __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, unsigned int depth) @@ -24,24 +148,28 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, struct rpcrdma_frmr *f = &r->r.frmr; int rc; - f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); if (IS_ERR(f->fr_mr)) goto out_mr_err; - f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); - if (IS_ERR(f->fr_pgl)) + + f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL); + if (!f->sg) goto out_list_err; + + sg_init_table(f->sg, depth); + return 0; out_mr_err: rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + dprintk("RPC: %s: ib_alloc_mr status %i\n", __func__, rc); return rc; out_list_err: - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", - __func__, rc); + rc = -ENOMEM; + dprintk("RPC: %s: sg allocation failure\n", + __func__); ib_dereg_mr(f->fr_mr); return rc; } @@ -55,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r) if (rc) dprintk("RPC: %s: ib_dereg_mr status %i\n", __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + kfree(r->r.frmr.sg); } static int @@ -128,8 +256,11 @@ frwr_sendcompletion(struct ib_wc *wc) /* WARNING: Only wr_id and status are reliable at this point */ r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - dprintk("RPC: %s: frmr %p (stale), status %d\n", - __func__, r, wc->status); + if (wc->status == IB_WC_WR_FLUSH_ERR) + dprintk("RPC: %s: frmr %p flushed\n", __func__, r); + else + pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", + __func__, r, ib_wc_status_msg(wc->status), wc->status); r->r.frmr.fr_state = FRMR_IS_STALE; } @@ -137,16 +268,19 @@ static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct ib_device *device = r_xprt->rx_ia.ri_id->device; + struct ib_device *device = r_xprt->rx_ia.ri_device; unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; struct ib_pd *pd = r_xprt->rx_ia.ri_pd; int i; + spin_lock_init(&buf->rb_mwlock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); + i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); + i += 2; /* head + tail */ + i *= buf->rb_max_requests; /* one set for each RPC slot */ + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); while (i--) { struct rpcrdma_mw *r; @@ -165,6 +299,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt) list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); r->mw_sendcompletion = frwr_sendcompletion; + r->r.frmr.fr_xprt = r_xprt; } return 0; @@ -178,78 +313,103 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_id->device; + struct ib_device *device = ia->ri_device; enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; + struct rpcrdma_mw *mw; + struct rpcrdma_frmr *frmr; + struct ib_mr *mr; + struct ib_reg_wr reg_wr; + struct ib_send_wr *bad_wr; + int rc, i, n, dma_nents; u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; + + mw = seg1->rl_mw; + seg1->rl_mw = NULL; + do { + if (mw) + __frwr_queue_recovery(mw); + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOMEM; + } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); + frmr = &mw->r.frmr; + frmr->fr_state = FRMR_IS_VALID; + mr = frmr->fr_mr; + if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < nsegs;) { - rpcrdma_map_one(device, seg, direction); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; + + for (i = 0; i < nsegs;) { + if (seg->mr_page) + sg_set_page(&frmr->sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); + else + sg_set_buf(&frmr->sg[i], seg->mr_offset, + seg->mr_len); + ++seg; ++i; + /* Check for holes */ if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", - __func__, mw, i, len); + frmr->sg_nents = i; - frmr->fr_state = FRMR_IS_VALID; + dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction); + if (!dma_nents) { + pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", + __func__, frmr->sg, frmr->sg_nents); + return -ENOMEM; + } + + n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); + if (unlikely(n != frmr->sg_nents)) { + pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", + __func__, frmr->fr_mr, n, frmr->sg_nents); + rc = n < 0 ? n : -EINVAL; + goto out_senderr; + } + + dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + __func__, mw, frmr->sg_nents, mr->length); - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.length = len; - fastreg_wr.wr.fast_reg.access_flags = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + reg_wr.wr.next = NULL; + reg_wr.wr.opcode = IB_WR_REG_MR; + reg_wr.wr.wr_id = (uintptr_t)mw; + reg_wr.wr.num_sge = 0; + reg_wr.wr.send_flags = 0; + reg_wr.mr = mr; + reg_wr.key = mr->rkey; + reg_wr.access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, ®_wr.wr, &bad_wr); if (rc) goto out_senderr; + seg1->mr_dir = direction; + seg1->rl_mw = mw; seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - return i; + seg1->mr_base = mr->iova; + seg1->mr_nsegs = frmr->sg_nents; + seg1->mr_len = mr->length; + + return frmr->sg_nents; out_senderr: dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(device, --seg); + ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction); + __frwr_queue_recovery(mw); return rc; } @@ -261,78 +421,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; struct ib_send_wr invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; - struct ib_device *device; - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + dprintk("RPC: %s: FRMR %p\n", __func__, mw); + + seg1->rl_mw = NULL; + frmr->fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; + invalidate_wr.wr_id = (unsigned long)(void *)mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; + invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); - device = ia->ri_id->device; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(device, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) goto out_err; + + rpcrdma_put_mw(r_xprt, mw); return nsegs; out_err: - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + __frwr_queue_recovery(mw); return nsegs; } -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -frwr_op_reset(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct ib_device *device = r_xprt->rx_ia.ri_id->device; - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - struct rpcrdma_mw *r; - int rc; - - list_for_each_entry(r, &buf->rb_all, mw_all) { - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - __frwr_release(r); - rc = __frwr_init(r, pd, device, depth); - if (rc) { - dprintk("RPC: %s: mw %p left %s\n", - __func__, r, - (r->r.frmr.fr_state == FRMR_IS_STALE ? - "stale" : "valid")); - continue; - } - - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - static void frwr_op_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_mw *r; + /* Ensure stale MWs for "buf" are no longer in flight */ + flush_workqueue(frwr_recovery_wq); + while (!list_empty(&buf->rb_all)) { r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); list_del(&r->mw_all); @@ -347,7 +475,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init = frwr_op_init, - .ro_reset = frwr_op_reset, .ro_destroy = frwr_op_destroy, .ro_displayname = "frwr", }; diff --git a/kernel/net/sunrpc/xprtrdma/module.c b/kernel/net/sunrpc/xprtrdma/module.c new file mode 100644 index 000000000..560712bd9 --- /dev/null +++ b/kernel/net/sunrpc/xprtrdma/module.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + */ + +/* rpcrdma.ko module initialization + */ + +#include +#include +#include +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); +MODULE_DESCRIPTION("RPC/RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("svcrdma"); +MODULE_ALIAS("xprtrdma"); + +static void __exit rpc_rdma_cleanup(void) +{ + xprt_rdma_cleanup(); + svc_rdma_cleanup(); +} + +static int __init rpc_rdma_init(void) +{ + int rc; + + rc = svc_rdma_init(); + if (rc) + goto out; + + rc = xprt_rdma_init(); + if (rc) + svc_rdma_cleanup(); + +out: + return rc; +} + +module_init(rpc_rdma_init); +module_exit(rpc_rdma_cleanup); diff --git a/kernel/net/sunrpc/xprtrdma/physical_ops.c b/kernel/net/sunrpc/xprtrdma/physical_ops.c index ba518af16..617b76f22 100644 --- a/kernel/net/sunrpc/xprtrdma/physical_ops.c +++ b/kernel/net/sunrpc/xprtrdma/physical_ops.c @@ -23,6 +23,21 @@ static int physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + struct ib_mr *mr; + + /* Obtain an rkey to use for RPC data payloads. + */ + mr = ib_get_dma_mr(ia->ri_pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(mr)) { + pr_err("%s: ib_get_dma_mr for failed with %lX\n", + __func__, PTR_ERR(mr)); + return -ENOMEM; + } + + ia->ri_dma_mr = mr; return 0; } @@ -50,9 +65,8 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - rpcrdma_map_one(ia->ri_id->device, seg, - rpcrdma_data_dir(writing)); - seg->mr_rkey = ia->ri_bind_mem->rkey; + rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); + seg->mr_rkey = ia->ri_dma_mr->rkey; seg->mr_base = seg->mr_dma; seg->mr_nsegs = 1; return 1; @@ -65,18 +79,10 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia->ri_id->device, seg); - read_unlock(&ia->ri_qplock); - + rpcrdma_unmap_one(ia->ri_device, seg); return 1; } -static void -physical_op_reset(struct rpcrdma_xprt *r_xprt) -{ -} - static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -88,7 +94,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, .ro_init = physical_op_init, - .ro_reset = physical_op_reset, .ro_destroy = physical_op_destroy, .ro_displayname = "physical", }; diff --git a/kernel/net/sunrpc/xprtrdma/rpc_rdma.c b/kernel/net/sunrpc/xprtrdma/rpc_rdma.c index 2c53ea9e1..c10d96994 100644 --- a/kernel/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/kernel/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,6 +71,67 @@ static const char transfertypes[][12] = { }; #endif +/* The client can send a request inline as long as the RPCRDMA header + * plus the RPC call fit under the transport's inline limit. If the + * combined call message size exceeds that limit, the client must use + * the read chunk list for this operation. + */ +static bool rpcrdma_args_inline(struct rpc_rqst *rqst) +{ + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; + + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); +} + +/* The client can't know how large the actual reply will be. Thus it + * plans for the largest possible reply for that particular ULP + * operation. If the maximum combined reply message size exceeds that + * limit, the client must provide a write list or a reply chunk for + * this request. + */ +static bool rpcrdma_results_inline(struct rpc_rqst *rqst) +{ + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; + + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); +} + +static int +rpcrdma_tail_pullup(struct xdr_buf *buf) +{ + size_t tlen = buf->tail[0].iov_len; + size_t skip = tlen & 3; + + /* Do not include the tail if it is only an XDR pad */ + if (tlen < 4) + return 0; + + /* xdr_write_pages() adds a pad at the beginning of the tail + * if the content in "buf->pages" is unaligned. Force the + * tail's actual content to land at the next XDR position + * after the head instead. + */ + if (skip) { + unsigned char *src, *dst; + unsigned int count; + + src = buf->tail[0].iov_base; + dst = buf->head[0].iov_base; + dst += buf->head[0].iov_len; + + src += skip; + tlen -= skip; + + dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", + __func__, skip, dst, src, tlen); + + for (count = tlen; count; count--) + *dst++ = *src++; + } + + return tlen; +} + /* * Chunk assembly from upper layer xdr_buf. * @@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == nsegs) return -EIO; + /* When encoding the read list, the tail is always sent inline */ + if (type == rpcrdma_readch) + return n; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ @@ -284,9 +349,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) - return n; - for (pos = 0; nchunks--;) pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, &req->rl_segments[pos]); @@ -300,8 +362,7 @@ out: * pre-registered memory buffer for this request. For small amounts * of data, this is efficient. The cutoff value is tunable. */ -static int -rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) { int i, npages, curlen; int copy_len; @@ -313,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp = rqst->rq_svec[0].iov_base; curlen = rqst->rq_svec[0].iov_len; destp += curlen; - /* - * Do optional padding where it makes sense. Alignment of write - * payload can help the server, if our setting is accurate. - */ - pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); - if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) - pad = 0; /* don't pad this request */ - dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", - __func__, pad, destp, rqst->rq_slen, curlen); + dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", + __func__, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; @@ -358,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) page_base = 0; } /* header now contains entire send message */ - return pad; } /* @@ -383,11 +436,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t rpclen, padlen; + size_t rpclen; ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) + return rpcrdma_bc_marshal_reply(rqst); +#endif + /* * rpclen gets amount of data in first buffer, which is the * pre-registered buffer. @@ -405,28 +463,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* * Chunks needed for results? * + * o Read ops return data as write chunk(s), header as inline. * o If the expected result is under the inline threshold, all ops - * return as inline (but see later). + * return as inline. * o Large non-read ops return as a single reply chunk. - * o Large read ops return data as write chunk(s), header as inline. - * - * Note: the NFS code sending down multiple result segments implies - * the op is one of read, readdir[plus], readlink or NFSv4 getacl. - */ - - /* - * This code can handle read chunks, write chunks OR reply - * chunks -- only one type. If the request is too big to fit - * inline, then we will choose read chunks. If the request is - * a READ, then use write chunks to separate the file data - * into pages; otherwise use reply chunks. */ - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + if (rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; + else if (rpcrdma_results_inline(rqst)) + wtype = rpcrdma_noch; else wtype = rpcrdma_replych; @@ -435,21 +480,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * * o If the total request is under the inline threshold, all ops * are sent as inline. - * o Large non-write ops are sent with the entire message as a - * single read chunk (protocol 0-position special case). * o Large write ops transmit data as read chunk(s), header as * inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). * - * Note: the NFS code sending down multiple argument segments - * implies the op is a write. - * TBD check NFSv4 setacl + * This assumes that the upper layer does not present a request + * that both has a data payload, and whose non-data arguments + * by themselves are larger than the inline threshold. */ - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + if (rpcrdma_args_inline(rqst)) { rtype = rpcrdma_noch; - else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; - else + } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; + } else { + r_xprt->rx_stats.nomsg_call_count++; + headerp->rm_type = htonl(RDMA_NOMSG); + rtype = rpcrdma_areadch; + rpclen = 0; + } /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) @@ -461,7 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) } hdrlen = RPCRDMA_HDRLEN_MIN; - padlen = 0; /* * Pull up any extra send data into the preregistered buffer. @@ -470,45 +518,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rtype == rpcrdma_noch) { - padlen = rpcrdma_inline_pullup(rqst, - RPCRDMA_INLINE_PAD_VALUE(rqst)); - - if (padlen) { - headerp->rm_type = rdma_msgp; - headerp->rm_body.rm_padded.rm_align = - cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); - headerp->rm_body.rm_padded.rm_thresh = - cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); - headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; - hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { - dprintk("RPC: %s: invalid chunk list\n", - __func__); - return -EIO; - } - } else { - headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; - /* new length after pullup */ - rpclen = rqst->rq_svec[0].iov_len; - /* - * Currently we try to not actually use read inline. - * Reply chunks have the desirable property that - * they land, packed, directly in the target buffers - * without headers, so they require no fixup. The - * additional RDMA Write op sends the same amount - * of data, streams on-the-wire and adds no overhead - * on receive. Therefore, we request a reply chunk - * for non-writes wherever feasible and efficient. - */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; - } - } + rpcrdma_inline_pullup(rqst); + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + } else if (rtype == rpcrdma_readch) + rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); if (rtype != rpcrdma_noch) { hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, headerp, rtype); @@ -521,9 +539,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen < 0) return hdrlen; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* @@ -537,26 +555,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + req->rl_niovs = 1; + if (rtype == rpcrdma_areadch) + return 0; + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 2; - - if (padlen) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); - req->rl_send_iov[2].length = padlen; - req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); - - req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; - req->rl_send_iov[3].length = rqst->rq_slen - rpclen; - req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 4; - } - return 0; } @@ -709,6 +716,37 @@ rpcrdma_connect_worker(struct work_struct *work) spin_unlock_bh(&xprt->transport_lock); } +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +rpcrdma_is_bcall(struct rpcrdma_msg *headerp) +{ + __be32 *p = (__be32 *)headerp; + + if (headerp->rm_type != rdma_msg) + return false; + if (headerp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (headerp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (headerp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != headerp->rm_xid) + return false; + /* call direction */ + if (p[8] != cpu_to_be32(RPC_CALL)) + return false; + + return true; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + /* * This function is called when an async event is posted to * the connection which changes the connection state. All it @@ -721,8 +759,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) schedule_delayed_work(&ep->rep_connect_worker, 0); } -/* - * Called as a tasklet to do req/reply match and complete a request +/* Process received RPC/RDMA messages. + * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ @@ -732,60 +770,39 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpcrdma_msg *headerp; struct rpcrdma_req *req; struct rpc_rqst *rqst; - struct rpc_xprt *xprt = rep->rr_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; __be32 *iptr; int rdmalen, status; unsigned long cwnd; u32 credits; - /* Check status. If bad, signal disconnect and return rep to pool */ - if (rep->rr_len == ~0U) { - rpcrdma_recv_buffer_put(rep); - if (r_xprt->rx_ep.rep_connected == 1) { - r_xprt->rx_ep.rep_connected = -EIO; - rpcrdma_conn_func(&r_xprt->rx_ep); - } - return; - } - if (rep->rr_len < RPCRDMA_HDRLEN_MIN) { - dprintk("RPC: %s: short/invalid reply\n", __func__); - goto repost; - } + dprintk("RPC: %s: incoming rep %p\n", __func__, rep); + + if (rep->rr_len == RPCRDMA_BAD_LEN) + goto out_badstatus; + if (rep->rr_len < RPCRDMA_HDRLEN_MIN) + goto out_shortreply; + headerp = rdmab_to_msg(rep->rr_rdmabuf); - if (headerp->rm_vers != rpcrdma_version) { - dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(headerp->rm_vers)); - goto repost; - } + if (headerp->rm_vers != rpcrdma_version) + goto out_badversion; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (rpcrdma_is_bcall(headerp)) + goto out_bcall; +#endif - /* Get XID and try for a match. */ - spin_lock(&xprt->transport_lock); + /* Match incoming rpcrdma_rep to an rpcrdma_req to + * get context for handling any incoming chunks. + */ + spin_lock_bh(&xprt->transport_lock); rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (rqst == NULL) { - spin_unlock(&xprt->transport_lock); - dprintk("RPC: %s: reply 0x%p failed " - "to match any request xid 0x%08x len %d\n", - __func__, rep, be32_to_cpu(headerp->rm_xid), - rep->rr_len); -repost: - r_xprt->rx_stats.bad_reply_count++; - rep->rr_func = rpcrdma_reply_handler; - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) - rpcrdma_recv_buffer_put(rep); - - return; - } + if (!rqst) + goto out_nomatch; - /* get request object */ req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - spin_unlock(&xprt->transport_lock); - dprintk("RPC: %s: duplicate reply 0x%p to RPC " - "request 0x%p: xid 0x%08x\n", __func__, rep, req, - be32_to_cpu(headerp->rm_xid)); - goto repost; - } + if (req->rl_reply) + goto out_duplicate; dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", @@ -882,8 +899,50 @@ badheader: if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); + xprt_complete_rqst(rqst->rq_task, status); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); - xprt_complete_rqst(rqst->rq_task, status); - spin_unlock(&xprt->transport_lock); + return; + +out_badstatus: + rpcrdma_recv_buffer_put(rep); + if (r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -EIO; + rpcrdma_conn_func(&r_xprt->rx_ep); + } + return; + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +out_bcall: + rpcrdma_bc_receive_call(r_xprt, rep); + return; +#endif + +out_shortreply: + dprintk("RPC: %s: short/invalid reply\n", __func__); + goto repost; + +out_badversion: + dprintk("RPC: %s: invalid version %d\n", + __func__, be32_to_cpu(headerp->rm_vers)); + goto repost; + +out_nomatch: + spin_unlock_bh(&xprt->transport_lock); + dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", + __func__, be32_to_cpu(headerp->rm_xid), + rep->rr_len); + goto repost; + +out_duplicate: + spin_unlock_bh(&xprt->transport_lock); + dprintk("RPC: %s: " + "duplicate reply %p to RPC request %p: xid 0x%08x\n", + __func__, rep, req, be32_to_cpu(headerp->rm_xid)); + +repost: + r_xprt->rx_stats.bad_reply_count++; + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + rpcrdma_recv_buffer_put(rep); } diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma.c b/kernel/net/sunrpc/xprtrdma/svc_rdma.c index c1b627026..1b7051bdb 100644 --- a/kernel/net/sunrpc/xprtrdma/svc_rdma.c +++ b/kernel/net/sunrpc/xprtrdma/svc_rdma.c @@ -38,8 +38,7 @@ * * Author: Tom Tucker */ -#include -#include + #include #include #include @@ -240,6 +239,9 @@ void svc_rdma_cleanup(void) unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; } +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + svc_unreg_xprt_class(&svc_rdma_bc_class); +#endif svc_unreg_xprt_class(&svc_rdma_class); kmem_cache_destroy(svc_rdma_map_cachep); kmem_cache_destroy(svc_rdma_ctxt_cachep); @@ -287,6 +289,9 @@ int svc_rdma_init(void) /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + svc_reg_xprt_class(&svc_rdma_bc_class); +#endif return 0; err1: kmem_cache_destroy(svc_rdma_map_cachep); @@ -295,8 +300,3 @@ int svc_rdma_init(void) destroy_workqueue(svc_rdma_wq); return -ENOMEM; } -MODULE_AUTHOR("Tom Tucker "); -MODULE_DESCRIPTION("SVC RDMA Transport"); -MODULE_LICENSE("Dual BSD/GPL"); -module_init(svc_rdma_init); -module_exit(svc_rdma_cleanup); diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c index b681855cf..e2fca7617 100644 --- a/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -50,12 +50,12 @@ /* * Decodes a read chunk list. The expected format is as follows: * descrim : xdr_one - * position : u32 offset into XDR stream - * handle : u32 RKEY + * position : __be32 offset into XDR stream + * handle : __be32 RKEY * . . . * end-of-list: xdr_zero */ -static u32 *decode_read_list(u32 *va, u32 *vaend) +static __be32 *decode_read_list(__be32 *va, __be32 *vaend) { struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; @@ -67,20 +67,20 @@ static u32 *decode_read_list(u32 *va, u32 *vaend) } ch++; } - return (u32 *)&ch->rc_position; + return &ch->rc_position; } /* * Decodes a write chunk list. The expected format is as follows: * descrim : xdr_one * nchunks : - * handle : u32 RKEY ---+ - * length : u32 | + * handle : __be32 RKEY ---+ + * length : __be32 | * offset : remove va + * . . . | * ---+ */ -static u32 *decode_write_list(u32 *va, u32 *vaend) +static __be32 *decode_write_list(__be32 *va, __be32 *vaend) { unsigned long start, end; int nchunks; @@ -90,14 +90,14 @@ static u32 *decode_write_list(u32 *va, u32 *vaend) /* Check for not write-array */ if (ary->wc_discrim == xdr_zero) - return (u32 *)&ary->wc_nchunks; + return &ary->wc_nchunks; if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > (unsigned long)vaend) { dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); return NULL; } - nchunks = ntohl(ary->wc_nchunks); + nchunks = be32_to_cpu(ary->wc_nchunks); start = (unsigned long)&ary->wc_array[0]; end = (unsigned long)vaend; @@ -112,10 +112,10 @@ static u32 *decode_write_list(u32 *va, u32 *vaend) * rs_length is the 2nd 4B field in wc_target and taking its * address skips the list terminator */ - return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length; + return &ary->wc_array[nchunks].wc_target.rs_length; } -static u32 *decode_reply_array(u32 *va, u32 *vaend) +static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) { unsigned long start, end; int nchunks; @@ -124,14 +124,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend) /* Check for no reply-array */ if (ary->wc_discrim == xdr_zero) - return (u32 *)&ary->wc_nchunks; + return &ary->wc_nchunks; if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > (unsigned long)vaend) { dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); return NULL; } - nchunks = ntohl(ary->wc_nchunks); + nchunks = be32_to_cpu(ary->wc_nchunks); start = (unsigned long)&ary->wc_array[0]; end = (unsigned long)vaend; @@ -142,15 +142,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend) ary, nchunks, vaend); return NULL; } - return (u32 *)&ary->wc_array[nchunks]; + return (__be32 *)&ary->wc_array[nchunks]; } int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, struct svc_rqst *rqstp) { struct rpcrdma_msg *rmsgp = NULL; - u32 *va; - u32 *vaend; + __be32 *va, *vaend; u32 hdr_len; rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; @@ -162,22 +161,17 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, return -EINVAL; } - /* Decode the header */ - rmsgp->rm_xid = ntohl(rmsgp->rm_xid); - rmsgp->rm_vers = ntohl(rmsgp->rm_vers); - rmsgp->rm_credit = ntohl(rmsgp->rm_credit); - rmsgp->rm_type = ntohl(rmsgp->rm_type); - - if (rmsgp->rm_vers != RPCRDMA_VERSION) + if (rmsgp->rm_vers != rpcrdma_version) return -ENOSYS; /* Pull in the extra for the padded case and bump our pointer */ - if (rmsgp->rm_type == RDMA_MSGP) { + if (rmsgp->rm_type == rdma_msgp) { int hdrlen; + rmsgp->rm_body.rm_padded.rm_align = - ntohl(rmsgp->rm_body.rm_padded.rm_align); + be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); rmsgp->rm_body.rm_padded.rm_thresh = - ntohl(rmsgp->rm_body.rm_padded.rm_thresh); + be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; rqstp->rq_arg.head[0].iov_base = va; @@ -192,7 +186,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, * chunk list and a reply chunk list. */ va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); va = decode_read_list(va, vaend); if (!va) return -EINVAL; @@ -211,76 +205,20 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, return hdr_len; } -int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) -{ - struct rpcrdma_msg *rmsgp = NULL; - struct rpcrdma_read_chunk *ch; - struct rpcrdma_write_array *ary; - u32 *va; - u32 hdrlen; - - dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", - rqstp); - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - - /* Pull in the extra for the padded case and bump our pointer */ - if (rmsgp->rm_type == RDMA_MSGP) { - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rqstp->rq_arg.head[0].iov_base = va; - hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); - rqstp->rq_arg.head[0].iov_len -= hdrlen; - return hdrlen; - } - - /* - * Skip all chunks to find RPC msg. These were previously processed - */ - va = &rmsgp->rm_body.rm_chunks[0]; - - /* Skip read-list */ - for (ch = (struct rpcrdma_read_chunk *)va; - ch->rc_discrim != xdr_zero; ch++); - va = (u32 *)&ch->rc_position; - - /* Skip write-list */ - ary = (struct rpcrdma_write_array *)va; - if (ary->wc_discrim == xdr_zero) - va = (u32 *)&ary->wc_nchunks; - else - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; - - /* Skip reply-array */ - ary = (struct rpcrdma_write_array *)va; - if (ary->wc_discrim == xdr_zero) - va = (u32 *)&ary->wc_nchunks; - else - va = (u32 *)&ary->wc_array[ary->wc_nchunks]; - - rqstp->rq_arg.head[0].iov_base = va; - hdrlen = (unsigned long)va - (unsigned long)rmsgp; - rqstp->rq_arg.head[0].iov_len -= hdrlen; - - return hdrlen; -} - int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err, u32 *va) + enum rpcrdma_errcode err, __be32 *va) { - u32 *startp = va; + __be32 *startp = va; - *va++ = htonl(rmsgp->rm_xid); - *va++ = htonl(rmsgp->rm_vers); - *va++ = htonl(xprt->sc_max_requests); - *va++ = htonl(RDMA_ERROR); - *va++ = htonl(err); + *va++ = rmsgp->rm_xid; + *va++ = rmsgp->rm_vers; + *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = rdma_error; + *va++ = cpu_to_be32(err); if (err == ERR_VERS) { - *va++ = htonl(RPCRDMA_VERSION); - *va++ = htonl(RPCRDMA_VERSION); + *va++ = rpcrdma_version; + *va++ = rpcrdma_version; } return (int)((unsigned long)va - (unsigned long)startp); @@ -297,7 +235,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) &rmsgp->rm_body.rm_chunks[1]; if (wr_ary->wc_discrim) wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. + &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. wc_target.rs_length; else wr_ary = (struct rpcrdma_write_array *) @@ -306,7 +244,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) /* skip reply array */ if (wr_ary->wc_discrim) wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; + &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; else wr_ary = (struct rpcrdma_write_array *) &wr_ary->wc_nchunks; @@ -325,7 +263,7 @@ void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) ary = (struct rpcrdma_write_array *) &rmsgp->rm_body.rm_chunks[1]; ary->wc_discrim = xdr_one; - ary->wc_nchunks = htonl(chunks); + ary->wc_nchunks = cpu_to_be32(chunks); /* write-list terminator */ ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; @@ -338,7 +276,7 @@ void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, int chunks) { ary->wc_discrim = xdr_one; - ary->wc_nchunks = htonl(chunks); + ary->wc_nchunks = cpu_to_be32(chunks); } void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, @@ -350,7 +288,7 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; seg->rs_handle = rs_handle; seg->rs_offset = rs_offset; - seg->rs_length = htonl(write_len); + seg->rs_length = cpu_to_be32(write_len); } void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, @@ -358,10 +296,10 @@ void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_resp, enum rpcrdma_proc rdma_type) { - rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); - rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); - rdma_resp->rm_credit = htonl(xprt->sc_max_requests); - rdma_resp->rm_type = htonl(rdma_type); + rdma_resp->rm_xid = rdma_argp->rm_xid; + rdma_resp->rm_vers = rdma_argp->rm_vers; + rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); + rdma_resp->rm_type = cpu_to_be32(rdma_type); /* Encode chunks lists */ rdma_resp->rm_body.rm_chunks[0] = xdr_zero; diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index f9f13a32d..ff4f01e52 100644 --- a/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -85,7 +85,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG) + if (rmsgp->rm_type == rdma_nomsg) rqstp->rq_arg.pages = &rqstp->rq_pages[0]; else rqstp->rq_arg.pages = &rqstp->rq_pages[1]; @@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) -{ - if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == - RDMA_TRANSPORT_IWARP) - return 1; - else - return min_t(int, sge_count, xprt->sc_max_sge); -} - /* Issue an RDMA_READ using the local lkey to map the data sink */ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -135,7 +126,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, u64 rs_offset, bool last) { - struct ib_send_wr read_wr; + struct ib_rdma_wr read_wr; int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); int ret, read, pno; @@ -144,9 +135,9 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->direction = DMA_FROM_DEVICE; ctxt->read_hdr = head; - pages_needed = - min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); - read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd); + read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, + rs_length); for (pno = 0; pno < pages_needed; pno++) { int len = min_t(int, rs_length, PAGE_SIZE - pg_off); @@ -189,16 +180,16 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); memset(&read_wr, 0, sizeof(read_wr)); - read_wr.wr_id = (unsigned long)ctxt; - read_wr.opcode = IB_WR_RDMA_READ; - ctxt->wr_op = read_wr.opcode; - read_wr.send_flags = IB_SEND_SIGNALED; - read_wr.wr.rdma.rkey = rs_handle; - read_wr.wr.rdma.remote_addr = rs_offset; - read_wr.sg_list = ctxt->sge; - read_wr.num_sge = pages_needed; - - ret = svc_rdma_send(xprt, &read_wr); + read_wr.wr.wr_id = (unsigned long)ctxt; + read_wr.wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.wr.opcode; + read_wr.wr.send_flags = IB_SEND_SIGNALED; + read_wr.rkey = rs_handle; + read_wr.remote_addr = rs_offset; + read_wr.wr.sg_list = ctxt->sge; + read_wr.wr.num_sge = pages_needed; + + ret = svc_rdma_send(xprt, &read_wr.wr); if (ret) { pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -228,14 +219,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, u64 rs_offset, bool last) { - struct ib_send_wr read_wr; + struct ib_rdma_wr read_wr; struct ib_send_wr inv_wr; - struct ib_send_wr fastreg_wr; + struct ib_reg_wr reg_wr; u8 key; - int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + int nents = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); - int ret, read, pno; + int ret, read, pno, dma_nents, n; u32 pg_off = *page_offset; u32 pg_no = *page_no; @@ -244,16 +235,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, ctxt->direction = DMA_FROM_DEVICE; ctxt->frmr = frmr; - pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); - read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + nents = min_t(unsigned int, nents, xprt->sc_frmr_pg_list_len); + read = min_t(int, (nents << PAGE_SHIFT) - *page_offset, rs_length); - frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); frmr->direction = DMA_FROM_DEVICE; frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); - frmr->map_len = pages_needed << PAGE_SHIFT; - frmr->page_list_len = pages_needed; + frmr->sg_nents = nents; - for (pno = 0; pno < pages_needed; pno++) { + for (pno = 0; pno < nents; pno++) { int len = min_t(int, rs_length, PAGE_SIZE - pg_off); head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; @@ -261,17 +250,12 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, head->arg.len += len; if (!pg_off) head->count++; + + sg_set_page(&frmr->sg[pno], rqstp->rq_arg.pages[pg_no], + len, pg_off); + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; rqstp->rq_next_page = rqstp->rq_respages + 1; - frmr->page_list->page_list[pno] = - ib_dma_map_page(xprt->sc_cm_id->device, - head->arg.pages[pg_no], 0, - PAGE_SIZE, DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[pno]); - if (ret) - goto err; - atomic_inc(&xprt->sc_dma_used); /* adjust offset and wrap to next page if needed */ pg_off += len; @@ -287,43 +271,57 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, else clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + dma_nents = ib_dma_map_sg(xprt->sc_cm_id->device, + frmr->sg, frmr->sg_nents, + frmr->direction); + if (!dma_nents) { + pr_err("svcrdma: failed to dma map sg %p\n", + frmr->sg); + return -ENOMEM; + } + atomic_inc(&xprt->sc_dma_used); + + n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); + if (unlikely(n != frmr->sg_nents)) { + pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n", + frmr->mr, n, frmr->sg_nents); + return n < 0 ? n : -EINVAL; + } + /* Bump the key */ key = (u8)(frmr->mr->lkey & 0x000000FF); ib_update_fast_reg_key(frmr->mr, ++key); - ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset; + ctxt->sge[0].addr = frmr->mr->iova; ctxt->sge[0].lkey = frmr->mr->lkey; - ctxt->sge[0].length = read; + ctxt->sge[0].length = frmr->mr->length; ctxt->count = 1; ctxt->read_hdr = head; - /* Prepare FASTREG WR */ - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.send_flags = IB_SEND_SIGNALED; - fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; - fastreg_wr.wr.fast_reg.page_list = frmr->page_list; - fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = frmr->map_len; - fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; - fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; - fastreg_wr.next = &read_wr; + /* Prepare REG WR */ + reg_wr.wr.opcode = IB_WR_REG_MR; + reg_wr.wr.wr_id = 0; + reg_wr.wr.send_flags = IB_SEND_SIGNALED; + reg_wr.wr.num_sge = 0; + reg_wr.mr = frmr->mr; + reg_wr.key = frmr->mr->lkey; + reg_wr.access = frmr->access_flags; + reg_wr.wr.next = &read_wr.wr; /* Prepare RDMA_READ */ memset(&read_wr, 0, sizeof(read_wr)); - read_wr.send_flags = IB_SEND_SIGNALED; - read_wr.wr.rdma.rkey = rs_handle; - read_wr.wr.rdma.remote_addr = rs_offset; - read_wr.sg_list = ctxt->sge; - read_wr.num_sge = 1; + read_wr.wr.send_flags = IB_SEND_SIGNALED; + read_wr.rkey = rs_handle; + read_wr.remote_addr = rs_offset; + read_wr.wr.sg_list = ctxt->sge; + read_wr.wr.num_sge = 1; if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { - read_wr.opcode = IB_WR_RDMA_READ_WITH_INV; - read_wr.wr_id = (unsigned long)ctxt; - read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; + read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + read_wr.wr.wr_id = (unsigned long)ctxt; + read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; } else { - read_wr.opcode = IB_WR_RDMA_READ; - read_wr.next = &inv_wr; + read_wr.wr.opcode = IB_WR_RDMA_READ; + read_wr.wr.next = &inv_wr; /* Prepare invalidate */ memset(&inv_wr, 0, sizeof(inv_wr)); inv_wr.wr_id = (unsigned long)ctxt; @@ -331,10 +329,10 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; inv_wr.ex.invalidate_rkey = frmr->mr->lkey; } - ctxt->wr_op = read_wr.opcode; + ctxt->wr_op = read_wr.wr.opcode; /* Post the chain */ - ret = svc_rdma_send(xprt, &fastreg_wr); + ret = svc_rdma_send(xprt, ®_wr.wr); if (ret) { pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -348,7 +346,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, atomic_inc(&rdma_stat_read); return ret; err: - svc_rdma_unmap_dma(ctxt); + ib_dma_unmap_sg(xprt->sc_cm_id->device, + frmr->sg, frmr->sg_nents, frmr->direction); svc_rdma_put_context(ctxt, 0); svc_rdma_put_frmr(xprt, frmr); return ret; @@ -541,7 +540,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.page_base = head->arg.page_base; /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_respages = &rqstp->rq_pages[page_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Rebuild rq_arg head and tail. */ diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 7de33d1af..969a1ab75 100644 --- a/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -136,6 +136,79 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, return dma_addr; } +/* Returns the address of the first read chunk or if no read chunk + * is present + */ +struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == xdr_zero) + return NULL; + return ch; +} + +/* Returns the address of the first read write array element or + * if no write array list is present + */ +static struct rpcrdma_write_array * +svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) +{ + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || + rmsgp->rm_body.rm_chunks[1] == xdr_zero) + return NULL; + return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; +} + +/* Returns the address of the first reply array element or if no + * reply array is present + */ +static struct rpcrdma_write_array * +svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *rch; + struct rpcrdma_write_array *wr_ary; + struct rpcrdma_write_array *rp_ary; + + /* XXX: Need to fix when reply chunk may occur with read list + * and/or write list. + */ + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || + rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return NULL; + + rch = svc_rdma_get_read_chunk(rmsgp); + if (rch) { + while (rch->rc_discrim != xdr_zero) + rch++; + + /* The reply chunk follows an empty write array located + * at 'rc_position' here. The reply array is at rc_target. + */ + rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; + goto found_it; + } + + wr_ary = svc_rdma_get_write_array(rmsgp); + if (wr_ary) { + int chunk = be32_to_cpu(wr_ary->wc_nchunks); + + rp_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[chunk].wc_target.rs_length; + goto found_it; + } + + /* No read list, no write list */ + rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2]; + + found_it: + if (rp_ary->wc_discrim == xdr_zero) + return NULL; + return rp_ary; +} + /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ @@ -144,7 +217,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, u32 xdr_off, int write_len, struct svc_rdma_req_map *vec) { - struct ib_send_wr write_wr; + struct ib_rdma_wr write_wr; struct ib_sge *sge; int xdr_sge_no; int sge_no; @@ -209,17 +282,17 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); ctxt->wr_op = IB_WR_RDMA_WRITE; - write_wr.wr_id = (unsigned long)ctxt; - write_wr.sg_list = &sge[0]; - write_wr.num_sge = sge_no; - write_wr.opcode = IB_WR_RDMA_WRITE; - write_wr.send_flags = IB_SEND_SIGNALED; - write_wr.wr.rdma.rkey = rmr; - write_wr.wr.rdma.remote_addr = to; + write_wr.wr.wr_id = (unsigned long)ctxt; + write_wr.wr.sg_list = &sge[0]; + write_wr.wr.num_sge = sge_no; + write_wr.wr.opcode = IB_WR_RDMA_WRITE; + write_wr.wr.send_flags = IB_SEND_SIGNALED; + write_wr.rkey = rmr; + write_wr.remote_addr = to; /* Post It */ atomic_inc(&rdma_stat_write); - if (svc_rdma_send(xprt, &write_wr)) + if (svc_rdma_send(xprt, &write_wr.wr)) goto err; return write_len - bc; err: @@ -240,6 +313,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, u32 xdr_off; int chunk_off; int chunk_no; + int nchunks; struct rpcrdma_write_array *arg_ary; struct rpcrdma_write_array *res_ary; int ret; @@ -251,14 +325,15 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, &rdma_resp->rm_body.rm_chunks[1]; /* Write chunks start at the pagelist */ + nchunks = be32_to_cpu(arg_ary->wc_nchunks); for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; - xfer_len && chunk_no < arg_ary->wc_nchunks; + xfer_len && chunk_no < nchunks; chunk_no++) { struct rpcrdma_segment *arg_ch; u64 rs_offset; arg_ch = &arg_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, ntohl(arg_ch->rs_length)); + write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length)); /* Prepare the response chunk given the length actually * written */ @@ -270,7 +345,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, chunk_off = 0; while (write_len) { ret = send_write(xprt, rqstp, - ntohl(arg_ch->rs_handle), + be32_to_cpu(arg_ch->rs_handle), rs_offset + chunk_off, xdr_off, write_len, @@ -318,13 +393,13 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, &rdma_resp->rm_body.rm_chunks[2]; /* xdr offset starts at RPC message */ - nchunks = ntohl(arg_ary->wc_nchunks); + nchunks = be32_to_cpu(arg_ary->wc_nchunks); for (xdr_off = 0, chunk_no = 0; xfer_len && chunk_no < nchunks; chunk_no++) { u64 rs_offset; ch = &arg_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, htonl(ch->rs_length)); + write_len = min(xfer_len, be32_to_cpu(ch->rs_length)); /* Prepare the reply chunk given the length actually * written */ @@ -335,7 +410,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, chunk_off = 0; while (write_len) { ret = send_write(xprt, rqstp, - ntohl(ch->rs_handle), + be32_to_cpu(ch->rs_handle), rs_offset + chunk_off, xdr_off, write_len, @@ -382,6 +457,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; + u32 xdr_off; int sge_no; int sge_bytes; int page_no; @@ -416,8 +492,8 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->direction = DMA_TO_DEVICE; /* Map the payload indicated by 'byte_count' */ + xdr_off = 0; for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; ctxt->sge[sge_no].addr = @@ -455,6 +531,13 @@ static int send_reply(struct svcxprt_rdma *rdma, } rqstp->rq_next_page = rqstp->rq_respages + 1; + /* The loop above bumps sc_dma_used for each sge. The + * xdr_buf.tail gets a separate sge, but resides in the + * same page as xdr_buf.head. Don't count it twice. + */ + if (sge_no > ctxt->count) + atomic_dec(&rdma->sc_dma_used); + if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; @@ -515,7 +598,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ - res_page = svc_rdma_get_page(); + res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); rdma_resp = page_address(res_page); reply_ary = svc_rdma_get_reply_array(rdma_argp); if (reply_ary) diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c index f609c1c2d..b348b4ade 100644 --- a/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -56,6 +56,7 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -95,16 +96,69 @@ struct svc_xprt_class svc_rdma_class = { .xcl_ident = XPRT_TRANSPORT_RDMA, }; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, + struct sockaddr *, int, int); +static void svc_rdma_bc_detach(struct svc_xprt *); +static void svc_rdma_bc_free(struct svc_xprt *); + +static struct svc_xprt_ops svc_rdma_bc_ops = { + .xpo_create = svc_rdma_bc_create, + .xpo_detach = svc_rdma_bc_detach, + .xpo_free = svc_rdma_bc_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_secure_port = svc_rdma_secure_port, +}; + +struct svc_xprt_class svc_rdma_bc_class = { + .xcl_name = "rdma-bc", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_bc_ops, + .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN) +}; + +static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + + cma_xprt = rdma_create_xprt(serv, 0); + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + xprt = &cma_xprt->sc_xprt; + + svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); + serv->sv_bc_xprt = xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, xprt); + return xprt; +} + +static void svc_rdma_bc_detach(struct svc_xprt *xprt) +{ + dprintk("svcrdma: %s(%p)\n", __func__, xprt); +} + +static void svc_rdma_bc_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + dprintk("svcrdma: %s(%p)\n", __func__, xprt); + if (xprt) + kfree(rdma); +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt; - while (1) { - ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); - if (ctxt) - break; - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - } + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, + GFP_KERNEL | __GFP_NOFAIL); ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; @@ -156,12 +210,8 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) struct svc_rdma_req_map *svc_rdma_get_req_map(void) { struct svc_rdma_req_map *map; - while (1) { - map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); - if (map) - break; - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - } + map = kmem_cache_alloc(svc_rdma_map_cachep, + GFP_KERNEL | __GFP_NOFAIL); map->count = 0; return map; } @@ -175,8 +225,8 @@ void svc_rdma_put_req_map(struct svc_rdma_req_map *map) static void cq_event_handler(struct ib_event *event, void *context) { struct svc_xprt *xprt = context; - dprintk("svcrdma: received CQ event id=%d, context=%p\n", - event->event, context); + dprintk("svcrdma: received CQ event %s (%d), context=%p\n", + ib_event_msg(event->event), event->event, context); set_bit(XPT_CLOSE, &xprt->xpt_flags); } @@ -191,8 +241,9 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_COMM_EST: case IB_EVENT_SQ_DRAINED: case IB_EVENT_QP_LAST_WQE_REACHED: - dprintk("svcrdma: QP event %d received for QP=%p\n", - event->event, event->element.qp); + dprintk("svcrdma: QP event %s (%d) received for QP=%p\n", + ib_event_msg(event->event), event->event, + event->element.qp); break; /* These are considered fatal events */ case IB_EVENT_PATH_MIG_ERR: @@ -201,9 +252,10 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_QP_ACCESS_ERR: case IB_EVENT_DEVICE_FATAL: default: - dprintk("svcrdma: QP ERROR event %d received for QP=%p, " + dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, " "closing transport\n", - event->event, event->element.qp); + ib_event_msg(event->event), event->event, + event->element.qp); set_bit(XPT_CLOSE, &xprt->xpt_flags); break; } @@ -402,7 +454,8 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) for (i = 0; i < ret; i++) { wc = &wc_a[i]; if (wc->status != IB_WC_SUCCESS) { - dprintk("svcrdma: sq wc err status %d\n", + dprintk("svcrdma: sq wc err status %s (%d)\n", + ib_wc_status_msg(wc->status), wc->status); /* Close the transport */ @@ -490,18 +543,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -struct page *svc_rdma_get_page(void) -{ - struct page *page; - - while ((page = alloc_page(GFP_KERNEL)) == NULL) { - /* If we can't get memory, wait a bit and try again */ - printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n"); - schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); - } - return page; -} - int svc_rdma_post_recv(struct svcxprt_rdma *xprt) { struct ib_recv_wr recv_wr, *bad_recv_wr; @@ -520,7 +561,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = svc_rdma_get_page(); + page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -616,7 +657,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " - "event=%d\n", cma_id, cma_id->context, event->event); + "event = %s (%d)\n", cma_id, cma_id->context, + rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, event->param.conn.initiator_depth); break; @@ -636,7 +678,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, default: dprintk("svcrdma: Unexpected event on listening endpoint %p, " - "event=%d\n", cma_id, event->event); + "event = %s (%d)\n", cma_id, + rdma_event_msg(event->event), event->event); break; } @@ -669,15 +712,18 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, break; case RDMA_CM_EVENT_DEVICE_REMOVAL: dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " - "event=%d\n", cma_id, xprt, event->event); + "event = %s (%d)\n", cma_id, xprt, + rdma_event_msg(event->event), event->event); if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } break; default: dprintk("svcrdma: Unexpected event on DTO endpoint %p, " - "event=%d\n", cma_id, event->event); + "event = %s (%d)\n", cma_id, + rdma_event_msg(event->event), event->event); break; } return 0; @@ -704,8 +750,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, if (!cma_xprt) return ERR_PTR(-ENOMEM); - listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, - IB_QPT_RC); + listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); dprintk("svcrdma: rdma_create_id failed = %d\n", ret); @@ -744,24 +790,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) { struct ib_mr *mr; - struct ib_fast_reg_page_list *pl; + struct scatterlist *sg; struct svc_rdma_fastreg_mr *frmr; + u32 num_sg; frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); if (!frmr) goto err; - mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len); + mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg); if (IS_ERR(mr)) goto err_free_frmr; - pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, - RPCSVC_MAXPAGES); - if (IS_ERR(pl)) + sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL); + if (!sg) goto err_free_mr; + sg_init_table(sg, RPCSVC_MAXPAGES); + frmr->mr = mr; - frmr->page_list = pl; + frmr->sg = sg; INIT_LIST_HEAD(&frmr->frmr_list); return frmr; @@ -781,8 +830,8 @@ static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) frmr = list_entry(xprt->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); + kfree(frmr->sg); ib_dereg_mr(frmr->mr); - ib_free_fast_reg_page_list(frmr->page_list); kfree(frmr); } } @@ -796,8 +845,7 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) frmr = list_entry(rdma->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); - frmr->map_len = 0; - frmr->page_list_len = 0; + frmr->sg_nents = 0; } spin_unlock_bh(&rdma->sc_frmr_q_lock); if (frmr) @@ -806,25 +854,13 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) return rdma_alloc_frmr(rdma); } -static void frmr_unmap_dma(struct svcxprt_rdma *xprt, - struct svc_rdma_fastreg_mr *frmr) -{ - int page_no; - for (page_no = 0; page_no < frmr->page_list_len; page_no++) { - dma_addr_t addr = frmr->page_list->page_list[page_no]; - if (ib_dma_mapping_error(frmr->mr->device, addr)) - continue; - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE, - frmr->direction); - } -} - void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, struct svc_rdma_fastreg_mr *frmr) { if (frmr) { - frmr_unmap_dma(rdma, frmr); + ib_dma_unmap_sg(rdma->sc_cm_id->device, + frmr->sg, frmr->sg_nents, frmr->direction); + atomic_dec(&rdma->sc_dma_used); spin_lock_bh(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); @@ -848,10 +884,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; + struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; int uninitialized_var(dma_mr_acc); - int need_dma_mr; + int need_dma_mr = 0; int ret; int i; @@ -884,6 +921,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * capabilities of this particular device */ newxprt->sc_max_sge = min((size_t)devattr.max_sge, (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + RPCSVC_MAXPAGES); newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, (size_t)svcrdma_max_requests); newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; @@ -900,22 +939,22 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } + cq_attr.cqe = newxprt->sc_sq_depth; newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, sq_comp_handler, cq_event_handler, newxprt, - newxprt->sc_sq_depth, - 0); + &cq_attr); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } + cq_attr.cqe = newxprt->sc_max_requests; newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, rq_comp_handler, cq_event_handler, newxprt, - newxprt->sc_max_requests, - 0); + &cq_attr); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -985,35 +1024,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* * Determine if a DMA MR is required and if so, what privs are required */ - switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { - case RDMA_TRANSPORT_IWARP: - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { - need_dma_mr = 1; - dma_mr_acc = - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - } else - need_dma_mr = 0; - break; - case RDMA_TRANSPORT_IB: - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - } else if (!(devattr.device_cap_flags & - IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - } else - need_dma_mr = 0; - break; - default: + if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num) && + !rdma_ib_or_roce(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num)) goto errout; + + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || + !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num) && + !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) + dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; } + if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num)) + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ if (need_dma_mr) { /* Register all of physical memory */ @@ -1067,6 +1097,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " remote_ip : %pI4\n" " remote_port : %d\n" " max_sge : %d\n" + " max_sge_rd : %d\n" " sq_depth : %d\n" " max_requests : %d\n" " ord : %d\n", @@ -1080,6 +1111,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> route.addr.dst_addr)->sin_port), newxprt->sc_max_sge, + newxprt->sc_max_sge_rd, newxprt->sc_sq_depth, newxprt->sc_max_requests, newxprt->sc_ord); @@ -1222,40 +1254,6 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp) return 1; } -/* - * Attempt to register the kvec representing the RPC memory with the - * device. - * - * Returns: - * NULL : The device does not support fastreg or there were no more - * fastreg mr. - * frmr : The kvec register request was successfully posted. - * <0 : An error was encountered attempting to register the kvec. - */ -int svc_rdma_fastreg(struct svcxprt_rdma *xprt, - struct svc_rdma_fastreg_mr *frmr) -{ - struct ib_send_wr fastreg_wr; - u8 key; - - /* Bump the key */ - key = (u8)(frmr->mr->lkey & 0x000000FF); - ib_update_fast_reg_key(frmr->mr, ++key); - - /* Prepare FASTREG WR */ - memset(&fastreg_wr, 0, sizeof fastreg_wr); - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.send_flags = IB_SEND_SIGNALED; - fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; - fastreg_wr.wr.fast_reg.page_list = frmr->page_list; - fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = frmr->map_len; - fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; - fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; - return svc_rdma_send(xprt, &fastreg_wr); -} - int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr, *n_wr; @@ -1319,11 +1317,11 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, struct ib_send_wr err_wr; struct page *p; struct svc_rdma_op_ctxt *ctxt; - u32 *va; + __be32 *va; int length; int ret; - p = svc_rdma_get_page(); + p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); va = page_address(p); /* XDR encode error */ diff --git a/kernel/net/sunrpc/xprtrdma/transport.c b/kernel/net/sunrpc/xprtrdma/transport.c index 54f23b1be..8c545f7d7 100644 --- a/kernel/net/sunrpc/xprtrdma/transport.c +++ b/kernel/net/sunrpc/xprtrdma/transport.c @@ -48,7 +48,6 @@ */ #include -#include #include #include #include @@ -59,11 +58,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -MODULE_LICENSE("Dual BSD/GPL"); - -MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS"); -MODULE_AUTHOR("Network Appliance, Inc."); - /* * tunables */ @@ -181,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) } static void -xprt_rdma_format_addresses(struct rpc_xprt *xprt) +xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { - struct sockaddr *sap = (struct sockaddr *) - &rpcx_to_rdmad(xprt).addr; char buf[128]; switch (sap->sa_family) { @@ -246,6 +238,16 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt_clear_connecting(xprt); } +static void +xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, + rx_xprt); + + pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt); + rdma_disconnect(r_xprt->rx_ia.ri_id); +} + /* * xprt_rdma_destroy * @@ -268,8 +270,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) xprt_clear_connected(xprt); - rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); @@ -298,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; struct rpcrdma_ep *new_ep; - struct sockaddr_in *sin; + struct sockaddr *sap; int rc; if (args->addrlen > sizeof(xprt->addr)) { @@ -329,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args) * Set up RDMA-specific connect data. */ - /* Put server RDMA address in local cdata */ - memcpy(&cdata.addr, args->dstaddr, args->addrlen); + sap = (struct sockaddr *)&cdata.addr; + memcpy(sap, args->dstaddr, args->addrlen); /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; xprt->addrlen = args->addrlen; - memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + memcpy(&xprt->addr, sap, xprt->addrlen); - sin = (struct sockaddr_in *)&cdata.addr; - if (ntohs(sin->sin_port) != 0) + if (rpc_get_port(sap)) xprt_set_bound(xprt); - dprintk("RPC: %s: %pI4:%u\n", - __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); - - /* Set max requests */ cdata.max_requests = xprt->max_reqs; - /* Set some length limits */ cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ @@ -371,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, - xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); if (rc) goto out1; @@ -405,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt); + xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -416,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args) if (!try_module_get(THIS_MODULE)) goto out4; + dprintk("RPC: %s: %s:%s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); return xprt; out4: @@ -618,12 +616,6 @@ xprt_rdma_send_request(struct rpc_task *task) if (req->rl_reply == NULL) /* e.g. reconnection */ rpcrdma_recv_buffer_get(req); - if (req->rl_reply) { - req->rl_reply->rr_func = rpcrdma_reply_handler; - /* this need only be done once, but... */ - req->rl_reply->rr_xprt = xprt; - } - /* Must suppress retransmit to maintain credits */ if (req->rl_connect_cookie == xprt->connect_cookie) goto drop_connection; @@ -655,31 +647,41 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; - seq_printf(seq, - "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " - "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", - - 0, /* need a local port? */ - xprt->stat.bind_count, - xprt->stat.connect_count, - xprt->stat.connect_time, - idle_time, - xprt->stat.sends, - xprt->stat.recvs, - xprt->stat.bad_xids, - xprt->stat.req_u, - xprt->stat.bklog_u, - - r_xprt->rx_stats.read_chunk_count, - r_xprt->rx_stats.write_chunk_count, - r_xprt->rx_stats.reply_chunk_count, - r_xprt->rx_stats.total_rdma_request, - r_xprt->rx_stats.total_rdma_reply, - r_xprt->rx_stats.pullup_copy_count, - r_xprt->rx_stats.fixup_copy_count, - r_xprt->rx_stats.hardway_register_count, - r_xprt->rx_stats.failed_marshal_count, - r_xprt->rx_stats.bad_reply_count); + seq_puts(seq, "\txprt:\trdma "); + seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ", + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count, + r_xprt->rx_stats.nomsg_call_count); +} + +static int +xprt_rdma_enable_swap(struct rpc_xprt *xprt) +{ + return 0; +} + +static void +xprt_rdma_disable_swap(struct rpc_xprt *xprt) +{ } /* @@ -700,7 +702,16 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, - .print_stats = xprt_rdma_print_stats + .print_stats = xprt_rdma_print_stats, + .enable_swap = xprt_rdma_enable_swap, + .disable_swap = xprt_rdma_disable_swap, + .inject_disconnect = xprt_rdma_inject_disconnect, +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + .bc_setup = xprt_rdma_bc_setup, + .bc_up = xprt_rdma_bc_up, + .bc_free_rqst = xprt_rdma_bc_free_rqst, + .bc_destroy = xprt_rdma_bc_destroy, +#endif }; static struct xprt_class xprt_rdma = { @@ -711,7 +722,7 @@ static struct xprt_class xprt_rdma = { .setup = xprt_setup_rdma, }; -static void __exit xprt_rdma_cleanup(void) +void xprt_rdma_cleanup(void) { int rc; @@ -726,17 +737,32 @@ static void __exit xprt_rdma_cleanup(void) if (rc) dprintk("RPC: %s: xprt_unregister returned %i\n", __func__, rc); + + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); } -static int __init xprt_rdma_init(void) +int xprt_rdma_init(void) { int rc; - rc = xprt_register_transport(&xprt_rdma); - + rc = frwr_alloc_recovery_wq(); if (rc) return rc; + rc = rpcrdma_alloc_wq(); + if (rc) { + frwr_destroy_recovery_wq(); + return rc; + } + + rc = xprt_register_transport(&xprt_rdma); + if (rc) { + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); @@ -753,6 +779,3 @@ static int __init xprt_rdma_init(void) #endif return 0; } - -module_init(xprt_rdma_init); -module_exit(xprt_rdma_cleanup); diff --git a/kernel/net/sunrpc/xprtrdma/verbs.c b/kernel/net/sunrpc/xprtrdma/verbs.c index 4870d272e..eadd16551 100644 --- a/kernel/net/sunrpc/xprtrdma/verbs.c +++ b/kernel/net/sunrpc/xprtrdma/verbs.c @@ -52,6 +52,7 @@ #include #include #include +#include /* try_module_get()/module_put() */ #include "xprt_rdma.h" @@ -67,79 +68,33 @@ * internal functions */ -/* - * handle replies in tasklet context, using a single, global list - * rdma tasklet function -- just turn around and call the func - * for all replies on the list - */ - -static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); -static LIST_HEAD(rpcrdma_tasklets_g); +static struct workqueue_struct *rpcrdma_receive_wq; -static void -rpcrdma_run_tasklet(unsigned long data) +int +rpcrdma_alloc_wq(void) { - struct rpcrdma_rep *rep; - void (*func)(struct rpcrdma_rep *); - unsigned long flags; + struct workqueue_struct *recv_wq; - data = data; - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - while (!list_empty(&rpcrdma_tasklets_g)) { - rep = list_entry(rpcrdma_tasklets_g.next, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - func = rep->rr_func; - rep->rr_func = NULL; - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - - if (func) - func(rep); - else - rpcrdma_recv_buffer_put(rep); - - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - } - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); -} + recv_wq = alloc_workqueue("xprtrdma_receive", + WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, + 0); + if (!recv_wq) + return -ENOMEM; -static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); - -static const char * const async_event[] = { - "CQ error", - "QP fatal error", - "QP request error", - "QP access error", - "communication established", - "send queue drained", - "path migration successful", - "path mig error", - "device fatal error", - "port active", - "port error", - "LID change", - "P_key change", - "SM change", - "SRQ error", - "SRQ limit reached", - "last WQE reached", - "client reregister", - "GID change", -}; - -#define ASYNC_MSG(status) \ - ((status) < ARRAY_SIZE(async_event) ? \ - async_event[(status)] : "unknown async error") + rpcrdma_receive_wq = recv_wq; + return 0; +} -static void -rpcrdma_schedule_tasklet(struct list_head *sched_list) +void +rpcrdma_destroy_wq(void) { - unsigned long flags; + struct workqueue_struct *wq; - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_splice_tail(sched_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); + if (rpcrdma_receive_wq) { + wq = rpcrdma_receive_wq; + rpcrdma_receive_wq = NULL; + destroy_workqueue(wq); + } } static void @@ -148,7 +103,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) struct rpcrdma_ep *ep = context; pr_err("RPC: %s: %s on device %s ep %p\n", - __func__, ASYNC_MSG(event->event), + __func__, ib_event_msg(event->event), event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; @@ -163,7 +118,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) struct rpcrdma_ep *ep = context; pr_err("RPC: %s: %s on device %s ep %p\n", - __func__, ASYNC_MSG(event->event), + __func__, ib_event_msg(event->event), event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; @@ -172,35 +127,6 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) } } -static const char * const wc_status[] = { - "success", - "local length error", - "local QP operation error", - "local EE context operation error", - "local protection error", - "WR flushed", - "memory management operation error", - "bad response error", - "local access error", - "remote invalid request error", - "remote access error", - "remote operation error", - "transport retry counter exceeded", - "RNR retry counter exceeded", - "local RDD violation error", - "remove invalid RD request", - "operation aborted", - "invalid EE context number", - "invalid EE context state", - "fatal error", - "response timeout error", - "general error", -}; - -#define COMPLETION_MSG(status) \ - ((status) < ARRAY_SIZE(wc_status) ? \ - wc_status[(status)] : "unexpected completion error") - static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { @@ -209,7 +135,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", - __func__, COMPLETION_MSG(wc->status)); + __func__, ib_wc_status_msg(wc->status)); } else { struct rpcrdma_mw *r; @@ -218,63 +144,54 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) } } -static int -rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +/* The common case is a single send completion is waiting. By + * passing two WC entries to ib_poll_cq, a return code of 1 + * means there is exactly one WC waiting and no more. We don't + * have to invoke ib_poll_cq again to know that the CQ has been + * properly drained. + */ +static void +rpcrdma_sendcq_poll(struct ib_cq *cq) { - struct ib_wc *wcs; - int budget, count, rc; + struct ib_wc *pos, wcs[2]; + int count, rc; - budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { - wcs = ep->rep_send_wcs; + pos = wcs; - rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); - if (rc <= 0) - return rc; + rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); + if (rc < 0) + break; count = rc; while (count-- > 0) - rpcrdma_sendcq_process_wc(wcs++); - } while (rc == RPCRDMA_POLLSIZE && --budget); - return 0; + rpcrdma_sendcq_process_wc(pos++); + } while (rc == ARRAY_SIZE(wcs)); + return; } -/* - * Handle send, fast_reg_mr, and local_inv completions. - * - * Send events are typically suppressed and thus do not result - * in an upcall. Occasionally one is signaled, however. This - * prevents the provider's completion queue from wrapping and - * losing a completion. +/* Handle provider send completion upcalls. */ static void rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) { - struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; - int rc; - - rc = rpcrdma_sendcq_poll(cq, ep); - if (rc) { - dprintk("RPC: %s: ib_poll_cq failed: %i\n", - __func__, rc); - return; - } + do { + rpcrdma_sendcq_poll(cq); + } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0); +} - rc = ib_req_notify_cq(cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc == 0) - return; - if (rc < 0) { - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", - __func__, rc); - return; - } +static void +rpcrdma_receive_worker(struct work_struct *work) +{ + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); - rpcrdma_sendcq_poll(cq, ep); + rpcrdma_reply_handler(rep); } static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) +rpcrdma_recvcq_process_wc(struct ib_wc *wc) { struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; @@ -291,126 +208,70 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) __func__, rep, wc->byte_len); rep->rr_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, + ib_dma_sync_single_for_cpu(rep->rr_device, rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); prefetch(rdmab_to_msg(rep->rr_rdmabuf)); out_schedule: - list_add_tail(&rep->rr_list, sched_list); + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; + out_fail: if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: rep %p: %s\n", - __func__, rep, COMPLETION_MSG(wc->status)); - rep->rr_len = ~0U; + __func__, rep, ib_wc_status_msg(wc->status)); + rep->rr_len = RPCRDMA_BAD_LEN; goto out_schedule; } -static int -rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +/* The wc array is on stack: automatic memory is always CPU-local. + * + * struct ib_wc is 64 bytes, making the poll array potentially + * large. But this is at the bottom of the call chain. Further + * substantial work is done in another thread. + */ +static void +rpcrdma_recvcq_poll(struct ib_cq *cq) { - struct list_head sched_list; - struct ib_wc *wcs; - int budget, count, rc; + struct ib_wc *pos, wcs[4]; + int count, rc; - INIT_LIST_HEAD(&sched_list); - budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { - wcs = ep->rep_recv_wcs; + pos = wcs; - rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); - if (rc <= 0) - goto out_schedule; + rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); + if (rc < 0) + break; count = rc; while (count-- > 0) - rpcrdma_recvcq_process_wc(wcs++, &sched_list); - } while (rc == RPCRDMA_POLLSIZE && --budget); - rc = 0; - -out_schedule: - rpcrdma_schedule_tasklet(&sched_list); - return rc; + rpcrdma_recvcq_process_wc(pos++); + } while (rc == ARRAY_SIZE(wcs)); } -/* - * Handle receive completions. - * - * It is reentrant but processes single events in order to maintain - * ordering of receives to keep server credits. - * - * It is the responsibility of the scheduled tasklet to return - * recv buffers to the pool. NOTE: this affects synchronization of - * connection shutdown. That is, the structures required for - * the completion of the reply handler must remain intact until - * all memory has been reclaimed. +/* Handle provider receive completion upcalls. */ static void rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) { - struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; - int rc; - - rc = rpcrdma_recvcq_poll(cq, ep); - if (rc) { - dprintk("RPC: %s: ib_poll_cq failed: %i\n", - __func__, rc); - return; - } - - rc = ib_req_notify_cq(cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc == 0) - return; - if (rc < 0) { - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", - __func__, rc); - return; - } - - rpcrdma_recvcq_poll(cq, ep); + do { + rpcrdma_recvcq_poll(cq); + } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0); } static void rpcrdma_flush_cqs(struct rpcrdma_ep *ep) { struct ib_wc wc; - LIST_HEAD(sched_list); while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) - rpcrdma_recvcq_process_wc(&wc, &sched_list); - if (!list_empty(&sched_list)) - rpcrdma_schedule_tasklet(&sched_list); + rpcrdma_recvcq_process_wc(&wc); while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0) rpcrdma_sendcq_process_wc(&wc); } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -static const char * const conn[] = { - "address resolved", - "address error", - "route resolved", - "route error", - "connect request", - "connect response", - "connect error", - "unreachable", - "rejected", - "established", - "disconnected", - "device removal", - "multicast join", - "multicast error", - "address change", - "timewait exit", -}; - -#define CONNECTION_MSG(status) \ - ((status) < ARRAY_SIZE(conn) ? \ - conn[(status)] : "unrecognized connection error") -#endif - static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -476,7 +337,7 @@ connected: default: dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", __func__, sap, rpc_get_port(sap), ep, - CONNECTION_MSG(event->event)); + rdma_event_msg(event->event)); break; } @@ -487,7 +348,7 @@ connected: pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", sap, rpc_get_port(sap), - ia->ri_id->device->name, + ia->ri_device->name, ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); @@ -500,6 +361,14 @@ connected: return 0; } +static void rpcrdma_destroy_id(struct rdma_cm_id *id) +{ + if (id) { + module_put(id->device->owner); + rdma_destroy_id(id); + } +} + static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -509,7 +378,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, init_completion(&ia->ri_done); - id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); dprintk("RPC: %s: rdma_create_id() failed %i\n", @@ -526,6 +396,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + + /* FIXME: + * Until xprtrdma supports DEVICE_REMOVAL, the provider must + * be pinned while there are active NFS/RDMA mounts to prevent + * hangs and crashes at umount time. + */ + if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { + dprintk("RPC: %s: Failed to get device module\n", + __func__); + ia->ri_async_rc = -ENODEV; + } rc = ia->ri_async_rc; if (rc) goto out; @@ -535,16 +416,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto out; + goto put; } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) - goto out; + goto put; return id; - +put: + module_put(id->device->owner); out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -579,17 +461,20 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc, mem_priv; struct rpcrdma_ia *ia = &xprt->rx_ia; struct ib_device_attr *devattr = &ia->ri_devattr; + int rc; + + ia->ri_dma_mr = NULL; ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); goto out1; } + ia->ri_device = ia->ri_id->device; - ia->ri_pd = ib_alloc_pd(ia->ri_id->device); + ia->ri_pd = ib_alloc_pd(ia->ri_device); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); dprintk("RPC: %s: ib_alloc_pd() failed %i\n", @@ -597,69 +482,39 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - rc = ib_query_device(ia->ri_id->device, devattr); + rc = ib_query_device(ia->ri_device, devattr); if (rc) { dprintk("RPC: %s: ib_query_device failed %d\n", __func__, rc); goto out3; } - if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { - ia->ri_have_dma_lkey = 1; - ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; - } - if (memreg == RPCRDMA_FRMR) { - /* Requires both frmr reg and local dma lkey */ - if (((devattr->device_cap_flags & - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || - (devattr->max_fast_reg_page_list_len == 0)) { + if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; } } if (memreg == RPCRDMA_MTHCAFMR) { - if (!ia->ri_id->device->alloc_fmr) { + if (!ia->ri_device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); - memreg = RPCRDMA_ALLPHYSICAL; + rc = -EINVAL; + goto out3; } } - /* - * Optionally obtain an underlying physical identity mapping in - * order to do a memory window-based bind. This base registration - * is protected from remote access - that is enabled only by binding - * for the specific bytes targeted during each RPC operation, and - * revoked after the corresponding completion similar to a storage - * adapter. - */ switch (memreg) { case RPCRDMA_FRMR: ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: ia->ri_ops = &rpcrdma_physical_memreg_ops; - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ; - goto register_setup; + break; case RPCRDMA_MTHCAFMR: ia->ri_ops = &rpcrdma_fmr_memreg_ops; - if (ia->ri_have_dma_lkey) - break; - mem_priv = IB_ACCESS_LOCAL_WRITE; - register_setup: - ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); - if (IS_ERR(ia->ri_bind_mem)) { - printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n", - __func__, PTR_ERR(ia->ri_bind_mem)); - rc = -ENOMEM; - goto out3; - } break; default: printk(KERN_ERR "RPC: Unsupported memory " @@ -670,9 +525,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) dprintk("RPC: %s: memory registration strategy is '%s'\n", __func__, ia->ri_ops->ro_displayname); - /* Else will do memory reg/dereg for each chunk */ - ia->ri_memreg_strategy = memreg; - rwlock_init(&ia->ri_qplock); return 0; @@ -680,7 +532,7 @@ out3: ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; out2: - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; out1: return rc; @@ -694,25 +546,17 @@ out1: void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering\n", __func__); - if (ia->ri_bind_mem != NULL) { - rc = ib_dereg_mr(ia->ri_bind_mem); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; } - if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { - rc = ib_dealloc_pd(ia->ri_pd); - dprintk("RPC: %s: ib_dealloc_pd returned %i\n", - __func__, rc); - } + + /* If the pd is still busy, xprtrdma missed freeing a resource */ + if (ia->ri_pd && !IS_ERR(ia->ri_pd)) + ib_dealloc_pd(ia->ri_pd); } /* @@ -724,35 +568,44 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, { struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; + struct ib_cq_init_attr cq_attr = {}; + unsigned int max_qp_wr; int rc, err; + if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + dprintk("RPC: %s: insufficient sge's available\n", + __func__); + return -ENOMEM; + } + + if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { + dprintk("RPC: %s: insufficient wqe's available\n", + __func__); + return -ENOMEM; + } + max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; + /* check provider's send/recv wr limits */ - if (cdata->max_requests > devattr->max_qp_wr) - cdata->max_requests = devattr->max_qp_wr; + if (cdata->max_requests > max_qp_wr) + cdata->max_requests = max_qp_wr; ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; rc = ia->ri_ops->ro_open(ia, ep, cdata); if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ep->rep_attr.qp_type = IB_QPT_RC; ep->rep_attr.port_num = ~0; - if (cdata->padding) { - ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, - GFP_KERNEL); - if (IS_ERR(ep->rep_padbuf)) - return PTR_ERR(ep->rep_padbuf); - } else - ep->rep_padbuf = NULL; - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, @@ -771,9 +624,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, - rpcrdma_cq_async_error_upcall, ep, - ep->rep_attr.cap.max_send_wr + 1, 0); + cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1; + sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall, + rpcrdma_cq_async_error_upcall, NULL, &cq_attr); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", @@ -788,9 +641,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, goto out2; } - recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, - rpcrdma_cq_async_error_upcall, ep, - ep->rep_attr.cap.max_recv_wr + 1, 0); + cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; + recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall, + rpcrdma_cq_async_error_upcall, NULL, &cq_attr); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -835,7 +688,8 @@ out2: dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); out1: - rpcrdma_free_regbuf(ia, ep->rep_padbuf); + if (ia->ri_dma_mr) + ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -856,25 +710,32 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) { + if (ia->ri_id->qp) rpcrdma_ep_disconnect(ep, ia); + + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); + + if (ia->ri_id->qp) { rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } - rpcrdma_free_regbuf(ia, ep->rep_padbuf); - - rpcrdma_clean_cq(ep->rep_attr.recv_cq); rc = ib_destroy_cq(ep->rep_attr.recv_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_attr.send_cq); rc = ib_destroy_cq(ep->rep_attr.send_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); + + if (ia->ri_dma_mr) { + rc = ib_dereg_mr(ia->ri_dma_mr); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } } /* @@ -896,8 +757,6 @@ retry: rpcrdma_flush_cqs(ep); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - ia->ri_ops->ro_reset(xprt); - id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { @@ -911,10 +770,10 @@ retry: * More stuff I haven't thought of! * Rrrgh! */ - if (ia->ri_id->device != id->device) { + if (ia->ri_device != id->device) { printk("RPC: %s: can't reconnect on " "different device!\n", __func__); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -923,7 +782,7 @@ retry: if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -934,7 +793,7 @@ retry: write_unlock(&ia->ri_qplock); rdma_destroy_qp(old); - rdma_destroy_id(old); + rpcrdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -983,7 +842,21 @@ retry: } rc = ep->rep_connected; } else { + struct rpcrdma_xprt *r_xprt; + unsigned int extras; + dprintk("RPC: %s: connected\n", __func__); + + r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + extras = r_xprt->rx_buf.rb_bc_srv_max_requests; + + if (extras) { + rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); + if (rc) + pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", + __func__, rc); + rc = 0; + } } out: @@ -1020,20 +893,25 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) } } -static struct rpcrdma_req * +struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&req->rl_free); + spin_lock(&buffer->rb_reqslock); + list_add(&req->rl_all, &buffer->rb_allreqs); + spin_unlock(&buffer->rb_reqslock); req->rl_buffer = &r_xprt->rx_buf; return req; } -static struct rpcrdma_rep * +struct rpcrdma_rep * rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; @@ -1053,7 +931,9 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) goto out_free; } - rep->rr_buffer = &r_xprt->rx_buf; + rep->rr_device = ia->ri_device; + rep->rr_rxprt = r_xprt; + INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); return rep; out_free: @@ -1067,44 +947,21 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - char *p; - size_t len; int i, rc; - buf->rb_max_requests = cdata->max_requests; + buf->rb_max_requests = r_xprt->rx_data.max_requests; + buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_lock); - /* Need to allocate: - * 1. arrays for send and recv pointers - * 2. arrays of struct rpcrdma_req to fill in pointers - * 3. array of struct rpcrdma_rep for replies - * Send/recv buffers in req/rep need to be registered - */ - len = buf->rb_max_requests * - (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); - - p = kzalloc(len, GFP_KERNEL); - if (p == NULL) { - dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", - __func__, len); - rc = -ENOMEM; - goto out; - } - buf->rb_pool = p; /* for freeing it later */ - - buf->rb_send_bufs = (struct rpcrdma_req **) p; - p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; - buf->rb_recv_bufs = (struct rpcrdma_rep **) p; - p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - rc = ia->ri_ops->ro_init(r_xprt); if (rc) goto out; + INIT_LIST_HEAD(&buf->rb_send_bufs); + INIT_LIST_HEAD(&buf->rb_allreqs); + spin_lock_init(&buf->rb_reqslock); for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - struct rpcrdma_rep *rep; req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) { @@ -1113,7 +970,13 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(req); goto out; } - buf->rb_send_bufs[i] = req; + req->rl_backchannel = false; + list_add(&req->rl_free, &buf->rb_send_bufs); + } + + INIT_LIST_HEAD(&buf->rb_recv_bufs); + for (i = 0; i < buf->rb_max_requests + 2; i++) { + struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); if (IS_ERR(rep)) { @@ -1122,7 +985,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(rep); goto out; } - buf->rb_recv_bufs[i] = rep; + list_add(&rep->rr_list, &buf->rb_recv_bufs); } return 0; @@ -1131,22 +994,38 @@ out: return rc; } +static struct rpcrdma_req * +rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_req *req; + + req = list_first_entry(&buf->rb_send_bufs, + struct rpcrdma_req, rl_free); + list_del(&req->rl_free); + return req; +} + +static struct rpcrdma_rep * +rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_rep *rep; + + rep = list_first_entry(&buf->rb_recv_bufs, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); + return rep; +} + static void rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) { - if (!rep) - return; - rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); kfree(rep); } -static void +void rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { - if (!req) - return; - rpcrdma_free_regbuf(ia, req->rl_sendbuf); rpcrdma_free_regbuf(ia, req->rl_rdmabuf); kfree(req); @@ -1156,220 +1035,88 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_ia *ia = rdmab_to_ia(buf); - int i; - - /* clean up in reverse order from create - * 1. recv mr memory (mr free, then kfree) - * 2. send mr memory (mr free, then kfree) - * 3. MWs - */ - dprintk("RPC: %s: entering\n", __func__); - - for (i = 0; i < buf->rb_max_requests; i++) { - if (buf->rb_recv_bufs) - rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]); - if (buf->rb_send_bufs) - rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); - } - - ia->ri_ops->ro_destroy(buf); - kfree(buf->rb_pool); -} + while (!list_empty(&buf->rb_recv_bufs)) { + struct rpcrdma_rep *rep; -/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving - * some req segments uninitialized. - */ -static void -rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) -{ - if (*mw) { - list_add_tail(&(*mw)->mw_list, &buf->rb_mws); - *mw = NULL; + rep = rpcrdma_buffer_get_rep_locked(buf); + rpcrdma_destroy_rep(ia, rep); } -} -/* Cycle mw's back in reverse order, and "spin" them. - * This delays and scrambles reuse as much as possible. - */ -static void -rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mr_seg *seg = req->rl_segments; - struct rpcrdma_mr_seg *seg1 = seg; - int i; + spin_lock(&buf->rb_reqslock); + while (!list_empty(&buf->rb_allreqs)) { + struct rpcrdma_req *req; - for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) - rpcrdma_buffer_put_mr(&seg->rl_mw, buf); - rpcrdma_buffer_put_mr(&seg1->rl_mw, buf); -} + req = list_first_entry(&buf->rb_allreqs, + struct rpcrdma_req, rl_all); + list_del(&req->rl_all); -static void -rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) -{ - buf->rb_send_bufs[--buf->rb_send_index] = req; - req->rl_niovs = 0; - if (req->rl_reply) { - buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; - req->rl_reply->rr_func = NULL; - req->rl_reply = NULL; + spin_unlock(&buf->rb_reqslock); + rpcrdma_destroy_req(ia, req); + spin_lock(&buf->rb_reqslock); } -} - -/* rpcrdma_unmap_one() was already done during deregistration. - * Redo only the ib_post_send(). - */ -static void -rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); - - /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ - r->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)r; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", - __func__, r, r->r.frmr.fr_mr->rkey); + spin_unlock(&buf->rb_reqslock); - read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - r->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: ib_post_send failed, %i\n", - __func__, rc); - } + ia->ri_ops->ro_destroy(buf); } -static void -rpcrdma_retry_flushed_linv(struct list_head *stale, - struct rpcrdma_buffer *buf) +struct rpcrdma_mw * +rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct list_head *pos; - struct rpcrdma_mw *r; - unsigned long flags; - - list_for_each(pos, stale) { - r = list_entry(pos, struct rpcrdma_mw, mw_list); - rpcrdma_retry_local_inv(r, ia); - } - - spin_lock_irqsave(&buf->rb_lock, flags); - list_splice_tail(stale, &buf->rb_mws); - spin_unlock_irqrestore(&buf->rb_lock, flags); -} + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mw *mw = NULL; -static struct rpcrdma_req * -rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, - struct list_head *stale) -{ - struct rpcrdma_mw *r; - int i; - - i = RPCRDMA_MAX_SEGS - 1; - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - if (r->r.frmr.fr_state == FRMR_IS_STALE) { - list_add(&r->mw_list, stale); - continue; - } - req->rl_segments[i].rl_mw = r; - if (unlikely(i-- == 0)) - return req; /* Success */ + spin_lock(&buf->rb_mwlock); + if (!list_empty(&buf->rb_mws)) { + mw = list_first_entry(&buf->rb_mws, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); } + spin_unlock(&buf->rb_mwlock); - /* Not enough entries on rb_mws for this req */ - rpcrdma_buffer_put_sendbuf(req, buf); - rpcrdma_buffer_put_mrs(req, buf); - return NULL; + if (!mw) + pr_err("RPC: %s: no MWs available\n", __func__); + return mw; } -static struct rpcrdma_req * -rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +void +rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) { - struct rpcrdma_mw *r; - int i; - - i = RPCRDMA_MAX_SEGS - 1; - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - req->rl_segments[i].rl_mw = r; - if (unlikely(i-- == 0)) - return req; /* Success */ - } + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - /* Not enough entries on rb_mws for this req */ - rpcrdma_buffer_put_sendbuf(req, buf); - rpcrdma_buffer_put_mrs(req, buf); - return NULL; + spin_lock(&buf->rb_mwlock); + list_add_tail(&mw->mw_list, &buf->rb_mws); + spin_unlock(&buf->rb_mwlock); } /* * Get a set of request/reply buffers. * - * Reply buffer (if needed) is attached to send buffer upon return. - * Rule: - * rb_send_index and rb_recv_index MUST always be pointing to the - * *next* available buffer (non-NULL). They are incremented after - * removing buffers, and decremented *before* returning them. + * Reply buffer (if available) is attached to send buffer upon return. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { - struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - struct list_head stale; struct rpcrdma_req *req; - unsigned long flags; - - spin_lock_irqsave(&buffers->rb_lock, flags); - if (buffers->rb_send_index == buffers->rb_max_requests) { - spin_unlock_irqrestore(&buffers->rb_lock, flags); - dprintk("RPC: %s: out of request buffers\n", __func__); - return ((struct rpcrdma_req *)NULL); - } - req = buffers->rb_send_bufs[buffers->rb_send_index]; - if (buffers->rb_send_index < buffers->rb_recv_index) { - dprintk("RPC: %s: %d extra receives outstanding (ok)\n", - __func__, - buffers->rb_recv_index - buffers->rb_send_index); - req->rl_reply = NULL; - } else { - req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; - buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; - } - buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; + spin_lock(&buffers->rb_lock); + if (list_empty(&buffers->rb_send_bufs)) + goto out_reqbuf; + req = rpcrdma_buffer_get_req_locked(buffers); + if (list_empty(&buffers->rb_recv_bufs)) + goto out_repbuf; + req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + spin_unlock(&buffers->rb_lock); + return req; - INIT_LIST_HEAD(&stale); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); - break; - case RPCRDMA_MTHCAFMR: - req = rpcrdma_buffer_get_fmrs(req, buffers); - break; - default: - break; - } - spin_unlock_irqrestore(&buffers->rb_lock, flags); - if (!list_empty(&stale)) - rpcrdma_retry_flushed_linv(&stale, buffers); +out_reqbuf: + spin_unlock(&buffers->rb_lock); + pr_warn("RPC: %s: out of request buffers\n", __func__); + return NULL; +out_repbuf: + spin_unlock(&buffers->rb_lock); + pr_warn("RPC: %s: out of reply buffers\n", __func__); + req->rl_reply = NULL; return req; } @@ -1381,39 +1128,31 @@ void rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; - struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - unsigned long flags; + struct rpcrdma_rep *rep = req->rl_reply; - spin_lock_irqsave(&buffers->rb_lock, flags); - rpcrdma_buffer_put_sendbuf(req, buffers); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - case RPCRDMA_MTHCAFMR: - rpcrdma_buffer_put_mrs(req, buffers); - break; - default: - break; - } - spin_unlock_irqrestore(&buffers->rb_lock, flags); + req->rl_niovs = 0; + req->rl_reply = NULL; + + spin_lock(&buffers->rb_lock); + list_add_tail(&req->rl_free, &buffers->rb_send_bufs); + if (rep) + list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock(&buffers->rb_lock); } /* * Recover reply buffers from pool. - * This happens when recovering from error conditions. - * Post-increment counter/array index. + * This happens when recovering from disconnect. */ void rpcrdma_recv_buffer_get(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; - unsigned long flags; - spin_lock_irqsave(&buffers->rb_lock, flags); - if (buffers->rb_recv_index < buffers->rb_max_requests) { - req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; - buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; - } - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); + if (!list_empty(&buffers->rb_recv_bufs)) + req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + spin_unlock(&buffers->rb_lock); } /* @@ -1423,13 +1162,11 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { - struct rpcrdma_buffer *buffers = rep->rr_buffer; - unsigned long flags; + struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - rep->rr_func = NULL; - spin_lock_irqsave(&buffers->rb_lock, flags); - buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); + list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock(&buffers->rb_lock); } /* @@ -1444,75 +1181,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) (unsigned long long)seg->mr_dma, seg->mr_dmalen); } -static int -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, - struct ib_mr **mrp, struct ib_sge *iov) -{ - struct ib_phys_buf ipb; - struct ib_mr *mr; - int rc; - - /* - * All memory passed here was kmalloc'ed, therefore phys-contiguous. - */ - iov->addr = ib_dma_map_single(ia->ri_id->device, - va, len, DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) - return -ENOMEM; - - iov->length = len; - - if (ia->ri_have_dma_lkey) { - *mrp = NULL; - iov->lkey = ia->ri_dma_lkey; - return 0; - } else if (ia->ri_bind_mem != NULL) { - *mrp = NULL; - iov->lkey = ia->ri_bind_mem->lkey; - return 0; - } - - ipb.addr = iov->addr; - ipb.size = iov->length; - mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, - IB_ACCESS_LOCAL_WRITE, &iov->addr); - - dprintk("RPC: %s: phys convert: 0x%llx " - "registered 0x%llx length %d\n", - __func__, (unsigned long long)ipb.addr, - (unsigned long long)iov->addr, len); - - if (IS_ERR(mr)) { - *mrp = NULL; - rc = PTR_ERR(mr); - dprintk("RPC: %s: failed with %i\n", __func__, rc); - } else { - *mrp = mr; - iov->lkey = mr->lkey; - rc = 0; - } - - return rc; -} - -static int -rpcrdma_deregister_internal(struct rpcrdma_ia *ia, - struct ib_mr *mr, struct ib_sge *iov) -{ - int rc; - - ib_dma_unmap_single(ia->ri_id->device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); - - if (NULL == mr) - return 0; - - rc = ib_dereg_mr(mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); - return rc; -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1532,26 +1200,29 @@ struct rpcrdma_regbuf * rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) { struct rpcrdma_regbuf *rb; - int rc; + struct ib_sge *iov; - rc = -ENOMEM; rb = kmalloc(sizeof(*rb) + size, flags); if (rb == NULL) goto out; - rb->rg_size = size; - rb->rg_owner = NULL; - rc = rpcrdma_register_internal(ia, rb->rg_base, size, - &rb->rg_mr, &rb->rg_iov); - if (rc) + iov = &rb->rg_iov; + iov->addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, size, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_device, iov->addr)) goto out_free; + iov->length = size; + iov->lkey = ia->ri_pd->local_dma_lkey; + rb->rg_size = size; + rb->rg_owner = NULL; return rb; out_free: kfree(rb); out: - return ERR_PTR(rc); + return ERR_PTR(-ENOMEM); } /** @@ -1562,10 +1233,15 @@ out: void rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { - if (rb) { - rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); - kfree(rb); - } + struct ib_sge *iov; + + if (!rb) + return; + + iov = &rb->rg_iov; + ib_dma_unmap_single(ia->ri_device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + kfree(rb); } /* @@ -1578,9 +1254,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { + struct ib_device *device = ia->ri_device; struct ib_send_wr send_wr, *send_wr_fail; struct rpcrdma_rep *rep = req->rl_reply; - int rc; + struct ib_sge *iov = req->rl_send_iov; + int i, rc; if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); @@ -1591,19 +1269,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr.next = NULL; send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; - send_wr.sg_list = req->rl_send_iov; + send_wr.sg_list = iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ - ib_dma_sync_single_for_device(ia->ri_id->device, - req->rl_send_iov[3].addr, req->rl_send_iov[3].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_id->device, - req->rl_send_iov[1].addr, req->rl_send_iov[1].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_id->device, - req->rl_send_iov[0].addr, req->rl_send_iov[0].length, - DMA_TO_DEVICE); + + for (i = 0; i < send_wr.num_sge; i++) + ib_dma_sync_single_for_device(device, iov[i].addr, + iov[i].length, DMA_TO_DEVICE); + dprintk("RPC: %s: posting %d s/g entries\n", + __func__, send_wr.num_sge); if (DECR_CQCOUNT(ep) > 0) send_wr.send_flags = 0; @@ -1636,7 +1310,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; recv_wr.num_sge = 1; - ib_dma_sync_single_for_cpu(ia->ri_id->device, + ib_dma_sync_single_for_cpu(ia->ri_device, rdmab_addr(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf), DMA_BIDIRECTIONAL); @@ -1649,6 +1323,47 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } +/** + * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests + * @r_xprt: transport associated with these backchannel resources + * @min_reqs: minimum number of incoming requests expected + * + * Returns zero if all requested buffers were posted, or a negative errno. + */ +int +rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) +{ + struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_rep *rep; + unsigned long flags; + int rc; + + while (count--) { + spin_lock_irqsave(&buffers->rb_lock, flags); + if (list_empty(&buffers->rb_recv_bufs)) + goto out_reqbuf; + rep = rpcrdma_buffer_get_rep_locked(buffers); + spin_unlock_irqrestore(&buffers->rb_lock, flags); + + rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (rc) + goto out_rc; + } + + return 0; + +out_reqbuf: + spin_unlock_irqrestore(&buffers->rb_lock, flags); + pr_warn("%s: no extra receive buffers\n", __func__); + return -ENOMEM; + +out_rc: + rpcrdma_recv_buffer_put(rep); + return rc; +} + /* How many chunk list items fit within our inline buffers? */ unsigned int diff --git a/kernel/net/sunrpc/xprtrdma/xprt_rdma.h b/kernel/net/sunrpc/xprtrdma/xprt_rdma.h index 78e0b8bea..ac7f8d4f6 100644 --- a/kernel/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/kernel/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,7 +51,6 @@ #include /* rpc_xprt */ #include /* RPC/RDMA protocol */ #include /* xprt parameters */ -#include /* RPCSVC_MAXPAYLOAD */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -62,14 +61,12 @@ struct rpcrdma_ia { const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; + struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_bind_mem; - u32 ri_dma_lkey; - int ri_have_dma_lkey; + struct ib_mr *ri_dma_mr; struct completion ri_done; int ri_async_rc; - enum rpcrdma_memreg ri_memreg_strategy; unsigned int ri_max_frmr_depth; struct ib_device_attr ri_devattr; struct ib_qp_attr ri_qp_attr; @@ -80,21 +77,15 @@ struct rpcrdma_ia { * RDMA Endpoint -- one per transport instance */ -#define RPCRDMA_WC_BUDGET (128) -#define RPCRDMA_POLLSIZE (16) - struct rpcrdma_ep { atomic_t rep_cqcount; int rep_cqinit; int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; - struct rpcrdma_regbuf *rep_padbuf; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; - struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; - struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; }; /* @@ -110,6 +101,16 @@ struct rpcrdma_ep { */ #define RPCRDMA_IGNORE_COMPLETION (0ULL) +/* Pre-allocate extra Work Requests for handling backward receives + * and sends. This is a fixed value because the Work Queues are + * allocated when the forward channel is set up. + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +#define RPCRDMA_BACKWARD_WRS (8) +#else +#define RPCRDMA_BACKWARD_WRS (0) +#endif + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -119,7 +120,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; struct rpcrdma_req *rg_owner; - struct ib_mr *rg_mr; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -165,21 +165,22 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -/* temporary static scatter/gather max */ -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ struct rpcrdma_buffer; struct rpcrdma_rep { unsigned int rr_len; - struct rpcrdma_buffer *rr_buffer; - struct rpc_xprt *rr_xprt; - void (*rr_func)(struct rpcrdma_rep *); + struct ib_device *rr_device; + struct rpcrdma_xprt *rr_rxprt; + struct work_struct rr_work; struct list_head rr_list; struct rpcrdma_regbuf *rr_rdmabuf; }; +#define RPCRDMA_BAD_LEN (~0U) + /* * struct rpcrdma_mw - external memory region metadata * @@ -200,14 +201,22 @@ enum rpcrdma_frmr_state { }; struct rpcrdma_frmr { - struct ib_fast_reg_page_list *fr_pgl; + struct scatterlist *sg; + int sg_nents; struct ib_mr *fr_mr; enum rpcrdma_frmr_state fr_state; + struct work_struct fr_work; + struct rpcrdma_xprt *fr_xprt; +}; + +struct rpcrdma_fmr { + struct ib_fmr *fmr; + u64 *physaddrs; }; struct rpcrdma_mw { union { - struct ib_fmr *fmr; + struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; } r; void (*mw_sendcompletion)(struct ib_wc *); @@ -252,16 +261,22 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; +#define RPCRDMA_MAX_IOVS (2) + struct rpcrdma_req { - unsigned int rl_niovs; /* 0, 2 or 4 */ - unsigned int rl_nchunks; /* non-zero if chunks */ - unsigned int rl_connect_cookie; /* retry detection */ - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + struct list_head rl_free; + unsigned int rl_niovs; + unsigned int rl_nchunks; + unsigned int rl_connect_cookie; + struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[4]; /* for active requests */ - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; + struct rpcrdma_regbuf *rl_rdmabuf; + struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + + struct list_head rl_all; + bool rl_backchannel; }; static inline struct rpcrdma_req * @@ -281,15 +296,19 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_lock; /* protects indexes */ - u32 rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ - struct list_head rb_all; - int rb_send_index; - struct rpcrdma_req **rb_send_bufs; - int rb_recv_index; - struct rpcrdma_rep **rb_recv_bufs; - char *rb_pool; + spinlock_t rb_mwlock; /* protect rb_mws list */ + struct list_head rb_mws; + struct list_head rb_all; + char *rb_pool; + + spinlock_t rb_lock; /* protect buf lists */ + struct list_head rb_send_bufs; + struct list_head rb_recv_bufs; + u32 rb_max_requests; + + u32 rb_bc_srv_max_requests; + spinlock_t rb_reqslock; /* protect rb_allreqs */ + struct list_head rb_allreqs; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -334,6 +353,8 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; + unsigned long nomsg_call_count; + unsigned long bcall_count; }; /* @@ -350,7 +371,6 @@ struct rpcrdma_memreg_ops { struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init)(struct rpcrdma_xprt *); - void (*ro_reset)(struct rpcrdma_xprt *); void (*ro_destroy)(struct rpcrdma_buffer *); const char *ro_displayname; }; @@ -410,9 +430,14 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, /* * Buffer calls - xprtrdma/verbs.c */ +struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); +struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); +void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); +void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); @@ -424,6 +449,13 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); +int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); + +int frwr_alloc_recovery_wq(void); +void frwr_destroy_recovery_wq(void); + +int rpcrdma_alloc_wq(void); +void rpcrdma_destroy_wq(void); /* * Wrappers for chunk registration, shared by read/write chunk code. @@ -480,6 +512,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); */ int rpcrdma_marshal_req(struct rpc_rqst *); +/* RPC/RDMA module init - xprtrdma/transport.c + */ +int xprt_rdma_init(void); +void xprt_rdma_cleanup(void); + +/* Backchannel calls - xprtrdma/backchannel.c + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); +int xprt_rdma_bc_up(struct svc_serv *, struct net *); +int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); +void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); +int rpcrdma_bc_marshal_reply(struct rpc_rqst *); +void xprt_rdma_bc_free_rqst(struct rpc_rqst *); +void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; /* WR context cache. Created in svc_rdma.c */ @@ -487,10 +536,4 @@ extern struct kmem_cache *svc_rdma_ctxt_cachep; /* Workqueue created in svc_rdma.c */ extern struct workqueue_struct *svc_rdma_wq; -#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) -#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD -#else -#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) -#endif - #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/kernel/net/sunrpc/xprtsock.c b/kernel/net/sunrpc/xprtsock.c index 5e3ad598d..027c9ef8a 100644 --- a/kernel/net/sunrpc/xprtsock.c +++ b/kernel/net/sunrpc/xprtsock.c @@ -360,8 +360,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i int flags = XS_SENDMSG_FLAGS; remainder -= len; - if (remainder != 0 || more) + if (more) flags |= MSG_MORE; + if (remainder != 0) + flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE; err = do_sendpage(sock, *ppage, base, len, flags); if (remainder == 0 || err != len) break; @@ -396,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -440,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -465,20 +465,11 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { - /* - * Notify TCP that we're limited by the application - * window size - */ - set_bit(SOCK_NOSPACE, &transport->sock->flags); - sk->sk_write_pending++; - /* ...and wait for more buffer space */ - xprt_wait_for_buffer_space(task, xs_nospace_callback); - } - } else { - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + /* wait for more buffer space */ + sk->sk_write_pending++; + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } else ret = -ENOTCONN; - } spin_unlock_bh(&xprt->transport_lock); @@ -527,6 +518,10 @@ static int xs_local_send_request(struct rpc_task *task) true, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - req->rq_bytes_sent, status); + + if (status == -EAGAIN && sock_writeable(transport->inet)) + status = -ENOBUFS; + if (likely(sent > 0) || status == 0) { req->rq_bytes_sent += sent; req->rq_xmit_bytes_sent += sent; @@ -539,6 +534,7 @@ static int xs_local_send_request(struct rpc_task *task) switch (status) { case -ENOBUFS: + break; case -EAGAIN: status = xs_nospace(task); break; @@ -589,6 +585,9 @@ static int xs_udp_send_request(struct rpc_task *task) if (status == -EPERM) goto process_status; + if (status == -EAGAIN && sock_writeable(transport->inet)) + status = -ENOBUFS; + if (sent > 0 || status == 0) { req->rq_xmit_bytes_sent += sent; if (sent >= req->rq_slen) @@ -606,9 +605,6 @@ process_status: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -ENOBUFS: case -EPIPE: @@ -616,30 +612,15 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; } -/** - * xs_tcp_shutdown - gracefully shut down a TCP socket - * @xprt: transport - * - * Initiates a graceful shutdown of the TCP socket by calling the - * equivalent of shutdown(SHUT_RDWR); - */ -static void xs_tcp_shutdown(struct rpc_xprt *xprt) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct socket *sock = transport->sock; - - if (sock != NULL) { - kernel_sock_shutdown(sock, SHUT_RDWR); - trace_rpc_socket_shutdown(xprt, sock); - } -} - /** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request @@ -687,9 +668,6 @@ static int xs_tcp_send_request(struct rpc_task *task) dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); - if (unlikely(sent == 0 && status < 0)) - break; - /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ req->rq_bytes_sent += sent; @@ -699,30 +677,34 @@ static int xs_tcp_send_request(struct rpc_task *task) return 0; } - if (sent != 0) - continue; - status = -EAGAIN; - break; + if (status < 0) + break; + if (sent == 0) { + status = -EAGAIN; + break; + } } + if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) + status = -ENOBUFS; switch (status) { case -ENOTSOCK: status = -ENOTCONN; /* Should we call xs_close() here? */ break; - case -ENOBUFS: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: + case -ENOBUFS: case -EPIPE: - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -827,6 +809,12 @@ static void xs_reset_transport(struct sock_xprt *transport) if (sk == NULL) return; + if (atomic_read(&transport->xprt.swapper)) + sk_clear_memalloc(sk); + + kernel_sock_shutdown(sock, SHUT_RDWR); + + mutex_lock(&transport->recv_mutex); write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; @@ -837,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport) xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); + mutex_unlock(&transport->recv_mutex); trace_rpc_socket_close(xprt, sock); sock_release(sock); @@ -864,6 +853,13 @@ static void xs_close(struct rpc_xprt *xprt) xprt_disconnect_done(xprt); } +static void xs_inject_disconnect(struct rpc_xprt *xprt) +{ + dprintk("RPC: injecting transport disconnect on xprt=%p\n", + xprt); + xprt_disconnect_done(xprt); +} + static void xs_xprt_free(struct rpc_xprt *xprt) { xs_free_peer_addresses(xprt); @@ -877,9 +873,13 @@ static void xs_xprt_free(struct rpc_xprt *xprt) */ static void xs_destroy(struct rpc_xprt *xprt) { + struct sock_xprt *transport = container_of(xprt, + struct sock_xprt, xprt); dprintk("RPC: xs_destroy xprt %p\n", xprt); + cancel_delayed_work_sync(&transport->connect_worker); xs_close(xprt); + cancel_work_sync(&transport->recv_worker); xs_xprt_free(xprt); module_put(THIS_MODULE); } @@ -900,45 +900,36 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) } /** - * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets - * @sk: socket with data to read - * @len: how much data to read + * xs_local_data_read_skb + * @xprt: transport + * @sk: socket + * @skb: skbuff * * Currently this assumes we can read the whole reply in a single gulp. */ -static void xs_local_data_ready(struct sock *sk) +static void xs_local_data_read_skb(struct rpc_xprt *xprt, + struct sock *sk, + struct sk_buff *skb) { struct rpc_task *task; - struct rpc_xprt *xprt; struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; + int repsize, copied; u32 _xid; __be32 *xp; - read_lock_bh(&sk->sk_callback_lock); - dprintk("RPC: %s...\n", __func__); - xprt = xprt_from_sock(sk); - if (xprt == NULL) - goto out; - - skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) - goto out; - repsize = skb->len - sizeof(rpc_fraghdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d\n", repsize); - goto dropit; + return; } /* Copy the XID from the skb... */ xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); if (xp == NULL) - goto dropit; + return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -956,51 +947,68 @@ static void xs_local_data_ready(struct sock *sk) xprt_complete_rqst(task, copied); out_unlock: - spin_unlock(&xprt->transport_lock); - dropit: - skb_free_datagram(sk, skb); - out: - read_unlock_bh(&sk->sk_callback_lock); + spin_unlock_bh(&xprt->transport_lock); +} + +static void xs_local_data_receive(struct sock_xprt *transport) +{ + struct sk_buff *skb; + struct sock *sk; + int err; + + mutex_lock(&transport->recv_mutex); + sk = transport->inet; + if (sk == NULL) + goto out; + for (;;) { + skb = skb_recv_datagram(sk, 0, 1, &err); + if (skb == NULL) + break; + xs_local_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + } +out: + mutex_unlock(&transport->recv_mutex); +} + +static void xs_local_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_local_data_receive(transport); } /** - * xs_udp_data_ready - "data ready" callback for UDP sockets - * @sk: socket with data to read - * @len: how much data to read + * xs_udp_data_read_skb - receive callback for UDP sockets + * @xprt: transport + * @sk: socket + * @skb: skbuff * */ -static void xs_udp_data_ready(struct sock *sk) +static void xs_udp_data_read_skb(struct rpc_xprt *xprt, + struct sock *sk, + struct sk_buff *skb) { struct rpc_task *task; - struct rpc_xprt *xprt; struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; + int repsize, copied; u32 _xid; __be32 *xp; - read_lock_bh(&sk->sk_callback_lock); - dprintk("RPC: xs_udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) - goto out; - - if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) - goto out; - repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); - goto dropit; + return; } /* Copy the XID from the skb... */ xp = skb_header_pointer(skb, sizeof(struct udphdr), sizeof(_xid), &_xid); if (xp == NULL) - goto dropit; + return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -1021,10 +1029,54 @@ static void xs_udp_data_ready(struct sock *sk) xprt_complete_rqst(task, copied); out_unlock: - spin_unlock(&xprt->transport_lock); - dropit: - skb_free_datagram(sk, skb); - out: + spin_unlock_bh(&xprt->transport_lock); +} + +static void xs_udp_data_receive(struct sock_xprt *transport) +{ + struct sk_buff *skb; + struct sock *sk; + int err; + + mutex_lock(&transport->recv_mutex); + sk = transport->inet; + if (sk == NULL) + goto out; + for (;;) { + skb = skb_recv_datagram(sk, 0, 1, &err); + if (skb == NULL) + break; + xs_udp_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + } +out: + mutex_unlock(&transport->recv_mutex); +} + +static void xs_udp_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_udp_data_receive(transport); +} + +/** + * xs_data_ready - "data ready" callback for UDP sockets + * @sk: socket with data to read + * + */ +static void xs_data_ready(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock_bh(&sk->sk_callback_lock); + dprintk("RPC: xs_data_ready...\n"); + xprt = xprt_from_sock(sk); + if (xprt != NULL) { + struct sock_xprt *transport = container_of(xprt, + struct sock_xprt, xprt); + queue_work(rpciod_workqueue, &transport->recv_worker); + } read_unlock_bh(&sk->sk_callback_lock); } @@ -1239,12 +1291,12 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); req = xprt_lookup_rqst(xprt, transport->tcp_xid); if (!req) { dprintk("RPC: XID %08x request not found!\n", ntohl(transport->tcp_xid)); - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); return -1; } @@ -1253,7 +1305,7 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_rqst(req->rq_task, transport->tcp_copied); - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1273,10 +1325,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, struct rpc_rqst *req; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; @@ -1287,7 +1339,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_bc_request(req, transport->tcp_copied); - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1302,6 +1354,17 @@ static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, xs_tcp_read_reply(xprt, desc) : xs_tcp_read_callback(xprt, desc); } + +static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) +{ + int ret; + + ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); + if (ret < 0) + return ret; + return 0; +} #else static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) @@ -1387,42 +1450,69 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns return len - desc.count; } +static void xs_tcp_data_receive(struct sock_xprt *transport) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct sock *sk; + read_descriptor_t rd_desc = { + .count = 2*1024*1024, + .arg.data = xprt, + }; + unsigned long total = 0; + int read = 0; + + mutex_lock(&transport->recv_mutex); + sk = transport->inet; + if (sk == NULL) + goto out; + + /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ + for (;;) { + lock_sock(sk); + read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); + release_sock(sk); + if (read <= 0) + break; + total += read; + rd_desc.count = 65536; + } +out: + mutex_unlock(&transport->recv_mutex); + trace_xs_tcp_data_ready(xprt, read, total); +} + +static void xs_tcp_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_tcp_data_receive(transport); +} + /** * xs_tcp_data_ready - "data ready" callback for TCP sockets * @sk: socket with data to read - * @bytes: how much data to read * */ static void xs_tcp_data_ready(struct sock *sk) { + struct sock_xprt *transport; struct rpc_xprt *xprt; - read_descriptor_t rd_desc; - int read; - unsigned long total = 0; dprintk("RPC: xs_tcp_data_ready...\n"); read_lock_bh(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) { - read = 0; + if (!(xprt = xprt_from_sock(sk))) goto out; - } + transport = container_of(xprt, struct sock_xprt, xprt); + /* Any data means we had a useful conversation, so * the we don't need to delay the next reconnect */ if (xprt->reestablish_timeout) xprt->reestablish_timeout = 0; + queue_work(rpciod_workqueue, &transport->recv_worker); - /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - rd_desc.arg.data = xprt; - do { - rd_desc.count = 65536; - read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (read > 0) - total += read; - } while (read > 0); out: - trace_xs_tcp_data_ready(xprt, read, total); read_unlock_bh(&sk->sk_callback_lock); } @@ -1508,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket *sock; + struct socket_wq *wq; struct rpc_xprt *xprt; - if (unlikely(!(sock = sk->sk_socket))) + if (!sk->sk_socket) return; - clear_bit(SOCK_NOSPACE, &sock->flags); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) - return; + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) + goto out; xprt_write_space(xprt); +out: + rcu_read_unlock(); } /** @@ -1870,10 +1964,10 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_local_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_clear_connected(xprt); @@ -1892,9 +1986,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, /** * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint - * @xprt: RPC transport to connect * @transport: socket transport to connect - * @create_sock: function to create a socket of the correct type */ static int xs_local_setup_socket(struct sock_xprt *transport) { @@ -1966,43 +2058,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) msleep_interruptible(15000); } -#ifdef CONFIG_SUNRPC_SWAP +#if IS_ENABLED(CONFIG_SUNRPC_SWAP) +/* + * Note that this should be called with XPRT_LOCKED held (or when we otherwise + * know that we have exclusive access to the socket), to guard against + * races with xs_reset_transport. + */ static void xs_set_memalloc(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - if (xprt->swapper) + /* + * If there's no sock, then we have nothing to set. The + * reconnecting process will get it for us. + */ + if (!transport->inet) + return; + if (atomic_read(&xprt->swapper)) sk_set_memalloc(transport->inet); } /** - * xs_swapper - Tag this transport as being used for swap. + * xs_enable_swap - Tag this transport as being used for swap. * @xprt: transport to tag - * @enable: enable/disable * + * Take a reference to this transport on behalf of the rpc_clnt, and + * optionally mark it for swapping if it wasn't already. */ -int xs_swapper(struct rpc_xprt *xprt, int enable) +static int +xs_enable_swap(struct rpc_xprt *xprt) { - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, - xprt); - int err = 0; + struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); - if (enable) { - xprt->swapper++; - xs_set_memalloc(xprt); - } else if (xprt->swapper) { - xprt->swapper--; - sk_clear_memalloc(transport->inet); - } + if (atomic_inc_return(&xprt->swapper) != 1) + return 0; + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) + return -ERESTARTSYS; + if (xs->inet) + sk_set_memalloc(xs->inet); + xprt_release_xprt(xprt, NULL); + return 0; +} - return err; +/** + * xs_disable_swap - Untag this transport as being used for swap. + * @xprt: transport to tag + * + * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the + * swapper refcount goes to 0, untag the socket as a memalloc socket. + */ +static void +xs_disable_swap(struct rpc_xprt *xprt) +{ + struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); + + if (!atomic_dec_and_test(&xprt->swapper)) + return; + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) + return; + if (xs->inet) + sk_clear_memalloc(xs->inet); + xprt_release_xprt(xprt, NULL); } -EXPORT_SYMBOL_GPL(xs_swapper); #else static void xs_set_memalloc(struct rpc_xprt *xprt) { } + +static int +xs_enable_swap(struct rpc_xprt *xprt) +{ + return -EINVAL; +} + +static void +xs_disable_swap(struct rpc_xprt *xprt) +{ +} #endif static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) @@ -2017,9 +2150,9 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_udp_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_set_connected(xprt); @@ -2063,6 +2196,27 @@ out: xprt_wake_pending_tasks(xprt, status); } +/** + * xs_tcp_shutdown - gracefully shut down a TCP socket + * @xprt: transport + * + * Initiates a graceful shutdown of the TCP socket by calling the + * equivalent of shutdown(SHUT_RDWR); + */ +static void xs_tcp_shutdown(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct socket *sock = transport->sock; + + if (sock == NULL) + return; + if (xprt_connected(xprt)) { + kernel_sock_shutdown(sock, SHUT_RDWR); + trace_rpc_socket_shutdown(xprt, sock); + } else + xs_reset_transport(transport); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -2073,6 +2227,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) unsigned int keepidle = xprt->timeout->to_initval / HZ; unsigned int keepcnt = xprt->timeout->to_retries + 1; unsigned int opt_on = 1; + unsigned int timeo; /* TCP Keepalive options */ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, @@ -2084,6 +2239,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); + /* TCP user timeout (see RFC5482) */ + timeo = jiffies_to_msecs(xprt->timeout->to_initval) * + (xprt->timeout->to_retries + 1); + kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&timeo, sizeof(timeo)); + write_lock_bh(&sk->sk_callback_lock); xs_save_old_callbacks(transport, sk); @@ -2093,7 +2254,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); @@ -2132,9 +2293,6 @@ out: /** * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint - * @xprt: RPC transport to connect - * @transport: socket transport to connect - * @create_sock: function to create a socket of the correct type * * Invoked by a work queue tasklet. */ @@ -2405,7 +2563,7 @@ static int bc_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; - u32 len; + int len; dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); /* @@ -2470,6 +2628,8 @@ static struct rpc_xprt_ops xs_local_ops = { .close = xs_close, .destroy = xs_destroy, .print_stats = xs_local_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, }; static struct rpc_xprt_ops xs_udp_ops = { @@ -2489,6 +2649,9 @@ static struct rpc_xprt_ops xs_udp_ops = { .close = xs_close, .destroy = xs_destroy, .print_stats = xs_udp_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, + .inject_disconnect = xs_inject_disconnect, }; static struct rpc_xprt_ops xs_tcp_ops = { @@ -2505,6 +2668,15 @@ static struct rpc_xprt_ops xs_tcp_ops = { .close = xs_tcp_shutdown, .destroy = xs_destroy, .print_stats = xs_tcp_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, + .inject_disconnect = xs_inject_disconnect, +#ifdef CONFIG_SUNRPC_BACKCHANNEL + .bc_setup = xprt_setup_bc, + .bc_up = xs_tcp_bc_up, + .bc_free_rqst = xprt_free_bc_rqst, + .bc_destroy = xprt_destroy_bc, +#endif }; /* @@ -2522,6 +2694,9 @@ static struct rpc_xprt_ops bc_tcp_ops = { .close = bc_close, .destroy = bc_destroy, .print_stats = xs_tcp_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, + .inject_disconnect = xs_inject_disconnect, }; static int xs_init_anyaddr(const int family, struct sockaddr *sap) @@ -2572,6 +2747,7 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, } new = container_of(xprt, struct sock_xprt, xprt); + mutex_init(&new->recv_mutex); memcpy(&xprt->addr, args->dstaddr, args->addrlen); xprt->addrlen = args->addrlen; if (args->srcaddr) @@ -2625,6 +2801,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; + INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); @@ -2696,21 +2873,20 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) xprt->timeout = &xs_udp_default_timeout; + INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket); + switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_udp_setup_socket); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_udp_setup_socket); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); break; default: @@ -2775,21 +2951,20 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; + INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); + switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_tcp_setup_socket); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_tcp_setup_socket); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: @@ -2989,7 +3164,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) RPC_MAX_RESVPORT); } -static struct kernel_param_ops param_ops_portnr = { +static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; @@ -3008,7 +3183,7 @@ static int param_set_slot_table_size(const char *val, RPC_MAX_SLOT_TABLE); } -static struct kernel_param_ops param_ops_slot_table_size = { +static const struct kernel_param_ops param_ops_slot_table_size = { .set = param_set_slot_table_size, .get = param_get_uint, }; @@ -3024,7 +3199,7 @@ static int param_set_max_slot_table_size(const char *val, RPC_MAX_SLOT_TABLE_LIMIT); } -static struct kernel_param_ops param_ops_max_slot_table_size = { +static const struct kernel_param_ops param_ops_max_slot_table_size = { .set = param_set_max_slot_table_size, .get = param_get_uint, }; diff --git a/kernel/net/switchdev/switchdev.c b/kernel/net/switchdev/switchdev.c index 055453d48..d5d7132ac 100644 --- a/kernel/net/switchdev/switchdev.c +++ b/kernel/net/switchdev/switchdev.c @@ -1,6 +1,6 @@ /* * net/switchdev/switchdev.c - Switch device API - * Copyright (c) 2014 Jiri Pirko + * Copyright (c) 2014-2015 Jiri Pirko * Copyright (c) 2014-2015 Scott Feldman * * This program is free software; you can redistribute it and/or modify @@ -15,97 +15,598 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include /** - * netdev_switch_parent_id_get - Get ID of a switch + * switchdev_trans_item_enqueue - Enqueue data item to transaction queue + * + * @trans: transaction + * @data: pointer to data being queued + * @destructor: data destructor + * @tritem: transaction item being queued + * + * Enqeueue data item to transaction queue. tritem is typically placed in + * cointainter pointed at by data pointer. Destructor is called on + * transaction abort and after successful commit phase in case + * the caller did not dequeue the item before. + */ +void switchdev_trans_item_enqueue(struct switchdev_trans *trans, + void *data, void (*destructor)(void const *), + struct switchdev_trans_item *tritem) +{ + tritem->data = data; + tritem->destructor = destructor; + list_add_tail(&tritem->list, &trans->item_list); +} +EXPORT_SYMBOL_GPL(switchdev_trans_item_enqueue); + +static struct switchdev_trans_item * +__switchdev_trans_item_dequeue(struct switchdev_trans *trans) +{ + struct switchdev_trans_item *tritem; + + if (list_empty(&trans->item_list)) + return NULL; + tritem = list_first_entry(&trans->item_list, + struct switchdev_trans_item, list); + list_del(&tritem->list); + return tritem; +} + +/** + * switchdev_trans_item_dequeue - Dequeue data item from transaction queue + * + * @trans: transaction + */ +void *switchdev_trans_item_dequeue(struct switchdev_trans *trans) +{ + struct switchdev_trans_item *tritem; + + tritem = __switchdev_trans_item_dequeue(trans); + BUG_ON(!tritem); + return tritem->data; +} +EXPORT_SYMBOL_GPL(switchdev_trans_item_dequeue); + +static void switchdev_trans_init(struct switchdev_trans *trans) +{ + INIT_LIST_HEAD(&trans->item_list); +} + +static void switchdev_trans_items_destroy(struct switchdev_trans *trans) +{ + struct switchdev_trans_item *tritem; + + while ((tritem = __switchdev_trans_item_dequeue(trans))) + tritem->destructor(tritem->data); +} + +static void switchdev_trans_items_warn_destroy(struct net_device *dev, + struct switchdev_trans *trans) +{ + WARN(!list_empty(&trans->item_list), "%s: transaction item queue is not empty.\n", + dev->name); + switchdev_trans_items_destroy(trans); +} + +static LIST_HEAD(deferred); +static DEFINE_SPINLOCK(deferred_lock); + +typedef void switchdev_deferred_func_t(struct net_device *dev, + const void *data); + +struct switchdev_deferred_item { + struct list_head list; + struct net_device *dev; + switchdev_deferred_func_t *func; + unsigned long data[0]; +}; + +static struct switchdev_deferred_item *switchdev_deferred_dequeue(void) +{ + struct switchdev_deferred_item *dfitem; + + spin_lock_bh(&deferred_lock); + if (list_empty(&deferred)) { + dfitem = NULL; + goto unlock; + } + dfitem = list_first_entry(&deferred, + struct switchdev_deferred_item, list); + list_del(&dfitem->list); +unlock: + spin_unlock_bh(&deferred_lock); + return dfitem; +} + +/** + * switchdev_deferred_process - Process ops in deferred queue + * + * Called to flush the ops currently queued in deferred ops queue. + * rtnl_lock must be held. + */ +void switchdev_deferred_process(void) +{ + struct switchdev_deferred_item *dfitem; + + ASSERT_RTNL(); + + while ((dfitem = switchdev_deferred_dequeue())) { + dfitem->func(dfitem->dev, dfitem->data); + dev_put(dfitem->dev); + kfree(dfitem); + } +} +EXPORT_SYMBOL_GPL(switchdev_deferred_process); + +static void switchdev_deferred_process_work(struct work_struct *work) +{ + rtnl_lock(); + switchdev_deferred_process(); + rtnl_unlock(); +} + +static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work); + +static int switchdev_deferred_enqueue(struct net_device *dev, + const void *data, size_t data_len, + switchdev_deferred_func_t *func) +{ + struct switchdev_deferred_item *dfitem; + + dfitem = kmalloc(sizeof(*dfitem) + data_len, GFP_ATOMIC); + if (!dfitem) + return -ENOMEM; + dfitem->dev = dev; + dfitem->func = func; + memcpy(dfitem->data, data, data_len); + dev_hold(dev); + spin_lock_bh(&deferred_lock); + list_add_tail(&dfitem->list, &deferred); + spin_unlock_bh(&deferred_lock); + schedule_work(&deferred_process_work); + return 0; +} + +/** + * switchdev_port_attr_get - Get port attribute + * + * @dev: port device + * @attr: attribute to get + */ +int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + struct switchdev_attr first = { + .id = SWITCHDEV_ATTR_ID_UNDEFINED + }; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_attr_get) + return ops->switchdev_port_attr_get(dev, attr); + + if (attr->flags & SWITCHDEV_F_NO_RECURSE) + return err; + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to get attr on + * each port. Return -ENODATA if attr values don't + * compare across ports. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_attr_get(lower_dev, attr); + if (err) + break; + if (first.id == SWITCHDEV_ATTR_ID_UNDEFINED) + first = *attr; + else if (memcmp(&first, attr, sizeof(*attr))) + return -ENODATA; + } + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_attr_get); + +static int __switchdev_port_attr_set(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_attr_set) { + err = ops->switchdev_port_attr_set(dev, attr, trans); + goto done; + } + + if (attr->flags & SWITCHDEV_F_NO_RECURSE) + goto done; + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to set attr on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_port_attr_set(lower_dev, attr, trans); + if (err) + break; + } + +done: + if (err == -EOPNOTSUPP && attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP) + err = 0; + + return err; +} + +static int switchdev_port_attr_set_now(struct net_device *dev, + const struct switchdev_attr *attr) +{ + struct switchdev_trans trans; + int err; + + switchdev_trans_init(&trans); + + /* Phase I: prepare for attr set. Driver/device should fail + * here if there are going to be issues in the commit phase, + * such as lack of resources or support. The driver/device + * should reserve resources needed for the commit phase here, + * but should not commit the attr. + */ + + trans.ph_prepare = true; + err = __switchdev_port_attr_set(dev, attr, &trans); + if (err) { + /* Prepare phase failed: abort the transaction. Any + * resources reserved in the prepare phase are + * released. + */ + + if (err != -EOPNOTSUPP) + switchdev_trans_items_destroy(&trans); + + return err; + } + + /* Phase II: commit attr set. This cannot fail as a fault + * of driver/device. If it does, it's a bug in the driver/device + * because the driver said everythings was OK in phase I. + */ + + trans.ph_prepare = false; + err = __switchdev_port_attr_set(dev, attr, &trans); + WARN(err, "%s: Commit of attribute (id=%d) failed.\n", + dev->name, attr->id); + switchdev_trans_items_warn_destroy(dev, &trans); + + return err; +} + +static void switchdev_port_attr_set_deferred(struct net_device *dev, + const void *data) +{ + const struct switchdev_attr *attr = data; + int err; + + err = switchdev_port_attr_set_now(dev, attr); + if (err && err != -EOPNOTSUPP) + netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", + err, attr->id); +} + +static int switchdev_port_attr_set_defer(struct net_device *dev, + const struct switchdev_attr *attr) +{ + return switchdev_deferred_enqueue(dev, attr, sizeof(*attr), + switchdev_port_attr_set_deferred); +} + +/** + * switchdev_port_attr_set - Set port attribute + * * @dev: port device - * @psid: switch ID + * @attr: attribute to set + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. * - * Get ID of a switch this port is part of. + * rtnl_lock must be held and must not be in atomic section, + * in case SWITCHDEV_F_DEFER flag is not set. */ -int netdev_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +int switchdev_port_attr_set(struct net_device *dev, + const struct switchdev_attr *attr) +{ + if (attr->flags & SWITCHDEV_F_DEFER) + return switchdev_port_attr_set_defer(dev, attr); + ASSERT_RTNL(); + return switchdev_port_attr_set_now(dev, attr); +} +EXPORT_SYMBOL_GPL(switchdev_port_attr_set); + +static size_t switchdev_obj_size(const struct switchdev_obj *obj) +{ + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_VLAN: + return sizeof(struct switchdev_obj_port_vlan); + case SWITCHDEV_OBJ_ID_IPV4_FIB: + return sizeof(struct switchdev_obj_ipv4_fib); + case SWITCHDEV_OBJ_ID_PORT_FDB: + return sizeof(struct switchdev_obj_port_fdb); + default: + BUG(); + } + return 0; +} + +static int __switchdev_port_obj_add(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_add) + return ops->switchdev_port_obj_add(dev, obj, trans); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to add object on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_port_obj_add(lower_dev, obj, trans); + if (err) + break; + } + + return err; +} + +static int switchdev_port_obj_add_now(struct net_device *dev, + const struct switchdev_obj *obj) { - const struct swdev_ops *ops = dev->swdev_ops; + struct switchdev_trans trans; + int err; + + ASSERT_RTNL(); + + switchdev_trans_init(&trans); - if (!ops || !ops->swdev_parent_id_get) - return -EOPNOTSUPP; - return ops->swdev_parent_id_get(dev, psid); + /* Phase I: prepare for obj add. Driver/device should fail + * here if there are going to be issues in the commit phase, + * such as lack of resources or support. The driver/device + * should reserve resources needed for the commit phase here, + * but should not commit the obj. + */ + + trans.ph_prepare = true; + err = __switchdev_port_obj_add(dev, obj, &trans); + if (err) { + /* Prepare phase failed: abort the transaction. Any + * resources reserved in the prepare phase are + * released. + */ + + if (err != -EOPNOTSUPP) + switchdev_trans_items_destroy(&trans); + + return err; + } + + /* Phase II: commit obj add. This cannot fail as a fault + * of driver/device. If it does, it's a bug in the driver/device + * because the driver said everythings was OK in phase I. + */ + + trans.ph_prepare = false; + err = __switchdev_port_obj_add(dev, obj, &trans); + WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); + switchdev_trans_items_warn_destroy(dev, &trans); + + return err; +} + +static void switchdev_port_obj_add_deferred(struct net_device *dev, + const void *data) +{ + const struct switchdev_obj *obj = data; + int err; + + err = switchdev_port_obj_add_now(dev, obj); + if (err && err != -EOPNOTSUPP) + netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", + err, obj->id); +} + +static int switchdev_port_obj_add_defer(struct net_device *dev, + const struct switchdev_obj *obj) +{ + return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), + switchdev_port_obj_add_deferred); } -EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get); /** - * netdev_switch_port_stp_update - Notify switch device port of STP - * state change + * switchdev_port_obj_add - Add port object + * * @dev: port device - * @state: port STP state + * @id: object ID + * @obj: object to add + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. * - * Notify switch device port of bridge port STP state change. + * rtnl_lock must be held and must not be in atomic section, + * in case SWITCHDEV_F_DEFER flag is not set. */ -int netdev_switch_port_stp_update(struct net_device *dev, u8 state) +int switchdev_port_obj_add(struct net_device *dev, + const struct switchdev_obj *obj) { - const struct swdev_ops *ops = dev->swdev_ops; + if (obj->flags & SWITCHDEV_F_DEFER) + return switchdev_port_obj_add_defer(dev, obj); + ASSERT_RTNL(); + return switchdev_port_obj_add_now(dev, obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_add); + +static int switchdev_port_obj_del_now(struct net_device *dev, + const struct switchdev_obj *obj) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; - if (ops && ops->swdev_port_stp_update) - return ops->swdev_port_stp_update(dev, state); + if (ops && ops->switchdev_port_obj_del) + return ops->switchdev_port_obj_del(dev, obj); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to delete object on + * each port. + */ netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_stp_update(lower_dev, state); - if (err && err != -EOPNOTSUPP) - return err; + err = switchdev_port_obj_del_now(lower_dev, obj); + if (err) + break; + } + + return err; +} + +static void switchdev_port_obj_del_deferred(struct net_device *dev, + const void *data) +{ + const struct switchdev_obj *obj = data; + int err; + + err = switchdev_port_obj_del_now(dev, obj); + if (err && err != -EOPNOTSUPP) + netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", + err, obj->id); +} + +static int switchdev_port_obj_del_defer(struct net_device *dev, + const struct switchdev_obj *obj) +{ + return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), + switchdev_port_obj_del_deferred); +} + +/** + * switchdev_port_obj_del - Delete port object + * + * @dev: port device + * @id: object ID + * @obj: object to delete + * + * rtnl_lock must be held and must not be in atomic section, + * in case SWITCHDEV_F_DEFER flag is not set. + */ +int switchdev_port_obj_del(struct net_device *dev, + const struct switchdev_obj *obj) +{ + if (obj->flags & SWITCHDEV_F_DEFER) + return switchdev_port_obj_del_defer(dev, obj); + ASSERT_RTNL(); + return switchdev_port_obj_del_now(dev, obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_del); + +/** + * switchdev_port_obj_dump - Dump port objects + * + * @dev: port device + * @id: object ID + * @obj: object to dump + * @cb: function to call with a filled object + * + * rtnl_lock must be held. + */ +int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, + switchdev_obj_dump_cb_t *cb) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + ASSERT_RTNL(); + + if (ops && ops->switchdev_port_obj_dump) + return ops->switchdev_port_obj_dump(dev, obj, cb); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to dump objects on + * first port at bottom of stack. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_obj_dump(lower_dev, obj, cb); + break; } return err; } -EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update); +EXPORT_SYMBOL_GPL(switchdev_port_obj_dump); -static DEFINE_MUTEX(netdev_switch_mutex); -static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain); +static RAW_NOTIFIER_HEAD(switchdev_notif_chain); /** - * register_netdev_switch_notifier - Register notifier + * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. This should be used by code * which needs to monitor events happening in particular device. * Return values are same as for atomic_notifier_chain_register(). */ -int register_netdev_switch_notifier(struct notifier_block *nb) +int register_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + rtnl_lock(); + err = raw_notifier_chain_register(&switchdev_notif_chain, nb); + rtnl_unlock(); return err; } -EXPORT_SYMBOL_GPL(register_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** - * unregister_netdev_switch_notifier - Unregister notifier + * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. * Return values are same as for atomic_notifier_chain_unregister(). */ -int unregister_netdev_switch_notifier(struct notifier_block *nb) +int unregister_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + rtnl_lock(); + err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb); + rtnl_unlock(); return err; } -EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** - * call_netdev_switch_notifiers - Call notifiers + * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data @@ -113,147 +614,498 @@ EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier); * Call all network notifier blocks. This should be called by driver * when it needs to propagate hardware event. * Return values are same as for atomic_notifier_call_chain(). + * rtnl_lock must be held. */ -int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev, - struct netdev_switch_notifier_info *info) +int call_switchdev_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info) { int err; + ASSERT_RTNL(); + info->dev = dev; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info); - mutex_unlock(&netdev_switch_mutex); + err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); + return err; +} +EXPORT_SYMBOL_GPL(call_switchdev_notifiers); + +struct switchdev_vlan_dump { + struct switchdev_obj_port_vlan vlan; + struct sk_buff *skb; + u32 filter_mask; + u16 flags; + u16 begin; + u16 end; +}; + +static int switchdev_port_vlan_dump_put(struct switchdev_vlan_dump *dump) +{ + struct bridge_vlan_info vinfo; + + vinfo.flags = dump->flags; + + if (dump->begin == 0 && dump->end == 0) { + return 0; + } else if (dump->begin == dump->end) { + vinfo.vid = dump->begin; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + } else { + vinfo.vid = dump->begin; + vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + vinfo.vid = dump->end; + vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN; + vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + } + + return 0; +} + +static int switchdev_port_vlan_dump_cb(struct switchdev_obj *obj) +{ + struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); + struct switchdev_vlan_dump *dump = + container_of(vlan, struct switchdev_vlan_dump, vlan); + int err = 0; + + if (vlan->vid_begin > vlan->vid_end) + return -EINVAL; + + if (dump->filter_mask & RTEXT_FILTER_BRVLAN) { + dump->flags = vlan->flags; + for (dump->begin = dump->end = vlan->vid_begin; + dump->begin <= vlan->vid_end; + dump->begin++, dump->end++) { + err = switchdev_port_vlan_dump_put(dump); + if (err) + return err; + } + } else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) { + if (dump->begin > vlan->vid_begin && + dump->begin >= vlan->vid_end) { + if ((dump->begin - 1) == vlan->vid_end && + dump->flags == vlan->flags) { + /* prepend */ + dump->begin = vlan->vid_begin; + } else { + err = switchdev_port_vlan_dump_put(dump); + dump->flags = vlan->flags; + dump->begin = vlan->vid_begin; + dump->end = vlan->vid_end; + } + } else if (dump->end <= vlan->vid_begin && + dump->end < vlan->vid_end) { + if ((dump->end + 1) == vlan->vid_begin && + dump->flags == vlan->flags) { + /* append */ + dump->end = vlan->vid_end; + } else { + err = switchdev_port_vlan_dump_put(dump); + dump->flags = vlan->flags; + dump->begin = vlan->vid_begin; + dump->end = vlan->vid_end; + } + } else { + err = -EINVAL; + } + } + return err; } -EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers); + +static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev, + u32 filter_mask) +{ + struct switchdev_vlan_dump dump = { + .vlan.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .skb = skb, + .filter_mask = filter_mask, + }; + int err = 0; + + if ((filter_mask & RTEXT_FILTER_BRVLAN) || + (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { + err = switchdev_port_obj_dump(dev, &dump.vlan.obj, + switchdev_port_vlan_dump_cb); + if (err) + goto err_out; + if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) + /* last one */ + err = switchdev_port_vlan_dump_put(&dump); + } + +err_out: + return err == -EOPNOTSUPP ? 0 : err; +} /** - * netdev_switch_port_bridge_setlink - Notify switch device port of bridge - * port attributes + * switchdev_port_bridge_getlink - Get bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags * - * Notify switch device port of bridge port attributes + * Called for SELF on rtnl_bridge_getlink to get bridge port + * attributes. */ -int netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask, + int nlflags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, + }; + u16 mode = BRIDGE_MODE_UNDEF; + u32 mask = BR_LEARNING | BR_LEARNING_SYNC | BR_FLOOD; + int err; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return 0; + err = switchdev_port_attr_get(dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + + return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, + attr.u.brport_flags, mask, nlflags, + filter_mask, switchdev_port_vlan_fill); +} +EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink); + +static int switchdev_port_br_setflag(struct net_device *dev, + struct nlattr *nlattr, + unsigned long brport_flag) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, + }; + u8 flag = nla_get_u8(nlattr); + int err; - if (!ops->ndo_bridge_setlink) - return -EOPNOTSUPP; + err = switchdev_port_attr_get(dev, &attr); + if (err) + return err; - return ops->ndo_bridge_setlink(dev, nlh, flags); + if (flag) + attr.u.brport_flags |= brport_flag; + else + attr.u.brport_flags &= ~brport_flag; + + return switchdev_port_attr_set(dev, &attr); +} + +static const struct nla_policy +switchdev_port_bridge_policy[IFLA_BRPORT_MAX + 1] = { + [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, + [IFLA_BRPORT_COST] = { .type = NLA_U32 }, + [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, + [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, + [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, + [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, + [IFLA_BRPORT_FAST_LEAVE] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING_SYNC] = { .type = NLA_U8 }, + [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, +}; + +static int switchdev_port_br_setlink_protinfo(struct net_device *dev, + struct nlattr *protinfo) +{ + struct nlattr *attr; + int rem; + int err; + + err = nla_validate_nested(protinfo, IFLA_BRPORT_MAX, + switchdev_port_bridge_policy); + if (err) + return err; + + nla_for_each_nested(attr, protinfo, rem) { + switch (nla_type(attr)) { + case IFLA_BRPORT_LEARNING: + err = switchdev_port_br_setflag(dev, attr, + BR_LEARNING); + break; + case IFLA_BRPORT_LEARNING_SYNC: + err = switchdev_port_br_setflag(dev, attr, + BR_LEARNING_SYNC); + break; + case IFLA_BRPORT_UNICAST_FLOOD: + err = switchdev_port_br_setflag(dev, attr, BR_FLOOD); + break; + default: + err = -EOPNOTSUPP; + break; + } + if (err) + return err; + } + + return 0; +} + +static int switchdev_port_br_afspec(struct net_device *dev, + struct nlattr *afspec, + int (*f)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + struct nlattr *attr; + struct bridge_vlan_info *vinfo; + struct switchdev_obj_port_vlan vlan = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + }; + int rem; + int err; + + nla_for_each_nested(attr, afspec, rem) { + if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO) + continue; + if (nla_len(attr) != sizeof(struct bridge_vlan_info)) + return -EINVAL; + vinfo = nla_data(attr); + if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) + return -EINVAL; + vlan.flags = vinfo->flags; + if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + if (vlan.vid_begin) + return -EINVAL; + vlan.vid_begin = vinfo->vid; + /* don't allow range of pvids */ + if (vlan.flags & BRIDGE_VLAN_INFO_PVID) + return -EINVAL; + } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { + if (!vlan.vid_begin) + return -EINVAL; + vlan.vid_end = vinfo->vid; + if (vlan.vid_end <= vlan.vid_begin) + return -EINVAL; + err = f(dev, &vlan.obj); + if (err) + return err; + vlan.vid_begin = 0; + } else { + if (vlan.vid_begin) + return -EINVAL; + vlan.vid_begin = vinfo->vid; + vlan.vid_end = vinfo->vid; + err = f(dev, &vlan.obj); + if (err) + return err; + vlan.vid_begin = 0; + } + } + + return 0; } -EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink); /** - * netdev_switch_port_bridge_dellink - Notify switch device port of bridge - * port attribute delete + * switchdev_port_bridge_setlink - Set bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags + * @nlh: netlink header + * @flags: netlink flags * - * Notify switch device port of bridge port attribute delete + * Called for SELF on rtnl_bridge_setlink to set bridge port + * attributes. */ -int netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct nlattr *protinfo; + struct nlattr *afspec; + int err = 0; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return 0; + protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_PROTINFO); + if (protinfo) { + err = switchdev_port_br_setlink_protinfo(dev, protinfo); + if (err) + return err; + } - if (!ops->ndo_bridge_dellink) - return -EOPNOTSUPP; + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) + err = switchdev_port_br_afspec(dev, afspec, + switchdev_port_obj_add); - return ops->ndo_bridge_dellink(dev, nlh, flags); + return err; } -EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink); /** - * ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink - * op for master devices + * switchdev_port_bridge_dellink - Set bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags + * @nlh: netlink header + * @flags: netlink flags * - * Notify master device slaves of bridge port attributes + * Called for SELF on rtnl_bridge_dellink to set bridge port + * attributes. */ -int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { - struct net_device *lower_dev; - struct list_head *iter; - int ret = 0, err = 0; + struct nlattr *afspec; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return ret; + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) + return switchdev_port_br_afspec(dev, afspec, + switchdev_port_obj_del); - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags); - if (err && err != -EOPNOTSUPP) - ret = err; - } + return 0; +} +EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); + +/** + * switchdev_port_fdb_add - Add FDB (MAC/VLAN) entry to port + * + * @ndmsg: netlink hdr + * @nlattr: netlink attributes + * @dev: port device + * @addr: MAC address to add + * @vid: VLAN to add + * + * Add FDB entry to switch device. + */ +int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr, + u16 vid, u16 nlm_flags) +{ + struct switchdev_obj_port_fdb fdb = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .vid = vid, + }; - return ret; + ether_addr_copy(fdb.addr, addr); + return switchdev_port_obj_add(dev, &fdb.obj); } -EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink); +EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); /** - * ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink - * op for master devices + * switchdev_port_fdb_del - Delete FDB (MAC/VLAN) entry from port * + * @ndmsg: netlink hdr + * @nlattr: netlink attributes * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge dellink flags + * @addr: MAC address to delete + * @vid: VLAN to delete * - * Notify master device slaves of bridge port attribute deletes + * Delete FDB entry from switch device. */ -int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr, + u16 vid) { - struct net_device *lower_dev; - struct list_head *iter; - int ret = 0, err = 0; + struct switchdev_obj_port_fdb fdb = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .vid = vid, + }; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return ret; + ether_addr_copy(fdb.addr, addr); + return switchdev_port_obj_del(dev, &fdb.obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags); - if (err && err != -EOPNOTSUPP) - ret = err; - } +struct switchdev_fdb_dump { + struct switchdev_obj_port_fdb fdb; + struct net_device *dev; + struct sk_buff *skb; + struct netlink_callback *cb; + int idx; +}; - return ret; +static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj) +{ + struct switchdev_obj_port_fdb *fdb = SWITCHDEV_OBJ_PORT_FDB(obj); + struct switchdev_fdb_dump *dump = + container_of(fdb, struct switchdev_fdb_dump, fdb); + u32 portid = NETLINK_CB(dump->cb->skb).portid; + u32 seq = dump->cb->nlh->nlmsg_seq; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + if (dump->idx < dump->cb->args[0]) + goto skip; + + nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, + sizeof(*ndm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = NTF_SELF; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dump->dev->ifindex; + ndm->ndm_state = fdb->ndm_state; + + if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, fdb->addr)) + goto nla_put_failure; + + if (fdb->vid && nla_put_u16(dump->skb, NDA_VLAN, fdb->vid)) + goto nla_put_failure; + + nlmsg_end(dump->skb, nlh); + +skip: + dump->idx++; + return 0; + +nla_put_failure: + nlmsg_cancel(dump->skb, nlh); + return -EMSGSIZE; } -EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink); -static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) +/** + * switchdev_port_fdb_dump - Dump port FDB (MAC/VLAN) entries + * + * @skb: netlink skb + * @cb: netlink callback + * @dev: port device + * @filter_dev: filter device + * @idx: + * + * Delete FDB entry from switch device. + */ +int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev, + struct net_device *filter_dev, int idx) +{ + struct switchdev_fdb_dump dump = { + .fdb.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .dev = dev, + .skb = skb, + .cb = cb, + .idx = idx, + }; + + switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); + return dump.idx; +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); + +static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) { - const struct swdev_ops *ops = dev->swdev_ops; + const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; struct net_device *port_dev; struct list_head *iter; /* Recusively search down until we find a sw port dev. - * (A sw port dev supports swdev_parent_id_get). + * (A sw port dev supports switchdev_port_attr_get). */ - if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD && - ops && ops->swdev_parent_id_get) + if (ops && ops->switchdev_port_attr_get) return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { - port_dev = netdev_switch_get_lowest_dev(lower_dev); + port_dev = switchdev_get_lowest_dev(lower_dev); if (port_dev) return port_dev; } @@ -261,13 +1113,17 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) return NULL; } -static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) +static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) { - struct netdev_phys_item_id psid; - struct netdev_phys_item_id prev_psid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; + struct switchdev_attr prev_attr; struct net_device *dev = NULL; int nhsel; + ASSERT_RTNL(); + /* For this route, all nexthop devs must be on the same switch. */ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { @@ -276,28 +1132,25 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) if (!nh->nh_dev) return NULL; - dev = netdev_switch_get_lowest_dev(nh->nh_dev); + dev = switchdev_get_lowest_dev(nh->nh_dev); if (!dev) return NULL; - if (netdev_switch_parent_id_get(dev, &psid)) + if (switchdev_port_attr_get(dev, &attr)) return NULL; - if (nhsel > 0) { - if (prev_psid.id_len != psid.id_len) + if (nhsel > 0 && + !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) return NULL; - if (memcmp(prev_psid.id, psid.id, psid.id_len)) - return NULL; - } - prev_psid = psid; + prev_attr = attr; } return dev; } /** - * netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch + * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -307,15 +1160,25 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) * @nlflags: netlink flags passed in (NLM_F_*) * @tb_id: route table ID * - * Add IPv4 route entry to switch device. + * Add/modify switch IPv4 route entry. */ -int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id) +int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, u32 tb_id) { + struct switchdev_obj_ipv4_fib ipv4_fib = { + .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, + .dst = dst, + .dst_len = dst_len, + .tos = tos, + .type = type, + .nlflags = nlflags, + .tb_id = tb_id, + }; struct net_device *dev; - const struct swdev_ops *ops; int err = 0; + memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi)); + /* Don't offload route if using custom ip rules or if * IPv4 FIB offloading has been disabled completely. */ @@ -328,25 +1191,20 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (fi->fib_net->ipv4.fib_offload_disabled) return 0; - dev = netdev_switch_get_dev_by_nhs(fi); + dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->swdev_ops; - - if (ops->swdev_fib_ipv4_add) { - err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len, - fi, tos, type, nlflags, - tb_id); - if (!err) - fi->fib_flags |= RTNH_F_OFFLOAD; - } - return err; + err = switchdev_port_obj_add(dev, &ipv4_fib.obj); + if (!err) + fi->fib_flags |= RTNH_F_OFFLOAD; + + return err == -EOPNOTSUPP ? 0 : err; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); /** - * netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch + * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch * * @dst: route's IPv4 destination address * @dst_len: destination address length (prefix length) @@ -357,38 +1215,44 @@ EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add); * * Delete IPv4 route entry from switch device. */ -int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) +int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) { + struct switchdev_obj_ipv4_fib ipv4_fib = { + .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, + .dst = dst, + .dst_len = dst_len, + .tos = tos, + .type = type, + .nlflags = 0, + .tb_id = tb_id, + }; struct net_device *dev; - const struct swdev_ops *ops; int err = 0; + memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi)); + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) return 0; - dev = netdev_switch_get_dev_by_nhs(fi); + dev = switchdev_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->swdev_ops; - if (ops->swdev_fib_ipv4_del) { - err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len, - fi, tos, type, tb_id); - if (!err) - fi->fib_flags &= ~RTNH_F_OFFLOAD; - } + err = switchdev_port_obj_del(dev, &ipv4_fib.obj); + if (!err) + fi->fib_flags &= ~RTNH_F_OFFLOAD; - return err; + return err == -EOPNOTSUPP ? 0 : err; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); /** - * netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation + * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation * * @fi: route FIB info structure */ -void netdev_switch_fib_ipv4_abort(struct fib_info *fi) +void switchdev_fib_ipv4_abort(struct fib_info *fi) { /* There was a problem installing this route to the offload * device. For now, until we come up with more refined @@ -401,4 +1265,108 @@ void netdev_switch_fib_ipv4_abort(struct fib_info *fi) fib_flush_external(fi->fib_net); fi->fib_net->ipv4.fib_offload_disabled = true; } -EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); + +static bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) +{ + struct switchdev_attr a_attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + struct switchdev_attr b_attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + + if (switchdev_port_attr_get(a, &a_attr) || + switchdev_port_attr_get(b, &b_attr)) + return false; + + return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); +} + +static u32 switchdev_port_fwd_mark_get(struct net_device *dev, + struct net_device *group_dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev == dev) + continue; + if (switchdev_port_same_parent_id(dev, lower_dev)) + return lower_dev->offload_fwd_mark; + return switchdev_port_fwd_mark_get(dev, lower_dev); + } + + return dev->ifindex; +} + +static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, + u32 old_mark, u32 *reset_mark) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev->offload_fwd_mark == old_mark) { + if (!*reset_mark) + *reset_mark = lower_dev->ifindex; + lower_dev->offload_fwd_mark = *reset_mark; + } + switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); + } +} + +/** + * switchdev_port_fwd_mark_set - Set port offload forwarding mark + * + * @dev: port device + * @group_dev: containing device + * @joining: true if dev is joining group; false if leaving group + * + * An ungrouped port's offload mark is just its ifindex. A grouped + * port's (member of a bridge, for example) offload mark is the ifindex + * of one of the ports in the group with the same parent (switch) ID. + * Ports on the same device in the same group will have the same mark. + * + * Example: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=5 + * sw2p2 ifindex=5 mark=5 + * + * If sw2p2 leaves the bridge, we'll have: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=4 + * sw2p2 ifindex=5 mark=5 + */ +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ + u32 mark = dev->ifindex; + u32 reset_mark = 0; + + if (group_dev) { + ASSERT_RTNL(); + if (joining) + mark = switchdev_port_fwd_mark_get(dev, group_dev); + else if (dev->offload_fwd_mark == mark) + /* Ohoh, this port was the mark reference port, + * but it's leaving the group, so reset the + * mark for the remaining ports in the group. + */ + switchdev_port_fwd_mark_reset(group_dev, mark, + &reset_mark); + } + + dev->offload_fwd_mark = mark; +} +EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); diff --git a/kernel/net/sysctl_net.c b/kernel/net/sysctl_net.c index e7000be32..ed98c1fc3 100644 --- a/kernel/net/sysctl_net.c +++ b/kernel/net/sysctl_net.c @@ -94,10 +94,14 @@ __init int net_sysctl_init(void) goto out; ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) - goto out; + goto out1; register_sysctl_root(&net_sysctl_root); out: return ret; +out1: + unregister_sysctl_table(net_header); + net_header = NULL; + goto out; } struct ctl_table_header *register_net_sysctl(struct net *net, diff --git a/kernel/net/tipc/addr.c b/kernel/net/tipc/addr.c index ba7daa864..48fd3b5a7 100644 --- a/kernel/net/tipc/addr.c +++ b/kernel/net/tipc/addr.c @@ -38,13 +38,6 @@ #include "addr.h" #include "core.h" -u32 tipc_own_addr(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - return tn->own_addr; -} - /** * in_own_cluster - test for cluster inclusion; <0.0.0> always matches */ diff --git a/kernel/net/tipc/addr.h b/kernel/net/tipc/addr.h index 7ba6d5c8a..93f7c983b 100644 --- a/kernel/net/tipc/addr.h +++ b/kernel/net/tipc/addr.h @@ -41,10 +41,18 @@ #include #include #include +#include "core.h" #define TIPC_ZONE_MASK 0xff000000u #define TIPC_CLUSTER_MASK 0xfffff000u +static inline u32 tipc_own_addr(struct net *net) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + + return tn->own_addr; +} + static inline u32 tipc_zone_mask(u32 addr) { return addr & TIPC_ZONE_MASK; diff --git a/kernel/net/tipc/bcast.c b/kernel/net/tipc/bcast.c index c5cbdcb1f..92e367a0a 100644 --- a/kernel/net/tipc/bcast.c +++ b/kernel/net/tipc/bcast.c @@ -35,690 +35,301 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include #include "socket.h" #include "msg.h" #include "bcast.h" #include "name_distr.h" -#include "core.h" +#include "link.h" +#include "node.h" -#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ -#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ +#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ +#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; -static void tipc_nmap_diff(struct tipc_node_map *nm_a, - struct tipc_node_map *nm_b, - struct tipc_node_map *nm_diff); -static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node); -static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node); - -static void tipc_bclink_lock(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - spin_lock_bh(&tn->bclink->lock); -} - -static void tipc_bclink_unlock(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - spin_unlock_bh(&tn->bclink->lock); -} - -void tipc_bclink_input(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - tipc_sk_mcast_rcv(net, &tn->bclink->arrvq, &tn->bclink->inputq); -} - -uint tipc_bclink_get_mtu(void) -{ - return MAX_PKT_DEFAULT_MCAST; -} - -static u32 bcbuf_acks(struct sk_buff *buf) -{ - return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle; -} - -static void bcbuf_set_acks(struct sk_buff *buf, u32 acks) -{ - TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks; -} - -static void bcbuf_decr_acks(struct sk_buff *buf) -{ - bcbuf_set_acks(buf, bcbuf_acks(buf) - 1); -} +/** + * struct tipc_bc_base - base structure for keeping broadcast send state + * @link: broadcast send link structure + * @inputq: data input queue; will only carry SOCK_WAKEUP messages + * @dest: array keeping number of reachable destinations per bearer + * @primary_bearer: a bearer having links to all broadcast destinations, if any + */ +struct tipc_bc_base { + struct tipc_link *link; + struct sk_buff_head inputq; + int dests[MAX_BEARERS]; + int primary_bearer; +}; -void tipc_bclink_add_node(struct net *net, u32 addr) +static struct tipc_bc_base *tipc_bc_base(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - - tipc_bclink_lock(net); - tipc_nmap_add(&tn->bclink->bcast_nodes, addr); - tipc_bclink_unlock(net); + return tipc_net(net)->bcbase; } -void tipc_bclink_remove_node(struct net *net, u32 addr) +int tipc_bcast_get_mtu(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - - tipc_bclink_lock(net); - tipc_nmap_remove(&tn->bclink->bcast_nodes, addr); - tipc_bclink_unlock(net); + return tipc_link_mtu(tipc_bc_sndlink(net)); } -static void bclink_set_last_sent(struct net *net) +/* tipc_bcbase_select_primary(): find a bearer with links to all destinations, + * if any, and make it primary bearer + */ +static void tipc_bcbase_select_primary(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *bcl = tn->bcl; - struct sk_buff *skb = skb_peek(&bcl->backlogq); + struct tipc_bc_base *bb = tipc_bc_base(net); + int all_dests = tipc_link_bc_peers(bb->link); + int i, mtu; - if (skb) - bcl->fsm_msg_cnt = mod(buf_seqno(skb) - 1); - else - bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); -} - -u32 tipc_bclink_get_last_sent(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); + bb->primary_bearer = INVALID_BEARER_ID; - return tn->bcl->fsm_msg_cnt; -} + if (!all_dests) + return; -static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) -{ - node->bclink.last_sent = less_eq(node->bclink.last_sent, seqno) ? - seqno : node->bclink.last_sent; -} + for (i = 0; i < MAX_BEARERS; i++) { + if (!bb->dests[i]) + continue; -/** - * tipc_bclink_retransmit_to - get most recent node to request retransmission - * - * Called with bclink_lock locked - */ -struct tipc_node *tipc_bclink_retransmit_to(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); + mtu = tipc_bearer_mtu(net, i); + if (mtu < tipc_link_mtu(bb->link)) + tipc_link_set_mtu(bb->link, mtu); - return tn->bclink->retransmit_to; -} + if (bb->dests[i] < all_dests) + continue; -/** - * bclink_retransmit_pkt - retransmit broadcast packets - * @after: sequence number of last packet to *not* retransmit - * @to: sequence number of last packet to retransmit - * - * Called with bclink_lock locked - */ -static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) -{ - struct sk_buff *skb; - struct tipc_link *bcl = tn->bcl; + bb->primary_bearer = i; - skb_queue_walk(&bcl->transmq, skb) { - if (more(buf_seqno(skb), after)) { - tipc_link_retransmit(bcl, skb, mod(to - after)); + /* Reduce risk that all nodes select same primary */ + if ((i ^ tipc_own_addr(net)) & 1) break; - } } } -/** - * tipc_bclink_wakeup_users - wake up pending users - * - * Called with no locks taken - */ -void tipc_bclink_wakeup_users(struct net *net) +void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_bc_base *bb = tipc_bc_base(net); - tipc_sk_rcv(net, &tn->bclink->link.wakeupq); + tipc_bcast_lock(net); + bb->dests[bearer_id]++; + tipc_bcbase_select_primary(net); + tipc_bcast_unlock(net); } -/** - * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets - * @n_ptr: node that sent acknowledgement info - * @acked: broadcast sequence # that has been acknowledged - * - * Node is locked, bclink_lock unlocked. - */ -void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) +void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id) { - struct sk_buff *skb, *tmp; - unsigned int released = 0; - struct net *net = n_ptr->net; - struct tipc_net *tn = net_generic(net, tipc_net_id); - - if (unlikely(!n_ptr->bclink.recv_permitted)) - return; - - tipc_bclink_lock(net); - - /* Bail out if tx queue is empty (no clean up is required) */ - skb = skb_peek(&tn->bcl->transmq); - if (!skb) - goto exit; - - /* Determine which messages need to be acknowledged */ - if (acked == INVALID_LINK_SEQ) { - /* - * Contact with specified node has been lost, so need to - * acknowledge sent messages only (if other nodes still exist) - * or both sent and unsent messages (otherwise) - */ - if (tn->bclink->bcast_nodes.count) - acked = tn->bcl->fsm_msg_cnt; - else - acked = tn->bcl->next_out_no; - } else { - /* - * Bail out if specified sequence number does not correspond - * to a message that has been sent and not yet acknowledged - */ - if (less(acked, buf_seqno(skb)) || - less(tn->bcl->fsm_msg_cnt, acked) || - less_eq(acked, n_ptr->bclink.acked)) - goto exit; - } - - /* Skip over packets that node has previously acknowledged */ - skb_queue_walk(&tn->bcl->transmq, skb) { - if (more(buf_seqno(skb), n_ptr->bclink.acked)) - break; - } + struct tipc_bc_base *bb = tipc_bc_base(net); - /* Update packets that node is now acknowledging */ - skb_queue_walk_from_safe(&tn->bcl->transmq, skb, tmp) { - if (more(buf_seqno(skb), acked)) - break; - bcbuf_decr_acks(skb); - bclink_set_last_sent(net); - if (bcbuf_acks(skb) == 0) { - __skb_unlink(skb, &tn->bcl->transmq); - kfree_skb(skb); - released = 1; - } - } - n_ptr->bclink.acked = acked; - - /* Try resolving broadcast link congestion, if necessary */ - if (unlikely(skb_peek(&tn->bcl->backlogq))) { - tipc_link_push_packets(tn->bcl); - bclink_set_last_sent(net); - } - if (unlikely(released && !skb_queue_empty(&tn->bcl->wakeupq))) - n_ptr->action_flags |= TIPC_WAKEUP_BCAST_USERS; -exit: - tipc_bclink_unlock(net); + tipc_bcast_lock(net); + bb->dests[bearer_id]--; + tipc_bcbase_select_primary(net); + tipc_bcast_unlock(net); } -/** - * tipc_bclink_update_link_state - update broadcast link state +/* tipc_bcbase_xmit - broadcast a packet queue across one or more bearers * - * RCU and node lock set + * Note that number of reachable destinations, as indicated in the dests[] + * array, may transitionally differ from the number of destinations indicated + * in each sent buffer. We can sustain this. Excess destination nodes will + * drop and never acknowledge the unexpected packets, and missing destinations + * will either require retransmission (if they are just about to be added to + * the bearer), or be removed from the buffer's 'ackers' counter (if they + * just went down) */ -void tipc_bclink_update_link_state(struct tipc_node *n_ptr, - u32 last_sent) +static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) { - struct sk_buff *buf; - struct net *net = n_ptr->net; - struct tipc_net *tn = net_generic(net, tipc_net_id); + int bearer_id; + struct tipc_bc_base *bb = tipc_bc_base(net); + struct sk_buff *skb, *_skb; + struct sk_buff_head _xmitq; - /* Ignore "stale" link state info */ - if (less_eq(last_sent, n_ptr->bclink.last_in)) + if (skb_queue_empty(xmitq)) return; - /* Update link synchronization state; quit if in sync */ - bclink_update_last_sent(n_ptr, last_sent); - - if (n_ptr->bclink.last_sent == n_ptr->bclink.last_in) + /* The typical case: at least one bearer has links to all nodes */ + bearer_id = bb->primary_bearer; + if (bearer_id >= 0) { + tipc_bearer_bc_xmit(net, bearer_id, xmitq); return; - - /* Update out-of-sync state; quit if loss is still unconfirmed */ - if ((++n_ptr->bclink.oos_state) == 1) { - if (n_ptr->bclink.deferred_size < (TIPC_MIN_LINK_WIN / 2)) - return; - n_ptr->bclink.oos_state++; } - /* Don't NACK if one has been recently sent (or seen) */ - if (n_ptr->bclink.oos_state & 0x1) - return; + /* We have to transmit across all bearers */ + skb_queue_head_init(&_xmitq); + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + if (!bb->dests[bearer_id]) + continue; - /* Send NACK */ - buf = tipc_buf_acquire(INT_H_SIZE); - if (buf) { - struct tipc_msg *msg = buf_msg(buf); - struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferdq); - u32 to = skb ? buf_seqno(skb) - 1 : n_ptr->bclink.last_sent; - - tipc_msg_init(tn->own_addr, msg, BCAST_PROTOCOL, STATE_MSG, - INT_H_SIZE, n_ptr->addr); - msg_set_non_seq(msg, 1); - msg_set_mc_netid(msg, tn->net_id); - msg_set_bcast_ack(msg, n_ptr->bclink.last_in); - msg_set_bcgap_after(msg, n_ptr->bclink.last_in); - msg_set_bcgap_to(msg, to); - - tipc_bclink_lock(net); - tipc_bearer_send(net, MAX_BEARERS, buf, NULL); - tn->bcl->stats.sent_nacks++; - tipc_bclink_unlock(net); - kfree_skb(buf); - - n_ptr->bclink.oos_state++; + skb_queue_walk(xmitq, skb) { + _skb = pskb_copy_for_clone(skb, GFP_ATOMIC); + if (!_skb) + break; + __skb_queue_tail(&_xmitq, _skb); + } + tipc_bearer_bc_xmit(net, bearer_id, &_xmitq); } + __skb_queue_purge(xmitq); + __skb_queue_purge(&_xmitq); } -/** - * bclink_peek_nack - monitor retransmission requests sent by other nodes - * - * Delay any upcoming NACK by this node if another node has already - * requested the first message this node is going to ask for. - */ -static void bclink_peek_nack(struct net *net, struct tipc_msg *msg) -{ - struct tipc_node *n_ptr = tipc_node_find(net, msg_destnode(msg)); - - if (unlikely(!n_ptr)) - return; - - tipc_node_lock(n_ptr); - if (n_ptr->bclink.recv_permitted && - (n_ptr->bclink.last_in != n_ptr->bclink.last_sent) && - (n_ptr->bclink.last_in == msg_bcgap_after(msg))) - n_ptr->bclink.oos_state = 2; - tipc_node_unlock(n_ptr); - tipc_node_put(n_ptr); -} - -/* tipc_bclink_xmit - deliver buffer chain to all nodes in cluster +/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster * and to identified node local sockets * @net: the applicable net namespace * @list: chain of buffers containing message * Consumes the buffer chain, except when returning -ELINKCONG * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ -int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list) +int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *bcl = tn->bcl; - struct tipc_bclink *bclink = tn->bclink; + struct tipc_link *l = tipc_bc_sndlink(net); + struct sk_buff_head xmitq, inputq, rcvq; int rc = 0; - int bc = 0; - struct sk_buff *skb; - struct sk_buff_head arrvq; - struct sk_buff_head inputq; - /* Prepare clone of message for local node */ - skb = tipc_msg_reassemble(list); - if (unlikely(!skb)) { - __skb_queue_purge(list); + __skb_queue_head_init(&rcvq); + __skb_queue_head_init(&xmitq); + skb_queue_head_init(&inputq); + + /* Prepare message clone for local node */ + if (unlikely(!tipc_msg_reassemble(list, &rcvq))) return -EHOSTUNREACH; - } - /* Broadcast to all nodes */ - if (likely(bclink)) { - tipc_bclink_lock(net); - if (likely(bclink->bcast_nodes.count)) { - rc = __tipc_link_xmit(net, bcl, list); - if (likely(!rc)) { - u32 len = skb_queue_len(&bcl->transmq); - - bclink_set_last_sent(net); - bcl->stats.queue_sz_counts++; - bcl->stats.accu_queue_sz += len; - } - bc = 1; - } - tipc_bclink_unlock(net); - } - if (unlikely(!bc)) - __skb_queue_purge(list); + tipc_bcast_lock(net); + if (tipc_link_bc_peers(l)) + rc = tipc_link_xmit(l, list, &xmitq); + tipc_bcast_unlock(net); + /* Don't send to local node if adding to link failed */ if (unlikely(rc)) { - kfree_skb(skb); + __skb_queue_purge(&rcvq); return rc; } - /* Deliver message clone */ - __skb_queue_head_init(&arrvq); - skb_queue_head_init(&inputq); - __skb_queue_tail(&arrvq, skb); - tipc_sk_mcast_rcv(net, &arrvq, &inputq); - return rc; -} -/** - * bclink_accept_pkt - accept an incoming, in-sequence broadcast packet - * - * Called with both sending node's lock and bclink_lock taken. - */ -static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) -{ - struct tipc_net *tn = net_generic(node->net, tipc_net_id); - - bclink_update_last_sent(node, seqno); - node->bclink.last_in = seqno; - node->bclink.oos_state = 0; - tn->bcl->stats.recv_info++; - - /* - * Unicast an ACK periodically, ensuring that - * all nodes in the cluster don't ACK at the same time - */ - if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) { - tipc_link_proto_xmit(node->active_links[node->addr & 1], - STATE_MSG, 0, 0, 0, 0); - tn->bcl->stats.sent_acks++; - } + /* Broadcast to all nodes, inluding local node */ + tipc_bcbase_xmit(net, &xmitq); + tipc_sk_mcast_rcv(net, &rcvq, &inputq); + __skb_queue_purge(list); + return 0; } -/** - * tipc_bclink_rcv - receive a broadcast packet, and deliver upwards +/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link * * RCU is locked, no other locks set */ -void tipc_bclink_rcv(struct net *net, struct sk_buff *buf) +int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *bcl = tn->bcl; - struct tipc_msg *msg = buf_msg(buf); - struct tipc_node *node; - u32 next_in; - u32 seqno; - int deferred = 0; - int pos = 0; - struct sk_buff *iskb; - struct sk_buff_head *arrvq, *inputq; - - /* Screen out unwanted broadcast messages */ - if (msg_mc_netid(msg) != tn->net_id) - goto exit; - - node = tipc_node_find(net, msg_prevnode(msg)); - if (unlikely(!node)) - goto exit; - - tipc_node_lock(node); - if (unlikely(!node->bclink.recv_permitted)) - goto unlock; - - /* Handle broadcast protocol message */ - if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) { - if (msg_type(msg) != STATE_MSG) - goto unlock; - if (msg_destnode(msg) == tn->own_addr) { - tipc_bclink_acknowledge(node, msg_bcast_ack(msg)); - tipc_bclink_lock(net); - bcl->stats.recv_nacks++; - tn->bclink->retransmit_to = node; - bclink_retransmit_pkt(tn, msg_bcgap_after(msg), - msg_bcgap_to(msg)); - tipc_bclink_unlock(net); - tipc_node_unlock(node); - } else { - tipc_node_unlock(node); - bclink_peek_nack(net, msg); - } - tipc_node_put(node); - goto exit; - } - - /* Handle in-sequence broadcast message */ - seqno = msg_seqno(msg); - next_in = mod(node->bclink.last_in + 1); - arrvq = &tn->bclink->arrvq; - inputq = &tn->bclink->inputq; - - if (likely(seqno == next_in)) { -receive: - /* Deliver message to destination */ - if (likely(msg_isdata(msg))) { - tipc_bclink_lock(net); - bclink_accept_pkt(node, seqno); - spin_lock_bh(&inputq->lock); - __skb_queue_tail(arrvq, buf); - spin_unlock_bh(&inputq->lock); - node->action_flags |= TIPC_BCAST_MSG_EVT; - tipc_bclink_unlock(net); - tipc_node_unlock(node); - } else if (msg_user(msg) == MSG_BUNDLER) { - tipc_bclink_lock(net); - bclink_accept_pkt(node, seqno); - bcl->stats.recv_bundles++; - bcl->stats.recv_bundled += msg_msgcnt(msg); - pos = 0; - while (tipc_msg_extract(buf, &iskb, &pos)) { - spin_lock_bh(&inputq->lock); - __skb_queue_tail(arrvq, iskb); - spin_unlock_bh(&inputq->lock); - } - node->action_flags |= TIPC_BCAST_MSG_EVT; - tipc_bclink_unlock(net); - tipc_node_unlock(node); - } else if (msg_user(msg) == MSG_FRAGMENTER) { - tipc_bclink_lock(net); - bclink_accept_pkt(node, seqno); - tipc_buf_append(&node->bclink.reasm_buf, &buf); - if (unlikely(!buf && !node->bclink.reasm_buf)) { - tipc_bclink_unlock(net); - goto unlock; - } - bcl->stats.recv_fragments++; - if (buf) { - bcl->stats.recv_fragmented++; - msg = buf_msg(buf); - tipc_bclink_unlock(net); - goto receive; - } - tipc_bclink_unlock(net); - tipc_node_unlock(node); - } else { - tipc_bclink_lock(net); - bclink_accept_pkt(node, seqno); - tipc_bclink_unlock(net); - tipc_node_unlock(node); - kfree_skb(buf); - } - buf = NULL; - - /* Determine new synchronization state */ - tipc_node_lock(node); - if (unlikely(!tipc_node_is_up(node))) - goto unlock; - - if (node->bclink.last_in == node->bclink.last_sent) - goto unlock; + struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; + struct sk_buff_head xmitq; + int rc; - if (skb_queue_empty(&node->bclink.deferdq)) { - node->bclink.oos_state = 1; - goto unlock; - } - - msg = buf_msg(skb_peek(&node->bclink.deferdq)); - seqno = msg_seqno(msg); - next_in = mod(next_in + 1); - if (seqno != next_in) - goto unlock; - - /* Take in-sequence message from deferred queue & deliver it */ - buf = __skb_dequeue(&node->bclink.deferdq); - goto receive; - } + __skb_queue_head_init(&xmitq); - /* Handle out-of-sequence broadcast message */ - if (less(next_in, seqno)) { - deferred = tipc_link_defer_pkt(&node->bclink.deferdq, - buf); - bclink_update_last_sent(node, seqno); - buf = NULL; + if (msg_mc_netid(hdr) != tipc_netid(net) || !tipc_link_is_up(l)) { + kfree_skb(skb); + return 0; } - tipc_bclink_lock(net); - - if (deferred) - bcl->stats.deferred_recv++; + tipc_bcast_lock(net); + if (msg_user(hdr) == BCAST_PROTOCOL) + rc = tipc_link_bc_nack_rcv(l, skb, &xmitq); else - bcl->stats.duplicates++; + rc = tipc_link_rcv(l, skb, NULL); + tipc_bcast_unlock(net); - tipc_bclink_unlock(net); + tipc_bcbase_xmit(net, &xmitq); -unlock: - tipc_node_unlock(node); - tipc_node_put(node); -exit: - kfree_skb(buf); -} + /* Any socket wakeup messages ? */ + if (!skb_queue_empty(inputq)) + tipc_sk_rcv(net, inputq); -u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr) -{ - return (n_ptr->bclink.recv_permitted && - (tipc_bclink_get_last_sent(n_ptr->net) != n_ptr->bclink.acked)); + return rc; } - -/** - * tipc_bcbearer_send - send a packet through the broadcast pseudo-bearer - * - * Send packet over as many bearers as necessary to reach all nodes - * that have joined the broadcast link. +/* tipc_bcast_ack_rcv - receive and handle a broadcast acknowledge * - * Returns 0 (packet sent successfully) under all circumstances, - * since the broadcast link's pseudo-bearer never blocks + * RCU is locked, no other locks set */ -static int tipc_bcbearer_send(struct net *net, struct sk_buff *buf, - struct tipc_bearer *unused1, - struct tipc_media_addr *unused2) +void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked) { - int bp_index; - struct tipc_msg *msg = buf_msg(buf); - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bcbearer *bcbearer = tn->bcbearer; - struct tipc_bclink *bclink = tn->bclink; - - /* Prepare broadcast link message for reliable transmission, - * if first time trying to send it; - * preparation is skipped for broadcast link protocol messages - * since they are sent in an unreliable manner and don't need it - */ - if (likely(!msg_non_seq(buf_msg(buf)))) { - bcbuf_set_acks(buf, bclink->bcast_nodes.count); - msg_set_non_seq(msg, 1); - msg_set_mc_netid(msg, tn->net_id); - tn->bcl->stats.sent_info++; - if (WARN_ON(!bclink->bcast_nodes.count)) { - dump_stack(); - return 0; - } - } + struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; + struct sk_buff_head xmitq; - /* Send buffer over bearers until all targets reached */ - bcbearer->remains = bclink->bcast_nodes; - - for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { - struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary; - struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary; - struct tipc_bearer *bp[2] = {p, s}; - struct tipc_bearer *b = bp[msg_link_selector(msg)]; - struct sk_buff *tbuf; - - if (!p) - break; /* No more bearers to try */ - if (!b) - b = p; - tipc_nmap_diff(&bcbearer->remains, &b->nodes, - &bcbearer->remains_new); - if (bcbearer->remains_new.count == bcbearer->remains.count) - continue; /* Nothing added by bearer pair */ - - if (bp_index == 0) { - /* Use original buffer for first bearer */ - tipc_bearer_send(net, b->identity, buf, &b->bcast_addr); - } else { - /* Avoid concurrent buffer access */ - tbuf = pskb_copy_for_clone(buf, GFP_ATOMIC); - if (!tbuf) - break; - tipc_bearer_send(net, b->identity, tbuf, - &b->bcast_addr); - kfree_skb(tbuf); /* Bearer keeps a clone */ - } - if (bcbearer->remains_new.count == 0) - break; /* All targets reached */ + __skb_queue_head_init(&xmitq); - bcbearer->remains = bcbearer->remains_new; - } + tipc_bcast_lock(net); + tipc_link_bc_ack_rcv(l, acked, &xmitq); + tipc_bcast_unlock(net); - return 0; + tipc_bcbase_xmit(net, &xmitq); + + /* Any socket wakeup messages ? */ + if (!skb_queue_empty(inputq)) + tipc_sk_rcv(net, inputq); } -/** - * tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer +/* tipc_bcast_synch_rcv - check and update rcv link with peer's send state + * + * RCU is locked, no other locks set */ -void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr, - u32 node, bool action) +void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, + struct tipc_msg *hdr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bcbearer *bcbearer = tn->bcbearer; - struct tipc_bcbearer_pair *bp_temp = bcbearer->bpairs_temp; - struct tipc_bcbearer_pair *bp_curr; - struct tipc_bearer *b; - int b_index; - int pri; - - tipc_bclink_lock(net); + struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; + struct sk_buff_head xmitq; - if (action) - tipc_nmap_add(nm_ptr, node); - else - tipc_nmap_remove(nm_ptr, node); + __skb_queue_head_init(&xmitq); - /* Group bearers by priority (can assume max of two per priority) */ - memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp)); + tipc_bcast_lock(net); + if (msg_type(hdr) == STATE_MSG) { + tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq); + tipc_link_bc_sync_rcv(l, hdr, &xmitq); + } else { + tipc_link_bc_init_rcv(l, hdr); + } + tipc_bcast_unlock(net); - rcu_read_lock(); - for (b_index = 0; b_index < MAX_BEARERS; b_index++) { - b = rcu_dereference_rtnl(tn->bearer_list[b_index]); - if (!b || !b->nodes.count) - continue; + tipc_bcbase_xmit(net, &xmitq); - if (!bp_temp[b->priority].primary) - bp_temp[b->priority].primary = b; - else - bp_temp[b->priority].secondary = b; - } - rcu_read_unlock(); + /* Any socket wakeup messages ? */ + if (!skb_queue_empty(inputq)) + tipc_sk_rcv(net, inputq); +} - /* Create array of bearer pairs for broadcasting */ - bp_curr = bcbearer->bpairs; - memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs)); +/* tipc_bcast_add_peer - add a peer node to broadcast link and bearer + * + * RCU is locked, node lock is set + */ +void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l, + struct sk_buff_head *xmitq) +{ + struct tipc_link *snd_l = tipc_bc_sndlink(net); - for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) { + tipc_bcast_lock(net); + tipc_link_add_bc_peer(snd_l, uc_l, xmitq); + tipc_bcbase_select_primary(net); + tipc_bcast_unlock(net); +} - if (!bp_temp[pri].primary) - continue; +/* tipc_bcast_remove_peer - remove a peer node from broadcast link and bearer + * + * RCU is locked, node lock is set + */ +void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) +{ + struct tipc_link *snd_l = tipc_bc_sndlink(net); + struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; + struct sk_buff_head xmitq; - bp_curr->primary = bp_temp[pri].primary; + __skb_queue_head_init(&xmitq); - if (bp_temp[pri].secondary) { - if (tipc_nmap_equal(&bp_temp[pri].primary->nodes, - &bp_temp[pri].secondary->nodes)) { - bp_curr->secondary = bp_temp[pri].secondary; - } else { - bp_curr++; - bp_curr->primary = bp_temp[pri].secondary; - } - } + tipc_bcast_lock(net); + tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq); + tipc_bcbase_select_primary(net); + tipc_bcast_unlock(net); - bp_curr++; - } + tipc_bcbase_xmit(net, &xmitq); - tipc_bclink_unlock(net); + /* Any socket wakeup messages ? */ + if (!skb_queue_empty(inputq)) + tipc_sk_rcv(net, inputq); } static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, @@ -784,12 +395,14 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) if (!bcl) return 0; - tipc_bclink_lock(net); + tipc_bcast_lock(net); hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_LINK_GET); - if (!hdr) + if (!hdr) { + tipc_bcast_unlock(net); return -EMSGSIZE; + } attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); if (!attrs) @@ -803,9 +416,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->next_in_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->next_out_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt)) goto attr_msg_full; prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); @@ -819,7 +432,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) if (err) goto attr_msg_full; - tipc_bclink_unlock(net); + tipc_bcast_unlock(net); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); @@ -830,7 +443,7 @@ prop_msg_full: attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: - tipc_bclink_unlock(net); + tipc_bcast_unlock(net); genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; @@ -844,143 +457,91 @@ int tipc_bclink_reset_stats(struct net *net) if (!bcl) return -ENOPROTOOPT; - tipc_bclink_lock(net); + tipc_bcast_lock(net); memset(&bcl->stats, 0, sizeof(bcl->stats)); - tipc_bclink_unlock(net); + tipc_bcast_unlock(net); return 0; } -int tipc_bclink_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *bcl = tn->bcl; + struct tipc_link *l = tipc_bc_sndlink(net); - if (!bcl) + if (!l) return -ENOPROTOOPT; - if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN)) + if (limit < BCLINK_WIN_MIN) + limit = BCLINK_WIN_MIN; + if (limit > TIPC_MAX_LINK_WIN) return -EINVAL; - - tipc_bclink_lock(net); - tipc_link_set_queue_limits(bcl, limit); - tipc_bclink_unlock(net); + tipc_bcast_lock(net); + tipc_link_set_queue_limits(l, limit); + tipc_bcast_unlock(net); return 0; } -int tipc_bclink_init(struct net *net) +int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bcbearer *bcbearer; - struct tipc_bclink *bclink; - struct tipc_link *bcl; - - bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC); - if (!bcbearer) - return -ENOMEM; - - bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC); - if (!bclink) { - kfree(bcbearer); - return -ENOMEM; - } + int err; + u32 win; + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; - bcl = &bclink->link; - bcbearer->bearer.media = &bcbearer->media; - bcbearer->media.send_msg = tipc_bcbearer_send; - sprintf(bcbearer->media.name, "tipc-broadcast"); - - spin_lock_init(&bclink->lock); - __skb_queue_head_init(&bcl->transmq); - __skb_queue_head_init(&bcl->backlogq); - __skb_queue_head_init(&bcl->deferdq); - skb_queue_head_init(&bcl->wakeupq); - bcl->next_out_no = 1; - spin_lock_init(&bclink->node.lock); - __skb_queue_head_init(&bclink->arrvq); - skb_queue_head_init(&bclink->inputq); - bcl->owner = &bclink->node; - bcl->owner->net = net; - bcl->mtu = MAX_PKT_DEFAULT_MCAST; - tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); - bcl->bearer_id = MAX_BEARERS; - rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer); - bcl->state = WORKING_WORKING; - bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg; - msg_set_prevnode(bcl->pmsg, tn->own_addr); - strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); - tn->bcbearer = bcbearer; - tn->bclink = bclink; - tn->bcl = bcl; - return 0; -} + if (!attrs[TIPC_NLA_LINK_PROP]) + return -EINVAL; -void tipc_bclink_stop(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); + if (err) + return err; - tipc_bclink_lock(net); - tipc_link_purge_queues(tn->bcl); - tipc_bclink_unlock(net); + if (!props[TIPC_NLA_PROP_WIN]) + return -EOPNOTSUPP; - RCU_INIT_POINTER(tn->bearer_list[BCBEARER], NULL); - synchronize_net(); - kfree(tn->bcbearer); - kfree(tn->bclink); + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + + return tipc_bc_link_set_queue_limits(net, win); } -/** - * tipc_nmap_add - add a node to a node map - */ -static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node) +int tipc_bcast_init(struct net *net) { - int n = tipc_node(node); - int w = n / WSIZE; - u32 mask = (1 << (n % WSIZE)); + struct tipc_net *tn = tipc_net(net); + struct tipc_bc_base *bb = NULL; + struct tipc_link *l = NULL; - if ((nm_ptr->map[w] & mask) == 0) { - nm_ptr->count++; - nm_ptr->map[w] |= mask; - } + bb = kzalloc(sizeof(*bb), GFP_ATOMIC); + if (!bb) + goto enomem; + tn->bcbase = bb; + spin_lock_init(&tipc_net(net)->bclock); + + if (!tipc_link_bc_create(net, 0, 0, + U16_MAX, + BCLINK_WIN_DEFAULT, + 0, + &bb->inputq, + NULL, + NULL, + &l)) + goto enomem; + bb->link = l; + tn->bcl = l; + return 0; +enomem: + kfree(bb); + kfree(l); + return -ENOMEM; } -/** - * tipc_nmap_remove - remove a node from a node map - */ -static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node) +void tipc_bcast_reinit(struct net *net) { - int n = tipc_node(node); - int w = n / WSIZE; - u32 mask = (1 << (n % WSIZE)); + struct tipc_bc_base *b = tipc_bc_base(net); - if ((nm_ptr->map[w] & mask) != 0) { - nm_ptr->map[w] &= ~mask; - nm_ptr->count--; - } + msg_set_prevnode(b->link->pmsg, tipc_own_addr(net)); } -/** - * tipc_nmap_diff - find differences between node maps - * @nm_a: input node map A - * @nm_b: input node map B - * @nm_diff: output node map A-B (i.e. nodes of A that are not in B) - */ -static void tipc_nmap_diff(struct tipc_node_map *nm_a, - struct tipc_node_map *nm_b, - struct tipc_node_map *nm_diff) +void tipc_bcast_stop(struct net *net) { - int stop = ARRAY_SIZE(nm_a->map); - int w; - int b; - u32 map; - - memset(nm_diff, 0, sizeof(*nm_diff)); - for (w = 0; w < stop; w++) { - map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]); - nm_diff->map[w] = map; - if (map != 0) { - for (b = 0 ; b < WSIZE; b++) { - if (map & (1 << b)) - nm_diff->count++; - } - } - } + struct tipc_net *tn = net_generic(net, tipc_net_id); + + synchronize_net(); + kfree(tn->bcbase); + kfree(tn->bcl); } diff --git a/kernel/net/tipc/bcast.h b/kernel/net/tipc/bcast.h index 4bdc12277..2855b9356 100644 --- a/kernel/net/tipc/bcast.h +++ b/kernel/net/tipc/bcast.h @@ -37,100 +37,44 @@ #ifndef _TIPC_BCAST_H #define _TIPC_BCAST_H -#include -#include "link.h" -#include "node.h" +#include "core.h" -/** - * struct tipc_bcbearer_pair - a pair of bearers used by broadcast link - * @primary: pointer to primary bearer - * @secondary: pointer to secondary bearer - * - * Bearers must have same priority and same set of reachable destinations - * to be paired. - */ - -struct tipc_bcbearer_pair { - struct tipc_bearer *primary; - struct tipc_bearer *secondary; -}; - -#define BCBEARER MAX_BEARERS - -/** - * struct tipc_bcbearer - bearer used by broadcast link - * @bearer: (non-standard) broadcast bearer structure - * @media: (non-standard) broadcast media structure - * @bpairs: array of bearer pairs - * @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort() - * @remains: temporary node map used by tipc_bcbearer_send() - * @remains_new: temporary node map used tipc_bcbearer_send() - * - * Note: The fields labelled "temporary" are incorporated into the bearer - * to avoid consuming potentially limited stack space through the use of - * large local variables within multicast routines. Concurrent access is - * prevented through use of the spinlock "bclink_lock". - */ -struct tipc_bcbearer { - struct tipc_bearer bearer; - struct tipc_media media; - struct tipc_bcbearer_pair bpairs[MAX_BEARERS]; - struct tipc_bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1]; - struct tipc_node_map remains; - struct tipc_node_map remains_new; -}; +struct tipc_node; +struct tipc_msg; +struct tipc_nl_msg; +struct tipc_node_map; -/** - * struct tipc_bclink - link used for broadcast messages - * @lock: spinlock governing access to structure - * @link: (non-standard) broadcast link structure - * @node: (non-standard) node structure representing b'cast link's peer node - * @bcast_nodes: map of broadcast-capable nodes - * @retransmit_to: node that most recently requested a retransmit - * - * Handles sequence numbering, fragmentation, bundling, etc. - */ -struct tipc_bclink { - spinlock_t lock; - struct tipc_link link; - struct tipc_node node; - struct sk_buff_head arrvq; - struct sk_buff_head inputq; - struct tipc_node_map bcast_nodes; - struct tipc_node *retransmit_to; -}; +int tipc_bcast_init(struct net *net); +void tipc_bcast_reinit(struct net *net); +void tipc_bcast_stop(struct net *net); +void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, + struct sk_buff_head *xmitq); +void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); +void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); +void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); +int tipc_bcast_get_mtu(struct net *net); +int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list); +int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); +void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked); +void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, + struct tipc_msg *hdr); +int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); +int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); +int tipc_bclink_reset_stats(struct net *net); -struct tipc_node; -extern const char tipc_bclink_name[]; +static inline void tipc_bcast_lock(struct net *net) +{ + spin_lock_bh(&tipc_net(net)->bclock); +} -/** - * tipc_nmap_equal - test for equality of node maps - */ -static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, - struct tipc_node_map *nm_b) +static inline void tipc_bcast_unlock(struct net *net) { - return !memcmp(nm_a, nm_b, sizeof(*nm_a)); + spin_unlock_bh(&tipc_net(net)->bclock); } -int tipc_bclink_init(struct net *net); -void tipc_bclink_stop(struct net *net); -void tipc_bclink_add_node(struct net *net, u32 addr); -void tipc_bclink_remove_node(struct net *net, u32 addr); -struct tipc_node *tipc_bclink_retransmit_to(struct net *tn); -void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked); -void tipc_bclink_rcv(struct net *net, struct sk_buff *buf); -u32 tipc_bclink_get_last_sent(struct net *net); -u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr); -void tipc_bclink_update_link_state(struct tipc_node *node, - u32 last_sent); -int tipc_bclink_reset_stats(struct net *net); -int tipc_bclink_set_queue_limits(struct net *net, u32 limit); -void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr, - u32 node, bool action); -uint tipc_bclink_get_mtu(void); -int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list); -void tipc_bclink_wakeup_users(struct net *net); -int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); -void tipc_bclink_input(struct net *net); +static inline struct tipc_link *tipc_bc_sndlink(struct net *net) +{ + return tipc_net(net)->bcl; +} #endif diff --git a/kernel/net/tipc/bearer.c b/kernel/net/tipc/bearer.c index 70e3dacbf..648f2a67f 100644 --- a/kernel/net/tipc/bearer.c +++ b/kernel/net/tipc/bearer.c @@ -71,8 +71,7 @@ static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } }; -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, - bool shutting_down); +static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr); /** * tipc_media_find - locates specified media object by name @@ -194,10 +193,8 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) rcu_read_lock(); b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (b_ptr) { - tipc_bcbearer_sort(net, &b_ptr->nodes, dest, true); + if (b_ptr) tipc_disc_add_dest(b_ptr->link_req); - } rcu_read_unlock(); } @@ -208,10 +205,8 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) rcu_read_lock(); b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (b_ptr) { - tipc_bcbearer_sort(net, &b_ptr->nodes, dest, false); + if (b_ptr) tipc_disc_remove_dest(b_ptr->link_req); - } rcu_read_unlock(); } @@ -324,7 +319,7 @@ restart: res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr); if (res) { - bearer_disable(net, b_ptr, false); + bearer_disable(net, b_ptr); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", name); return -EINVAL; @@ -344,7 +339,7 @@ restart: static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) { pr_info("Resetting bearer <%s>\n", b_ptr->name); - tipc_link_reset_list(net, b_ptr->identity); + tipc_node_delete_links(net, b_ptr->identity); tipc_disc_reset(net, b_ptr); return 0; } @@ -354,8 +349,7 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) * * Note: This routine assumes caller holds RTNL lock. */ -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, - bool shutting_down) +static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr) { struct tipc_net *tn = net_generic(net, tipc_net_id); u32 i; @@ -363,7 +357,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, pr_info("Disabling bearer <%s>\n", b_ptr->name); b_ptr->media->disable_media(b_ptr); - tipc_link_delete_list(net, b_ptr->identity, shutting_down); + tipc_node_delete_links(net, b_ptr->identity); + RCU_INIT_POINTER(b_ptr->media_ptr, NULL); if (b_ptr->link_req) tipc_disc_delete(b_ptr->link_req); @@ -401,16 +396,13 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, /* tipc_disable_l2_media - detach TIPC bearer from an L2 interface * - * Mark L2 bearer as inactive so that incoming buffers are thrown away, - * then get worker thread to complete bearer cleanup. (Can't do cleanup - * here because cleanup code needs to sleep and caller holds spinlocks.) + * Mark L2 bearer as inactive so that incoming buffers are thrown away */ void tipc_disable_l2_media(struct tipc_bearer *b) { struct net_device *dev; dev = (struct net_device *)rtnl_dereference(b->media_ptr); - RCU_INIT_POINTER(b->media_ptr, NULL); RCU_INIT_POINTER(dev->tipc_ptr, NULL); synchronize_net(); dev_put(dev); @@ -422,10 +414,9 @@ void tipc_disable_l2_media(struct tipc_bearer *b) * @b_ptr: the bearer through which the packet is to be sent * @dest: peer destination address */ -int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, +int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dest) { - struct sk_buff *clone; struct net_device *dev; int delta; @@ -433,42 +424,97 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, if (!dev) return 0; - clone = skb_clone(buf, GFP_ATOMIC); - if (!clone) - return 0; - - delta = dev->hard_header_len - skb_headroom(buf); + delta = dev->hard_header_len - skb_headroom(skb); if ((delta > 0) && - pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { - kfree_skb(clone); + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(skb); return 0; } - skb_reset_network_header(clone); - clone->dev = dev; - clone->protocol = htons(ETH_P_TIPC); - dev_hard_header(clone, dev, ETH_P_TIPC, dest->value, - dev->dev_addr, clone->len); - dev_queue_xmit(clone); + skb_reset_network_header(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_TIPC); + dev_hard_header(skb, dev, ETH_P_TIPC, dest->value, + dev->dev_addr, skb->len); + dev_queue_xmit(skb); return 0; } -/* tipc_bearer_send- sends buffer to destination over bearer - * - * IMPORTANT: - * The media send routine must not alter the buffer being passed in - * as it may be needed for later retransmission! +int tipc_bearer_mtu(struct net *net, u32 bearer_id) +{ + int mtu = 0; + struct tipc_bearer *b; + + rcu_read_lock(); + b = rcu_dereference_rtnl(tipc_net(net)->bearer_list[bearer_id]); + if (b) + mtu = b->mtu; + rcu_read_unlock(); + return mtu; +} + +/* tipc_bearer_xmit_skb - sends buffer to destination over bearer + */ +void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, + struct sk_buff *skb, + struct tipc_media_addr *dest) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer *b; + + rcu_read_lock(); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (likely(b)) + b->media->send_msg(net, skb, b, dest); + rcu_read_unlock(); +} + +/* tipc_bearer_xmit() -send buffer to destination over bearer */ -void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf, - struct tipc_media_addr *dest) +void tipc_bearer_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr *dst) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; + struct sk_buff *skb, *tmp; + + if (skb_queue_empty(xmitq)) + return; rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (likely(b_ptr)) - b_ptr->media->send_msg(net, buf, b_ptr, dest); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (likely(b)) { + skb_queue_walk_safe(xmitq, skb, tmp) { + __skb_dequeue(xmitq); + b->media->send_msg(net, skb, b, dst); + } + } + rcu_read_unlock(); +} + +/* tipc_bearer_bc_xmit() - broadcast buffers to all destinations + */ +void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq) +{ + struct tipc_net *tn = tipc_net(net); + int net_id = tn->net_id; + struct tipc_bearer *b; + struct sk_buff *skb, *tmp; + struct tipc_msg *hdr; + + rcu_read_lock(); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (likely(b)) { + skb_queue_walk_safe(xmitq, skb, tmp) { + hdr = buf_msg(skb); + msg_set_non_seq(hdr, 1); + msg_set_mc_netid(hdr, net_id); + __skb_dequeue(xmitq); + b->media->send_msg(net, skb, b, &b->bcast_addr); + } + } rcu_read_unlock(); } @@ -530,7 +576,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; - case NETDEV_DOWN: + case NETDEV_GOING_DOWN: case NETDEV_CHANGEMTU: tipc_reset_bearer(net, b_ptr); break; @@ -541,7 +587,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b_ptr, false); + bearer_disable(dev_net(dev), b_ptr); break; } return NOTIFY_OK; @@ -583,7 +629,7 @@ void tipc_bearer_stop(struct net *net) for (i = 0; i < MAX_BEARERS; i++) { b_ptr = rtnl_dereference(tn->bearer_list[i]); if (b_ptr) { - bearer_disable(net, b_ptr, true); + bearer_disable(net, b_ptr); tn->bearer_list[i] = NULL; } } @@ -747,7 +793,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - bearer_disable(net, bearer, false); + bearer_disable(net, bearer); rtnl_unlock(); return 0; @@ -812,7 +858,7 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) char *name; struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; - struct net *net = genl_info_net(info); + struct net *net = sock_net(skb->sk); if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/kernel/net/tipc/bearer.h b/kernel/net/tipc/bearer.h index 5cad243ee..552185bc4 100644 --- a/kernel/net/tipc/bearer.h +++ b/kernel/net/tipc/bearer.h @@ -38,9 +38,9 @@ #define _TIPC_BEARER_H #include "netlink.h" +#include "core.h" #include -#define MAX_BEARERS 2 #define MAX_MEDIA 3 #define MAX_NODES 4096 #define WSIZE 32 @@ -163,6 +163,7 @@ struct tipc_bearer { u32 identity; struct tipc_link_req *link_req; char net_plane; + int node_cnt; struct tipc_node_map nodes; }; @@ -215,7 +216,14 @@ struct tipc_media *tipc_media_find(const char *name); int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); -void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf, - struct tipc_media_addr *dest); +int tipc_bearer_mtu(struct net *net, u32 bearer_id); +void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, + struct sk_buff *skb, + struct tipc_media_addr *dest); +void tipc_bearer_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr *dst); +void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq); #endif /* _TIPC_BEARER_H */ diff --git a/kernel/net/tipc/core.c b/kernel/net/tipc/core.c index be1c9fa60..03a842870 100644 --- a/kernel/net/tipc/core.c +++ b/kernel/net/tipc/core.c @@ -42,6 +42,7 @@ #include "bearer.h" #include "net.h" #include "socket.h" +#include "bcast.h" #include @@ -68,11 +69,18 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; - err = tipc_subscr_start(net); + err = tipc_topsrv_start(net); if (err) goto out_subscr; + + err = tipc_bcast_init(net); + if (err) + goto out_bclink; + return 0; +out_bclink: + tipc_bcast_stop(net); out_subscr: tipc_nametbl_stop(net); out_nametbl: @@ -83,8 +91,9 @@ out_sk_rht: static void __net_exit tipc_exit_net(struct net *net) { - tipc_subscr_stop(net); + tipc_topsrv_stop(net); tipc_net_stop(net); + tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); } diff --git a/kernel/net/tipc/core.h b/kernel/net/tipc/core.h index 3dc68c7a9..18e95a802 100644 --- a/kernel/net/tipc/core.h +++ b/kernel/net/tipc/core.h @@ -60,16 +60,18 @@ #include #include -#include "node.h" -#include "bearer.h" -#include "bcast.h" -#include "netlink.h" -#include "link.h" -#include "node.h" -#include "msg.h" +struct tipc_node; +struct tipc_bearer; +struct tipc_bc_base; +struct tipc_link; +struct tipc_name_table; +struct tipc_server; #define TIPC_MOD_VER "2.0.0" +#define NODE_HTABLE_SIZE 512 +#define MAX_BEARERS 3 + extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; @@ -90,8 +92,8 @@ struct tipc_net { struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; /* Broadcast link */ - struct tipc_bcbearer *bcbearer; - struct tipc_bclink *bclink; + spinlock_t bclock; + struct tipc_bc_base *bcbase; struct tipc_link *bcl; /* Socket hash table */ @@ -106,6 +108,41 @@ struct tipc_net { atomic_t subscription_count; }; +static inline struct tipc_net *tipc_net(struct net *net) +{ + return net_generic(net, tipc_net_id); +} + +static inline int tipc_netid(struct net *net) +{ + return tipc_net(net)->net_id; +} + +static inline u16 mod(u16 x) +{ + return x & 0xffffu; +} + +static inline int less_eq(u16 left, u16 right) +{ + return mod(right - left) < 32768u; +} + +static inline int more(u16 left, u16 right) +{ + return !less_eq(left, right); +} + +static inline int less(u16 left, u16 right) +{ + return less_eq(left, right) && (mod(right) != mod(left)); +} + +static inline int in_range(u16 val, u16 min, u16 max) +{ + return !less(val, min) && !more(val, max); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/kernel/net/tipc/discover.c b/kernel/net/tipc/discover.c index 967e292f5..afe8c47c4 100644 --- a/kernel/net/tipc/discover.c +++ b/kernel/net/tipc/discover.c @@ -35,7 +35,7 @@ */ #include "core.h" -#include "link.h" +#include "node.h" #include "discover.h" /* min delay during bearer start up */ @@ -89,7 +89,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, MAX_H_SIZE, dest_domain); msg_set_non_seq(msg, 1); msg_set_node_sig(msg, tn->random); - msg_set_node_capabilities(msg, 0); + msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(msg, dest_domain); msg_set_bc_netid(msg, tn->net_id); b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr); @@ -120,30 +120,24 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr, * @buf: buffer containing message * @bearer: bearer that message arrived on */ -void tipc_disc_rcv(struct net *net, struct sk_buff *buf, +void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *bearer) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_node *node; - struct tipc_link *link; struct tipc_media_addr maddr; - struct sk_buff *rbuf; - struct tipc_msg *msg = buf_msg(buf); - u32 ddom = msg_dest_domain(msg); - u32 onode = msg_prevnode(msg); - u32 net_id = msg_bc_netid(msg); - u32 mtyp = msg_type(msg); - u32 signature = msg_node_sig(msg); - u16 caps = msg_node_capabilities(msg); - bool addr_match = false; - bool sign_match = false; - bool link_up = false; - bool accept_addr = false; - bool accept_sign = false; + struct sk_buff *rskb; + struct tipc_msg *hdr = buf_msg(skb); + u32 ddom = msg_dest_domain(hdr); + u32 onode = msg_prevnode(hdr); + u32 net_id = msg_bc_netid(hdr); + u32 mtyp = msg_type(hdr); + u32 signature = msg_node_sig(hdr); + u16 caps = msg_node_capabilities(hdr); bool respond = false; + bool dupl_addr = false; - bearer->media->msg2addr(bearer, &maddr, msg_media_addr(msg)); - kfree_skb(buf); + bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); + kfree_skb(skb); /* Ensure message from node is valid and communication is permitted */ if (net_id != tn->net_id) @@ -165,102 +159,19 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, if (!tipc_in_scope(bearer->domain, onode)) return; - node = tipc_node_create(net, onode); - if (!node) - return; - tipc_node_lock(node); - node->capabilities = caps; - link = node->links[bearer->identity]; - - /* Prepare to validate requesting node's signature and media address */ - sign_match = (signature == node->signature); - addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr)); - link_up = link && tipc_link_is_up(link); - - - /* These three flags give us eight permutations: */ - - if (sign_match && addr_match && link_up) { - /* All is fine. Do nothing. */ - } else if (sign_match && addr_match && !link_up) { - /* Respond. The link will come up in due time */ - respond = true; - } else if (sign_match && !addr_match && link_up) { - /* Peer has changed i/f address without rebooting. - * If so, the link will reset soon, and the next - * discovery will be accepted. So we can ignore it. - * It may also be an cloned or malicious peer having - * chosen the same node address and signature as an - * existing one. - * Ignore requests until the link goes down, if ever. - */ - disc_dupl_alert(bearer, onode, &maddr); - } else if (sign_match && !addr_match && !link_up) { - /* Peer link has changed i/f address without rebooting. - * It may also be a cloned or malicious peer; we can't - * distinguish between the two. - * The signature is correct, so we must accept. - */ - accept_addr = true; - respond = true; - } else if (!sign_match && addr_match && link_up) { - /* Peer node rebooted. Two possibilities: - * - Delayed re-discovery; this link endpoint has already - * reset and re-established contact with the peer, before - * receiving a discovery message from that node. - * (The peer happened to receive one from this node first). - * - The peer came back so fast that our side has not - * discovered it yet. Probing from this side will soon - * reset the link, since there can be no working link - * endpoint at the peer end, and the link will re-establish. - * Accept the signature, since it comes from a known peer. - */ - accept_sign = true; - } else if (!sign_match && addr_match && !link_up) { - /* The peer node has rebooted. - * Accept signature, since it is a known peer. - */ - accept_sign = true; - respond = true; - } else if (!sign_match && !addr_match && link_up) { - /* Peer rebooted with new address, or a new/duplicate peer. - * Ignore until the link goes down, if ever. - */ + tipc_node_check_dest(net, onode, bearer, caps, signature, + &maddr, &respond, &dupl_addr); + if (dupl_addr) disc_dupl_alert(bearer, onode, &maddr); - } else if (!sign_match && !addr_match && !link_up) { - /* Peer rebooted with new address, or it is a new peer. - * Accept signature and address. - */ - accept_sign = true; - accept_addr = true; - respond = true; - } - - if (accept_sign) - node->signature = signature; - - if (accept_addr) { - if (!link) - link = tipc_link_create(node, bearer, &maddr); - if (link) { - memcpy(&link->media_addr, &maddr, sizeof(maddr)); - tipc_link_reset(link); - } else { - respond = false; - } - } /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { - rbuf = tipc_buf_acquire(MAX_H_SIZE); - if (rbuf) { - tipc_disc_init_msg(net, rbuf, DSC_RESP_MSG, bearer); - tipc_bearer_send(net, bearer->identity, rbuf, &maddr); - kfree_skb(rbuf); - } + rskb = tipc_buf_acquire(MAX_H_SIZE); + if (!rskb) + return; + tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); + tipc_bearer_xmit_skb(net, bearer->identity, rskb, &maddr); } - tipc_node_unlock(node); - tipc_node_put(node); } /** @@ -313,6 +224,7 @@ void tipc_disc_remove_dest(struct tipc_link_req *req) static void disc_timeout(unsigned long data) { struct tipc_link_req *req = (struct tipc_link_req *)data; + struct sk_buff *skb; int max_delay; spin_lock_bh(&req->lock); @@ -330,9 +242,9 @@ static void disc_timeout(unsigned long data) * hold at fast polling rate if don't have any associated nodes, * otherwise hold at slow polling rate */ - tipc_bearer_send(req->net, req->bearer_id, req->buf, &req->dest); - - + skb = skb_clone(req->buf, GFP_ATOMIC); + if (skb) + tipc_bearer_xmit_skb(req->net, req->bearer_id, skb, &req->dest); req->timer_intv *= 2; if (req->num_nodes) max_delay = TIPC_LINK_REQ_SLOW; @@ -359,6 +271,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, struct tipc_media_addr *dest) { struct tipc_link_req *req; + struct sk_buff *skb; req = kmalloc(sizeof(*req), GFP_ATOMIC); if (!req) @@ -380,7 +293,9 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, setup_timer(&req->timer, disc_timeout, (unsigned long)req); mod_timer(&req->timer, jiffies + req->timer_intv); b_ptr->link_req = req; - tipc_bearer_send(net, req->bearer_id, req->buf, &req->dest); + skb = skb_clone(req->buf, GFP_ATOMIC); + if (skb) + tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest); return 0; } @@ -404,6 +319,7 @@ void tipc_disc_delete(struct tipc_link_req *req) void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr) { struct tipc_link_req *req = b_ptr->link_req; + struct sk_buff *skb; spin_lock_bh(&req->lock); tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr); @@ -413,6 +329,8 @@ void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr) req->num_nodes = 0; req->timer_intv = TIPC_LINK_REQ_INIT; mod_timer(&req->timer, jiffies + req->timer_intv); - tipc_bearer_send(net, req->bearer_id, req->buf, &req->dest); + skb = skb_clone(req->buf, GFP_ATOMIC); + if (skb) + tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest); spin_unlock_bh(&req->lock); } diff --git a/kernel/net/tipc/link.c b/kernel/net/tipc/link.c index 43a515dc9..91aea071a 100644 --- a/kernel/net/tipc/link.c +++ b/kernel/net/tipc/link.c @@ -48,9 +48,9 @@ /* * Error message prefixes */ -static const char *link_co_err = "Link changeover error, "; +static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; -static const char *link_unk_evt = "Unknown link event "; +static const char tipc_bclink_name[] = "broadcast-link"; static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, @@ -76,257 +76,530 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } }; +/* Send states for broadcast NACKs + */ +enum { + BC_NACK_SND_CONDITIONAL, + BC_NACK_SND_UNCONDITIONAL, + BC_NACK_SND_SUPPRESS, +}; + +/* + * Interval between NACKs when packets arrive out of order + */ +#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) /* * Out-of-range value for link session numbers */ -#define INVALID_SESSION 0x10000 +#define WILDCARD_SESSION 0x10000 -/* - * Link state events: +/* Link FSM states: */ -#define STARTING_EVT 856384768 /* link processing trigger */ -#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ -#define TIMEOUT_EVT 560817u /* link timer expired */ +enum { + LINK_ESTABLISHED = 0xe, + LINK_ESTABLISHING = 0xe << 4, + LINK_RESET = 0x1 << 8, + LINK_RESETTING = 0x2 << 12, + LINK_PEER_RESET = 0xd << 16, + LINK_FAILINGOVER = 0xf << 20, + LINK_SYNCHING = 0xc << 24 +}; -/* - * State value stored in 'failover_pkts' +/* Link FSM state checking routines */ -#define FIRST_FAILOVER 0xffffu - -static void link_handle_out_of_seq_msg(struct tipc_link *link, - struct sk_buff *skb); -static void tipc_link_proto_rcv(struct tipc_link *link, - struct sk_buff *skb); -static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol); -static void link_state_event(struct tipc_link *l_ptr, u32 event); +static int link_is_up(struct tipc_link *l) +{ + return l->state & (LINK_ESTABLISHED | LINK_SYNCHING); +} + +static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); +static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, + u16 rcvgap, int tolerance, int priority, + struct sk_buff_head *xmitq); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static void tipc_link_sync_xmit(struct tipc_link *l); -static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); -static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); -static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); -static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); +static void tipc_link_build_nack_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); +static void tipc_link_build_bc_init_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); +static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); + /* - * Simple link routines + * Simple non-static link routines (i.e. referenced outside this file) */ -static unsigned int align(unsigned int i) +bool tipc_link_is_up(struct tipc_link *l) { - return (i + 3) & ~3u; + return link_is_up(l); } -static void tipc_link_release(struct kref *kref) +bool tipc_link_peer_is_down(struct tipc_link *l) { - kfree(container_of(kref, struct tipc_link, ref)); + return l->state == LINK_PEER_RESET; } -static void tipc_link_get(struct tipc_link *l_ptr) +bool tipc_link_is_reset(struct tipc_link *l) { - kref_get(&l_ptr->ref); + return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING); } -static void tipc_link_put(struct tipc_link *l_ptr) +bool tipc_link_is_establishing(struct tipc_link *l) { - kref_put(&l_ptr->ref, tipc_link_release); + return l->state == LINK_ESTABLISHING; } -static struct tipc_link *tipc_parallel_link(struct tipc_link *l) +bool tipc_link_is_synching(struct tipc_link *l) { - if (l->owner->active_links[0] != l) - return l->owner->active_links[0]; - return l->owner->active_links[1]; + return l->state == LINK_SYNCHING; } -/* - * Simple non-static link routines (i.e. referenced outside this file) - */ -int tipc_link_is_up(struct tipc_link *l_ptr) +bool tipc_link_is_failingover(struct tipc_link *l) { - if (!l_ptr) - return 0; - return link_working_working(l_ptr) || link_working_unknown(l_ptr); + return l->state == LINK_FAILINGOVER; } -int tipc_link_is_active(struct tipc_link *l_ptr) +bool tipc_link_is_blocked(struct tipc_link *l) { - return (l_ptr->owner->active_links[0] == l_ptr) || - (l_ptr->owner->active_links[1] == l_ptr); + return l->state & (LINK_RESETTING | LINK_PEER_RESET | LINK_FAILINGOVER); } -/** - * link_timeout - handle expiration of link timer - * @l_ptr: pointer to link - */ -static void link_timeout(unsigned long data) +static bool link_is_bc_sndlink(struct tipc_link *l) { - struct tipc_link *l_ptr = (struct tipc_link *)data; - struct sk_buff *skb; + return !l->bc_sndlink; +} + +static bool link_is_bc_rcvlink(struct tipc_link *l) +{ + return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l)); +} + +int tipc_link_is_active(struct tipc_link *l) +{ + return l->active; +} - tipc_node_lock(l_ptr->owner); +void tipc_link_set_active(struct tipc_link *l, bool active) +{ + l->active = active; +} - /* update counters used in statistical profiling of send traffic */ - l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq); - l_ptr->stats.queue_sz_counts++; +void tipc_link_add_bc_peer(struct tipc_link *snd_l, + struct tipc_link *uc_l, + struct sk_buff_head *xmitq) +{ + struct tipc_link *rcv_l = uc_l->bc_rcvlink; - skb = skb_peek(&l_ptr->transmq); - if (skb) { - struct tipc_msg *msg = buf_msg(skb); - u32 length = msg_size(msg); + snd_l->ackers++; + rcv_l->acked = snd_l->snd_nxt - 1; + snd_l->state = LINK_ESTABLISHED; + tipc_link_build_bc_init_msg(uc_l, xmitq); +} - if ((msg_user(msg) == MSG_FRAGMENTER) && - (msg_type(msg) == FIRST_FRAGMENT)) { - length = msg_size(msg_get_wrapped(msg)); - } - if (length) { - l_ptr->stats.msg_lengths_total += length; - l_ptr->stats.msg_length_counts++; - if (length <= 64) - l_ptr->stats.msg_length_profile[0]++; - else if (length <= 256) - l_ptr->stats.msg_length_profile[1]++; - else if (length <= 1024) - l_ptr->stats.msg_length_profile[2]++; - else if (length <= 4096) - l_ptr->stats.msg_length_profile[3]++; - else if (length <= 16384) - l_ptr->stats.msg_length_profile[4]++; - else if (length <= 32768) - l_ptr->stats.msg_length_profile[5]++; - else - l_ptr->stats.msg_length_profile[6]++; - } +void tipc_link_remove_bc_peer(struct tipc_link *snd_l, + struct tipc_link *rcv_l, + struct sk_buff_head *xmitq) +{ + u16 ack = snd_l->snd_nxt - 1; + + snd_l->ackers--; + tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); + tipc_link_reset(rcv_l); + rcv_l->state = LINK_RESET; + if (!snd_l->ackers) { + tipc_link_reset(snd_l); + snd_l->state = LINK_RESET; + __skb_queue_purge(xmitq); } +} - /* do all other link processing performed on a periodic basis */ - link_state_event(l_ptr, TIMEOUT_EVT); +int tipc_link_bc_peers(struct tipc_link *l) +{ + return l->ackers; +} - if (skb_queue_len(&l_ptr->backlogq)) - tipc_link_push_packets(l_ptr); +void tipc_link_set_mtu(struct tipc_link *l, int mtu) +{ + l->mtu = mtu; +} - tipc_node_unlock(l_ptr->owner); - tipc_link_put(l_ptr); +int tipc_link_mtu(struct tipc_link *l) +{ + return l->mtu; } -static void link_set_timer(struct tipc_link *link, unsigned long time) +static u32 link_own_addr(struct tipc_link *l) { - if (!mod_timer(&link->timer, jiffies + time)) - tipc_link_get(link); + return msg_prevnode(l->pmsg); } /** * tipc_link_create - create a new link - * @n_ptr: pointer to associated node - * @b_ptr: pointer to associated bearer - * @media_addr: media address to use when sending messages over link + * @n: pointer to associated node + * @if_name: associated interface name + * @bearer_id: id (index) of associated bearer + * @tolerance: link tolerance to be used by link + * @net_plane: network plane (A,B,c..) this link belongs to + * @mtu: mtu to be advertised by link + * @priority: priority to be used by link + * @window: send window to be used by link + * @session: session to be used by link + * @ownnode: identity of own node + * @peer: node id of peer node + * @peer_caps: bitmap describing peer node capabilities + * @bc_sndlink: the namespace global link used for broadcast sending + * @bc_rcvlink: the peer specific link used for broadcast reception + * @inputq: queue to put messages ready for delivery + * @namedq: queue to put binding table update messages ready for delivery + * @link: return value, pointer to put the created link * - * Returns pointer to link. + * Returns true if link was created, otherwise false */ -struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, - struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr) +bool tipc_link_create(struct net *net, char *if_name, int bearer_id, + int tolerance, char net_plane, u32 mtu, int priority, + int window, u32 session, u32 ownnode, u32 peer, + u16 peer_caps, + struct tipc_link *bc_sndlink, + struct tipc_link *bc_rcvlink, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq, + struct tipc_link **link) { - struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); - struct tipc_link *l_ptr; - struct tipc_msg *msg; - char *if_name; - char addr_string[16]; - u32 peer = n_ptr->addr; - - if (n_ptr->link_cnt >= MAX_BEARERS) { - tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_err("Attempt to establish %uth link to %s. Max %u allowed.\n", - n_ptr->link_cnt, addr_string, MAX_BEARERS); - return NULL; - } + struct tipc_link *l; + struct tipc_msg *hdr; - if (n_ptr->links[b_ptr->identity]) { - tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_err("Attempt to establish second link on <%s> to %s\n", - b_ptr->name, addr_string); - return NULL; - } - - l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC); - if (!l_ptr) { - pr_warn("Link creation failed, no memory\n"); - return NULL; - } - kref_init(&l_ptr->ref); - l_ptr->addr = peer; - if_name = strchr(b_ptr->name, ':') + 1; - sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown", - tipc_zone(tn->own_addr), tipc_cluster(tn->own_addr), - tipc_node(tn->own_addr), - if_name, - tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); - /* note: peer i/f name is updated by reset/activate message */ - memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); - l_ptr->owner = n_ptr; - l_ptr->checkpoint = 1; - l_ptr->peer_session = INVALID_SESSION; - l_ptr->bearer_id = b_ptr->identity; - link_set_supervision_props(l_ptr, b_ptr->tolerance); - l_ptr->state = RESET_UNKNOWN; - - l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; - msg = l_ptr->pmsg; - tipc_msg_init(tn->own_addr, msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, - l_ptr->addr); - msg_set_size(msg, sizeof(l_ptr->proto_msg)); - msg_set_session(msg, (tn->random & 0xffff)); - msg_set_bearer_id(msg, b_ptr->identity); - strcpy((char *)msg_data(msg), if_name); - l_ptr->net_plane = b_ptr->net_plane; - l_ptr->advertised_mtu = b_ptr->mtu; - l_ptr->mtu = l_ptr->advertised_mtu; - l_ptr->priority = b_ptr->priority; - tipc_link_set_queue_limits(l_ptr, b_ptr->window); - l_ptr->next_out_no = 1; - __skb_queue_head_init(&l_ptr->transmq); - __skb_queue_head_init(&l_ptr->backlogq); - __skb_queue_head_init(&l_ptr->deferdq); - skb_queue_head_init(&l_ptr->wakeupq); - skb_queue_head_init(&l_ptr->inputq); - skb_queue_head_init(&l_ptr->namedq); - link_reset_statistics(l_ptr); - tipc_node_attach_link(n_ptr, l_ptr); - setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr); - link_state_event(l_ptr, STARTING_EVT); - - return l_ptr; + l = kzalloc(sizeof(*l), GFP_ATOMIC); + if (!l) + return false; + *link = l; + l->pmsg = (struct tipc_msg *)&l->proto_msg; + hdr = l->pmsg; + tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer); + msg_set_size(hdr, sizeof(l->proto_msg)); + msg_set_session(hdr, session); + msg_set_bearer_id(hdr, l->bearer_id); + + /* Note: peer i/f name is completed by reset/activate message */ + sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", + tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), + if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); + strcpy((char *)msg_data(hdr), if_name); + + l->addr = peer; + l->peer_caps = peer_caps; + l->net = net; + l->peer_session = WILDCARD_SESSION; + l->bearer_id = bearer_id; + l->tolerance = tolerance; + l->net_plane = net_plane; + l->advertised_mtu = mtu; + l->mtu = mtu; + l->priority = priority; + tipc_link_set_queue_limits(l, window); + l->ackers = 1; + l->bc_sndlink = bc_sndlink; + l->bc_rcvlink = bc_rcvlink; + l->inputq = inputq; + l->namedq = namedq; + l->state = LINK_RESETTING; + __skb_queue_head_init(&l->transmq); + __skb_queue_head_init(&l->backlogq); + __skb_queue_head_init(&l->deferdq); + skb_queue_head_init(&l->wakeupq); + skb_queue_head_init(l->inputq); + return true; } /** - * tipc_link_delete - Delete a link - * @l: link to be deleted + * tipc_link_bc_create - create new link to be used for broadcast + * @n: pointer to associated node + * @mtu: mtu to be used + * @window: send window to be used + * @inputq: queue to put messages ready for delivery + * @namedq: queue to put binding table update messages ready for delivery + * @link: return value, pointer to put the created link + * + * Returns true if link was created, otherwise false */ -void tipc_link_delete(struct tipc_link *l) +bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, + int mtu, int window, u16 peer_caps, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq, + struct tipc_link *bc_sndlink, + struct tipc_link **link) { + struct tipc_link *l; + + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, + 0, ownnode, peer, peer_caps, bc_sndlink, + NULL, inputq, namedq, link)) + return false; + + l = *link; + strcpy(l->name, tipc_bclink_name); tipc_link_reset(l); - if (del_timer(&l->timer)) - tipc_link_put(l); - l->flags |= LINK_STOPPED; - /* Delete link now, or when timer is finished: */ - tipc_link_reset_fragments(l); - tipc_node_detach_link(l->owner, l); - tipc_link_put(l); + l->state = LINK_RESET; + l->ackers = 0; + l->bc_rcvlink = l; + + /* Broadcast send link is always up */ + if (link_is_bc_sndlink(l)) + l->state = LINK_ESTABLISHED; + + return true; } -void tipc_link_delete_list(struct net *net, unsigned int bearer_id, - bool shutting_down) +/** + * tipc_link_fsm_evt - link finite state machine + * @l: pointer to link + * @evt: state machine event to be processed + */ +int tipc_link_fsm_evt(struct tipc_link *l, int evt) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *link; - struct tipc_node *node; + int rc = 0; - rcu_read_lock(); - list_for_each_entry_rcu(node, &tn->node_list, list) { - tipc_node_lock(node); - link = node->links[bearer_id]; - if (link) - tipc_link_delete(link); - tipc_node_unlock(node); + switch (l->state) { + case LINK_RESETTING: + switch (evt) { + case LINK_PEER_RESET_EVT: + l->state = LINK_PEER_RESET; + break; + case LINK_RESET_EVT: + l->state = LINK_RESET; + break; + case LINK_FAILURE_EVT: + case LINK_FAILOVER_BEGIN_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILOVER_END_EVT: + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_RESET: + switch (evt) { + case LINK_PEER_RESET_EVT: + l->state = LINK_ESTABLISHING; + break; + case LINK_FAILOVER_BEGIN_EVT: + l->state = LINK_FAILINGOVER; + case LINK_FAILURE_EVT: + case LINK_RESET_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILOVER_END_EVT: + break; + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_PEER_RESET: + switch (evt) { + case LINK_RESET_EVT: + l->state = LINK_ESTABLISHING; + break; + case LINK_PEER_RESET_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILURE_EVT: + break; + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: + case LINK_FAILOVER_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_FAILINGOVER: + switch (evt) { + case LINK_FAILOVER_END_EVT: + l->state = LINK_RESET; + break; + case LINK_PEER_RESET_EVT: + case LINK_RESET_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILURE_EVT: + break; + case LINK_FAILOVER_BEGIN_EVT: + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_ESTABLISHING: + switch (evt) { + case LINK_ESTABLISH_EVT: + l->state = LINK_ESTABLISHED; + break; + case LINK_FAILOVER_BEGIN_EVT: + l->state = LINK_FAILINGOVER; + break; + case LINK_RESET_EVT: + l->state = LINK_RESET; + break; + case LINK_FAILURE_EVT: + case LINK_PEER_RESET_EVT: + case LINK_SYNCH_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: + break; + case LINK_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_ESTABLISHED: + switch (evt) { + case LINK_PEER_RESET_EVT: + l->state = LINK_PEER_RESET; + rc |= TIPC_LINK_DOWN_EVT; + break; + case LINK_FAILURE_EVT: + l->state = LINK_RESETTING; + rc |= TIPC_LINK_DOWN_EVT; + break; + case LINK_RESET_EVT: + l->state = LINK_RESET; + break; + case LINK_ESTABLISH_EVT: + case LINK_SYNCH_END_EVT: + break; + case LINK_SYNCH_BEGIN_EVT: + l->state = LINK_SYNCHING; + break; + case LINK_FAILOVER_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_SYNCHING: + switch (evt) { + case LINK_PEER_RESET_EVT: + l->state = LINK_PEER_RESET; + rc |= TIPC_LINK_DOWN_EVT; + break; + case LINK_FAILURE_EVT: + l->state = LINK_RESETTING; + rc |= TIPC_LINK_DOWN_EVT; + break; + case LINK_RESET_EVT: + l->state = LINK_RESET; + break; + case LINK_ESTABLISH_EVT: + case LINK_SYNCH_BEGIN_EVT: + break; + case LINK_SYNCH_END_EVT: + l->state = LINK_ESTABLISHED; + break; + case LINK_FAILOVER_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + default: + pr_err("Unknown FSM state %x in %s\n", l->state, l->name); } - rcu_read_unlock(); + return rc; +illegal_evt: + pr_err("Illegal FSM event %x in state %x on link %s\n", + evt, l->state, l->name); + return rc; +} + +/* link_profile_stats - update statistical profiling of traffic + */ +static void link_profile_stats(struct tipc_link *l) +{ + struct sk_buff *skb; + struct tipc_msg *msg; + int length; + + /* Update counters used in statistical profiling of send traffic */ + l->stats.accu_queue_sz += skb_queue_len(&l->transmq); + l->stats.queue_sz_counts++; + + skb = skb_peek(&l->transmq); + if (!skb) + return; + msg = buf_msg(skb); + length = msg_size(msg); + + if (msg_user(msg) == MSG_FRAGMENTER) { + if (msg_type(msg) != FIRST_FRAGMENT) + return; + length = msg_size(msg_get_wrapped(msg)); + } + l->stats.msg_lengths_total += length; + l->stats.msg_length_counts++; + if (length <= 64) + l->stats.msg_length_profile[0]++; + else if (length <= 256) + l->stats.msg_length_profile[1]++; + else if (length <= 1024) + l->stats.msg_length_profile[2]++; + else if (length <= 4096) + l->stats.msg_length_profile[3]++; + else if (length <= 16384) + l->stats.msg_length_profile[4]++; + else if (length <= 32768) + l->stats.msg_length_profile[5]++; + else + l->stats.msg_length_profile[6]++; +} + +/* tipc_link_timeout - perform periodic task as instructed from node timeout + */ +/* tipc_link_timeout - perform periodic task as instructed from node timeout + */ +int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + int rc = 0; + int mtyp = STATE_MSG; + bool xmit = false; + bool prb = false; + u16 bc_snt = l->bc_sndlink->snd_nxt - 1; + u16 bc_acked = l->bc_rcvlink->acked; + bool bc_up = link_is_up(l->bc_rcvlink); + + link_profile_stats(l); + + switch (l->state) { + case LINK_ESTABLISHED: + case LINK_SYNCHING: + if (!l->silent_intv_cnt) { + if (bc_up && (bc_acked != bc_snt)) + xmit = true; + } else if (l->silent_intv_cnt <= l->abort_limit) { + xmit = true; + prb = true; + } else { + rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + } + l->silent_intv_cnt++; + break; + case LINK_RESET: + xmit = true; + mtyp = RESET_MSG; + break; + case LINK_ESTABLISHING: + xmit = true; + mtyp = ACTIVATE_MSG; + break; + case LINK_PEER_RESET: + case LINK_RESETTING: + case LINK_FAILINGOVER: + break; + default: + break; + } + + if (xmit) + tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq); + + return rc; } /** @@ -334,7 +607,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, * @link: congested link * @list: message that was attempted sent * Create pseudo msg to send back to user when congestion abates - * Only consumes message if there is an error + * Does not consume buffer list */ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) { @@ -347,8 +620,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) /* This really cannot happen... */ if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - tipc_link_reset(link); - goto err; + return -ENOBUFS; } /* Non-blocking sender: */ if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) @@ -358,15 +630,12 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, addr, addr, oport, 0, 0); if (!skb) - goto err; + return -ENOBUFS; TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); TIPC_SKB_CB(skb)->chain_imp = imp; skb_queue_tail(&link->wakeupq, skb); link->stats.link_congs++; return -ELINKCONG; -err: - __skb_queue_purge(list); - return -ENOBUFS; } /** @@ -388,782 +657,202 @@ void link_prepare_wakeup(struct tipc_link *l) if ((pnd[imp] + l->backlog[imp].len) >= lim) break; skb_unlink(skb, &l->wakeupq); - skb_queue_tail(&l->inputq, skb); - l->owner->inputq = &l->inputq; - l->owner->action_flags |= TIPC_MSG_EVT; + skb_queue_tail(l->inputq, skb); } } -/** - * tipc_link_reset_fragments - purge link's inbound message fragments queue - * @l_ptr: pointer to link - */ -void tipc_link_reset_fragments(struct tipc_link *l_ptr) +void tipc_link_reset(struct tipc_link *l) { - kfree_skb(l_ptr->reasm_buf); - l_ptr->reasm_buf = NULL; -} + /* Link is down, accept any session */ + l->peer_session = WILDCARD_SESSION; -static void tipc_link_purge_backlog(struct tipc_link *l) -{ + /* If peer is up, it only accepts an incremented session number */ + msg_set_session(l->pmsg, msg_session(l->pmsg) + 1); + + /* Prepare for renewed mtu size negotiation */ + l->mtu = l->advertised_mtu; + + /* Clean up all queues and counters: */ + __skb_queue_purge(&l->transmq); + __skb_queue_purge(&l->deferdq); + skb_queue_splice_init(&l->wakeupq, l->inputq); __skb_queue_purge(&l->backlogq); l->backlog[TIPC_LOW_IMPORTANCE].len = 0; l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0; l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0; + kfree_skb(l->reasm_buf); + kfree_skb(l->failover_reasm_skb); + l->reasm_buf = NULL; + l->failover_reasm_skb = NULL; + l->rcv_unacked = 0; + l->snd_nxt = 1; + l->rcv_nxt = 1; + l->acked = 0; + l->silent_intv_cnt = 0; + l->stats.recv_info = 0; + l->stale_count = 0; + l->bc_peer_is_up = false; + link_reset_statistics(l); } /** - * tipc_link_purge_queues - purge all pkt queues associated with link - * @l_ptr: pointer to link - */ -void tipc_link_purge_queues(struct tipc_link *l_ptr) -{ - __skb_queue_purge(&l_ptr->deferdq); - __skb_queue_purge(&l_ptr->transmq); - tipc_link_purge_backlog(l_ptr); - tipc_link_reset_fragments(l_ptr); -} - -void tipc_link_reset(struct tipc_link *l_ptr) -{ - u32 prev_state = l_ptr->state; - int was_active_link = tipc_link_is_active(l_ptr); - struct tipc_node *owner = l_ptr->owner; - struct tipc_link *pl = tipc_parallel_link(l_ptr); - - msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); - - /* Link is down, accept any session */ - l_ptr->peer_session = INVALID_SESSION; - - /* Prepare for renewed mtu size negotiation */ - l_ptr->mtu = l_ptr->advertised_mtu; - - l_ptr->state = RESET_UNKNOWN; - - if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET)) - return; - - tipc_node_link_down(l_ptr->owner, l_ptr); - tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); - - if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { - l_ptr->flags |= LINK_FAILINGOVER; - l_ptr->failover_checkpt = l_ptr->next_in_no; - pl->failover_pkts = FIRST_FAILOVER; - pl->failover_checkpt = l_ptr->next_in_no; - pl->failover_skb = l_ptr->reasm_buf; - } else { - kfree_skb(l_ptr->reasm_buf); - } - /* Clean up all queues, except inputq: */ - __skb_queue_purge(&l_ptr->transmq); - __skb_queue_purge(&l_ptr->deferdq); - if (!owner->inputq) - owner->inputq = &l_ptr->inputq; - skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); - if (!skb_queue_empty(owner->inputq)) - owner->action_flags |= TIPC_MSG_EVT; - tipc_link_purge_backlog(l_ptr); - l_ptr->reasm_buf = NULL; - l_ptr->rcv_unacked = 0; - l_ptr->checkpoint = 1; - l_ptr->next_out_no = 1; - l_ptr->fsm_msg_cnt = 0; - l_ptr->stale_count = 0; - link_reset_statistics(l_ptr); -} - -void tipc_link_reset_list(struct net *net, unsigned int bearer_id) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *l_ptr; - struct tipc_node *n_ptr; - - rcu_read_lock(); - list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->links[bearer_id]; - if (l_ptr) - tipc_link_reset(l_ptr); - tipc_node_unlock(n_ptr); - } - rcu_read_unlock(); -} - -static void link_activate(struct tipc_link *link) -{ - struct tipc_node *node = link->owner; - - link->next_in_no = 1; - link->stats.recv_info = 1; - tipc_node_link_up(node, link); - tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); -} - -/** - * link_state_event - link finite state machine - * @l_ptr: pointer to link - * @event: state machine event to process - */ -static void link_state_event(struct tipc_link *l_ptr, unsigned int event) -{ - struct tipc_link *other; - unsigned long cont_intv = l_ptr->cont_intv; - - if (l_ptr->flags & LINK_STOPPED) - return; - - if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) - return; /* Not yet. */ - - if (l_ptr->flags & LINK_FAILINGOVER) { - if (event == TIMEOUT_EVT) - link_set_timer(l_ptr, cont_intv); - return; - } - - switch (l_ptr->state) { - case WORKING_WORKING: - switch (event) { - case TRAFFIC_MSG_EVT: - case ACTIVATE_MSG: - break; - case TIMEOUT_EVT: - if (l_ptr->next_in_no != l_ptr->checkpoint) { - l_ptr->checkpoint = l_ptr->next_in_no; - if (tipc_bclink_acks_missing(l_ptr->owner)) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - } - link_set_timer(l_ptr, cont_intv); - break; - } - l_ptr->state = WORKING_UNKNOWN; - l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv / 4); - break; - case RESET_MSG: - pr_debug("%s<%s>, requested by peer\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - break; - default: - pr_debug("%s%u in WW state\n", link_unk_evt, event); - } - break; - case WORKING_UNKNOWN: - switch (event) { - case TRAFFIC_MSG_EVT: - case ACTIVATE_MSG: - l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - link_set_timer(l_ptr, cont_intv); - break; - case RESET_MSG: - pr_debug("%s<%s>, requested by peer while probing\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - break; - case TIMEOUT_EVT: - if (l_ptr->next_in_no != l_ptr->checkpoint) { - l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - l_ptr->checkpoint = l_ptr->next_in_no; - if (tipc_bclink_acks_missing(l_ptr->owner)) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - } - link_set_timer(l_ptr, cont_intv); - } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv / 4); - } else { /* Link has failed */ - pr_debug("%s<%s>, peer not responding\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_UNKNOWN; - l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, RESET_MSG, - 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - } - break; - default: - pr_err("%s%u in WU state\n", link_unk_evt, event); - } - break; - case RESET_UNKNOWN: - switch (event) { - case TRAFFIC_MSG_EVT: - break; - case ACTIVATE_MSG: - other = l_ptr->owner->active_links[0]; - if (other && link_working_unknown(other)) - break; - l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - if (l_ptr->owner->working_links == 1) - tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, cont_intv); - break; - case RESET_MSG: - l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - break; - case STARTING_EVT: - l_ptr->flags |= LINK_STARTED; - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - break; - case TIMEOUT_EVT: - tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - break; - default: - pr_err("%s%u in RU state\n", link_unk_evt, event); - } - break; - case RESET_RESET: - switch (event) { - case TRAFFIC_MSG_EVT: - case ACTIVATE_MSG: - other = l_ptr->owner->active_links[0]; - if (other && link_working_unknown(other)) - break; - l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - if (l_ptr->owner->working_links == 1) - tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, cont_intv); - break; - case RESET_MSG: - break; - case TIMEOUT_EVT: - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); - break; - default: - pr_err("%s%u in RR state\n", link_unk_evt, event); - } - break; - default: - pr_err("Unknown link state %u/%u\n", l_ptr->state, event); - } -} - -/** - * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked + * tipc_link_xmit(): enqueue buffer list according to queue situation * @link: link to use * @list: chain of buffers containing message + * @xmitq: returned list of packets to be sent by caller * * Consumes the buffer chain, except when returning -ELINKCONG, * since the caller then may want to make more send attempts. * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ -int __tipc_link_xmit(struct net *net, struct tipc_link *link, - struct sk_buff_head *list) +int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, + struct sk_buff_head *xmitq) { - struct tipc_msg *msg = buf_msg(skb_peek(list)); - unsigned int maxwin = link->window; - unsigned int imp = msg_importance(msg); - uint mtu = link->mtu; - uint ack = mod(link->next_in_no - 1); - uint seqno = link->next_out_no; - uint bc_last_in = link->owner->bclink.last_in; - struct tipc_media_addr *addr = &link->media_addr; - struct sk_buff_head *transmq = &link->transmq; - struct sk_buff_head *backlogq = &link->backlogq; - struct sk_buff *skb, *tmp; - - /* Match backlog limit against msg importance: */ - if (unlikely(link->backlog[imp].len >= link->backlog[imp].limit)) - return link_schedule_user(link, list); - - if (unlikely(msg_size(msg) > mtu)) { - __skb_queue_purge(list); - return -EMSGSIZE; + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + unsigned int maxwin = l->window; + unsigned int i, imp = msg_importance(hdr); + unsigned int mtu = l->mtu; + u16 ack = l->rcv_nxt - 1; + u16 seqno = l->snd_nxt; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff *skb, *_skb, *bskb; + + /* Match msg importance against this and all higher backlog limits: */ + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) + return link_schedule_user(l, list); } + if (unlikely(msg_size(hdr) > mtu)) + return -EMSGSIZE; + /* Prepare each packet for sending, and add to relevant queue: */ - skb_queue_walk_safe(list, skb, tmp) { - __skb_unlink(skb, list); - msg = buf_msg(skb); - msg_set_seqno(msg, seqno); - msg_set_ack(msg, ack); - msg_set_bcast_ack(msg, bc_last_in); + while (skb_queue_len(list)) { + skb = skb_peek(list); + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); if (likely(skb_queue_len(transmq) < maxwin)) { + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) + return -ENOBUFS; + __skb_dequeue(list); __skb_queue_tail(transmq, skb); - tipc_bearer_send(net, link->bearer_id, skb, addr); - link->rcv_unacked = 0; + __skb_queue_tail(xmitq, _skb); + TIPC_SKB_CB(skb)->ackers = l->ackers; + l->rcv_unacked = 0; seqno++; continue; } - if (tipc_msg_bundle(skb_peek_tail(backlogq), skb, mtu)) { - link->stats.sent_bundled++; + if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) { + kfree_skb(__skb_dequeue(list)); + l->stats.sent_bundled++; continue; } - if (tipc_msg_make_bundle(&skb, mtu, link->addr)) { - link->stats.sent_bundled++; - link->stats.sent_bundles++; - imp = msg_importance(buf_msg(skb)); + if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) { + kfree_skb(__skb_dequeue(list)); + __skb_queue_tail(backlogq, bskb); + l->backlog[msg_importance(buf_msg(bskb))].len++; + l->stats.sent_bundled++; + l->stats.sent_bundles++; + continue; } - __skb_queue_tail(backlogq, skb); - link->backlog[imp].len++; - seqno++; + l->backlog[imp].len += skb_queue_len(list); + skb_queue_splice_tail_init(list, backlogq); } - link->next_out_no = seqno; - return 0; -} - -static void skb2list(struct sk_buff *skb, struct sk_buff_head *list) -{ - skb_queue_head_init(list); - __skb_queue_tail(list, skb); -} - -static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) -{ - struct sk_buff_head head; - - skb2list(skb, &head); - return __tipc_link_xmit(link->owner->net, link, &head); -} - -/* tipc_link_xmit_skb(): send single buffer to destination - * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE - * messages, which will not be rejected - * The only exception is datagram messages rerouted after secondary - * lookup, which are rare and safe to dispose of anyway. - * TODO: Return real return value, and let callers use - * tipc_wait_for_sendpkt() where applicable - */ -int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, - u32 selector) -{ - struct sk_buff_head head; - int rc; - - skb2list(skb, &head); - rc = tipc_link_xmit(net, &head, dnode, selector); - if (rc == -ELINKCONG) - kfree_skb(skb); + l->snd_nxt = seqno; return 0; } -/** - * tipc_link_xmit() is the general link level function for message sending - * @net: the applicable net namespace - * @list: chain of buffers containing message - * @dsz: amount of user data to be sent - * @dnode: address of destination node - * @selector: a number used for deterministic link selection - * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE - */ -int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, - u32 selector) -{ - struct tipc_link *link = NULL; - struct tipc_node *node; - int rc = -EHOSTUNREACH; - - node = tipc_node_find(net, dnode); - if (node) { - tipc_node_lock(node); - link = node->active_links[selector & 1]; - if (link) - rc = __tipc_link_xmit(net, link, list); - tipc_node_unlock(node); - tipc_node_put(node); - } - if (link) - return rc; - - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } - - __skb_queue_purge(list); - return rc; -} - -/* - * tipc_link_sync_xmit - synchronize broadcast link endpoints. - * - * Give a newly added peer node the sequence number where it should - * start receiving and acking broadcast packets. - * - * Called with node locked - */ -static void tipc_link_sync_xmit(struct tipc_link *link) -{ - struct sk_buff *skb; - struct tipc_msg *msg; - - skb = tipc_buf_acquire(INT_H_SIZE); - if (!skb) - return; - - msg = buf_msg(skb); - tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG, - INT_H_SIZE, link->addr); - msg_set_last_bcast(msg, link->owner->bclink.acked); - __tipc_link_xmit_skb(link, skb); -} - -/* - * tipc_link_sync_rcv - synchronize broadcast link endpoints. - * Receive the sequence number where we should start receiving and - * acking broadcast packets from a newly added peer node, and open - * up for reception of such packets. - * - * Called with node locked - */ -static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf) -{ - struct tipc_msg *msg = buf_msg(buf); - - n->bclink.last_sent = n->bclink.last_in = msg_last_bcast(msg); - n->bclink.recv_permitted = true; - kfree_skb(buf); -} - -/* - * tipc_link_push_packets - push unsent packets to bearer - * - * Push out the unsent messages of a link where congestion - * has abated. Node is locked. - * - * Called with node locked - */ -void tipc_link_push_packets(struct tipc_link *link) +void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { - struct sk_buff *skb; - struct tipc_msg *msg; - unsigned int ack = mod(link->next_in_no - 1); - - while (skb_queue_len(&link->transmq) < link->window) { - skb = __skb_dequeue(&link->backlogq); + struct sk_buff *skb, *_skb; + struct tipc_msg *hdr; + u16 seqno = l->snd_nxt; + u16 ack = l->rcv_nxt - 1; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + + while (skb_queue_len(&l->transmq) < l->window) { + skb = skb_peek(&l->backlogq); if (!skb) - break; - msg = buf_msg(skb); - link->backlog[msg_importance(msg)].len--; - msg_set_ack(msg, ack); - msg_set_bcast_ack(msg, link->owner->bclink.last_in); - link->rcv_unacked = 0; - __skb_queue_tail(&link->transmq, skb); - tipc_bearer_send(link->owner->net, link->bearer_id, - skb, &link->media_addr); - } -} - -void tipc_link_reset_all(struct tipc_node *node) -{ - char addr_string[16]; - u32 i; - - tipc_node_lock(node); - - pr_warn("Resetting all links to %s\n", - tipc_addr_string_fill(addr_string, node->addr)); - - for (i = 0; i < MAX_BEARERS; i++) { - if (node->links[i]) { - link_print(node->links[i], "Resetting link\n"); - tipc_link_reset(node->links[i]); - } - } - - tipc_node_unlock(node); -} - -static void link_retransmit_failure(struct tipc_link *l_ptr, - struct sk_buff *buf) -{ - struct tipc_msg *msg = buf_msg(buf); - struct net *net = l_ptr->owner->net; - - pr_warn("Retransmission failure on link <%s>\n", l_ptr->name); - - if (l_ptr->addr) { - /* Handle failure on standard link */ - link_print(l_ptr, "Resetting link\n"); - tipc_link_reset(l_ptr); - - } else { - /* Handle failure on broadcast link */ - struct tipc_node *n_ptr; - char addr_string[16]; - - pr_info("Msg seq number: %u, ", msg_seqno(msg)); - pr_cont("Outstanding acks: %lu\n", - (unsigned long) TIPC_SKB_CB(buf)->handle); - - n_ptr = tipc_bclink_retransmit_to(net); - - tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_info("Broadcast link info for %s\n", addr_string); - pr_info("Reception permitted: %d, Acked: %u\n", - n_ptr->bclink.recv_permitted, - n_ptr->bclink.acked); - pr_info("Last in: %u, Oos state: %u, Last sent: %u\n", - n_ptr->bclink.last_in, - n_ptr->bclink.oos_state, - n_ptr->bclink.last_sent); - - n_ptr->action_flags |= TIPC_BCAST_RESET; - l_ptr->stale_count = 0; - } -} - -void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, - u32 retransmits) -{ - struct tipc_msg *msg; - - if (!skb) - return; - - msg = buf_msg(skb); - - /* Detect repeated retransmit failures */ - if (l_ptr->last_retransmitted == msg_seqno(msg)) { - if (++l_ptr->stale_count > 100) { - link_retransmit_failure(l_ptr, skb); - return; - } - } else { - l_ptr->last_retransmitted = msg_seqno(msg); - l_ptr->stale_count = 1; - } - - skb_queue_walk_from(&l_ptr->transmq, skb) { - if (!retransmits) - break; - msg = buf_msg(skb); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb, - &l_ptr->media_addr); - retransmits--; - l_ptr->stats.retransmitted++; - } -} - -/* link_synch(): check if all packets arrived before the synch - * point have been consumed - * Returns true if the parallel links are synched, otherwise false - */ -static bool link_synch(struct tipc_link *l) -{ - unsigned int post_synch; - struct tipc_link *pl; - - pl = tipc_parallel_link(l); - if (pl == l) - goto synched; - - /* Was last pre-synch packet added to input queue ? */ - if (less_eq(pl->next_in_no, l->synch_point)) - return false; - - /* Is it still in the input queue ? */ - post_synch = mod(pl->next_in_no - l->synch_point) - 1; - if (skb_queue_len(&pl->inputq) > post_synch) - return false; -synched: - l->flags &= ~LINK_SYNCHING; - return true; -} - -static void link_retrieve_defq(struct tipc_link *link, - struct sk_buff_head *list) -{ - u32 seq_no; - - if (skb_queue_empty(&link->deferdq)) - return; - - seq_no = buf_seqno(skb_peek(&link->deferdq)); - if (seq_no == mod(link->next_in_no)) - skb_queue_splice_tail_init(&link->deferdq, list); -} - -/** - * tipc_rcv - process TIPC packets/messages arriving from off-node - * @net: the applicable net namespace - * @skb: TIPC packet - * @b_ptr: pointer to bearer message arrived on - * - * Invoked with no locks held. Bearer pointer must point to a valid bearer - * structure (i.e. cannot be NULL), but bearer can be inactive. - */ -void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff_head head; - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; - struct sk_buff *skb1, *tmp; - struct tipc_msg *msg; - u32 seq_no; - u32 ackd; - u32 released; - - skb2list(skb, &head); - - while ((skb = __skb_dequeue(&head))) { - /* Ensure message is well-formed */ - if (unlikely(!tipc_msg_validate(skb))) - goto discard; - - /* Handle arrival of a non-unicast link message */ - msg = buf_msg(skb); - if (unlikely(msg_non_seq(msg))) { - if (msg_user(msg) == LINK_CONFIG) - tipc_disc_rcv(net, skb, b_ptr); - else - tipc_bclink_rcv(net, skb); - continue; - } - - /* Discard unicast link messages destined for another node */ - if (unlikely(!msg_short(msg) && - (msg_destnode(msg) != tn->own_addr))) - goto discard; - - /* Locate neighboring node that sent message */ - n_ptr = tipc_node_find(net, msg_prevnode(msg)); - if (unlikely(!n_ptr)) - goto discard; - - tipc_node_lock(n_ptr); - /* Locate unicast link endpoint that should handle message */ - l_ptr = n_ptr->links[b_ptr->identity]; - if (unlikely(!l_ptr)) - goto unlock; - - /* Verify that communication with node is currently allowed */ - if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) && - msg_user(msg) == LINK_PROTOCOL && - (msg_type(msg) == RESET_MSG || - msg_type(msg) == ACTIVATE_MSG) && - !msg_redundant_link(msg)) - n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN; - - if (tipc_node_blocked(n_ptr)) - goto unlock; - - /* Validate message sequence number info */ - seq_no = msg_seqno(msg); - ackd = msg_ack(msg); - - /* Release acked messages */ - if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg))) - tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); - - released = 0; - skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) { - if (more(buf_seqno(skb1), ackd)) - break; - __skb_unlink(skb1, &l_ptr->transmq); - kfree_skb(skb1); - released = 1; - } + break; + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) + break; + __skb_dequeue(&l->backlogq); + hdr = buf_msg(skb); + l->backlog[msg_importance(hdr)].len--; + __skb_queue_tail(&l->transmq, skb); + __skb_queue_tail(xmitq, _skb); + TIPC_SKB_CB(skb)->ackers = l->ackers; + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); + l->rcv_unacked = 0; + seqno++; + } + l->snd_nxt = seqno; +} - /* Try sending any messages link endpoint has pending */ - if (unlikely(skb_queue_len(&l_ptr->backlogq))) - tipc_link_push_packets(l_ptr); +static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) +{ + struct tipc_msg *hdr = buf_msg(skb); + + pr_warn("Retransmission failure on link <%s>\n", l->name); + link_print(l, "Resetting link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr)); + pr_info("sqno %u, prev: %x, src: %x\n", + msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); +} - if (released && !skb_queue_empty(&l_ptr->wakeupq)) - link_prepare_wakeup(l_ptr); +int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to, + struct sk_buff_head *xmitq) +{ + struct sk_buff *_skb, *skb = skb_peek(&l->transmq); + struct tipc_msg *hdr; + u16 ack = l->rcv_nxt - 1; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - /* Process the incoming packet */ - if (unlikely(!link_working_working(l_ptr))) { - if (msg_user(msg) == LINK_PROTOCOL) { - tipc_link_proto_rcv(l_ptr, skb); - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; - } + if (!skb) + return 0; - /* Traffic message. Conditionally activate link */ - link_state_event(l_ptr, TRAFFIC_MSG_EVT); + /* Detect repeated retransmit failures on same packet */ + if (likely(l->last_retransm != buf_seqno(skb))) { + l->last_retransm = buf_seqno(skb); + l->stale_count = 1; + } else if (++l->stale_count > 100) { + link_retransmit_failure(l, skb); + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + } - if (link_working_working(l_ptr)) { - /* Re-insert buffer in front of queue */ - __skb_queue_head(&head, skb); - skb = NULL; - goto unlock; - } - goto unlock; - } + /* Move forward to where retransmission should start */ + skb_queue_walk(&l->transmq, skb) { + if (!less(buf_seqno(skb), from)) + break; + } - /* Link is now in state WORKING_WORKING */ - if (unlikely(seq_no != mod(l_ptr->next_in_no))) { - link_handle_out_of_seq_msg(l_ptr, skb); - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; - } - /* Synchronize with parallel link if applicable */ - if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { - if (!link_synch(l_ptr)) - goto unlock; - } - l_ptr->next_in_no++; - if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) - link_retrieve_defq(l_ptr, &head); - if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { - l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - } - tipc_link_input(l_ptr, skb); - skb = NULL; -unlock: - tipc_node_unlock(n_ptr); - tipc_node_put(n_ptr); -discard: - if (unlikely(skb)) - kfree_skb(skb); + skb_queue_walk_from(&l->transmq, skb) { + if (more(buf_seqno(skb), to)) + break; + hdr = buf_msg(skb); + _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + if (!_skb) + return 0; + hdr = buf_msg(_skb); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); + _skb->priority = TC_PRIO_CONTROL; + __skb_queue_tail(xmitq, _skb); + l->stats.retransmitted++; } + return 0; } /* tipc_data_input - deliver data and name distr msgs to upper layer @@ -1171,29 +860,20 @@ discard: * Consumes buffer if message is of right type * Node lock must be held */ -static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) +static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *inputq) { - struct tipc_node *node = link->owner; - struct tipc_msg *msg = buf_msg(skb); - u32 dport = msg_destport(msg); - - switch (msg_user(msg)) { + switch (msg_user(buf_msg(skb))) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: case CONN_MANAGER: - if (tipc_skb_queue_tail(&link->inputq, skb, dport)) { - node->inputq = &link->inputq; - node->action_flags |= TIPC_MSG_EVT; - } + skb_queue_tail(inputq, skb); return true; case NAME_DISTRIBUTOR: - node->bclink.recv_permitted = true; - node->namedq = &link->namedq; - skb_queue_tail(&link->namedq, skb); - if (skb_queue_len(&link->namedq) == 1) - node->action_flags |= TIPC_NAMED_MSG_EVT; + l->bc_rcvlink->state = LINK_ESTABLISHED; + skb_queue_tail(l->namedq, skb); return true; case MSG_BUNDLER: case TUNNEL_PROTOCOL: @@ -1210,540 +890,629 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) /* tipc_link_input - process packet that has passed link protocol check * * Consumes buffer - * Node lock must be held */ -static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) +static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *inputq) { - struct tipc_node *node = link->owner; - struct tipc_msg *msg = buf_msg(skb); + struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff **reasm_skb = &l->reasm_buf; struct sk_buff *iskb; + struct sk_buff_head tmpq; + int usr = msg_user(hdr); + int rc = 0; int pos = 0; + int ipos = 0; - if (likely(tipc_data_input(link, skb))) - return; - - switch (msg_user(msg)) { - case TUNNEL_PROTOCOL: - if (msg_dup(msg)) { - link->flags |= LINK_SYNCHING; - link->synch_point = msg_seqno(msg_get_wrapped(msg)); - kfree_skb(skb); - break; + if (unlikely(usr == TUNNEL_PROTOCOL)) { + if (msg_type(hdr) == SYNCH_MSG) { + __skb_queue_purge(&l->deferdq); + goto drop; } - if (!tipc_link_failover_rcv(link, &skb)) - break; - if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { - tipc_data_input(link, skb); - break; - } - case MSG_BUNDLER: - link->stats.recv_bundles++; - link->stats.recv_bundled += msg_msgcnt(msg); + if (!tipc_msg_extract(skb, &iskb, &ipos)) + return rc; + kfree_skb(skb); + skb = iskb; + hdr = buf_msg(skb); + if (less(msg_seqno(hdr), l->drop_point)) + goto drop; + if (tipc_data_input(l, skb, inputq)) + return rc; + usr = msg_user(hdr); + reasm_skb = &l->failover_reasm_skb; + } + if (usr == MSG_BUNDLER) { + skb_queue_head_init(&tmpq); + l->stats.recv_bundles++; + l->stats.recv_bundled += msg_msgcnt(hdr); while (tipc_msg_extract(skb, &iskb, &pos)) - tipc_data_input(link, iskb); - break; - case MSG_FRAGMENTER: - link->stats.recv_fragments++; - if (tipc_buf_append(&link->reasm_buf, &skb)) { - link->stats.recv_fragmented++; - tipc_data_input(link, skb); - } else if (!link->reasm_buf) { - tipc_link_reset(link); + tipc_data_input(l, iskb, &tmpq); + tipc_skb_queue_splice_tail(&tmpq, inputq); + return 0; + } else if (usr == MSG_FRAGMENTER) { + l->stats.recv_fragments++; + if (tipc_buf_append(reasm_skb, &skb)) { + l->stats.recv_fragmented++; + tipc_data_input(l, skb, inputq); + } else if (!*reasm_skb && !link_is_bc_rcvlink(l)) { + pr_warn_ratelimited("Unable to build fragment list\n"); + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } - break; - case BCAST_PROTOCOL: - tipc_link_sync_rcv(node, skb); - break; - default: - break; - }; + return 0; + } else if (usr == BCAST_PROTOCOL) { + tipc_bcast_lock(l->net); + tipc_link_bc_init_rcv(l->bc_rcvlink, hdr); + tipc_bcast_unlock(l->net); + } +drop: + kfree_skb(skb); + return 0; } -/** - * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue - * - * Returns increase in queue length (i.e. 0 or 1) - */ -u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) +static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) { - struct sk_buff *skb1; - u32 seq_no = buf_seqno(skb); + bool released = false; + struct sk_buff *skb, *tmp; - /* Empty queue ? */ - if (skb_queue_empty(list)) { - __skb_queue_tail(list, skb); - return 1; + skb_queue_walk_safe(&l->transmq, skb, tmp) { + if (more(buf_seqno(skb), acked)) + break; + __skb_unlink(skb, &l->transmq); + kfree_skb(skb); + released = true; } + return released; +} + +/* tipc_link_build_ack_msg: prepare link acknowledge message for transmission + * + * Note that sending of broadcast ack is coordinated among nodes, to reduce + * risk of ack storms towards the sender + */ +int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + if (!l) + return 0; - /* Last ? */ - if (less(buf_seqno(skb_peek_tail(list)), seq_no)) { - __skb_queue_tail(list, skb); - return 1; + /* Broadcast ACK must be sent via a unicast link => defer to caller */ + if (link_is_bc_rcvlink(l)) { + if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf) + return 0; + l->rcv_unacked = 0; + return TIPC_LINK_SND_BC_ACK; } - /* Locate insertion point in queue, then insert; discard if duplicate */ - skb_queue_walk(list, skb1) { - u32 curr_seqno = buf_seqno(skb1); + /* Unicast ACK */ + l->rcv_unacked = 0; + l->stats.sent_acks++; + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + return 0; +} - if (seq_no == curr_seqno) { - kfree_skb(skb); - return 0; - } +/* tipc_link_build_reset_msg: prepare link RESET or ACTIVATE message + */ +void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + int mtyp = RESET_MSG; - if (less(seq_no, curr_seqno)) - break; - } + if (l->state == LINK_ESTABLISHING) + mtyp = ACTIVATE_MSG; - __skb_queue_before(list, skb1, skb); - return 1; + tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq); } -/* - * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet +/* tipc_link_build_nack_msg: prepare link nack message for transmission */ -static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, - struct sk_buff *buf) +static void tipc_link_build_nack_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) { - u32 seq_no = buf_seqno(buf); + u32 def_cnt = ++l->stats.deferred_recv; - if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) { - tipc_link_proto_rcv(l_ptr, buf); + if (link_is_bc_rcvlink(l)) return; - } - /* Record OOS packet arrival (force mismatch on next timeout) */ - l_ptr->checkpoint--; + if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); +} - /* - * Discard packet if a duplicate; otherwise add it to deferred queue - * and notify peer of gap as per protocol specification - */ - if (less(seq_no, mod(l_ptr->next_in_no))) { - l_ptr->stats.duplicates++; - kfree_skb(buf); - return; - } +/* tipc_link_rcv - process TIPC packets/messages arriving from off-node + * @l: the link that should handle the message + * @skb: TIPC packet + * @xmitq: queue to place packets to be sent after this call + */ +int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + struct sk_buff_head *defq = &l->deferdq; + struct tipc_msg *hdr; + u16 seqno, rcv_nxt, win_lim; + int rc = 0; + + do { + hdr = buf_msg(skb); + seqno = msg_seqno(hdr); + rcv_nxt = l->rcv_nxt; + win_lim = rcv_nxt + TIPC_MAX_LINK_WIN; + + /* Verify and update link state */ + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) + return tipc_link_proto_rcv(l, skb, xmitq); + + if (unlikely(!link_is_up(l))) { + if (l->state == LINK_ESTABLISHING) + rc = TIPC_LINK_UP_EVT; + goto drop; + } - if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) { - l_ptr->stats.deferred_recv++; - if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - } else { - l_ptr->stats.duplicates++; - } + /* Don't send probe at next timeout expiration */ + l->silent_intv_cnt = 0; + + /* Drop if outside receive window */ + if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { + l->stats.duplicates++; + goto drop; + } + + /* Forward queues and wake up waiting users */ + if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } + + /* Defer delivery if sequence gap */ + if (unlikely(seqno != rcv_nxt)) { + __tipc_skb_queue_sorted(defq, seqno, skb); + tipc_link_build_nack_msg(l, xmitq); + break; + } + + /* Deliver packet */ + l->rcv_nxt++; + l->stats.recv_info++; + if (!tipc_data_input(l, skb, l->inputq)) + rc |= tipc_link_input(l, skb, l->inputq); + if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) + rc |= tipc_link_build_ack_msg(l, xmitq); + if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK)) + break; + } while ((skb = __skb_dequeue(defq))); + + return rc; +drop: + kfree_skb(skb); + return rc; } /* * Send protocol message to the other endpoint. */ -void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, +void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg, u32 gap, u32 tolerance, u32 priority) { - struct sk_buff *buf = NULL; - struct tipc_msg *msg = l_ptr->pmsg; - u32 msg_size = sizeof(l_ptr->proto_msg); - int r_flag; - - /* Don't send protocol message during link failover */ - if (l_ptr->flags & LINK_FAILINGOVER) - return; + struct sk_buff *skb = NULL; + struct sk_buff_head xmitq; - /* Abort non-RESET send if communication with node is prohibited */ - if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG)) + __skb_queue_head_init(&xmitq); + tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap, + tolerance, priority, &xmitq); + skb = __skb_dequeue(&xmitq); + if (!skb) return; + tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr); + l->rcv_unacked = 0; +} - /* Create protocol message with "out-of-sequence" sequence number */ - msg_set_type(msg, msg_typ); - msg_set_net_plane(msg, l_ptr->net_plane); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net)); +static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, + u16 rcvgap, int tolerance, int priority, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = NULL; + struct tipc_msg *hdr = l->pmsg; + bool node_up = link_is_up(l->bc_rcvlink); - if (msg_typ == STATE_MSG) { - u32 next_sent = mod(l_ptr->next_out_no); + /* Don't send protocol message during reset or link failover */ + if (tipc_link_is_blocked(l)) + return; - if (!tipc_link_is_up(l_ptr)) + msg_set_type(hdr, mtyp); + msg_set_net_plane(hdr, l->net_plane); + msg_set_next_sent(hdr, l->snd_nxt); + msg_set_ack(hdr, l->rcv_nxt - 1); + msg_set_bcast_ack(hdr, l->bc_rcvlink->rcv_nxt - 1); + msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1); + msg_set_link_tolerance(hdr, tolerance); + msg_set_linkprio(hdr, priority); + msg_set_redundant_link(hdr, node_up); + msg_set_seq_gap(hdr, 0); + + /* Compatibility: created msg must not be in sequence with pkt flow */ + msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); + + if (mtyp == STATE_MSG) { + if (!tipc_link_is_up(l)) return; - if (skb_queue_len(&l_ptr->backlogq)) - next_sent = buf_seqno(skb_peek(&l_ptr->backlogq)); - msg_set_next_sent(msg, next_sent); - if (!skb_queue_empty(&l_ptr->deferdq)) { - u32 rec = buf_seqno(skb_peek(&l_ptr->deferdq)); - gap = mod(rec - mod(l_ptr->next_in_no)); + + /* Override rcvgap if there are packets in deferred queue */ + if (!skb_queue_empty(&l->deferdq)) + rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt; + if (rcvgap) { + msg_set_seq_gap(hdr, rcvgap); + l->stats.sent_nacks++; } - msg_set_seq_gap(msg, gap); - if (gap) - l_ptr->stats.sent_nacks++; - msg_set_link_tolerance(msg, tolerance); - msg_set_linkprio(msg, priority); - msg_set_max_pkt(msg, l_ptr->mtu); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); - msg_set_probe(msg, probe_msg != 0); - if (probe_msg) - l_ptr->stats.sent_probes++; - l_ptr->stats.sent_states++; - } else { /* RESET_MSG or ACTIVATE_MSG */ - msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1)); - msg_set_seq_gap(msg, 0); - msg_set_next_sent(msg, 1); - msg_set_probe(msg, 0); - msg_set_link_tolerance(msg, l_ptr->tolerance); - msg_set_linkprio(msg, l_ptr->priority); - msg_set_max_pkt(msg, l_ptr->advertised_mtu); + msg_set_probe(hdr, probe); + if (probe) + l->stats.sent_probes++; + l->stats.sent_states++; + l->rcv_unacked = 0; + } else { + /* RESET_MSG or ACTIVATE_MSG */ + msg_set_max_pkt(hdr, l->advertised_mtu); + msg_set_ack(hdr, l->rcv_nxt - 1); + msg_set_next_sent(hdr, 1); } + skb = tipc_buf_acquire(msg_size(hdr)); + if (!skb) + return; + skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); + skb->priority = TC_PRIO_CONTROL; + __skb_queue_tail(xmitq, skb); +} + +/* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets + * with contents of the link's transmit and backlog queues. + */ +void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, + int mtyp, struct sk_buff_head *xmitq) +{ + struct sk_buff *skb, *tnlskb; + struct tipc_msg *hdr, tnlhdr; + struct sk_buff_head *queue = &l->transmq; + struct sk_buff_head tmpxq, tnlq; + u16 pktlen, pktcnt, seqno = l->snd_nxt; - r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr)); - msg_set_redundant_link(msg, r_flag); - msg_set_linkprio(msg, l_ptr->priority); - msg_set_size(msg, msg_size); + if (!tnl) + return; - msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2))); + skb_queue_head_init(&tnlq); + skb_queue_head_init(&tmpxq); - buf = tipc_buf_acquire(msg_size); - if (!buf) + /* At least one packet required for safe algorithm => add dummy */ + skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, 0, l->addr, link_own_addr(l), + 0, 0, TIPC_ERR_NO_PORT); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); return; + } + skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, &tmpxq); + __skb_queue_purge(&tmpxq); + + /* Initialize reusable tunnel packet header */ + tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL, + mtyp, INT_H_SIZE, l->addr); + pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq); + msg_set_msgcnt(&tnlhdr, pktcnt); + msg_set_bearer_id(&tnlhdr, l->peer_bearer_id); +tnl: + /* Wrap each packet into a tunnel packet */ + skb_queue_walk(queue, skb) { + hdr = buf_msg(skb); + if (queue == &l->backlogq) + msg_set_seqno(hdr, seqno++); + pktlen = msg_size(hdr); + msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); + tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE); + if (!tnlskb) { + pr_warn("%sunable to send packet\n", link_co_err); + return; + } + skb_copy_to_linear_data(tnlskb, &tnlhdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(tnlskb, INT_H_SIZE, hdr, pktlen); + __skb_queue_tail(&tnlq, tnlskb); + } + if (queue != &l->backlogq) { + queue = &l->backlogq; + goto tnl; + } + + tipc_link_xmit(tnl, &tnlq, xmitq); - skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); - buf->priority = TC_PRIO_CONTROL; - tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf, - &l_ptr->media_addr); - l_ptr->rcv_unacked = 0; - kfree_skb(buf); + if (mtyp == FAILOVER_MSG) { + tnl->drop_point = l->rcv_nxt; + tnl->failover_reasm_skb = l->reasm_buf; + l->reasm_buf = NULL; + } } -/* - * Receive protocol message : +/* tipc_link_proto_rcv(): receive link level protocol message : * Note that network plane id propagates through the network, and may - * change at any time. The node with lowest address rules + * change at any time. The node with lowest numerical id determines + * network plane */ -static void tipc_link_proto_rcv(struct tipc_link *l_ptr, - struct sk_buff *buf) +static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - u32 rec_gap = 0; - u32 msg_tol; - struct tipc_msg *msg = buf_msg(buf); + struct tipc_msg *hdr = buf_msg(skb); + u16 rcvgap = 0; + u16 ack = msg_ack(hdr); + u16 gap = msg_seq_gap(hdr); + u16 peers_snd_nxt = msg_next_sent(hdr); + u16 peers_tol = msg_link_tolerance(hdr); + u16 peers_prio = msg_linkprio(hdr); + u16 rcv_nxt = l->rcv_nxt; + int mtyp = msg_type(hdr); + char *if_name; + int rc = 0; - if (l_ptr->flags & LINK_FAILINGOVER) + if (tipc_link_is_blocked(l) || !xmitq) goto exit; - if (l_ptr->net_plane != msg_net_plane(msg)) - if (link_own_addr(l_ptr) > msg_prevnode(msg)) - l_ptr->net_plane = msg_net_plane(msg); - - switch (msg_type(msg)) { + if (link_own_addr(l) > msg_prevnode(hdr)) + l->net_plane = msg_net_plane(hdr); + switch (mtyp) { case RESET_MSG: - if (!link_working_unknown(l_ptr) && - (l_ptr->peer_session != INVALID_SESSION)) { - if (less_eq(msg_session(msg), l_ptr->peer_session)) - break; /* duplicate or old reset: ignore */ - } - - if (!msg_redundant_link(msg) && (link_working_working(l_ptr) || - link_working_unknown(l_ptr))) { - /* - * peer has lost contact -- don't allow peer's links - * to reactivate before we recognize loss & clean up - */ - l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN; - } - - link_state_event(l_ptr, RESET_MSG); + /* Ignore duplicate RESET with old session number */ + if ((less_eq(msg_session(hdr), l->peer_session)) && + (l->peer_session != WILDCARD_SESSION)) + break; /* fall thru' */ + case ACTIVATE_MSG: - /* Update link settings according other endpoint's values */ - strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg)); - msg_tol = msg_link_tolerance(msg); - if (msg_tol > l_ptr->tolerance) - link_set_supervision_props(l_ptr, msg_tol); + /* Complete own link name with peer's interface name */ + if_name = strrchr(l->name, ':') + 1; + if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) + break; + if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) + break; + strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); - if (msg_linkprio(msg) > l_ptr->priority) - l_ptr->priority = msg_linkprio(msg); + /* Update own tolerance if peer indicates a non-zero value */ + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + l->tolerance = peers_tol; - if (l_ptr->mtu > msg_max_pkt(msg)) - l_ptr->mtu = msg_max_pkt(msg); + /* Update own priority if peer's priority is higher */ + if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) + l->priority = peers_prio; - /* Synchronize broadcast link info, if not done previously */ - if (!tipc_node_is_up(l_ptr->owner)) { - l_ptr->owner->bclink.last_sent = - l_ptr->owner->bclink.last_in = - msg_last_bcast(msg); - l_ptr->owner->bclink.oos_state = 0; - } + /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ + if ((mtyp == RESET_MSG) || !link_is_up(l)) + rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); - l_ptr->peer_session = msg_session(msg); - l_ptr->peer_bearer_id = msg_bearer_id(msg); + /* ACTIVATE_MSG takes up link if it was already locally reset */ + if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING)) + rc = TIPC_LINK_UP_EVT; - if (msg_type(msg) == ACTIVATE_MSG) - link_state_event(l_ptr, ACTIVATE_MSG); + l->peer_session = msg_session(hdr); + l->peer_bearer_id = msg_bearer_id(hdr); + if (l->mtu > msg_max_pkt(hdr)) + l->mtu = msg_max_pkt(hdr); break; + case STATE_MSG: - msg_tol = msg_link_tolerance(msg); - if (msg_tol) - link_set_supervision_props(l_ptr, msg_tol); - - if (msg_linkprio(msg) && - (msg_linkprio(msg) != l_ptr->priority)) { - pr_debug("%s<%s>, priority change %u->%u\n", - link_rst_msg, l_ptr->name, - l_ptr->priority, msg_linkprio(msg)); - l_ptr->priority = msg_linkprio(msg); - tipc_link_reset(l_ptr); /* Enforce change to take effect */ - break; - } + /* Update own tolerance if peer indicates a non-zero value */ + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + l->tolerance = peers_tol; - /* Record reception; force mismatch at next timeout: */ - l_ptr->checkpoint--; + l->silent_intv_cnt = 0; + l->stats.recv_states++; + if (msg_probe(hdr)) + l->stats.recv_probes++; - link_state_event(l_ptr, TRAFFIC_MSG_EVT); - l_ptr->stats.recv_states++; - if (link_reset_unknown(l_ptr)) + if (!link_is_up(l)) { + if (l->state == LINK_ESTABLISHING) + rc = TIPC_LINK_UP_EVT; break; - - if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) { - rec_gap = mod(msg_next_sent(msg) - - mod(l_ptr->next_in_no)); } - if (msg_probe(msg)) - l_ptr->stats.recv_probes++; - - /* Protocol message before retransmits, reduce loss risk */ - if (l_ptr->owner->bclink.recv_permitted) - tipc_bclink_update_link_state(l_ptr->owner, - msg_last_bcast(msg)); - - if (rec_gap || (msg_probe(msg))) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, - rec_gap, 0, 0); + /* Send NACK if peer has sent pkts we haven't received yet */ + if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) + rcvgap = peers_snd_nxt - l->rcv_nxt; + if (rcvgap || (msg_probe(hdr))) + tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, + 0, 0, xmitq); + tipc_link_release_pkts(l, ack); + + /* If NACK, retransmit will now start at right position */ + if (gap) { + rc = tipc_link_retrans(l, ack + 1, ack + gap, xmitq); + l->stats.recv_nacks++; } - if (msg_seq_gap(msg)) { - l_ptr->stats.recv_nacks++; - tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq), - msg_seq_gap(msg)); - } - break; + + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); } exit: - kfree_skb(buf); + kfree_skb(skb); + return rc; } - -/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to - * a different bearer. Owner node is locked. +/* tipc_link_build_bc_proto_msg() - create broadcast protocol message */ -static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, - struct tipc_msg *tunnel_hdr, - struct tipc_msg *msg, - u32 selector) +static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast, + u16 peers_snd_nxt, + struct sk_buff_head *xmitq) { - struct tipc_link *tunnel; struct sk_buff *skb; - u32 length = msg_size(msg); + struct tipc_msg *hdr; + struct sk_buff *dfrd_skb = skb_peek(&l->deferdq); + u16 ack = l->rcv_nxt - 1; + u16 gap_to = peers_snd_nxt - 1; - tunnel = l_ptr->owner->active_links[selector & 1]; - if (!tipc_link_is_up(tunnel)) { - pr_warn("%stunnel link no longer available\n", link_co_err); - return; - } - msg_set_size(tunnel_hdr, length + INT_H_SIZE); - skb = tipc_buf_acquire(length + INT_H_SIZE); - if (!skb) { - pr_warn("%sunable to send tunnel msg\n", link_co_err); - return; - } - skb_copy_to_linear_data(skb, tunnel_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(skb, INT_H_SIZE, msg, length); - __tipc_link_xmit_skb(tunnel, skb); + skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, + 0, l->addr, link_own_addr(l), 0, 0, 0); + if (!skb) + return false; + hdr = buf_msg(skb); + msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1); + msg_set_bcast_ack(hdr, ack); + msg_set_bcgap_after(hdr, ack); + if (dfrd_skb) + gap_to = buf_seqno(dfrd_skb) - 1; + msg_set_bcgap_to(hdr, gap_to); + msg_set_non_seq(hdr, bcast); + __skb_queue_tail(xmitq, skb); + return true; } +/* tipc_link_build_bc_init_msg() - synchronize broadcast link endpoints. + * + * Give a newly added peer node the sequence number where it should + * start receiving and acking broadcast packets. + */ +static void tipc_link_build_bc_init_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + struct sk_buff_head list; + + __skb_queue_head_init(&list); + if (!tipc_link_build_bc_proto_msg(l->bc_rcvlink, false, 0, &list)) + return; + tipc_link_xmit(l, &list, xmitq); +} -/* tipc_link_failover_send_queue(): A link has gone down, but a second - * link is still active. We can do failover. Tunnel the failing link's - * whole send queue via the remaining link. This way, we don't lose - * any packets, and sequence order is preserved for subsequent traffic - * sent over the remaining link. Owner node is locked. +/* tipc_link_bc_init_rcv - receive initial broadcast synch data from peer */ -void tipc_link_failover_send_queue(struct tipc_link *l_ptr) +void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) { - int msgcount; - struct tipc_link *tunnel = l_ptr->owner->active_links[0]; - struct tipc_msg tunnel_hdr; - struct sk_buff *skb; - int split_bundles; + int mtyp = msg_type(hdr); + u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); - if (!tunnel) + if (link_is_up(l)) return; - tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL, - FAILOVER_MSG, INT_H_SIZE, l_ptr->addr); - skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); - tipc_link_purge_backlog(l_ptr); - msgcount = skb_queue_len(&l_ptr->transmq); - msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); - msg_set_msgcnt(&tunnel_hdr, msgcount); - - if (skb_queue_empty(&l_ptr->transmq)) { - skb = tipc_buf_acquire(INT_H_SIZE); - if (skb) { - skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE); - msg_set_size(&tunnel_hdr, INT_H_SIZE); - __tipc_link_xmit_skb(tunnel, skb); - } else { - pr_warn("%sunable to send changeover msg\n", - link_co_err); - } + if (msg_user(hdr) == BCAST_PROTOCOL) { + l->rcv_nxt = peers_snd_nxt; + l->state = LINK_ESTABLISHED; return; } - split_bundles = (l_ptr->owner->active_links[0] != - l_ptr->owner->active_links[1]); - - skb_queue_walk(&l_ptr->transmq, skb) { - struct tipc_msg *msg = buf_msg(skb); + if (l->peer_caps & TIPC_BCAST_SYNCH) + return; - if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) { - struct tipc_msg *m = msg_get_wrapped(msg); - unchar *pos = (unchar *)m; + if (msg_peer_node_is_up(hdr)) + return; - msgcount = msg_msgcnt(msg); - while (msgcount--) { - msg_set_seqno(m, msg_seqno(msg)); - tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, m, - msg_link_selector(m)); - pos += align(msg_size(m)); - m = (struct tipc_msg *)pos; - } - } else { - tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, msg, - msg_link_selector(msg)); - } - } + /* Compatibility: accept older, less safe initial synch data */ + if ((mtyp == RESET_MSG) || (mtyp == ACTIVATE_MSG)) + l->rcv_nxt = peers_snd_nxt; } -/* tipc_link_dup_queue_xmit(): A second link has become active. Tunnel a - * duplicate of the first link's send queue via the new link. This way, we - * are guaranteed that currently queued packets from a socket are delivered - * before future traffic from the same socket, even if this is using the - * new link. The last arriving copy of each duplicate packet is dropped at - * the receiving end by the regular protocol check, so packet cardinality - * and sequence order is preserved per sender/receiver socket pair. - * Owner node is locked. +/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state */ -void tipc_link_dup_queue_xmit(struct tipc_link *link, - struct tipc_link *tnl) +void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, + struct sk_buff_head *xmitq) { - struct sk_buff *skb; - struct tipc_msg tnl_hdr; - struct sk_buff_head *queue = &link->transmq; - int mcnt; + u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); - tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL, - SYNCH_MSG, INT_H_SIZE, link->addr); - mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq); - msg_set_msgcnt(&tnl_hdr, mcnt); - msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id); + if (!link_is_up(l)) + return; -tunnel_queue: - skb_queue_walk(queue, skb) { - struct sk_buff *outskb; - struct tipc_msg *msg = buf_msg(skb); - u32 len = msg_size(msg); - - msg_set_ack(msg, mod(link->next_in_no - 1)); - msg_set_bcast_ack(msg, link->owner->bclink.last_in); - msg_set_size(&tnl_hdr, len + INT_H_SIZE); - outskb = tipc_buf_acquire(len + INT_H_SIZE); - if (outskb == NULL) { - pr_warn("%sunable to send duplicate msg\n", - link_co_err); - return; - } - skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(outskb, INT_H_SIZE, - skb->data, len); - __tipc_link_xmit_skb(tnl, outskb); - if (!tipc_link_is_up(link)) - return; - } - if (queue == &link->backlogq) + if (!msg_peer_node_is_up(hdr)) return; - queue = &link->backlogq; - goto tunnel_queue; -} -/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet - * Owner node is locked. - */ -static bool tipc_link_failover_rcv(struct tipc_link *link, - struct sk_buff **skb) -{ - struct tipc_msg *msg = buf_msg(*skb); - struct sk_buff *iskb = NULL; - struct tipc_link *pl = NULL; - int bearer_id = msg_bearer_id(msg); - int pos = 0; + l->bc_peer_is_up = true; - if (msg_type(msg) != FAILOVER_MSG) { - pr_warn("%sunknown tunnel pkt received\n", link_co_err); - goto exit; + /* Ignore if peers_snd_nxt goes beyond receive window */ + if (more(peers_snd_nxt, l->rcv_nxt + l->window)) + return; + + if (!more(peers_snd_nxt, l->rcv_nxt)) { + l->nack_state = BC_NACK_SND_CONDITIONAL; + return; } - if (bearer_id >= MAX_BEARERS) - goto exit; - if (bearer_id == link->bearer_id) - goto exit; + /* Don't NACK if one was recently sent or peeked */ + if (l->nack_state == BC_NACK_SND_SUPPRESS) { + l->nack_state = BC_NACK_SND_UNCONDITIONAL; + return; + } - pl = link->owner->links[bearer_id]; - if (pl && tipc_link_is_up(pl)) - tipc_link_reset(pl); + /* Conditionally delay NACK sending until next synch rcv */ + if (l->nack_state == BC_NACK_SND_CONDITIONAL) { + l->nack_state = BC_NACK_SND_UNCONDITIONAL; + if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN) + return; + } - if (link->failover_pkts == FIRST_FAILOVER) - link->failover_pkts = msg_msgcnt(msg); + /* Send NACK now but suppress next one */ + tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq); + l->nack_state = BC_NACK_SND_SUPPRESS; +} - /* Should we expect an inner packet? */ - if (!link->failover_pkts) - goto exit; +void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb, *tmp; + struct tipc_link *snd_l = l->bc_sndlink; - if (!tipc_msg_extract(*skb, &iskb, &pos)) { - pr_warn("%sno inner failover pkt\n", link_co_err); - *skb = NULL; - goto exit; - } - link->failover_pkts--; - *skb = NULL; + if (!link_is_up(l) || !l->bc_peer_is_up) + return; - /* Was this packet already delivered? */ - if (less(buf_seqno(iskb), link->failover_checkpt)) { - kfree_skb(iskb); - iskb = NULL; - goto exit; + if (!more(acked, l->acked)) + return; + + /* Skip over packets peer has already acked */ + skb_queue_walk(&snd_l->transmq, skb) { + if (more(buf_seqno(skb), l->acked)) + break; } - if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) { - link->stats.recv_fragments++; - tipc_buf_append(&link->failover_skb, &iskb); + + /* Update/release the packets peer is acking now */ + skb_queue_walk_from_safe(&snd_l->transmq, skb, tmp) { + if (more(buf_seqno(skb), acked)) + break; + if (!--TIPC_SKB_CB(skb)->ackers) { + __skb_unlink(skb, &snd_l->transmq); + kfree_skb(skb); + } } -exit: - if (!link->failover_pkts && pl) - pl->flags &= ~LINK_FAILINGOVER; - kfree_skb(*skb); - *skb = iskb; - return *skb; + l->acked = acked; + tipc_link_advance_backlog(snd_l, xmitq); + if (unlikely(!skb_queue_empty(&snd_l->wakeupq))) + link_prepare_wakeup(snd_l); } -static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) +/* tipc_link_bc_nack_rcv(): receive broadcast nack message + */ +int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; + struct tipc_msg *hdr = buf_msg(skb); + u32 dnode = msg_destnode(hdr); + int mtyp = msg_type(hdr); + u16 acked = msg_bcast_ack(hdr); + u16 from = acked + 1; + u16 to = msg_bcgap_to(hdr); + u16 peers_snd_nxt = to + 1; + int rc = 0; + + kfree_skb(skb); + + if (!tipc_link_is_up(l) || !l->bc_peer_is_up) + return 0; - if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL)) - return; + if (mtyp != STATE_MSG) + return 0; + + if (dnode == link_own_addr(l)) { + tipc_link_bc_ack_rcv(l, acked, xmitq); + rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq); + l->stats.recv_nacks++; + return rc; + } + + /* Msg for other node => suppress own NACK at next sync if applicable */ + if (more(peers_snd_nxt, l->rcv_nxt) && !less(l->rcv_nxt, from)) + l->nack_state = BC_NACK_SND_SUPPRESS; - l_ptr->tolerance = tol; - l_ptr->cont_intv = msecs_to_jiffies(intv); - l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv) / 4); + return 0; } void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) @@ -1780,7 +1549,7 @@ static struct tipc_node *tipc_link_find_owner(struct net *net, list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { tipc_node_lock(n_ptr); for (i = 0; i < MAX_BEARERS; i++) { - l_ptr = n_ptr->links[i]; + l_ptr = n_ptr->links[i].link; if (l_ptr && !strcmp(l_ptr->name, link_name)) { *bearer_id = i; found_node = n_ptr; @@ -1803,31 +1572,20 @@ static struct tipc_node *tipc_link_find_owner(struct net *net, static void link_reset_statistics(struct tipc_link *l_ptr) { memset(&l_ptr->stats, 0, sizeof(l_ptr->stats)); - l_ptr->stats.sent_info = l_ptr->next_out_no; - l_ptr->stats.recv_info = l_ptr->next_in_no; + l_ptr->stats.sent_info = l_ptr->snd_nxt; + l_ptr->stats.recv_info = l_ptr->rcv_nxt; } -static void link_print(struct tipc_link *l_ptr, const char *str) +static void link_print(struct tipc_link *l, const char *str) { - struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id); - struct tipc_bearer *b_ptr; - - rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]); - if (b_ptr) - pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name); - rcu_read_unlock(); - - if (link_working_unknown(l_ptr)) - pr_cont(":WU\n"); - else if (link_reset_reset(l_ptr)) - pr_cont(":RR\n"); - else if (link_reset_unknown(l_ptr)) - pr_cont(":RU\n"); - else if (link_working_working(l_ptr)) - pr_cont(":WW\n"); - else - pr_cont("\n"); + struct sk_buff *hskb = skb_peek(&l->transmq); + u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt - 1; + u16 tail = l->snd_nxt - 1; + + pr_info("%s Link <%s> state %x\n", str, l->name, l->state); + pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n", + skb_queue_len(&l->transmq), head, tail, + skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt); } /* Parse and validate nested (link) properties valid for media, bearer and link @@ -1893,13 +1651,16 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + if (strcmp(name, tipc_bclink_name) == 0) + return tipc_nl_bc_link_set(net, attrs); + node = tipc_link_find_owner(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; @@ -1919,7 +1680,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - link_set_supervision_props(link, tol); + link->tolerance = tol; tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0); } if (props[TIPC_NLA_PROP_PRIO]) { @@ -2034,15 +1795,15 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->next_out_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt)) goto attr_msg_full; if (tipc_link_is_up(link)) if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) goto attr_msg_full; - if (tipc_link_is_active(link)) + if (link->active) if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE)) goto attr_msg_full; @@ -2089,10 +1850,11 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; - if (!node->links[i]) + if (!node->links[i].link) continue; - err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI); + err = __tipc_nl_add_link(net, msg, + node->links[i].link, NLM_F_MULTI); if (err) return err; } @@ -2175,50 +1937,53 @@ out: int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct sk_buff *ans_skb; struct tipc_nl_msg msg; - struct tipc_link *link; - struct tipc_node *node; char *name; - int bearer_id; int err; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + if (!info->attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; - name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]); - node = tipc_link_find_owner(net, name, &bearer_id); - if (!node) - return -EINVAL; - ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!ans_skb) + msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg.skb) return -ENOMEM; - msg.skb = ans_skb; - msg.portid = info->snd_portid; - msg.seq = info->snd_seq; - - tipc_node_lock(node); - link = node->links[bearer_id]; - if (!link) { - err = -EINVAL; - goto err_out; - } - - err = __tipc_nl_add_link(net, &msg, link, 0); - if (err) - goto err_out; + if (strcmp(name, tipc_bclink_name) == 0) { + err = tipc_nl_add_bc_link(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } else { + int bearer_id; + struct tipc_node *node; + struct tipc_link *link; - tipc_node_unlock(node); + node = tipc_link_find_owner(net, name, &bearer_id); + if (!node) + return -EINVAL; - return genlmsg_reply(ans_skb, info); + tipc_node_lock(node); + link = node->links[bearer_id].link; + if (!link) { + tipc_node_unlock(node); + nlmsg_free(msg.skb); + return -EINVAL; + } -err_out: - tipc_node_unlock(node); - nlmsg_free(ans_skb); + err = __tipc_nl_add_link(net, &msg, link, 0); + tipc_node_unlock(node); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } - return err; + return genlmsg_reply(msg.skb, info); } int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) @@ -2258,7 +2023,7 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { tipc_node_unlock(node); return -EINVAL; diff --git a/kernel/net/tipc/link.h b/kernel/net/tipc/link.h index b5b4e3554..66d859b66 100644 --- a/kernel/net/tipc/link.h +++ b/kernel/net/tipc/link.h @@ -49,19 +49,26 @@ */ #define INVALID_LINK_SEQ 0x10000 -/* Link working states +/* Link FSM events: */ -#define WORKING_WORKING 560810u -#define WORKING_UNKNOWN 560811u -#define RESET_UNKNOWN 560812u -#define RESET_RESET 560813u +enum { + LINK_ESTABLISH_EVT = 0xec1ab1e, + LINK_PEER_RESET_EVT = 0x9eed0e, + LINK_FAILURE_EVT = 0xfa110e, + LINK_RESET_EVT = 0x10ca1d0e, + LINK_FAILOVER_BEGIN_EVT = 0xfa110bee, + LINK_FAILOVER_END_EVT = 0xfa110ede, + LINK_SYNCH_BEGIN_EVT = 0xc1ccbee, + LINK_SYNCH_END_EVT = 0xc1ccede +}; -/* Link endpoint execution states +/* Events returned from link at packet reception or at timeout */ -#define LINK_STARTED 0x0001 -#define LINK_STOPPED 0x0002 -#define LINK_SYNCHING 0x0004 -#define LINK_FAILINGOVER 0x0008 +enum { + TIPC_LINK_UP_EVT = 1, + TIPC_LINK_DOWN_EVT = (1 << 1), + TIPC_LINK_SND_BC_ACK = (1 << 2) +}; /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) @@ -104,33 +111,34 @@ struct tipc_stats { * @name: link name character string * @media_addr: media address to use when sending messages over link * @timer: link timer - * @owner: pointer to peer node + * @net: pointer to namespace struct * @refcnt: reference counter for permanent references (owner node & timer) - * @flags: execution state flags for link endpoint instance - * @checkpoint: reference point for triggering link continuity checking * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @cont_intv: link continuity testing interval + * @keepalive_intv: link keepalive timer interval * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM - * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state + * @peer_caps: bitmap describing capabilities of peer node + * @silent_intv_cnt: # of timer intervals without any reception from peer * @proto_msg: template for control messages generated by link * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover - * @reset_checkpoint: seq # of last acknowledged message at time of link reset + * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset * @mtu: current maximum packet size for this link * @advertised_mtu: advertised own mtu when link is being established * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent - * @next_out_no: next sequence number to use for outbound messages + * @snt_nxt: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message * @stale_count: # of identical retransmit requests made by peer - * @next_in_no: next sequence number to expect for inbound messages + * @ackers: # of peers that needs to ack each packet before it can be released + * @acked: # last packet acked by a certain peer. Used for broadcast. + * @rcv_nxt: next sequence number to expect for inbound messages * @deferred_queue: deferred queue saved OOS b'cast message received from node * @unacked_window: # of inbound messages rx'd without ack'ing back to peer * @inputq: buffer queue for messages to be delivered upwards @@ -139,27 +147,26 @@ struct tipc_stats { * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate * @long_msg_seq_no: next identifier to use for outbound fragmented messages * @reasm_buf: head of partially reassembled inbound message fragments + * @bc_rcvr: marks that this is a broadcast receiver link * @stats: collects statistics regarding link activity */ struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; - struct tipc_media_addr media_addr; - struct timer_list timer; - struct tipc_node *owner; - struct kref ref; + struct tipc_media_addr *media_addr; + struct net *net; /* Management and link supervision data */ - unsigned int flags; - u32 checkpoint; u32 peer_session; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; - unsigned long cont_intv; + unsigned long keepalive_intv; u32 abort_limit; - int state; - u32 fsm_msg_cnt; + u32 state; + u16 peer_caps; + bool active; + u32 silent_intv_cnt; struct { unchar hdr[INT_H_SIZE]; unchar body[TIPC_MAX_IF_NAME]; @@ -167,12 +174,10 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; - u16 synch_point; - /* Failover */ - u16 failover_pkts; - u16 failover_checkpt; - struct sk_buff *failover_skb; + /* Failover/synch */ + u16 drop_point; + struct sk_buff *failover_reasm_skb; /* Max packet negotiation */ u16 mtu; @@ -185,17 +190,17 @@ struct tipc_link { u16 len; u16 limit; } backlog[5]; - u32 next_out_no; - u32 window; - u32 last_retransmitted; + u16 snd_nxt; + u16 last_retransm; + u16 window; u32 stale_count; /* Reception */ - u32 next_in_no; + u16 rcv_nxt; u32 rcv_unacked; struct sk_buff_head deferdq; - struct sk_buff_head inputq; - struct sk_buff_head namedq; + struct sk_buff_head *inputq; + struct sk_buff_head *namedq; /* Congestion handling */ struct sk_buff_head wakeupq; @@ -203,109 +208,76 @@ struct tipc_link { /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; + /* Broadcast */ + u16 ackers; + u16 acked; + struct tipc_link *bc_rcvlink; + struct tipc_link *bc_sndlink; + int nack_state; + bool bc_peer_is_up; + /* Statistics */ struct tipc_stats stats; }; -struct tipc_port; - -struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, - struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr); -void tipc_link_delete(struct tipc_link *link); -void tipc_link_delete_list(struct net *net, unsigned int bearer_id, - bool shutting_down); -void tipc_link_failover_send_queue(struct tipc_link *l_ptr); -void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest); +bool tipc_link_create(struct net *net, char *if_name, int bearer_id, + int tolerance, char net_plane, u32 mtu, int priority, + int window, u32 session, u32 ownnode, u32 peer, + u16 peer_caps, + struct tipc_link *bc_sndlink, + struct tipc_link *bc_rcvlink, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq, + struct tipc_link **link); +bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, + int mtu, int window, u16 peer_caps, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq, + struct tipc_link *bc_sndlink, + struct tipc_link **link); +void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, + int mtyp, struct sk_buff_head *xmitq); +void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); +int tipc_link_fsm_evt(struct tipc_link *l, int evt); void tipc_link_reset_fragments(struct tipc_link *l_ptr); -int tipc_link_is_up(struct tipc_link *l_ptr); -int tipc_link_is_active(struct tipc_link *l_ptr); -void tipc_link_purge_queues(struct tipc_link *l_ptr); -void tipc_link_reset_all(struct tipc_node *node); +bool tipc_link_is_up(struct tipc_link *l); +bool tipc_link_peer_is_down(struct tipc_link *l); +bool tipc_link_is_reset(struct tipc_link *l); +bool tipc_link_is_establishing(struct tipc_link *l); +bool tipc_link_is_synching(struct tipc_link *l); +bool tipc_link_is_failingover(struct tipc_link *l); +bool tipc_link_is_blocked(struct tipc_link *l); +void tipc_link_set_active(struct tipc_link *l, bool active); void tipc_link_reset(struct tipc_link *l_ptr); -void tipc_link_reset_list(struct net *net, unsigned int bearer_id); -int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, - u32 selector); -int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, - u32 selector); -int __tipc_link_xmit(struct net *net, struct tipc_link *link, - struct sk_buff_head *list); -void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, - u32 gap, u32 tolerance, u32 priority); -void tipc_link_push_packets(struct tipc_link *l_ptr); -u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *buf); -void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); -void tipc_link_retransmit(struct tipc_link *l_ptr, - struct sk_buff *start, u32 retransmits); -struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list, - const struct sk_buff *skb); +int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, + struct sk_buff_head *xmitq); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); -void link_prepare_wakeup(struct tipc_link *l); - -/* - * Link sequence number manipulation routines (uses modulo 2**16 arithmetic) - */ -static inline u32 buf_seqno(struct sk_buff *buf) -{ - return msg_seqno(buf_msg(buf)); -} - -static inline u32 mod(u32 x) -{ - return x & 0xffffu; -} - -static inline int less_eq(u32 left, u32 right) -{ - return mod(right - left) < 32768u; -} - -static inline int more(u32 left, u32 right) -{ - return !less_eq(left, right); -} - -static inline int less(u32 left, u32 right) -{ - return less_eq(left, right) && (mod(right) != mod(left)); -} - -static inline u32 lesser(u32 left, u32 right) -{ - return less_eq(left, right) ? left : right; -} - -static inline u32 link_own_addr(struct tipc_link *l) -{ - return msg_prevnode(l->pmsg); -} - -/* - * Link status checking routines - */ -static inline int link_working_working(struct tipc_link *l_ptr) -{ - return l_ptr->state == WORKING_WORKING; -} - -static inline int link_working_unknown(struct tipc_link *l_ptr) -{ - return l_ptr->state == WORKING_UNKNOWN; -} - -static inline int link_reset_unknown(struct tipc_link *l_ptr) -{ - return l_ptr->state == RESET_UNKNOWN; -} - -static inline int link_reset_reset(struct tipc_link *l_ptr) -{ - return l_ptr->state == RESET_RESET; -} - +int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); +int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); +int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); +void tipc_link_add_bc_peer(struct tipc_link *snd_l, + struct tipc_link *uc_l, + struct sk_buff_head *xmitq); +void tipc_link_remove_bc_peer(struct tipc_link *snd_l, + struct tipc_link *rcv_l, + struct sk_buff_head *xmitq); +int tipc_link_bc_peers(struct tipc_link *l); +void tipc_link_set_mtu(struct tipc_link *l, int mtu); +int tipc_link_mtu(struct tipc_link *l); +void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, + struct sk_buff_head *xmitq); +void tipc_link_build_bc_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); +void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); +void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, + struct sk_buff_head *xmitq); +int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); #endif diff --git a/kernel/net/tipc/msg.c b/kernel/net/tipc/msg.c index c3e96e815..8740930f0 100644 --- a/kernel/net/tipc/msg.c +++ b/kernel/net/tipc/msg.c @@ -121,7 +121,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) { struct sk_buff *head = *headbuf; struct sk_buff *frag = *buf; - struct sk_buff *tail; + struct sk_buff *tail = NULL; struct tipc_msg *msg; u32 fragid; int delta; @@ -141,9 +141,15 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (unlikely(skb_unclone(frag, GFP_ATOMIC))) goto err; head = *headbuf = frag; - skb_frag_list_init(head); - TIPC_SKB_CB(head)->tail = NULL; *buf = NULL; + TIPC_SKB_CB(head)->tail = NULL; + if (skb_is_nonlinear(head)) { + skb_walk_frags(head, tail) { + TIPC_SKB_CB(head)->tail = tail; + } + } else { + skb_frag_list_init(head); + } return 0; } @@ -176,7 +182,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) *buf = NULL; return 0; err: - pr_warn_ratelimited("Unable to build fragment list\n"); kfree_skb(*buf); kfree_skb(*headbuf); *buf = *headbuf = NULL; @@ -331,16 +336,15 @@ error: /** * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @bskb: the buffer to append to ("bundle") - * @skb: buffer to be appended + * @skb: the buffer to append to ("bundle") + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer * Consumes buffer if successful * Returns true if bundling could be performed, otherwise false */ -bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) { struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(skb); unsigned int bsz; unsigned int msz = msg_size(msg); u32 start, pad; @@ -348,9 +352,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) if (likely(msg_user(msg) == MSG_FRAGMENTER)) return false; - if (!bskb) + if (!skb) return false; - bmsg = buf_msg(bskb); + bmsg = buf_msg(skb); bsz = msg_size(bmsg); start = align(bsz); pad = start - bsz; @@ -359,18 +363,20 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) return false; if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) return false; - if (likely(msg_user(bmsg) != MSG_BUNDLER)) + if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) return false; - if (unlikely(skb_tailroom(bskb) < (pad + msz))) + if (unlikely(skb_tailroom(skb) < (pad + msz))) return false; if (unlikely(max < (start + msz))) return false; + if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && + (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + return false; - skb_put(bskb, pad + msz); - skb_copy_to_linear_data_offset(bskb, start, skb->data, msz); + skb_put(skb, pad + msz); + skb_copy_to_linear_data_offset(skb, start, msg, msz); msg_set_size(bmsg, start + msz); msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); - kfree_skb(skb); return true; } @@ -416,18 +422,18 @@ none: /** * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain - * @skb: buffer to be appended and replaced + * @list: the buffer chain, where head is the buffer to replace/append + * @skb: buffer to be created, appended to and returned in case of success + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer, inclusive header * @dnode: destination node for message. (Not always present in header) - * Replaces buffer if successful * Returns true if success, otherwise false */ -bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode) { - struct sk_buff *bskb; + struct sk_buff *_skb; struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(*skb); u32 msz = msg_size(msg); u32 max = mtu - INT_H_SIZE; @@ -440,78 +446,94 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) if (msz > (max / 2)) return false; - bskb = tipc_buf_acquire(max); - if (!bskb) + _skb = tipc_buf_acquire(max); + if (!_skb) return false; - skb_trim(bskb, INT_H_SIZE); - bmsg = buf_msg(bskb); + skb_trim(_skb, INT_H_SIZE); + bmsg = buf_msg(_skb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); + if (msg_isdata(msg)) + msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); + else + msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - tipc_msg_bundle(bskb, *skb, mtu); - *skb = bskb; + tipc_msg_bundle(_skb, msg, mtu); + *skb = _skb; return true; } /** * tipc_msg_reverse(): swap source and destination addresses and add error code - * @buf: buffer containing message to be reversed - * @dnode: return value: node where to send message after reversal - * @err: error code to be set in message - * Consumes buffer if failure + * @own_node: originating node id for reversed message + * @skb: buffer containing message to be reversed; may be replaced. + * @err: error code to be set in message, if any + * Consumes buffer at failure * Returns true if success, otherwise false */ -bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, - int err) +bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { - struct tipc_msg *msg = buf_msg(buf); + struct sk_buff *_skb = *skb; + struct tipc_msg *hdr = buf_msg(_skb); struct tipc_msg ohdr; - uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE); + int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); - if (skb_linearize(buf)) + if (skb_linearize(_skb)) goto exit; - msg = buf_msg(buf); - if (msg_dest_droppable(msg)) + hdr = buf_msg(_skb); + if (msg_dest_droppable(hdr)) goto exit; - if (msg_errcode(msg)) + if (msg_errcode(hdr)) goto exit; - memcpy(&ohdr, msg, msg_hdr_sz(msg)); - msg_set_errcode(msg, err); - msg_set_origport(msg, msg_destport(&ohdr)); - msg_set_destport(msg, msg_origport(&ohdr)); - msg_set_prevnode(msg, own_addr); - if (!msg_short(msg)) { - msg_set_orignode(msg, msg_destnode(&ohdr)); - msg_set_destnode(msg, msg_orignode(&ohdr)); + + /* Take a copy of original header before altering message */ + memcpy(&ohdr, hdr, msg_hdr_sz(hdr)); + + /* Never return SHORT header; expand by replacing buffer if necessary */ + if (msg_short(hdr)) { + *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen); + if (!*skb) + goto exit; + memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); + kfree_skb(_skb); + _skb = *skb; + hdr = buf_msg(_skb); + memcpy(hdr, &ohdr, BASIC_H_SIZE); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - msg_set_size(msg, msg_hdr_sz(msg) + rdsz); - skb_trim(buf, msg_size(msg)); - skb_orphan(buf); - *dnode = msg_orignode(&ohdr); + + /* Now reverse the concerned fields */ + msg_set_errcode(hdr, err); + msg_set_origport(hdr, msg_destport(&ohdr)); + msg_set_destport(hdr, msg_origport(&ohdr)); + msg_set_destnode(hdr, msg_prevnode(&ohdr)); + msg_set_prevnode(hdr, own_node); + msg_set_orignode(hdr, own_node); + msg_set_size(hdr, msg_hdr_sz(hdr) + dlen); + skb_trim(_skb, msg_size(hdr)); + skb_orphan(_skb); return true; exit: - kfree_skb(buf); - *dnode = 0; + kfree_skb(_skb); + *skb = NULL; return false; } /** * tipc_msg_lookup_dest(): try to find new destination for named message * @skb: the buffer containing the message. - * @dnode: return value: next-hop node, if destination found - * @err: return value: error code to use, if message to be rejected + * @err: error code to be used by caller if lookup fails * Does not consume buffer * Returns true if a destination is found, false otherwise */ -bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, - u32 *dnode, int *err) +bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { struct tipc_msg *msg = buf_msg(skb); - u32 dport; - u32 own_addr = tipc_own_addr(net); + u32 dport, dnode; + u32 onode = tipc_own_addr(net); if (!msg_isdata(msg)) return false; @@ -522,17 +544,18 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, *err = -TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; + msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; - *dnode = addr_domain(net, msg_lookup_scope(msg)); + dnode = addr_domain(net, msg_lookup_scope(msg)); dport = tipc_nametbl_translate(net, msg_nametype(msg), - msg_nameinst(msg), dnode); + msg_nameinst(msg), &dnode); if (!dport) return false; msg_incr_reroute_cnt(msg); - if (*dnode != own_addr) - msg_set_prevnode(msg, own_addr); - msg_set_destnode(msg, *dnode); + if (dnode != onode) + msg_set_prevnode(msg, onode); + msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; return true; @@ -541,18 +564,22 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, /* tipc_msg_reassemble() - clone a buffer chain of fragments and * reassemble the clones into one message */ -struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list) +bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq) { - struct sk_buff *skb; + struct sk_buff *skb, *_skb; struct sk_buff *frag = NULL; struct sk_buff *head = NULL; - int hdr_sz; + int hdr_len; /* Copy header if single buffer */ if (skb_queue_len(list) == 1) { skb = skb_peek(list); - hdr_sz = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb)); - return __pskb_copy(skb, hdr_sz, GFP_ATOMIC); + hdr_len = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb)); + _skb = __pskb_copy(skb, hdr_len, GFP_ATOMIC); + if (!_skb) + return false; + __skb_queue_tail(rcvq, _skb); + return true; } /* Clone all fragments and reassemble */ @@ -566,9 +593,41 @@ struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list) if (!head) goto error; } - return frag; + __skb_queue_tail(rcvq, frag); + return true; error: pr_warn("Failed do clone local mcast rcv buffer\n"); kfree_skb(head); - return NULL; + return false; +} + +/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number + * @list: list to be appended to + * @seqno: sequence number of buffer to add + * @skb: buffer to add + */ +void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, + struct sk_buff *skb) +{ + struct sk_buff *_skb, *tmp; + + if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) { + __skb_queue_head(list, skb); + return; + } + + if (more(seqno, buf_seqno(skb_peek_tail(list)))) { + __skb_queue_tail(list, skb); + return; + } + + skb_queue_walk_safe(list, _skb, tmp) { + if (more(seqno, buf_seqno(_skb))) + continue; + if (seqno == buf_seqno(_skb)) + break; + __skb_queue_before(list, _skb, skb); + return; + } + kfree_skb(skb); } diff --git a/kernel/net/tipc/msg.h b/kernel/net/tipc/msg.h index e1d3595e2..55778a0ae 100644 --- a/kernel/net/tipc/msg.h +++ b/kernel/net/tipc/msg.h @@ -38,6 +38,7 @@ #define _TIPC_MSG_H #include +#include "core.h" /* * Constants and routines used to read and write TIPC payload message headers @@ -109,9 +110,9 @@ struct tipc_skb_cb { struct sk_buff *tail; bool validated; bool wakeup_pending; - bool bundling; u16 chain_sz; u16 chain_imp; + u16 ackers; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) @@ -313,12 +314,12 @@ static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 19, 0x3, n); } -static inline u32 msg_bcast_ack(struct tipc_msg *m) +static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } -static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) +static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } @@ -327,22 +328,22 @@ static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) /* * Word 2 */ -static inline u32 msg_ack(struct tipc_msg *m) +static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } -static inline void msg_set_ack(struct tipc_msg *m, u32 n) +static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } -static inline u32 msg_seqno(struct tipc_msg *m) +static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } -static inline void msg_set_seqno(struct tipc_msg *m, u32 n) +static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } @@ -352,18 +353,22 @@ static inline void msg_set_seqno(struct tipc_msg *m, u32 n) */ static inline u32 msg_importance(struct tipc_msg *m) { - if (unlikely(msg_user(m) == MSG_FRAGMENTER)) - return msg_bits(m, 5, 13, 0x7); - if (likely(msg_isdata(m) && !msg_errcode(m))) - return msg_user(m); + int usr = msg_user(m); + + if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) + return usr; + if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) + return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { - if (unlikely(msg_user(m) == MSG_FRAGMENTER)) - msg_set_bits(m, 5, 13, 0x7, i); - else if (likely(i < TIPC_SYSTEM_IMPORTANCE)) + int usr = msg_user(m); + + if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) + msg_set_bits(m, 9, 0, 0x7, i); + else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); @@ -554,15 +559,6 @@ static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 15, 0x1fff, n); } -static inline bool msg_dup(struct tipc_msg *m) -{ - if (likely(msg_user(m) != TUNNEL_PROTOCOL)) - return false; - if (msg_type(m) != SYNCH_MSG) - return false; - return true; -} - /* * Word 2 */ @@ -605,6 +601,11 @@ static inline u32 msg_last_bcast(struct tipc_msg *m) return msg_bits(m, 4, 16, 0xffff); } +static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) +{ + return msg_last_bcast(m) + 1; +} + static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); @@ -616,12 +617,12 @@ static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) } -static inline u32 msg_next_sent(struct tipc_msg *m) +static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } -static inline void msg_set_next_sent(struct tipc_msg *m, u32 n) +static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } @@ -654,12 +655,12 @@ static inline void msg_set_link_selector(struct tipc_msg *m, u32 n) /* * Word 5 */ -static inline u32 msg_session(struct tipc_msg *m) +static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } -static inline void msg_set_session(struct tipc_msg *m, u32 n) +static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } @@ -722,12 +723,12 @@ static inline char *msg_media_addr(struct tipc_msg *m) /* * Word 9 */ -static inline u32 msg_msgcnt(struct tipc_msg *m) +static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } -static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n) +static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } @@ -762,25 +763,46 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } +static inline bool msg_peer_link_is_up(struct tipc_msg *m) +{ + if (likely(msg_user(m) != LINK_PROTOCOL)) + return true; + if (msg_type(m) == STATE_MSG) + return true; + return false; +} + +static inline bool msg_peer_node_is_up(struct tipc_msg *m) +{ + if (msg_peer_link_is_up(m)) + return true; + return msg_redundant_link(m); +} + struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); -bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, - int err); +bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu); - -bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode); +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); -bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode, - int *err); -struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list); +bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); +bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); +void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, + struct sk_buff *skb); + +static inline u16 buf_seqno(struct sk_buff *skb) +{ + return msg_seqno(buf_msg(skb)); +} /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in @@ -848,26 +870,33 @@ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, return skb; } -/* tipc_skb_queue_tail(): add buffer to tail of list; - * @list: list to be appended to - * @skb: buffer to append. Always appended - * @dport: the destination port of the buffer - * returns true if dport differs from previous destination +/* tipc_skb_queue_splice_tail - append an skb list to lock protected list + * @list: the new list to append. Not lock protected + * @head: target list. Lock protected. */ -static inline bool tipc_skb_queue_tail(struct sk_buff_head *list, - struct sk_buff *skb, u32 dport) +static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, + struct sk_buff_head *head) { - struct sk_buff *_skb = NULL; - bool rv = false; + spin_lock_bh(&head->lock); + skb_queue_splice_tail(list, head); + spin_unlock_bh(&head->lock); +} + +/* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists + * @list: the new list to add. Lock protected. Will be reinitialized + * @head: target list. Lock protected. + */ +static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, + struct sk_buff_head *head) +{ + struct sk_buff_head tmp; + + __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); - _skb = skb_peek_tail(list); - if (!_skb || (msg_destport(buf_msg(_skb)) != dport) || - (skb_queue_len(list) > 32)) - rv = true; - __skb_queue_tail(list, skb); + skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); - return rv; + tipc_skb_queue_splice_tail(&tmp, head); } #endif diff --git a/kernel/net/tipc/name_distr.c b/kernel/net/tipc/name_distr.c index 41e7b7e4d..c07612bab 100644 --- a/kernel/net/tipc/name_distr.c +++ b/kernel/net/tipc/name_distr.c @@ -96,13 +96,13 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb) dnode = node->addr; if (in_own_node(net, dnode)) continue; - if (!tipc_node_active_links(node)) + if (!tipc_node_is_up(node)) continue; oskb = pskb_copy(skb, GFP_ATOMIC); if (!oskb) break; msg_set_destnode(buf_msg(oskb), dnode); - tipc_link_xmit_skb(net, oskb, dnode, dnode); + tipc_node_xmit_skb(net, oskb, dnode, 0); } rcu_read_unlock(); @@ -223,7 +223,7 @@ void tipc_named_node_up(struct net *net, u32 dnode) &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]); rcu_read_unlock(); - tipc_link_xmit(net, &head, dnode, dnode); + tipc_node_xmit(net, &head, dnode, 0); } static void tipc_publ_subscribe(struct net *net, struct publication *publ, diff --git a/kernel/net/tipc/name_table.c b/kernel/net/tipc/name_table.c index ab0ac62a1..0f47f08bf 100644 --- a/kernel/net/tipc/name_table.c +++ b/kernel/net/tipc/name_table.c @@ -330,13 +330,9 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscr_report_overlap(s, - publ->lower, - publ->upper, - TIPC_PUBLISHED, - publ->ref, - publ->node, - created_subseq); + tipc_subscrp_report_overlap(s, publ->lower, publ->upper, + TIPC_PUBLISHED, publ->ref, + publ->node, created_subseq); } return publ; } @@ -404,13 +400,9 @@ found: /* Notify any waiting subscriptions */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscr_report_overlap(s, - publ->lower, - publ->upper, - TIPC_WITHDRAWN, - publ->ref, - publ->node, - removed_subseq); + tipc_subscrp_report_overlap(s, publ->lower, publ->upper, + TIPC_WITHDRAWN, publ->ref, + publ->node, removed_subseq); } return publ; @@ -432,19 +424,17 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscr_overlap(s, sseq->lower, sseq->upper)) { + if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; list_for_each_entry(crs, &info->zone_list, zone_list) { - tipc_subscr_report_overlap(s, - sseq->lower, - sseq->upper, - TIPC_PUBLISHED, - crs->ref, - crs->node, - must_report); + tipc_subscrp_report_overlap(s, sseq->lower, + sseq->upper, + TIPC_PUBLISHED, + crs->ref, crs->node, + must_report); must_report = 0; } } diff --git a/kernel/net/tipc/net.c b/kernel/net/tipc/net.c index a54f3cbe2..77bf9113c 100644 --- a/kernel/net/tipc/net.c +++ b/kernel/net/tipc/net.c @@ -40,6 +40,7 @@ #include "subscr.h" #include "socket.h" #include "node.h" +#include "bcast.h" static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, @@ -111,14 +112,11 @@ int tipc_net_start(struct net *net, u32 addr) { struct tipc_net *tn = net_generic(net, tipc_net_id); char addr_string[16]; - int res; tn->own_addr = addr; tipc_named_reinit(net); tipc_sk_reinit(net); - res = tipc_bclink_init(net); - if (res) - return res; + tipc_bcast_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, TIPC_ZONE_SCOPE, 0, tn->own_addr); @@ -141,7 +139,6 @@ void tipc_net_stop(struct net *net) tn->own_addr); rtnl_lock(); tipc_bearer_stop(net); - tipc_bclink_stop(net); tipc_node_stop(net); rtnl_unlock(); diff --git a/kernel/net/tipc/netlink_compat.c b/kernel/net/tipc/netlink_compat.c index ce9121e8e..1eadc95e1 100644 --- a/kernel/net/tipc/netlink_compat.c +++ b/kernel/net/tipc/netlink_compat.c @@ -55,6 +55,7 @@ struct tipc_nl_compat_msg { int rep_type; int rep_size; int req_type; + struct net *net; struct sk_buff *rep; struct tlv_desc *req; struct sock *dst_sk; @@ -68,7 +69,8 @@ struct tipc_nl_compat_cmd_dump { struct tipc_nl_compat_cmd_doit { int (*doit)(struct sk_buff *skb, struct genl_info *info); - int (*transcode)(struct sk_buff *skb, struct tipc_nl_compat_msg *msg); + int (*transcode)(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg); }; static int tipc_skb_tailroom(struct sk_buff *skb) @@ -281,7 +283,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (!trans_buf) return -ENOMEM; - err = (*cmd->transcode)(trans_buf, msg); + err = (*cmd->transcode)(cmd, trans_buf, msg); if (err) goto trans_out; @@ -353,7 +355,8 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, nla_len(bearer[TIPC_NLA_BEARER_NAME])); } -static int tipc_nl_compat_bearer_enable(struct sk_buff *skb, +static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; @@ -385,7 +388,8 @@ static int tipc_nl_compat_bearer_enable(struct sk_buff *skb, return 0; } -static int tipc_nl_compat_bearer_disable(struct sk_buff *skb, +static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; @@ -576,11 +580,81 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, &link_info, sizeof(link_info)); } -static int tipc_nl_compat_link_set(struct sk_buff *skb, - struct tipc_nl_compat_msg *msg) +static int __tipc_add_link_prop(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg, + struct tipc_link_config *lc) +{ + switch (msg->cmd) { + case TIPC_CMD_SET_LINK_PRI: + return nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value)); + case TIPC_CMD_SET_LINK_TOL: + return nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value)); + case TIPC_CMD_SET_LINK_WINDOW: + return nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value)); + } + + return -EINVAL; +} + +static int tipc_nl_compat_media_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) { - struct nlattr *link; struct nlattr *prop; + struct nlattr *media; + struct tipc_link_config *lc; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + media = nla_nest_start(skb, TIPC_NLA_MEDIA); + if (!media) + return -EMSGSIZE; + + if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) + return -EMSGSIZE; + + prop = nla_nest_start(skb, TIPC_NLA_MEDIA_PROP); + if (!prop) + return -EMSGSIZE; + + __tipc_add_link_prop(skb, msg, lc); + nla_nest_end(skb, prop); + nla_nest_end(skb, media); + + return 0; +} + +static int tipc_nl_compat_bearer_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct nlattr *prop; + struct nlattr *bearer; + struct tipc_link_config *lc; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + bearer = nla_nest_start(skb, TIPC_NLA_BEARER); + if (!bearer) + return -EMSGSIZE; + + if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) + return -EMSGSIZE; + + prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP); + if (!prop) + return -EMSGSIZE; + + __tipc_add_link_prop(skb, msg, lc); + nla_nest_end(skb, prop); + nla_nest_end(skb, bearer); + + return 0; +} + +static int __tipc_nl_compat_link_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct nlattr *prop; + struct nlattr *link; struct tipc_link_config *lc; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -596,24 +670,40 @@ static int tipc_nl_compat_link_set(struct sk_buff *skb, if (!prop) return -EMSGSIZE; - if (msg->cmd == TIPC_CMD_SET_LINK_PRI) { - if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value))) - return -EMSGSIZE; - } else if (msg->cmd == TIPC_CMD_SET_LINK_TOL) { - if (nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value))) - return -EMSGSIZE; - } else if (msg->cmd == TIPC_CMD_SET_LINK_WINDOW) { - if (nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value))) - return -EMSGSIZE; - } - + __tipc_add_link_prop(skb, msg, lc); nla_nest_end(skb, prop); nla_nest_end(skb, link); return 0; } -static int tipc_nl_compat_link_reset_stats(struct sk_buff *skb, +static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct tipc_link_config *lc; + struct tipc_bearer *bearer; + struct tipc_media *media; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + media = tipc_media_find(lc->name); + if (media) { + cmd->doit = &tipc_nl_media_set; + return tipc_nl_compat_media_set(skb, msg); + } + + bearer = tipc_bearer_find(msg->net, lc->name); + if (bearer) { + cmd->doit = &tipc_nl_bearer_set; + return tipc_nl_compat_bearer_set(skb, msg); + } + + return __tipc_nl_compat_link_set(skb, msg); +} + +static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; @@ -851,7 +941,8 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, sizeof(node_info)); } -static int tipc_nl_compat_net_set(struct sk_buff *skb, +static int tipc_nl_compat_net_set(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { u32 val; @@ -1007,7 +1098,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) struct nlmsghdr *req_nlh; struct nlmsghdr *rep_nlh; struct tipc_genlmsghdr *req_userhdr = info->userhdr; - struct net *net = genl_info_net(info); memset(&msg, 0, sizeof(msg)); @@ -1015,6 +1105,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; msg.dst_sk = info->dst_sk; + msg.net = genl_info_net(info); if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); @@ -1023,14 +1114,14 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) } len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN); - if (TLV_GET_LEN(msg.req) && !TLV_OK(msg.req, len)) { + if (len && !TLV_OK(msg.req, len)) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); err = -EOPNOTSUPP; goto send; } err = tipc_nl_compat_handle(&msg); - if (err == -EOPNOTSUPP) + if ((err == -EOPNOTSUPP) || (err == -EPERM)) msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); else if (err == -EINVAL) msg.rep = tipc_get_err_tlv(TIPC_CFG_TLV_ERROR); @@ -1043,7 +1134,7 @@ send: rep_nlh = nlmsg_hdr(msg.rep); memcpy(rep_nlh, info->nlhdr, len); rep_nlh->nlmsg_len = msg.rep->len; - genlmsg_unicast(net, msg.rep, NETLINK_CB(skb).portid); + genlmsg_unicast(msg.net, msg.rep, NETLINK_CB(skb).portid); return err; } diff --git a/kernel/net/tipc/node.c b/kernel/net/tipc/node.c index 22c059ad2..3926b561f 100644 --- a/kernel/net/tipc/node.c +++ b/kernel/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2014, Ericsson AB + * Copyright (c) 2000-2006, 2012-2015, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -39,10 +39,42 @@ #include "node.h" #include "name_distr.h" #include "socket.h" +#include "bcast.h" +#include "discover.h" -static void node_lost_contact(struct tipc_node *n_ptr); -static void node_established_contact(struct tipc_node *n_ptr); +/* Node FSM states and events: + */ +enum { + SELF_DOWN_PEER_DOWN = 0xdd, + SELF_UP_PEER_UP = 0xaa, + SELF_DOWN_PEER_LEAVING = 0xd1, + SELF_UP_PEER_COMING = 0xac, + SELF_COMING_PEER_UP = 0xca, + SELF_LEAVING_PEER_DOWN = 0x1d, + NODE_FAILINGOVER = 0xf0, + NODE_SYNCHING = 0xcc +}; + +enum { + SELF_ESTABL_CONTACT_EVT = 0xece, + SELF_LOST_CONTACT_EVT = 0x1ce, + PEER_ESTABL_CONTACT_EVT = 0x9ece, + PEER_LOST_CONTACT_EVT = 0x91ce, + NODE_FAILOVER_BEGIN_EVT = 0xfbe, + NODE_FAILOVER_END_EVT = 0xfee, + NODE_SYNCH_BEGIN_EVT = 0xcbe, + NODE_SYNCH_END_EVT = 0xcee +}; + +static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr **maddr); +static void tipc_node_link_down(struct tipc_node *n, int bearer_id, + bool delete); +static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); +static void tipc_node_timeout(unsigned long data); +static void tipc_node_fsm_evt(struct tipc_node *n, int evt); struct tipc_sock_conn { u32 port; @@ -109,7 +141,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr) return NULL; } -struct tipc_node *tipc_node_create(struct net *net, u32 addr) +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n_ptr, *temp_node; @@ -125,31 +157,66 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) } n_ptr->addr = addr; n_ptr->net = net; + n_ptr->capabilities = capabilities; kref_init(&n_ptr->kref); spin_lock_init(&n_ptr->lock); INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->publ_list); INIT_LIST_HEAD(&n_ptr->conn_sks); - __skb_queue_head_init(&n_ptr->bclink.deferdq); + skb_queue_head_init(&n_ptr->bc_entry.namedq); + skb_queue_head_init(&n_ptr->bc_entry.inputq1); + __skb_queue_head_init(&n_ptr->bc_entry.arrvq); + skb_queue_head_init(&n_ptr->bc_entry.inputq2); + n_ptr->state = SELF_DOWN_PEER_LEAVING; + n_ptr->signature = INVALID_NODE_SIG; + n_ptr->active_links[0] = INVALID_BEARER_ID; + n_ptr->active_links[1] = INVALID_BEARER_ID; + if (!tipc_link_bc_create(net, tipc_own_addr(net), n_ptr->addr, + U16_MAX, tipc_bc_sndlink(net)->window, + n_ptr->capabilities, + &n_ptr->bc_entry.inputq1, + &n_ptr->bc_entry.namedq, + tipc_bc_sndlink(net), + &n_ptr->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no memory\n"); + kfree(n_ptr); + n_ptr = NULL; + goto exit; + } + tipc_node_get(n_ptr); + setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr); + n_ptr->keepalive_intv = U32_MAX; hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n_ptr->addr < temp_node->addr) break; } list_add_tail_rcu(&n_ptr->list, &temp_node->list); - n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; - n_ptr->signature = INVALID_NODE_SIG; - tipc_node_get(n_ptr); exit: spin_unlock_bh(&tn->node_list_lock); return n_ptr; } +static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) +{ + unsigned long tol = l->tolerance; + unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; + unsigned long keepalive_intv = msecs_to_jiffies(intv); + + /* Link with lowest tolerance determines timer interval */ + if (keepalive_intv < n->keepalive_intv) + n->keepalive_intv = keepalive_intv; + + /* Ensure link's abort limit corresponds to current interval */ + l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv); +} + static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); + kfree(node->bc_entry.link); kfree_rcu(node, rcu); } @@ -159,8 +226,11 @@ void tipc_node_stop(struct net *net) struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) + list_for_each_entry_safe(node, t_node, &tn->node_list, list) { + if (del_timer(&node->timer)) + tipc_node_put(node); tipc_node_put(node); + } spin_unlock_bh(&tn->node_list_lock); } @@ -218,212 +288,604 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) tipc_node_put(node); } +/* tipc_node_timeout - handle expiration of node timer + */ +static void tipc_node_timeout(unsigned long data) +{ + struct tipc_node *n = (struct tipc_node *)data; + struct tipc_link_entry *le; + struct sk_buff_head xmitq; + int bearer_id; + int rc = 0; + + __skb_queue_head_init(&xmitq); + + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + tipc_node_lock(n); + le = &n->links[bearer_id]; + if (le->link) { + /* Link tolerance may change asynchronously: */ + tipc_node_calculate_timer(n, le->link); + rc = tipc_link_timeout(le->link, &xmitq); + } + tipc_node_unlock(n); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); + if (rc & TIPC_LINK_DOWN_EVT) + tipc_node_link_down(n, bearer_id, false); + } + if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + tipc_node_get(n); + tipc_node_put(n); +} + /** - * tipc_node_link_up - handle addition of link - * + * __tipc_node_link_up - handle addition of link + * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ -void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, + struct sk_buff_head *xmitq) { - struct tipc_link **active = &n_ptr->active_links[0]; + int *slot0 = &n->active_links[0]; + int *slot1 = &n->active_links[1]; + struct tipc_link *ol = node_active_link(n, 0); + struct tipc_link *nl = n->links[bearer_id].link; - n_ptr->working_links++; - n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP; - n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id; + if (!nl) + return; - pr_debug("Established link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); + tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); + if (!tipc_link_is_up(nl)) + return; - if (!active[0]) { - active[0] = active[1] = l_ptr; - node_established_contact(n_ptr); - goto exit; - } - if (l_ptr->priority < active[0]->priority) { - pr_debug("New link <%s> becomes standby\n", l_ptr->name); - goto exit; + n->working_links++; + n->action_flags |= TIPC_NOTIFY_LINK_UP; + n->link_id = nl->peer_bearer_id << 16 | bearer_id; + + /* Leave room for tunnel header when returning 'mtu' to users: */ + n->links[bearer_id].mtu = nl->mtu - INT_H_SIZE; + + tipc_bearer_add_dest(n->net, bearer_id, n->addr); + tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); + + pr_debug("Established link <%s> on network plane %c\n", + nl->name, nl->net_plane); + + /* First link? => give it both slots */ + if (!ol) { + *slot0 = bearer_id; + *slot1 = bearer_id; + tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); + n->action_flags |= TIPC_NOTIFY_NODE_UP; + tipc_bcast_add_peer(n->net, nl, xmitq); + return; } - tipc_link_dup_queue_xmit(active[0], l_ptr); - if (l_ptr->priority == active[0]->priority) { - active[0] = l_ptr; - goto exit; + + /* Second link => redistribute slots */ + if (nl->priority > ol->priority) { + pr_debug("Old link <%s> becomes standby\n", ol->name); + *slot0 = bearer_id; + *slot1 = bearer_id; + tipc_link_set_active(nl, true); + tipc_link_set_active(ol, false); + } else if (nl->priority == ol->priority) { + tipc_link_set_active(nl, true); + *slot1 = bearer_id; + } else { + pr_debug("New link <%s> is standby\n", nl->name); } - pr_debug("Old link <%s> becomes standby\n", active[0]->name); - if (active[1] != active[0]) - pr_debug("Old link <%s> becomes standby\n", active[1]->name); - active[0] = active[1] = l_ptr; -exit: - /* Leave room for changeover header when returning 'mtu' to users: */ - n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; + + /* Prepare synchronization with first link */ + tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** - * node_select_active_links - select active link + * tipc_node_link_up - handle addition of link + * + * Link becomes active (alone or shared) or standby, depending on its priority. */ -static void node_select_active_links(struct tipc_node *n_ptr) +static void tipc_node_link_up(struct tipc_node *n, int bearer_id, + struct sk_buff_head *xmitq) { - struct tipc_link **active = &n_ptr->active_links[0]; - u32 i; - u32 highest_prio = 0; + tipc_node_lock(n); + __tipc_node_link_up(n, bearer_id, xmitq); + tipc_node_unlock(n); +} - active[0] = active[1] = NULL; +/** + * __tipc_node_link_down - handle loss of link + */ +static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr **maddr) +{ + struct tipc_link_entry *le = &n->links[*bearer_id]; + int *slot0 = &n->active_links[0]; + int *slot1 = &n->active_links[1]; + int i, highest = 0; + struct tipc_link *l, *_l, *tnl; + + l = n->links[*bearer_id].link; + if (!l || tipc_link_is_reset(l)) + return; - for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i]; + n->working_links--; + n->action_flags |= TIPC_NOTIFY_LINK_DOWN; + n->link_id = l->peer_bearer_id << 16 | *bearer_id; - if (!l_ptr || !tipc_link_is_up(l_ptr) || - (l_ptr->priority < highest_prio)) - continue; + tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); - if (l_ptr->priority > highest_prio) { - highest_prio = l_ptr->priority; - active[0] = active[1] = l_ptr; - } else { - active[1] = l_ptr; + pr_debug("Lost link <%s> on network plane %c\n", + l->name, l->net_plane); + + /* Select new active link if any available */ + *slot0 = INVALID_BEARER_ID; + *slot1 = INVALID_BEARER_ID; + for (i = 0; i < MAX_BEARERS; i++) { + _l = n->links[i].link; + if (!_l || !tipc_link_is_up(_l)) + continue; + if (_l == l) + continue; + if (_l->priority < highest) + continue; + if (_l->priority > highest) { + highest = _l->priority; + *slot0 = i; + *slot1 = i; + continue; } + *slot1 = i; } + + if (!tipc_node_is_up(n)) { + if (tipc_link_peer_is_down(l)) + tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); + tipc_link_fsm_evt(l, LINK_RESET_EVT); + tipc_link_reset(l); + tipc_link_build_reset_msg(l, xmitq); + *maddr = &n->links[*bearer_id].maddr; + node_lost_contact(n, &le->inputq); + tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); + return; + } + tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); + + /* There is still a working link => initiate failover */ + tnl = node_active_link(n, 0); + tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); + tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); + n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); + tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); + tipc_link_reset(l); + tipc_link_fsm_evt(l, LINK_RESET_EVT); + tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); + tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); + *maddr = &n->links[tnl->bearer_id].maddr; + *bearer_id = tnl->bearer_id; } -/** - * tipc_node_link_down - handle loss of link - */ -void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { - struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); - struct tipc_link **active; - - n_ptr->working_links--; - n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN; - n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id; + struct tipc_link_entry *le = &n->links[bearer_id]; + struct tipc_link *l = le->link; + struct tipc_media_addr *maddr; + struct sk_buff_head xmitq; - if (!tipc_link_is_active(l_ptr)) { - pr_debug("Lost standby link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); + if (!l) return; - } - pr_debug("Lost link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); - - active = &n_ptr->active_links[0]; - if (active[0] == l_ptr) - active[0] = active[1]; - if (active[1] == l_ptr) - active[1] = active[0]; - if (active[0] == l_ptr) - node_select_active_links(n_ptr); - if (tipc_node_is_up(n_ptr)) - tipc_link_failover_send_queue(l_ptr); - else - node_lost_contact(n_ptr); - /* Leave room for changeover header when returning 'mtu' to users: */ - if (active[0]) { - n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; - return; - } - /* Loopback link went down? No fragmentation needed from now on. */ - if (n_ptr->addr == tn->own_addr) { - n_ptr->act_mtus[0] = MAX_MSG_SIZE; - n_ptr->act_mtus[1] = MAX_MSG_SIZE; + __skb_queue_head_init(&xmitq); + + tipc_node_lock(n); + if (!tipc_link_is_establishing(l)) { + __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); + if (delete) { + kfree(l); + le->link = NULL; + n->link_cnt--; + } + } else { + /* Defuse pending tipc_node_link_up() */ + tipc_link_fsm_evt(l, LINK_RESET_EVT); } + tipc_node_unlock(n); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_sk_rcv(n->net, &le->inputq); } -int tipc_node_active_links(struct tipc_node *n_ptr) +bool tipc_node_is_up(struct tipc_node *n) { - return n_ptr->active_links[0] != NULL; + return n->active_links[0] != INVALID_BEARER_ID; } -int tipc_node_is_up(struct tipc_node *n_ptr) +void tipc_node_check_dest(struct net *net, u32 onode, + struct tipc_bearer *b, + u16 capabilities, u32 signature, + struct tipc_media_addr *maddr, + bool *respond, bool *dupl_addr) { - return tipc_node_active_links(n_ptr); + struct tipc_node *n; + struct tipc_link *l; + struct tipc_link_entry *le; + bool addr_match = false; + bool sign_match = false; + bool link_up = false; + bool accept_addr = false; + bool reset = true; + char *if_name; + + *dupl_addr = false; + *respond = false; + + n = tipc_node_create(net, onode, capabilities); + if (!n) + return; + + tipc_node_lock(n); + + le = &n->links[b->identity]; + + /* Prepare to validate requesting node's signature and media address */ + l = le->link; + link_up = l && tipc_link_is_up(l); + addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); + sign_match = (signature == n->signature); + + /* These three flags give us eight permutations: */ + + if (sign_match && addr_match && link_up) { + /* All is fine. Do nothing. */ + reset = false; + } else if (sign_match && addr_match && !link_up) { + /* Respond. The link will come up in due time */ + *respond = true; + } else if (sign_match && !addr_match && link_up) { + /* Peer has changed i/f address without rebooting. + * If so, the link will reset soon, and the next + * discovery will be accepted. So we can ignore it. + * It may also be an cloned or malicious peer having + * chosen the same node address and signature as an + * existing one. + * Ignore requests until the link goes down, if ever. + */ + *dupl_addr = true; + } else if (sign_match && !addr_match && !link_up) { + /* Peer link has changed i/f address without rebooting. + * It may also be a cloned or malicious peer; we can't + * distinguish between the two. + * The signature is correct, so we must accept. + */ + accept_addr = true; + *respond = true; + } else if (!sign_match && addr_match && link_up) { + /* Peer node rebooted. Two possibilities: + * - Delayed re-discovery; this link endpoint has already + * reset and re-established contact with the peer, before + * receiving a discovery message from that node. + * (The peer happened to receive one from this node first). + * - The peer came back so fast that our side has not + * discovered it yet. Probing from this side will soon + * reset the link, since there can be no working link + * endpoint at the peer end, and the link will re-establish. + * Accept the signature, since it comes from a known peer. + */ + n->signature = signature; + } else if (!sign_match && addr_match && !link_up) { + /* The peer node has rebooted. + * Accept signature, since it is a known peer. + */ + n->signature = signature; + *respond = true; + } else if (!sign_match && !addr_match && link_up) { + /* Peer rebooted with new address, or a new/duplicate peer. + * Ignore until the link goes down, if ever. + */ + *dupl_addr = true; + } else if (!sign_match && !addr_match && !link_up) { + /* Peer rebooted with new address, or it is a new peer. + * Accept signature and address. + */ + n->signature = signature; + accept_addr = true; + *respond = true; + } + + if (!accept_addr) + goto exit; + + /* Now create new link if not already existing */ + if (!l) { + if (n->link_cnt == 2) { + pr_warn("Cannot establish 3rd link to %x\n", n->addr); + goto exit; + } + if_name = strchr(b->name, ':') + 1; + if (!tipc_link_create(net, if_name, b->identity, b->tolerance, + b->net_plane, b->mtu, b->priority, + b->window, mod(tipc_net(net)->random), + tipc_own_addr(net), onode, + n->capabilities, + tipc_bc_sndlink(n->net), n->bc_entry.link, + &le->inputq, + &n->bc_entry.namedq, &l)) { + *respond = false; + goto exit; + } + tipc_link_reset(l); + tipc_link_fsm_evt(l, LINK_RESET_EVT); + if (n->state == NODE_FAILINGOVER) + tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); + le->link = l; + n->link_cnt++; + tipc_node_calculate_timer(n, l); + if (n->link_cnt == 1) + if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + tipc_node_get(n); + } + memcpy(&le->maddr, maddr, sizeof(*maddr)); +exit: + tipc_node_unlock(n); + if (reset && !tipc_link_is_reset(l)) + tipc_node_link_down(n, b->identity, false); + tipc_node_put(n); } -void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +void tipc_node_delete_links(struct net *net, int bearer_id) { - n_ptr->links[l_ptr->bearer_id] = l_ptr; - n_ptr->link_cnt++; + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_node *n; + + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) { + tipc_node_link_down(n, bearer_id, true); + } + rcu_read_unlock(); } -void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +static void tipc_node_reset_links(struct tipc_node *n) { + char addr_string[16]; int i; + pr_warn("Resetting all links to %s\n", + tipc_addr_string_fill(addr_string, n->addr)); + for (i = 0; i < MAX_BEARERS; i++) { - if (l_ptr != n_ptr->links[i]) - continue; - n_ptr->links[i] = NULL; - n_ptr->link_cnt--; + tipc_node_link_down(n, i, false); + } +} + +/* tipc_node_fsm_evt - node finite state machine + * Determines when contact is allowed with peer node + */ +static void tipc_node_fsm_evt(struct tipc_node *n, int evt) +{ + int state = n->state; + + switch (state) { + case SELF_DOWN_PEER_DOWN: + switch (evt) { + case SELF_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_COMING; + break; + case PEER_ESTABL_CONTACT_EVT: + state = SELF_COMING_PEER_UP; + break; + case SELF_LOST_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case SELF_UP_PEER_UP: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case NODE_SYNCH_BEGIN_EVT: + state = NODE_SYNCHING; + break; + case NODE_FAILOVER_BEGIN_EVT: + state = NODE_FAILINGOVER; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case NODE_SYNCH_END_EVT: + case NODE_FAILOVER_END_EVT: + break; + default: + goto illegal_evt; + } + break; + case SELF_DOWN_PEER_LEAVING: + switch (evt) { + case PEER_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case SELF_LOST_CONTACT_EVT: + break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case SELF_UP_PEER_COMING: + switch (evt) { + case PEER_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_UP; + break; + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + case NODE_SYNCH_END_EVT: + case NODE_FAILOVER_BEGIN_EVT: + break; + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case SELF_COMING_PEER_UP: + switch (evt) { + case SELF_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_UP; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case SELF_LOST_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case SELF_LEAVING_PEER_DOWN: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case NODE_FAILINGOVER: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case NODE_FAILOVER_END_EVT: + state = SELF_UP_PEER_UP; + break; + case NODE_FAILOVER_BEGIN_EVT: + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + case NODE_SYNCH_BEGIN_EVT: + case NODE_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case NODE_SYNCHING: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case NODE_SYNCH_END_EVT: + state = SELF_UP_PEER_UP; + break; + case NODE_FAILOVER_BEGIN_EVT: + state = NODE_FAILINGOVER; + break; + case NODE_SYNCH_BEGIN_EVT: + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + default: + pr_err("Unknown node fsm state %x\n", state); + break; } + n->state = state; + return; + +illegal_evt: + pr_err("Illegal node fsm evt %x in state %x\n", evt, state); } -static void node_established_contact(struct tipc_node *n_ptr) +bool tipc_node_filter_pkt(struct tipc_node *n, struct tipc_msg *hdr) { - n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP; - n_ptr->bclink.oos_state = 0; - n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net); - tipc_bclink_add_node(n_ptr->net, n_ptr->addr); + int state = n->state; + + if (likely(state == SELF_UP_PEER_UP)) + return true; + + if (state == SELF_LEAVING_PEER_DOWN) + return false; + + if (state == SELF_DOWN_PEER_LEAVING) { + if (msg_peer_node_is_up(hdr)) + return false; + } + + return true; } -static void node_lost_contact(struct tipc_node *n_ptr) +static void node_lost_contact(struct tipc_node *n, + struct sk_buff_head *inputq) { char addr_string[16]; struct tipc_sock_conn *conn, *safe; - struct list_head *conns = &n_ptr->conn_sks; + struct tipc_link *l; + struct list_head *conns = &n->conn_sks; struct sk_buff *skb; - struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); uint i; pr_debug("Lost contact with %s\n", - tipc_addr_string_fill(addr_string, n_ptr->addr)); - - /* Flush broadcast link info associated with lost node */ - if (n_ptr->bclink.recv_permitted) { - __skb_queue_purge(&n_ptr->bclink.deferdq); - - if (n_ptr->bclink.reasm_buf) { - kfree_skb(n_ptr->bclink.reasm_buf); - n_ptr->bclink.reasm_buf = NULL; - } - - tipc_bclink_remove_node(n_ptr->net, n_ptr->addr); - tipc_bclink_acknowledge(n_ptr, INVALID_LINK_SEQ); + tipc_addr_string_fill(addr_string, n->addr)); - n_ptr->bclink.recv_permitted = false; - } + /* Clean up broadcast state */ + tipc_bcast_remove_peer(n->net, n->bc_entry.link); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i]; - if (!l_ptr) - continue; - l_ptr->flags &= ~LINK_FAILINGOVER; - l_ptr->failover_checkpt = 0; - l_ptr->failover_pkts = 0; - kfree_skb(l_ptr->failover_skb); - l_ptr->failover_skb = NULL; - tipc_link_reset_fragments(l_ptr); + l = n->links[i].link; + if (l) + tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } - n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; - - /* Prevent re-contact with node until cleanup is done */ - n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN; - /* Notify publications from this node */ - n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN; + n->action_flags |= TIPC_NOTIFY_NODE_DOWN; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, - SHORT_H_SIZE, 0, tn->own_addr, + SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); - if (likely(skb)) { - skb_queue_tail(n_ptr->inputq, skb); - n_ptr->action_flags |= TIPC_MSG_EVT; - } + if (likely(skb)) + skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } @@ -452,7 +914,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, goto exit; tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (link) { strncpy(linkname, link->name, len); err = 0; @@ -470,36 +932,24 @@ void tipc_node_unlock(struct tipc_node *node) u32 flags = node->action_flags; u32 link_id = 0; struct list_head *publ_list; - struct sk_buff_head *inputq = node->inputq; - struct sk_buff_head *namedq; - if (likely(!flags || (flags == TIPC_MSG_EVT))) { - node->action_flags = 0; + if (likely(!flags)) { spin_unlock_bh(&node->lock); - if (flags == TIPC_MSG_EVT) - tipc_sk_rcv(net, inputq); return; } addr = node->addr; link_id = node->link_id; - namedq = node->namedq; publ_list = &node->publ_list; - node->action_flags &= ~(TIPC_MSG_EVT | - TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | - TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP | - TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT | - TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET); + node->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | + TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); spin_unlock_bh(&node->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, addr); - if (flags & TIPC_WAKEUP_BCAST_USERS) - tipc_bclink_wakeup_users(net); - if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, addr); @@ -511,17 +961,6 @@ void tipc_node_unlock(struct tipc_node *node) tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, link_id, addr); - if (flags & TIPC_MSG_EVT) - tipc_sk_rcv(net, inputq); - - if (flags & TIPC_NAMED_MSG_EVT) - tipc_named_rcv(net, namedq); - - if (flags & TIPC_BCAST_MSG_EVT) - tipc_bclink_input(net); - - if (flags & TIPC_BCAST_RESET) - tipc_link_reset_all(node); } /* Caller should hold node lock for the passed node */ @@ -558,6 +997,350 @@ msg_full: return -EMSGSIZE; } +static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel, + int *bearer_id, + struct tipc_media_addr **maddr) +{ + int id = n->active_links[sel & 1]; + + if (unlikely(id < 0)) + return NULL; + + *bearer_id = id; + *maddr = &n->links[id].maddr; + return n->links[id].link; +} + +/** + * tipc_node_xmit() is the general link level function for message sending + * @net: the applicable net namespace + * @list: chain of buffers containing message + * @dnode: address of destination node + * @selector: a number used for deterministic link selection + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + */ +int tipc_node_xmit(struct net *net, struct sk_buff_head *list, + u32 dnode, int selector) +{ + struct tipc_link *l = NULL; + struct tipc_node *n; + struct sk_buff_head xmitq; + struct tipc_media_addr *maddr; + int bearer_id; + int rc = -EHOSTUNREACH; + + __skb_queue_head_init(&xmitq); + n = tipc_node_find(net, dnode); + if (likely(n)) { + tipc_node_lock(n); + l = tipc_node_select_link(n, selector, &bearer_id, &maddr); + if (likely(l)) + rc = tipc_link_xmit(l, list, &xmitq); + tipc_node_unlock(n); + if (unlikely(rc == -ENOBUFS)) + tipc_node_link_down(n, bearer_id, false); + tipc_node_put(n); + } + if (likely(!rc)) { + tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); + return 0; + } + if (likely(in_own_node(net, dnode))) { + tipc_sk_rcv(net, list); + return 0; + } + return rc; +} + +/* tipc_node_xmit_skb(): send single buffer to destination + * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE + * messages, which will not be rejected + * The only exception is datagram messages rerouted after secondary + * lookup, which are rare and safe to dispose of anyway. + * TODO: Return real return value, and let callers use + * tipc_wait_for_sendpkt() where applicable + */ +int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, + u32 selector) +{ + struct sk_buff_head head; + int rc; + + skb_queue_head_init(&head); + __skb_queue_tail(&head, skb); + rc = tipc_node_xmit(net, &head, dnode, selector); + if (rc == -ELINKCONG) + kfree_skb(skb); + return 0; +} + +/** + * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node + * @net: the applicable net namespace + * @skb: TIPC packet + * @bearer_id: id of bearer message arrived on + * + * Invoked with no locks held. + */ +static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) +{ + int rc; + struct sk_buff_head xmitq; + struct tipc_bclink_entry *be; + struct tipc_link_entry *le; + struct tipc_msg *hdr = buf_msg(skb); + int usr = msg_user(hdr); + u32 dnode = msg_destnode(hdr); + struct tipc_node *n; + + __skb_queue_head_init(&xmitq); + + /* If NACK for other node, let rcv link for that node peek into it */ + if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) + n = tipc_node_find(net, dnode); + else + n = tipc_node_find(net, msg_prevnode(hdr)); + if (!n) { + kfree_skb(skb); + return; + } + be = &n->bc_entry; + le = &n->links[bearer_id]; + + rc = tipc_bcast_rcv(net, be->link, skb); + + /* Broadcast link reset may happen at reassembly failure */ + if (rc & TIPC_LINK_DOWN_EVT) + tipc_node_reset_links(n); + + /* Broadcast ACKs are sent on a unicast link */ + if (rc & TIPC_LINK_SND_BC_ACK) { + tipc_node_lock(n); + tipc_link_build_ack_msg(le->link, &xmitq); + tipc_node_unlock(n); + } + + if (!skb_queue_empty(&xmitq)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + + /* Deliver. 'arrvq' is under inputq2's lock protection */ + if (!skb_queue_empty(&be->inputq1)) { + spin_lock_bh(&be->inputq2.lock); + spin_lock_bh(&be->inputq1.lock); + skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); + spin_unlock_bh(&be->inputq1.lock); + spin_unlock_bh(&be->inputq2.lock); + tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2); + } + tipc_node_put(n); +} + +/** + * tipc_node_check_state - check and if necessary update node state + * @skb: TIPC packet + * @bearer_id: identity of bearer delivering the packet + * Returns true if state is ok, otherwise consumes buffer and returns false + */ +static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, + int bearer_id, struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr = buf_msg(skb); + int usr = msg_user(hdr); + int mtyp = msg_type(hdr); + u16 oseqno = msg_seqno(hdr); + u16 iseqno = msg_seqno(msg_get_wrapped(hdr)); + u16 exp_pkts = msg_msgcnt(hdr); + u16 rcv_nxt, syncpt, dlv_nxt; + int state = n->state; + struct tipc_link *l, *tnl, *pl = NULL; + struct tipc_media_addr *maddr; + int i, pb_id; + + l = n->links[bearer_id].link; + if (!l) + return false; + rcv_nxt = l->rcv_nxt; + + + if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) + return true; + + /* Find parallel link, if any */ + for (i = 0; i < MAX_BEARERS; i++) { + if ((i != bearer_id) && n->links[i].link) { + pl = n->links[i].link; + break; + } + } + + /* Update node accesibility if applicable */ + if (state == SELF_UP_PEER_COMING) { + if (!tipc_link_is_up(l)) + return true; + if (!msg_peer_link_is_up(hdr)) + return true; + tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); + } + + if (state == SELF_DOWN_PEER_LEAVING) { + if (msg_peer_node_is_up(hdr)) + return false; + tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + } + + /* Ignore duplicate packets */ + if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) + return true; + + /* Initiate or update failover mode if applicable */ + if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { + syncpt = oseqno + exp_pkts - 1; + if (pl && tipc_link_is_up(pl)) { + pb_id = pl->bearer_id; + __tipc_node_link_down(n, &pb_id, xmitq, &maddr); + tipc_skb_queue_splice_tail_init(pl->inputq, l->inputq); + } + /* If pkts arrive out of order, use lowest calculated syncpt */ + if (less(syncpt, n->sync_point)) + n->sync_point = syncpt; + } + + /* Open parallel link when tunnel link reaches synch point */ + if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { + if (!more(rcv_nxt, n->sync_point)) + return true; + tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); + if (pl) + tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); + return true; + } + + /* No synching needed if only one link */ + if (!pl || !tipc_link_is_up(pl)) + return true; + + /* Initiate synch mode if applicable */ + if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { + syncpt = iseqno + exp_pkts - 1; + if (!tipc_link_is_up(l)) { + tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + __tipc_node_link_up(n, bearer_id, xmitq); + } + if (n->state == SELF_UP_PEER_UP) { + n->sync_point = syncpt; + tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); + tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); + } + if (less(syncpt, n->sync_point)) + n->sync_point = syncpt; + } + + /* Open tunnel link when parallel link reaches synch point */ + if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) { + if (tipc_link_is_synching(l)) { + tnl = l; + } else { + tnl = pl; + pl = l; + } + dlv_nxt = pl->rcv_nxt - mod(skb_queue_len(pl->inputq)); + if (more(dlv_nxt, n->sync_point)) { + tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); + tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); + return true; + } + if (l == pl) + return true; + if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) + return true; + if (usr == LINK_PROTOCOL) + return true; + return false; + } + return true; +} + +/** + * tipc_rcv - process TIPC packets/messages arriving from off-node + * @net: the applicable net namespace + * @skb: TIPC packet + * @bearer: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ +void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) +{ + struct sk_buff_head xmitq; + struct tipc_node *n; + struct tipc_msg *hdr = buf_msg(skb); + int usr = msg_user(hdr); + int bearer_id = b->identity; + struct tipc_link_entry *le; + u16 bc_ack = msg_bcast_ack(hdr); + int rc = 0; + + __skb_queue_head_init(&xmitq); + + /* Ensure message is well-formed */ + if (unlikely(!tipc_msg_validate(skb))) + goto discard; + + /* Handle arrival of discovery or broadcast packet */ + if (unlikely(msg_non_seq(hdr))) { + if (unlikely(usr == LINK_CONFIG)) + return tipc_disc_rcv(net, skb, b); + else + return tipc_node_bc_rcv(net, skb, bearer_id); + } + + /* Locate neighboring node that sent packet */ + n = tipc_node_find(net, msg_prevnode(hdr)); + if (unlikely(!n)) + goto discard; + le = &n->links[bearer_id]; + + /* Ensure broadcast reception is in synch with peer's send state */ + if (unlikely(usr == LINK_PROTOCOL)) + tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr); + else if (unlikely(n->bc_entry.link->acked != bc_ack)) + tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack); + + tipc_node_lock(n); + + /* Is reception permitted at the moment ? */ + if (!tipc_node_filter_pkt(n, hdr)) + goto unlock; + + /* Check and if necessary update node state */ + if (likely(tipc_node_check_state(n, skb, bearer_id, &xmitq))) { + rc = tipc_link_rcv(le->link, skb, &xmitq); + skb = NULL; + } +unlock: + tipc_node_unlock(n); + + if (unlikely(rc & TIPC_LINK_UP_EVT)) + tipc_node_link_up(n, bearer_id, &xmitq); + + if (unlikely(rc & TIPC_LINK_DOWN_EVT)) + tipc_node_link_down(n, bearer_id, false); + + if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) + tipc_named_rcv(net, &n->bc_entry.namedq); + + if (!skb_queue_empty(&le->inputq)) + tipc_sk_rcv(net, &le->inputq); + + if (!skb_queue_empty(&xmitq)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + + tipc_node_put(n); +discard: + kfree_skb(skb); +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/kernel/net/tipc/node.h b/kernel/net/tipc/node.h index 02d5c20dc..6734562d3 100644 --- a/kernel/net/tipc/node.h +++ b/kernel/net/tipc/node.h @@ -45,50 +45,41 @@ /* Out-of-range value for node signature */ #define INVALID_NODE_SIG 0x10000 -#define NODE_HTABLE_SIZE 512 +#define INVALID_BEARER_ID -1 /* Flags used to take different actions according to flag type - * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down - * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { - TIPC_MSG_EVT = 1, - TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1), - TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), - TIPC_WAKEUP_BCAST_USERS = (1 << 5), TIPC_NOTIFY_LINK_UP = (1 << 6), - TIPC_NOTIFY_LINK_DOWN = (1 << 7), - TIPC_NAMED_MSG_EVT = (1 << 8), - TIPC_BCAST_MSG_EVT = (1 << 9), - TIPC_BCAST_RESET = (1 << 10) + TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; -/** - * struct tipc_node_bclink - TIPC node bclink structure - * @acked: sequence # of last outbound b'cast message acknowledged by node - * @last_in: sequence # of last in-sequence b'cast message received from node - * @last_sent: sequence # of last b'cast message sent by node - * @oos_state: state tracker for handling OOS b'cast messages - * @deferred_queue: deferred queue saved OOS b'cast message received from node - * @reasm_buf: broadcast reassembly queue head from node - * @inputq_map: bitmap indicating which inqueues should be kicked - * @recv_permitted: true if node is allowed to receive b'cast messages +/* Optional capabilities supported by this code version */ -struct tipc_node_bclink { - u32 acked; - u32 last_in; - u32 last_sent; - u32 oos_state; - u32 deferred_size; - struct sk_buff_head deferdq; - struct sk_buff *reasm_buf; - int inputq_map; - bool recv_permitted; +enum { + TIPC_BCAST_SYNCH = (1 << 1) +}; + +#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH + +struct tipc_link_entry { + struct tipc_link *link; + u32 mtu; + struct sk_buff_head inputq; + struct tipc_media_addr maddr; +}; + +struct tipc_bclink_entry { + struct tipc_link *link; + struct sk_buff_head inputq1; + struct sk_buff_head arrvq; + struct sk_buff_head inputq2; + struct sk_buff_head namedq; }; /** @@ -100,11 +91,11 @@ struct tipc_node_bclink { * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages - * @curr_link: the link holding the node lock, if any - * @active_links: pointers to active links to node - * @links: pointers to all links to node + * @active_links: bearer ids of active links, used as index into links[] array + * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions - * @bclink: broadcast-related info + * @state: connectivity state vs peer node + * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node @@ -120,14 +111,13 @@ struct tipc_node { spinlock_t lock; struct net *net; struct hlist_node hash; - struct sk_buff_head *inputq; - struct sk_buff_head *namedq; - struct tipc_link *active_links[2]; - u32 act_mtus[2]; - struct tipc_link *links[MAX_BEARERS]; + int active_links[2]; + struct tipc_link_entry links[MAX_BEARERS]; + struct tipc_bclink_entry bc_entry; int action_flags; - struct tipc_node_bclink bclink; struct list_head list; + int state; + u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; @@ -135,25 +125,32 @@ struct tipc_node { u32 link_id; struct list_head publ_list; struct list_head conn_sks; + unsigned long keepalive_intv; + struct timer_list timer; struct rcu_head rcu; }; struct tipc_node *tipc_node_find(struct net *net, u32 addr); void tipc_node_put(struct tipc_node *node); -struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); +void tipc_node_check_dest(struct net *net, u32 onode, + struct tipc_bearer *bearer, + u16 capabilities, u32 signature, + struct tipc_media_addr *maddr, + bool *respond, bool *dupl_addr); +void tipc_node_delete_links(struct net *net, int bearer_id); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -int tipc_node_active_links(struct tipc_node *n_ptr); -int tipc_node_is_up(struct tipc_node *n_ptr); +bool tipc_node_is_up(struct tipc_node *n); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); void tipc_node_unlock(struct tipc_node *node); +int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, + int selector); +int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, + u32 selector); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); - int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); static inline void tipc_node_lock(struct tipc_node *node) @@ -161,26 +158,30 @@ static inline void tipc_node_lock(struct tipc_node *node) spin_lock_bh(&node->lock); } -static inline bool tipc_node_blocked(struct tipc_node *node) +static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) { - return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN | - TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); -} + int bearer_id = n->active_links[sel & 1]; -static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) -{ - struct tipc_node *node; - u32 mtu; + if (unlikely(bearer_id == INVALID_BEARER_ID)) + return NULL; - node = tipc_node_find(net, addr); - - if (likely(node)) { - mtu = node->act_mtus[selector & 1]; - tipc_node_put(node); - } else { - mtu = MAX_MSG_SIZE; - } + return n->links[bearer_id].link; +} +static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +{ + struct tipc_node *n; + int bearer_id; + unsigned int mtu = MAX_MSG_SIZE; + + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return mtu; + + bearer_id = n->active_links[sel & 1]; + if (likely(bearer_id != INVALID_BEARER_ID)) + mtu = n->links[bearer_id].mtu; + tipc_node_put(n); return mtu; } diff --git a/kernel/net/tipc/server.c b/kernel/net/tipc/server.c index 77ff03ed1..922e04a43 100644 --- a/kernel/net/tipc/server.c +++ b/kernel/net/tipc/server.c @@ -309,6 +309,10 @@ static int tipc_accept_from_sock(struct tipc_conn *con) /* Notify that new connection is incoming */ newcon->usr_data = s->tipc_conn_new(newcon->conid); + if (!newcon->usr_data) { + sock_release(newsock); + return -ENOMEM; + } /* Wake up receive process in case of 'SYN+' message */ newsock->sk->sk_data_ready(newsock->sk); @@ -321,7 +325,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) struct socket *sock = NULL; int ret; - ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1); + ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock); if (ret < 0) return NULL; ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, diff --git a/kernel/net/tipc/socket.c b/kernel/net/tipc/socket.c index 20cc6df07..b53246fb0 100644 --- a/kernel/net/tipc/socket.c +++ b/kernel/net/tipc/socket.c @@ -41,6 +41,7 @@ #include "link.h" #include "name_distr.h" #include "socket.h" +#include "bcast.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -104,6 +105,7 @@ struct tipc_sock { static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); +static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); @@ -247,6 +249,22 @@ static void tsk_advance_rx_queue(struct sock *sk) kfree_skb(__skb_dequeue(&sk->sk_receive_queue)); } +/* tipc_sk_respond() : send response message back to sender + */ +static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) +{ + u32 selector; + u32 dnode; + u32 onode = tipc_own_addr(sock_net(sk)); + + if (!tipc_msg_reverse(onode, &skb, err)) + return; + + dnode = msg_destnode(buf_msg(skb)); + selector = msg_origport(buf_msg(skb)); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); +} + /** * tsk_rej_rx_queue - reject all buffers in socket receive queue * @@ -255,13 +273,9 @@ static void tsk_advance_rx_queue(struct sock *sk) static void tsk_rej_rx_queue(struct sock *sk) { struct sk_buff *skb; - u32 dnode; - u32 own_node = tsk_own_node(tipc_sk(sk)); - while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { - if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT)) - tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0); - } + while ((skb = __skb_dequeue(&sk->sk_receive_queue))) + tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); } /* tsk_peer_msg - verify if message was sent by connected port's peer @@ -342,7 +356,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, } /* Allocate socket's protocol area */ - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); + sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern); if (sk == NULL) return -ENOMEM; @@ -368,6 +382,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; + sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); @@ -409,7 +424,7 @@ static int tipc_release(struct socket *sock) struct net *net; struct tipc_sock *tsk; struct sk_buff *skb; - u32 dnode, probing_state; + u32 dnode; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -440,17 +455,12 @@ static int tipc_release(struct socket *sock) tsk->connected = 0; tipc_node_remove_conn(net, dnode, tsk->portid); } - if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, - TIPC_ERR_NO_PORT)) - tipc_link_xmit_skb(net, skb, dnode, 0); + tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); } } tipc_sk_withdraw(tsk, 0, NULL); - probing_state = tsk->probing_state; - if (del_timer_sync(&sk->sk_timer) && - probing_state != TIPC_CONN_PROBING) - sock_put(sk); + sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); if (tsk->connected) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, @@ -458,13 +468,10 @@ static int tipc_release(struct socket *sock) tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, TIPC_ERR_NO_PORT); if (skb) - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); tipc_node_remove_conn(net, dnode, tsk->portid); } - /* Discard any remaining (connection-based) messages in receive queue */ - __skb_queue_purge(&sk->sk_receive_queue); - /* Reject any messages that accumulated in backlog queue */ sock->state = SS_DISCONNECTING; release_sock(sk); @@ -681,28 +688,29 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, msg_set_hdr_sz(mhdr, MCAST_H_SIZE); new_mtu: - mtu = tipc_bclink_get_mtu(); + mtu = tipc_bcast_get_mtu(net); rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain); if (unlikely(rc < 0)) return rc; do { - rc = tipc_bclink_xmit(net, pktchain); - if (likely(rc >= 0)) { - rc = dsz; - break; + rc = tipc_bcast_xmit(net, pktchain); + if (likely(!rc)) + return dsz; + + if (rc == -ELINKCONG) { + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (!rc) + continue; } + __skb_queue_purge(pktchain); if (rc == -EMSGSIZE) { msg->msg_iter = save; goto new_mtu; } - if (rc != -ELINKCONG) - break; - tipc_sk(sk)->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); - } while (!rc); + break; + } while (1); return rc; } @@ -765,35 +773,35 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, /** * tipc_sk_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket - * @skb: pointer to message buffer. Set to NULL if buffer is consumed. + * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff **skb) +static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) { - struct tipc_msg *msg = buf_msg(*skb); + struct sock *sk = &tsk->sk; + struct tipc_msg *hdr = buf_msg(skb); + int mtyp = msg_type(hdr); int conn_cong; - u32 dnode; - u32 own_node = tsk_own_node(tsk); + /* Ignore if connection cannot be validated: */ - if (!tsk_peer_msg(tsk, msg)) + if (!tsk_peer_msg(tsk, hdr)) goto exit; tsk->probing_state = TIPC_CONN_OK; - if (msg_type(msg) == CONN_ACK) { + if (mtyp == CONN_PROBE) { + msg_set_type(hdr, CONN_PROBE_REPLY); + tipc_sk_respond(sk, skb, TIPC_OK); + return; + } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); - tsk->sent_unacked -= msg_msgcnt(msg); + tsk->sent_unacked -= msg_msgcnt(hdr); if (conn_cong) - tsk->sk.sk_write_space(&tsk->sk); - } else if (msg_type(msg) == CONN_PROBE) { - if (tipc_msg_reverse(own_node, *skb, &dnode, TIPC_OK)) { - msg_set_type(msg, CONN_PROBE_REPLY); - return; - } + sk->sk_write_space(sk); + } else if (mtyp != CONN_PROBE_REPLY) { + pr_warn("Received unknown CONN_PROTO msg\n"); } - /* Do nothing if msg_type() == CONN_PROBE_REPLY */ exit: - kfree_skb(*skb); - *skb = NULL; + kfree_skb(skb); } static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) @@ -926,24 +934,25 @@ new_mtu: do { skb = skb_peek(pktchain); TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid); - if (likely(rc >= 0)) { + rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid); + if (likely(!rc)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; - rc = dsz; - break; + return dsz; } + if (rc == -ELINKCONG) { + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (!rc) + continue; + } + __skb_queue_purge(pktchain); if (rc == -EMSGSIZE) { m->msg_iter = save; goto new_mtu; } - if (rc != -ELINKCONG) - break; - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); - } while (!rc); + break; + } while (1); return rc; } @@ -1045,15 +1054,16 @@ next: return rc; do { if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_link_xmit(net, pktchain, dnode, portid); + rc = tipc_node_xmit(net, pktchain, dnode, portid); if (likely(!rc)) { tsk->sent_unacked++; sent += send; if (sent == dsz) - break; + return dsz; goto next; } if (rc == -EMSGSIZE) { + __skb_queue_purge(pktchain); tsk->max_pkt = tipc_node_get_mtu(net, dnode, portid); m->msg_iter = save; @@ -1061,13 +1071,13 @@ next: } if (rc != -ELINKCONG) break; + tsk->link_cong = 1; } rc = tipc_wait_for_sndpkt(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); } while (!rc); + __skb_queue_purge(pktchain); return sent ? sent : rc; } @@ -1223,7 +1233,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) return; msg = buf_msg(skb); msg_set_msgcnt(msg, ack); - tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg)); + tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) @@ -1504,87 +1514,91 @@ static void tipc_data_ready(struct sock *sk) rcu_read_unlock(); } +static void tipc_sock_destruct(struct sock *sk) +{ + __skb_queue_purge(&sk->sk_receive_queue); +} + /** * filter_connect - Handle all incoming messages for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * - * Returns 0 (TIPC_OK) if everything ok, -TIPC_ERR_NO_PORT otherwise + * Returns true if everything ok, false otherwise */ -static int filter_connect(struct tipc_sock *tsk, struct sk_buff **skb) +static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct socket *sock = sk->sk_socket; - struct tipc_msg *msg = buf_msg(*skb); - int retval = -TIPC_ERR_NO_PORT; + struct tipc_msg *hdr = buf_msg(skb); - if (msg_mcast(msg)) - return retval; + if (unlikely(msg_mcast(hdr))) + return false; switch ((int)sock->state) { case SS_CONNECTED: + /* Accept only connection-based messages sent by peer */ - if (tsk_peer_msg(tsk, msg)) { - if (unlikely(msg_errcode(msg))) { - sock->state = SS_DISCONNECTING; - tsk->connected = 0; - /* let timer expire on it's own */ - tipc_node_remove_conn(net, tsk_peer_node(tsk), - tsk->portid); - } - retval = TIPC_OK; + if (unlikely(!tsk_peer_msg(tsk, hdr))) + return false; + + if (unlikely(msg_errcode(hdr))) { + sock->state = SS_DISCONNECTING; + tsk->connected = 0; + /* Let timer expire on it's own */ + tipc_node_remove_conn(net, tsk_peer_node(tsk), + tsk->portid); } - break; + return true; + case SS_CONNECTING: - /* Accept only ACK or NACK message */ - if (unlikely(!msg_connected(msg))) - break; + /* Accept only ACK or NACK message */ + if (unlikely(!msg_connected(hdr))) + return false; - if (unlikely(msg_errcode(msg))) { + if (unlikely(msg_errcode(hdr))) { sock->state = SS_DISCONNECTING; sk->sk_err = ECONNREFUSED; - retval = TIPC_OK; - break; + return true; } - if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) { + if (unlikely(!msg_isdata(hdr))) { sock->state = SS_DISCONNECTING; sk->sk_err = EINVAL; - retval = TIPC_OK; - break; + return true; } - tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg)); - msg_set_importance(&tsk->phdr, msg_importance(msg)); + tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); + msg_set_importance(&tsk->phdr, msg_importance(hdr)); sock->state = SS_CONNECTED; - /* If an incoming message is an 'ACK-', it should be - * discarded here because it doesn't contain useful - * data. In addition, we should try to wake up - * connect() routine if sleeping. - */ - if (msg_data_sz(msg) == 0) { - kfree_skb(*skb); - *skb = NULL; - if (waitqueue_active(sk_sleep(sk))) - wake_up_interruptible(sk_sleep(sk)); - } - retval = TIPC_OK; - break; + /* If 'ACK+' message, add to socket receive queue */ + if (msg_data_sz(hdr)) + return true; + + /* If empty 'ACK-' message, wake up sleeping connect() */ + if (waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); + + /* 'ACK-' message is neither accepted nor rejected: */ + msg_set_dest_droppable(hdr, 1); + return false; + case SS_LISTENING: case SS_UNCONNECTED: + /* Accept only SYN message */ - if (!msg_connected(msg) && !(msg_errcode(msg))) - retval = TIPC_OK; + if (!msg_connected(hdr) && !(msg_errcode(hdr))) + return true; break; case SS_DISCONNECTING: break; default: pr_err("Unknown socket state %u\n", sock->state); } - return retval; + return false; } /** @@ -1619,61 +1633,70 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) /** * filter_rcv - validate incoming message * @sk: socket - * @skb: pointer to message. Set to NULL if buffer is consumed. + * @skb: pointer to message. * * Enqueues message on receive queue if acceptable; optionally handles * disconnect indication for a connected socket. * * Called with socket lock already taken * - * Returns 0 (TIPC_OK) if message was ok, -TIPC error code if rejected + * Returns true if message was added to socket receive queue, otherwise false */ -static int filter_rcv(struct sock *sk, struct sk_buff **skb) +static bool filter_rcv(struct sock *sk, struct sk_buff *skb) { struct socket *sock = sk->sk_socket; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_msg *msg = buf_msg(*skb); - unsigned int limit = rcvbuf_limit(sk, *skb); - int rc = TIPC_OK; + struct tipc_msg *hdr = buf_msg(skb); + unsigned int limit = rcvbuf_limit(sk, skb); + int err = TIPC_OK; + int usr = msg_user(hdr); - if (unlikely(msg_user(msg) == CONN_MANAGER)) { + if (unlikely(msg_user(hdr) == CONN_MANAGER)) { tipc_sk_proto_rcv(tsk, skb); - return TIPC_OK; + return false; } - if (unlikely(msg_user(msg) == SOCK_WAKEUP)) { - kfree_skb(*skb); + if (unlikely(usr == SOCK_WAKEUP)) { + kfree_skb(skb); tsk->link_cong = 0; sk->sk_write_space(sk); - *skb = NULL; - return TIPC_OK; + return false; } - /* Reject message if it is wrong sort of message for socket */ - if (msg_type(msg) > TIPC_DIRECT_MSG) - return -TIPC_ERR_NO_PORT; + /* Drop if illegal message type */ + if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) { + kfree_skb(skb); + return false; + } - if (sock->state == SS_READY) { - if (msg_connected(msg)) - return -TIPC_ERR_NO_PORT; - } else { - rc = filter_connect(tsk, skb); - if (rc != TIPC_OK || !*skb) - return rc; + /* Reject if wrong message type for current socket state */ + if (unlikely(sock->state == SS_READY)) { + if (msg_connected(hdr)) { + err = TIPC_ERR_NO_PORT; + goto reject; + } + } else if (unlikely(!filter_connect(tsk, skb))) { + err = TIPC_ERR_NO_PORT; + goto reject; } /* Reject message if there isn't room to queue it */ - if (sk_rmem_alloc_get(sk) + (*skb)->truesize >= limit) - return -TIPC_ERR_OVERLOAD; + if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) { + err = TIPC_ERR_OVERLOAD; + goto reject; + } /* Enqueue message */ - TIPC_SKB_CB(*skb)->handle = NULL; - __skb_queue_tail(&sk->sk_receive_queue, *skb); - skb_set_owner_r(*skb, sk); + TIPC_SKB_CB(skb)->handle = NULL; + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); sk->sk_data_ready(sk); - *skb = NULL; - return TIPC_OK; + return true; + +reject: + tipc_sk_respond(sk, skb, err); + return false; } /** @@ -1687,22 +1710,10 @@ static int filter_rcv(struct sock *sk, struct sk_buff **skb) */ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - int err; - atomic_t *dcnt; - u32 dnode; - struct tipc_sock *tsk = tipc_sk(sk); - struct net *net = sock_net(sk); - uint truesize = skb->truesize; + unsigned int truesize = skb->truesize; - err = filter_rcv(sk, &skb); - if (likely(!skb)) { - dcnt = &tsk->dupl_rcvcnt; - if (atomic_read(dcnt) < TIPC_CONN_OVERLOAD_LIMIT) - atomic_add(truesize, dcnt); - return 0; - } - if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err)) - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + if (likely(filter_rcv(sk, skb))) + atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); return 0; } @@ -1712,45 +1723,43 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) * @inputq: list of incoming buffers with potentially different destinations * @sk: socket where the buffers should be enqueued * @dport: port number for the socket - * @_skb: returned buffer to be forwarded or rejected, if applicable * * Caller must hold socket lock - * - * Returns TIPC_OK if all buffers enqueued, otherwise -TIPC_ERR_OVERLOAD - * or -TIPC_ERR_NO_PORT */ -static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, - u32 dport, struct sk_buff **_skb) +static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, + u32 dport) { unsigned int lim; atomic_t *dcnt; - int err; struct sk_buff *skb; unsigned long time_limit = jiffies + 2; while (skb_queue_len(inputq)) { if (unlikely(time_after_eq(jiffies, time_limit))) - return TIPC_OK; + return; + skb = tipc_skb_dequeue(inputq, dport); if (unlikely(!skb)) - return TIPC_OK; + return; + + /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - err = filter_rcv(sk, &skb); - if (likely(!skb)) - continue; - *_skb = skb; - return err; + filter_rcv(sk, skb); + continue; } + + /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; if (sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) continue; - *_skb = skb; - return -TIPC_ERR_OVERLOAD; + + /* Overload => reject message back to sender */ + tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD); + break; } - return TIPC_OK; } /** @@ -1758,49 +1767,46 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, * @inputq: buffer list containing the buffers * Consumes all buffers in list until inputq is empty * Note: may be called in multiple threads referring to the same queue - * Returns 0 if last buffer was accepted, otherwise -EHOSTUNREACH - * Only node local calls check the return value, sending single-buffer queues */ -int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) +void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { u32 dnode, dport = 0; int err; - struct sk_buff *skb; struct tipc_sock *tsk; - struct tipc_net *tn; struct sock *sk; + struct sk_buff *skb; while (skb_queue_len(inputq)) { - err = -TIPC_ERR_NO_PORT; - skb = NULL; dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); + if (likely(tsk)) { sk = &tsk->sk; if (likely(spin_trylock_bh(&sk->sk_lock.slock))) { - err = tipc_sk_enqueue(inputq, sk, dport, &skb); + tipc_sk_enqueue(inputq, sk, dport); spin_unlock_bh(&sk->sk_lock.slock); - dport = 0; } sock_put(sk); - } else { - skb = tipc_skb_dequeue(inputq, dport); - } - if (likely(!skb)) continue; - if (tipc_msg_lookup_dest(net, skb, &dnode, &err)) - goto xmit; - if (!err) { - dnode = msg_destnode(buf_msg(skb)); - goto xmit; } - tn = net_generic(net, tipc_net_id); - if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err)) + + /* No destination socket => dequeue skb if still there */ + skb = tipc_skb_dequeue(inputq, dport); + if (!skb) + return; + + /* Try secondary lookup if unresolved named message */ + err = TIPC_ERR_NO_PORT; + if (tipc_msg_lookup_dest(net, skb, &err)) + goto xmit; + + /* Prepare for message rejection */ + if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err)) continue; xmit: - tipc_link_xmit_skb(net, skb, dnode, dport); + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(net, skb, dnode, dport); } - return err ? -EHOSTUNREACH : 0; } static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) @@ -2069,7 +2075,10 @@ static int tipc_shutdown(struct socket *sock, int how) struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); struct sk_buff *skb; - u32 dnode; + u32 dnode = tsk_peer_node(tsk); + u32 dport = tsk_peer_port(tsk); + u32 onode = tipc_own_addr(net); + u32 oport = tsk->portid; int res; if (how != SHUT_RDWR) @@ -2082,6 +2091,8 @@ static int tipc_shutdown(struct socket *sock, int how) case SS_CONNECTED: restart: + dnode = tsk_peer_node(tsk); + /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */ skb = __skb_dequeue(&sk->sk_receive_queue); if (skb) { @@ -2089,19 +2100,13 @@ restart: kfree_skb(skb); goto restart; } - if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, - TIPC_CONN_SHUTDOWN)) - tipc_link_xmit_skb(net, skb, dnode, - tsk->portid); + tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN); } else { - dnode = tsk_peer_node(tsk); - skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, - 0, dnode, tsk_own_node(tsk), - tsk_peer_port(tsk), - tsk->portid, TIPC_CONN_SHUTDOWN); - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + 0, dnode, onode, dport, oport, + TIPC_CONN_SHUTDOWN); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); } tsk->connected = 0; sock->state = SS_DISCONNECTING; @@ -2163,7 +2168,7 @@ static void tipc_sk_timeout(unsigned long data) } bh_unlock_sock(sk); if (skb) - tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); + tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); exit: sock_put(sk); } diff --git a/kernel/net/tipc/socket.h b/kernel/net/tipc/socket.h index bf6551389..4241f2206 100644 --- a/kernel/net/tipc/socket.h +++ b/kernel/net/tipc/socket.h @@ -44,7 +44,7 @@ SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) int tipc_socket_init(void); void tipc_socket_stop(void); -int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); +void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq); void tipc_sk_reinit(struct net *net); diff --git a/kernel/net/tipc/subscr.c b/kernel/net/tipc/subscr.c index 1c147c869..69ee2eeef 100644 --- a/kernel/net/tipc/subscr.c +++ b/kernel/net/tipc/subscr.c @@ -40,16 +40,21 @@ /** * struct tipc_subscriber - TIPC network topology subscriber + * @kref: reference counter to tipc_subscription object * @conid: connection identifier to server connecting to subscriber * @lock: control access to subscriber - * @subscription_list: list of subscription objects for this subscriber + * @subscrp_list: list of subscription objects for this subscriber */ struct tipc_subscriber { + struct kref kref; int conid; spinlock_t lock; - struct list_head subscription_list; + struct list_head subscrp_list; }; +static void tipc_subscrp_delete(struct tipc_subscription *sub); +static void tipc_subscrb_put(struct tipc_subscriber *subscriber); + /** * htohl - convert value to endianness used by destination * @in: value to convert @@ -62,9 +67,9 @@ static u32 htohl(u32 in, int swap) return swap ? swab32(in) : in; } -static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node) +static void tipc_subscrp_send_event(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port_ref, u32 node) { struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; @@ -82,12 +87,13 @@ static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, } /** - * tipc_subscr_overlap - test for subscription overlap with the given values + * tipc_subscrp_check_overlap - test for subscription overlap with the + * given values * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper) +int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper) { if (found_lower < sub->seq.lower) found_lower = sub->seq.lower; @@ -98,138 +104,121 @@ int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, return 1; } -/** - * tipc_subscr_report_overlap - issue event if there is subscription overlap - * - * Protected by nameseq.lock in name_table.c - */ -void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, int must) +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper, u32 event, u32 port_ref, + u32 node, int must) { - if (!tipc_subscr_overlap(sub, found_lower, found_upper)) + if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper)) return; if (!must && !(sub->filter & TIPC_SUB_PORTS)) return; - subscr_send_event(sub, found_lower, found_upper, event, port_ref, node); + tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, + node); } -static void subscr_timeout(unsigned long data) +static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; struct tipc_subscriber *subscriber = sub->subscriber; - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - /* The spin lock per subscriber is used to protect its members */ - spin_lock_bh(&subscriber->lock); + /* Notify subscriber of timeout */ + tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, + TIPC_SUBSCR_TIMEOUT, 0, 0); - /* Validate timeout (in case subscription is being cancelled) */ - if (sub->timeout == TIPC_WAIT_FOREVER) { - spin_unlock_bh(&subscriber->lock); - return; - } + spin_lock_bh(&subscriber->lock); + tipc_subscrp_delete(sub); + spin_unlock_bh(&subscriber->lock); - /* Unlink subscription from name table */ - tipc_nametbl_unsubscribe(sub); + tipc_subscrb_put(subscriber); +} - /* Unlink subscription from subscriber */ - list_del(&sub->subscription_list); +static void tipc_subscrb_kref_release(struct kref *kref) +{ + struct tipc_subscriber *subcriber = container_of(kref, + struct tipc_subscriber, kref); - spin_unlock_bh(&subscriber->lock); + kfree(subcriber); +} - /* Notify subscriber of timeout */ - subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, - TIPC_SUBSCR_TIMEOUT, 0, 0); +static void tipc_subscrb_put(struct tipc_subscriber *subscriber) +{ + kref_put(&subscriber->kref, tipc_subscrb_kref_release); +} - /* Now destroy subscription */ - kfree(sub); - atomic_dec(&tn->subscription_count); +static void tipc_subscrb_get(struct tipc_subscriber *subscriber) +{ + kref_get(&subscriber->kref); } -/** - * subscr_del - delete a subscription within a subscription list - * - * Called with subscriber lock held. - */ -static void subscr_del(struct tipc_subscription *sub) +static struct tipc_subscriber *tipc_subscrb_create(int conid) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + struct tipc_subscriber *subscriber; - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscription_list); - kfree(sub); - atomic_dec(&tn->subscription_count); + subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC); + if (!subscriber) { + pr_warn("Subscriber rejected, no memory\n"); + return NULL; + } + kref_init(&subscriber->kref); + INIT_LIST_HEAD(&subscriber->subscrp_list); + subscriber->conid = conid; + spin_lock_init(&subscriber->lock); + + return subscriber; } -static void subscr_release(struct tipc_subscriber *subscriber) +static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub; - struct tipc_subscription *sub_temp; + struct tipc_subscription *sub, *temp; spin_lock_bh(&subscriber->lock); - /* Destroy any existing subscriptions for subscriber */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, - subscription_list) { - if (sub->timeout != TIPC_WAIT_FOREVER) { - spin_unlock_bh(&subscriber->lock); - del_timer_sync(&sub->timer); - spin_lock_bh(&subscriber->lock); + list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, + subscrp_list) { + if (del_timer(&sub->timer)) { + tipc_subscrp_delete(sub); + tipc_subscrb_put(subscriber); } - subscr_del(sub); } spin_unlock_bh(&subscriber->lock); - /* Now destroy subscriber */ - kfree(subscriber); + tipc_subscrb_put(subscriber); } -/** - * subscr_cancel - handle subscription cancellation request - * - * Called with subscriber lock held. Routine must temporarily release lock - * to enable the subscription timeout routine to finish without deadlocking; - * the lock is then reclaimed to allow caller to release it upon return. - * - * Note that fields of 's' use subscriber's endianness! - */ -static void subscr_cancel(struct tipc_subscr *s, - struct tipc_subscriber *subscriber) +static void tipc_subscrp_delete(struct tipc_subscription *sub) { - struct tipc_subscription *sub; - struct tipc_subscription *sub_temp; - int found = 0; + struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + kfree(sub); + atomic_dec(&tn->subscription_count); +} + +static void tipc_subscrp_cancel(struct tipc_subscr *s, + struct tipc_subscriber *subscriber) +{ + struct tipc_subscription *sub, *temp; + spin_lock_bh(&subscriber->lock); /* Find first matching subscription, exit if not found */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, - subscription_list) { + list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, + subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - found = 1; + if (del_timer(&sub->timer)) { + tipc_subscrp_delete(sub); + tipc_subscrb_put(subscriber); + } break; } } - if (!found) - return; - - /* Cancel subscription timer (if used), then delete subscription */ - if (sub->timeout != TIPC_WAIT_FOREVER) { - sub->timeout = TIPC_WAIT_FOREVER; - spin_unlock_bh(&subscriber->lock); - del_timer_sync(&sub->timer); - spin_lock_bh(&subscriber->lock); - } - subscr_del(sub); + spin_unlock_bh(&subscriber->lock); } -/** - * subscr_subscribe - create subscription for subscriber - * - * Called with subscriber lock held. - */ -static int subscr_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, - struct tipc_subscription **sub_p) +static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, + struct tipc_subscription **sub_p) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; @@ -241,7 +230,7 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, /* Detect & process a subscription cancellation request */ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - subscr_cancel(s, subscriber); + tipc_subscrp_cancel(s, subscriber); return 0; } @@ -273,62 +262,50 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, kfree(sub); return -EINVAL; } - list_add(&sub->subscription_list, &subscriber->subscription_list); + spin_lock_bh(&subscriber->lock); + list_add(&sub->subscrp_list, &subscriber->subscrp_list); + spin_unlock_bh(&subscriber->lock); sub->subscriber = subscriber; sub->swap = swap; - memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); + memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); - if (sub->timeout != TIPC_WAIT_FOREVER) { - setup_timer(&sub->timer, subscr_timeout, (unsigned long)sub); - mod_timer(&sub->timer, jiffies + sub->timeout); - } + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); + if (sub->timeout != TIPC_WAIT_FOREVER) + sub->timeout += jiffies; + if (!mod_timer(&sub->timer, sub->timeout)) + tipc_subscrb_get(subscriber); *sub_p = sub; return 0; } /* Handle one termination request for the subscriber */ -static void subscr_conn_shutdown_event(int conid, void *usr_data) +static void tipc_subscrb_shutdown_cb(int conid, void *usr_data) { - subscr_release((struct tipc_subscriber *)usr_data); + tipc_subscrb_delete((struct tipc_subscriber *)usr_data); } /* Handle one request to create a new subscription for the subscriber */ -static void subscr_conn_msg_event(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len) +static void tipc_subscrb_rcv_cb(struct net *net, int conid, + struct sockaddr_tipc *addr, void *usr_data, + void *buf, size_t len) { - struct tipc_subscriber *subscriber = usr_data; + struct tipc_subscriber *subscrb = usr_data; struct tipc_subscription *sub = NULL; struct tipc_net *tn = net_generic(net, tipc_net_id); - spin_lock_bh(&subscriber->lock); - subscr_subscribe(net, (struct tipc_subscr *)buf, subscriber, &sub); - if (sub) - tipc_nametbl_subscribe(sub); - else - tipc_conn_terminate(tn->topsrv, subscriber->conid); - spin_unlock_bh(&subscriber->lock); + if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub)) + return tipc_conn_terminate(tn->topsrv, subscrb->conid); + + tipc_nametbl_subscribe(sub); } /* Handle one request to establish a new subscriber */ -static void *subscr_named_msg_event(int conid) +static void *tipc_subscrb_connect_cb(int conid) { - struct tipc_subscriber *subscriber; - - /* Create subscriber object */ - subscriber = kzalloc(sizeof(struct tipc_subscriber), GFP_ATOMIC); - if (subscriber == NULL) { - pr_warn("Subscriber rejected, no memory\n"); - return NULL; - } - INIT_LIST_HEAD(&subscriber->subscription_list); - subscriber->conid = conid; - spin_lock_init(&subscriber->lock); - - return (void *)subscriber; + return (void *)tipc_subscrb_create(conid); } -int tipc_subscr_start(struct net *net) +int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); const char name[] = "topology_server"; @@ -355,9 +332,9 @@ int tipc_subscr_start(struct net *net) topsrv->imp = TIPC_CRITICAL_IMPORTANCE; topsrv->type = SOCK_SEQPACKET; topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); - topsrv->tipc_conn_recvmsg = subscr_conn_msg_event; - topsrv->tipc_conn_new = subscr_named_msg_event; - topsrv->tipc_conn_shutdown = subscr_conn_shutdown_event; + topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; + topsrv->tipc_conn_new = tipc_subscrb_connect_cb; + topsrv->tipc_conn_shutdown = tipc_subscrb_shutdown_cb; strncpy(topsrv->name, name, strlen(name) + 1); tn->topsrv = topsrv; @@ -366,7 +343,7 @@ int tipc_subscr_start(struct net *net) return tipc_server_start(topsrv); } -void tipc_subscr_stop(struct net *net) +void tipc_topsrv_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_server *topsrv = tn->topsrv; diff --git a/kernel/net/tipc/subscr.h b/kernel/net/tipc/subscr.h index 33488bd9f..92ee18cc5 100644 --- a/kernel/net/tipc/subscr.h +++ b/kernel/net/tipc/subscr.h @@ -54,7 +54,7 @@ struct tipc_subscriber; * @filter: event filtering to be done for subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list - * @subscription_list: adjacent subscriptions in subscriber's subscription list + * @subscrp_list: adjacent subscriptions in subscriber's subscription list * @server_ref: object reference of server port associated with subscription * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription @@ -67,17 +67,17 @@ struct tipc_subscription { u32 filter; struct timer_list timer; struct list_head nameseq_list; - struct list_head subscription_list; + struct list_head subscrp_list; int swap; struct tipc_event evt; }; -int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper); -void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, int must); -int tipc_subscr_start(struct net *net); -void tipc_subscr_stop(struct net *net); +int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper); +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, u32 event, + u32 port_ref, u32 node, int must); +int tipc_topsrv_start(struct net *net); +void tipc_topsrv_stop(struct net *net); #endif diff --git a/kernel/net/tipc/udp_media.c b/kernel/net/tipc/udp_media.c index 66deebc66..70c03271b 100644 --- a/kernel/net/tipc/udp_media.c +++ b/kernel/net/tipc/udp_media.c @@ -48,10 +48,13 @@ #include #include "core.h" #include "bearer.h" +#include "msg.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 +#define UDP_MIN_HEADROOM 28 + static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, @@ -153,11 +156,15 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct udp_bearer *ub; struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value; struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; - struct sk_buff *clone; struct rtable *rt; - clone = skb_clone(skb, GFP_ATOMIC); - skb_set_inner_protocol(clone, htons(ETH_P_TIPC)); + if (skb_headroom(skb) < UDP_MIN_HEADROOM) { + err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (err) + goto tx_error; + } + + skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) { err = -ENODEV; @@ -167,7 +174,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct flowi4 fl = { .daddr = dst->ipv4.s_addr, .saddr = src->ipv4.s_addr, - .flowi4_mark = clone->mark, + .flowi4_mark = skb->mark, .flowi4_proto = IPPROTO_UDP }; rt = ip_route_output_key(net, &fl); @@ -176,7 +183,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, goto tx_error; } ttl = ip4_dst_hoplimit(&rt->dst); - err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, clone, + err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, dst->udp_port, @@ -194,11 +201,12 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; - err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6); + err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst, + &fl6); if (err) goto tx_error; ttl = ip6_dst_hoplimit(ndst); - err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, clone, + err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, &dst->ipv6, 0, ttl, src->udp_port, dst->udp_port, false); @@ -207,7 +215,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, return err; tx_error: - kfree_skb(clone); + kfree_skb(skb); return err; } @@ -216,6 +224,10 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; + int usr = msg_user(buf_msg(skb)); + + if ((usr == LINK_PROTOCOL) || (usr == NAME_DISTRIBUTOR)) + skb_linearize(skb); ub = rcu_dereference_sk_user_data(sk); if (!ub) { @@ -424,7 +436,6 @@ static void tipc_udp_disable(struct tipc_bearer *b) } if (ub->ubsock) sock_set_flag(ub->ubsock->sk, SOCK_DEAD); - RCU_INIT_POINTER(b->media_ptr, NULL); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ diff --git a/kernel/net/unix/af_unix.c b/kernel/net/unix/af_unix.c index 06430598c..898a53a56 100644 --- a/kernel/net/unix/af_unix.c +++ b/kernel/net/unix/af_unix.c @@ -140,12 +140,17 @@ static struct hlist_head *unix_sockets_unbound(void *addr) #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); + UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - scm->secid = *UNIXSID(skb); + scm->secid = UNIXCB(skb).secid; +} + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -153,6 +158,11 @@ static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return true; +} #endif /* CONFIG_SECURITY_NETWORK */ /* @@ -316,9 +326,122 @@ found: return s; } -static inline int unix_writable(struct sock *sk) +/* Support code for asymmetrically connected dgram sockets + * + * If a datagram socket is connected to a socket not itself connected + * to the first socket (eg, /dev/log), clients may only enqueue more + * messages if the present receive queue of the server socket is not + * "too large". This means there's a second writeability condition + * poll and sendmsg need to test. The dgram recv code will do a wake + * up on the peer_wait wait queue of a socket upon reception of a + * datagram which needs to be propagated to sleeping would-be writers + * since these might not have sent anything so far. This can't be + * accomplished via poll_wait because the lifetime of the server + * socket might be less than that of its clients if these break their + * association with it or if the server socket is closed while clients + * are still connected to it and there's no way to inform "a polling + * implementation" that it should let go of a certain wait queue + * + * In order to propagate a wake up, a wait_queue_t of the client + * socket is enqueued on the peer_wait queue of the server socket + * whose wake function does a wake_up on the ordinary client socket + * wait queue. This connection is established whenever a write (or + * poll for write) hit the flow control condition and broken when the + * association to the server socket is dissolved or after a wake up + * was relayed. + */ + +static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, + void *key) +{ + struct unix_sock *u; + wait_queue_head_t *u_sleep; + + u = container_of(q, struct unix_sock, peer_wake); + + __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, + q); + u->peer_wake.private = NULL; + + /* relaying can only happen while the wq still exists */ + u_sleep = sk_sleep(&u->sk); + if (u_sleep) + wake_up_interruptible_poll(u_sleep, key); + + return 0; +} + +static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) { - return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; + struct unix_sock *u, *u_other; + int rc; + + u = unix_sk(sk); + u_other = unix_sk(other); + rc = 0; + spin_lock(&u_other->peer_wait.lock); + + if (!u->peer_wake.private) { + u->peer_wake.private = other; + __add_wait_queue(&u_other->peer_wait, &u->peer_wake); + + rc = 1; + } + + spin_unlock(&u_other->peer_wait.lock); + return rc; +} + +static void unix_dgram_peer_wake_disconnect(struct sock *sk, + struct sock *other) +{ + struct unix_sock *u, *u_other; + + u = unix_sk(sk); + u_other = unix_sk(other); + spin_lock(&u_other->peer_wait.lock); + + if (u->peer_wake.private == other) { + __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); + u->peer_wake.private = NULL; + } + + spin_unlock(&u_other->peer_wait.lock); +} + +static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, + struct sock *other) +{ + unix_dgram_peer_wake_disconnect(sk, other); + wake_up_interruptible_poll(sk_sleep(sk), + POLLOUT | + POLLWRNORM | + POLLWRBAND); +} + +/* preconditions: + * - unix_peer(sk) == other + * - association is stable + */ +static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) +{ + int connected; + + connected = unix_dgram_peer_wake_connect(sk, other); + + if (unix_recvq_full(other)) + return 1; + + if (connected) + unix_dgram_peer_wake_disconnect(sk, other); + + return 0; +} + +static int unix_writable(const struct sock *sk) +{ + return sk->sk_state != TCP_LISTEN && + (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; } static void unix_write_space(struct sock *sk) @@ -420,6 +543,8 @@ static void unix_release_sock(struct sock *sk, int embrion) skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } + + unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; } @@ -430,6 +555,7 @@ static void unix_release_sock(struct sock *sk, int embrion) if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ + UNIXCB(skb).consumed = skb->len; kfree_skb(skb); } @@ -518,6 +644,11 @@ static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); +static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, + size_t size, int flags); +static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, + struct pipe_inode_info *, size_t size, + unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_dgram_connect(struct socket *, struct sockaddr *, @@ -558,7 +689,8 @@ static const struct proto_ops unix_stream_ops = { .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, + .sendpage = unix_stream_sendpage, + .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, }; @@ -620,7 +752,7 @@ static struct proto unix_proto = { */ static struct lock_class_key af_unix_sk_receive_queue_lock_key; -static struct sock *unix_create1(struct net *net, struct socket *sock) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { struct sock *sk = NULL; struct unix_sock *u; @@ -629,7 +761,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); if (!sk) goto out; @@ -648,6 +780,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); + init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -688,7 +821,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -820,32 +953,20 @@ fail: return NULL; } -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode, + struct path *res) { - struct dentry *dentry; - struct path path; - int err = 0; - /* - * Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - return err; + int err; - /* - * All right, let's create it. - */ - err = security_path_mknod(&path, dentry, mode, 0); + err = security_path_mknod(path, dentry, mode, 0); if (!err) { - err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); + err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0); if (!err) { - res->mnt = mntget(path.mnt); + res->mnt = mntget(path->mnt); res->dentry = dget(dentry); } } - done_path_create(&path, dentry); + return err; } @@ -856,10 +977,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - int err; + int err, name_err; unsigned int hash; struct unix_address *addr; struct hlist_head *list; + struct path path; + struct dentry *dentry; err = -EINVAL; if (sunaddr->sun_family != AF_UNIX) @@ -875,14 +998,34 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; + name_err = 0; + dentry = NULL; + if (sun_path[0]) { + /* Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + + if (IS_ERR(dentry)) { + /* delay report until after 'already bound' check */ + name_err = PTR_ERR(dentry); + dentry = NULL; + } + } + err = mutex_lock_interruptible(&u->readlock); if (err) - goto out; + goto out_path; err = -EINVAL; if (u->addr) goto out_up; + if (name_err) { + err = name_err == -EEXIST ? -EADDRINUSE : name_err; + goto out_up; + } + err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) @@ -893,11 +1036,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = hash ^ sk->sk_type; atomic_set(&addr->refcnt, 1); - if (sun_path[0]) { - struct path path; + if (dentry) { + struct path u_path; umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); + err = unix_mknod(dentry, &path, mode, &u_path); if (err) { if (err == -EEXIST) err = -EADDRINUSE; @@ -905,9 +1048,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out_up; } addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1); + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); - u->path = path; + u->path = u_path; list = &unix_socket_table[hash]; } else { spin_lock(&unix_table_lock); @@ -930,6 +1073,10 @@ out_unlock: spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->readlock); +out_path: + if (dentry) + done_path_create(&path, dentry); + out: return err; } @@ -1015,6 +1162,8 @@ restart: if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; + unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); + unix_state_double_unlock(sk, other); if (other != old_peer) @@ -1088,7 +1237,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL); + newsk = unix_create1(sock_net(sk), NULL, 0); if (newsk == NULL) goto out; @@ -1347,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->fp[i]); + unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1364,6 +1513,21 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +/* + * The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + #define MAX_RECURSION_LEVEL 4 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) @@ -1372,6 +1536,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) unsigned char max_level = 0; int unix_sock_count = 0; + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + for (i = scm->fp->count - 1; i >= 0; i--) { struct sock *sk = unix_get_socket(scm->fp->fp[i]); @@ -1393,10 +1560,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - if (unix_sock_count) { - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - } + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); return max_level; } @@ -1408,6 +1573,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; + unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1415,6 +1581,14 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } +static bool unix_passcred_enabled(const struct socket *sock, + const struct sock *other) +{ + return test_bit(SOCK_PASSCRED, &sock->flags) || + !other->sk_socket || + test_bit(SOCK_PASSCRED, &other->sk_socket->flags); +} + /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket @@ -1425,14 +1599,41 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, { if (UNIXCB(skb).pid) return; - if (test_bit(SOCK_PASSCRED, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { + if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } +static int maybe_init_creds(struct scm_cookie *scm, + struct socket *socket, + const struct sock *other) +{ + int err; + struct msghdr msg = { .msg_controllen = 0 }; + + err = scm_send(socket, &msg, scm, false); + if (err) + return err; + + if (unix_passcred_enabled(socket, other)) { + scm->pid = get_pid(task_tgid(current)); + current_uid_gid(&scm->creds.uid, &scm->creds.gid); + } + return err; +} + +static bool unix_skb_scm_eq(struct sk_buff *skb, + struct scm_cookie *scm) +{ + const struct unix_skb_parms *u = &UNIXCB(skb); + + return u->pid == scm->pid && + uid_eq(u->uid, scm->creds.uid) && + gid_eq(u->gid, scm->creds.gid) && + unix_secdata_eq(scm, skb); +} + /* * Send AF_UNIX data. */ @@ -1453,6 +1654,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct scm_cookie scm; int max_level; int data_len = 0; + int sk_locked; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1503,7 +1705,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out_free; max_level = err + 1; - unix_get_secdata(&scm, skb); skb_put(skb, len - data_len); skb->data_len = data_len; @@ -1532,12 +1733,14 @@ restart: goto out_free; } + sk_locked = 0; unix_state_lock(other); +restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; - if (sock_flag(other, SOCK_DEAD)) { + if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error @@ -1545,10 +1748,14 @@ restart: unix_state_unlock(other); sock_put(other); + if (!sk_locked) + unix_state_lock(sk); + err = 0; - unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; + unix_dgram_peer_wake_disconnect_wakeup(sk, other); + unix_state_unlock(sk); unix_dgram_disconnected(sk, other); @@ -1574,21 +1781,43 @@ restart: goto out_unlock; } - if (unix_peer(other) != sk && unix_recvq_full(other)) { - if (!timeo) { - err = -EAGAIN; - goto out_unlock; + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + if (timeo) { + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; } - timeo = unix_wait_for_peer(other, timeo); + if (!sk_locked) { + unix_state_unlock(other); + unix_state_double_lock(sk, other); + } - err = sock_intr_errno(timeo); - if (signal_pending(current)) - goto out_free; + if (unix_peer(sk) != other || + unix_dgram_peer_wake_me(sk, other)) { + err = -EAGAIN; + sk_locked = 1; + goto out_unlock; + } - goto restart; + if (!sk_locked) { + sk_locked = 1; + goto restart_locked; + } } + if (unlikely(sk_locked)) + unix_state_unlock(sk); + if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); @@ -1602,6 +1831,8 @@ restart: return len; out_unlock: + if (sk_locked) + unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); @@ -1720,6 +1951,122 @@ out_err: return sent ? : err; } +static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, + int offset, size_t size, int flags) +{ + int err; + bool send_sigpipe = false; + bool init_scm = true; + struct scm_cookie scm; + struct sock *other, *sk = socket->sk; + struct sk_buff *skb, *newskb = NULL, *tail = NULL; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + other = unix_peer(sk); + if (!other || sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + if (false) { +alloc_skb: + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->readlock); + newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, + &err, 0); + if (!newskb) + goto err; + } + + /* we must acquire readlock as we modify already present + * skbs in the sk_receive_queue and mess with skb->len + */ + err = mutex_lock_interruptible(&unix_sk(other)->readlock); + if (err) { + err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; + goto err; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + send_sigpipe = true; + goto err_unlock; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; + send_sigpipe = true; + goto err_state_unlock; + } + + if (init_scm) { + err = maybe_init_creds(&scm, socket, other); + if (err) + goto err_state_unlock; + init_scm = false; + } + + skb = skb_peek_tail(&other->sk_receive_queue); + if (tail && tail == skb) { + skb = newskb; + } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { + if (newskb) { + skb = newskb; + } else { + tail = skb; + goto alloc_skb; + } + } else if (newskb) { + /* this is fast path, we don't necessarily need to + * call to kfree_skb even though with newskb == NULL + * this - does no harm + */ + consume_skb(newskb); + newskb = NULL; + } + + if (skb_append_pagefrags(skb, page, offset, size)) { + tail = skb; + goto alloc_skb; + } + + skb->len += size; + skb->data_len += size; + skb->truesize += size; + atomic_add(size, &sk->sk_wmem_alloc); + + if (newskb) { + err = unix_scm_to_skb(&scm, skb, false); + if (err) + goto err_state_unlock; + spin_lock(&other->sk_receive_queue.lock); + __skb_queue_tail(&other->sk_receive_queue, newskb); + spin_unlock(&other->sk_receive_queue.lock); + } + + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->readlock); + + other->sk_data_ready(other); + scm_destroy(&scm); + return size; + +err_state_unlock: + unix_state_unlock(other); +err_unlock: + mutex_unlock(&unix_sk(other)->readlock); +err: + kfree_skb(newskb); + if (send_sigpipe && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + if (!init_scm) + scm_destroy(&scm); + return err; +} + static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { @@ -1860,8 +2207,9 @@ out: * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last) + struct sk_buff *last, unsigned int last_len) { + struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); @@ -1869,14 +2217,16 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (skb_peek_tail(&sk->sk_receive_queue) != last || + tail = skb_peek_tail(&sk->sk_receive_queue); + if (tail != last || + (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); @@ -1884,7 +2234,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); @@ -1897,49 +2247,62 @@ static unsigned int unix_skb_len(const struct sk_buff *skb) return skb->len - UNIXCB(skb).consumed; } -static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) +struct unix_stream_read_state { + int (*recv_actor)(struct sk_buff *, int, int, + struct unix_stream_read_state *); + struct socket *socket; + struct msghdr *msg; + struct pipe_inode_info *pipe; + size_t size; + int flags; + unsigned int splice_flags; +}; + +static int unix_stream_read_generic(struct unix_stream_read_state *state) { struct scm_cookie scm; + struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); int copied = 0; + int flags = state->flags; int noblock = flags & MSG_DONTWAIT; - int check_creds = 0; + bool check_creds = false; int target; int err = 0; long timeo; int skip; + size_t size = state->size; + unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags&MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } - target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); + memset(&scm, 0, sizeof(scm)); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ + mutex_lock(&u->readlock); - memset(&scm, 0, sizeof(scm)); - - err = mutex_lock_interruptible(&u->readlock); - if (unlikely(err)) { - /* recvmsg() in non blocking mode is supposed to return -EAGAIN - * sk_rcvtimeo is not honored by mutex_lock_interruptible() - */ - err = noblock ? -EAGAIN : -ERESTARTSYS; - goto out; - } + if (flags & MSG_PEEK) + skip = sk_peek_offset(sk, flags); + else + skip = 0; do { int chunk; + bool drop_skb; struct sk_buff *skb, *last; unix_state_lock(sk); @@ -1948,6 +2311,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); + last_len = last ? last->len : 0; again: if (skb == NULL) { unix_sk(sk)->recursion_level = 0; @@ -1965,29 +2329,33 @@ again: goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; + } + mutex_unlock(&u->readlock); - timeo = unix_stream_data_wait(sk, timeo, last); + timeo = unix_stream_data_wait(sk, timeo, last, + last_len); - if (signal_pending(current) - || mutex_lock_interruptible(&u->readlock)) { + if (signal_pending(current)) { err = sock_intr_errno(timeo); + scm_destroy(&scm); goto out; } + mutex_lock(&u->readlock); continue; - unlock: +unlock: unix_state_unlock(sk); break; } - skip = sk_peek_offset(sk, flags); while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; + last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -1997,25 +2365,30 @@ again: if (check_creds) { /* Never glue messages from different writers */ - if ((UNIXCB(skb).pid != scm.pid) || - !uid_eq(UNIXCB(skb).uid, scm.creds.uid) || - !gid_eq(UNIXCB(skb).gid, scm.creds.gid)) + if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - check_creds = 1; + unix_set_secdata(&scm, skb); + check_creds = true; } /* Copy address just once */ - if (sunaddr) { - unix_copy_addr(msg, skb->sk); + if (state->msg && state->msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, + state->msg->msg_name); + unix_copy_addr(state->msg, skb->sk); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, - msg, chunk)) { + skb_get(skb); + chunk = state->recv_actor(skb, skip, chunk, state); + drop_skb = !unix_skb_len(skb); + /* skb is only safe to use if !drop_skb */ + consume_skb(skb); + if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; @@ -2023,6 +2396,18 @@ again: copied += chunk; size -= chunk; + if (drop_skb) { + /* the skb was touched by a concurrent reader; + * we should not expect anything from this skb + * anymore and assume it invalid - we can be + * sure it was dropped from the socket queue + * + * let's report a short read + */ + err = 0; + break; + } + /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; @@ -2048,16 +2433,101 @@ again: sk_peek_offset_fwd(sk, chunk); + if (UNIXCB(skb).fp) + break; + + skip = 0; + last = skb; + last_len = skb->len; + unix_state_lock(sk); + skb = skb_peek_next(skb, &sk->sk_receive_queue); + if (skb) + goto again; + unix_state_unlock(sk); break; } } while (size); mutex_unlock(&u->readlock); - scm_recv(sock, msg, &scm, flags); + if (state->msg) + scm_recv(sock, state->msg, &scm, flags); + else + scm_destroy(&scm); out: return copied ? : err; } +static int unix_stream_read_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + int ret; + + ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + state->msg, chunk); + return ret ?: chunk; +} + +static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sock, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state); +} + +static ssize_t skb_unix_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + struct unix_sock *u = unix_sk(sk); + + mutex_unlock(&u->readlock); + ret = splice_to_pipe(pipe, spd); + mutex_lock(&u->readlock); + + return ret; +} + +static int unix_stream_splice_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + return skb_splice_bits(skb, state->socket->sk, + UNIXCB(skb).consumed + skip, + state->pipe, chunk, state->splice_flags, + skb_unix_socket_splice); +} + +static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t size, unsigned int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_splice_actor, + .socket = sock, + .pipe = pipe, + .size = size, + .splice_flags = flags, + }; + + if (unlikely(*ppos)) + return -ESPIPE; + + if (sock->file->f_flags & O_NONBLOCK || + flags & SPLICE_F_NONBLOCK) + state.flags = MSG_DONTWAIT; + + return unix_stream_read_generic(&state); +} + static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; @@ -2231,20 +2701,22 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; writable = unix_writable(sk); - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); - if (unix_recvq_full(other)) - writable = 0; - } - sock_put(other); + if (writable) { + unix_state_lock(sk); + + other = unix_peer(sk); + if (other && unix_peer(other) != sk && + unix_recvq_full(other) && + unix_dgram_peer_wake_me(sk, other)) + writable = 0; + + unix_state_unlock(sk); } if (writable) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/kernel/net/unix/diag.c b/kernel/net/unix/diag.c index c512f64d5..4d9679701 100644 --- a/kernel/net/unix/diag.c +++ b/kernel/net/unix/diag.c @@ -220,7 +220,7 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(int ino) +static struct sock *unix_lookup_by_ino(unsigned int ino) { int i; struct sock *sk; diff --git a/kernel/net/unix/garbage.c b/kernel/net/unix/garbage.c index a73a226f2..6a0d48525 100644 --- a/kernel/net/unix/garbage.c +++ b/kernel/net/unix/garbage.c @@ -116,15 +116,15 @@ struct sock *unix_get_socket(struct file *filp) * descriptor if it is for an AF_UNIX socket. */ -void unix_inflight(struct file *fp) +void unix_inflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); + spin_lock(&unix_gc_lock); + if (s) { struct unix_sock *u = unix_sk(s); - spin_lock(&unix_gc_lock); - if (atomic_long_inc_return(&u->inflight) == 1) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -132,25 +132,28 @@ void unix_inflight(struct file *fp) BUG_ON(list_empty(&u->link)); } unix_tot_inflight++; - spin_unlock(&unix_gc_lock); } + user->unix_inflight++; + spin_unlock(&unix_gc_lock); } -void unix_notinflight(struct file *fp) +void unix_notinflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); + spin_lock(&unix_gc_lock); + if (s) { struct unix_sock *u = unix_sk(s); - spin_lock(&unix_gc_lock); BUG_ON(list_empty(&u->link)); if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); unix_tot_inflight--; - spin_unlock(&unix_gc_lock); } + user->unix_inflight--; + spin_unlock(&unix_gc_lock); } static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), diff --git a/kernel/net/vmw_vsock/af_vsock.c b/kernel/net/vmw_vsock/af_vsock.c index 2ec86e652..7fd1220fb 100644 --- a/kernel/net/vmw_vsock/af_vsock.c +++ b/kernel/net/vmw_vsock/af_vsock.c @@ -36,19 +36,20 @@ * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this - * implementation because they are in the SS_LISTEN state. When a connection - * request is received (the second kind of socket mentioned above), we create a - * new socket and refer to it as a pending socket. These pending sockets are - * placed on the pending connection list of the listener socket. When future - * packets are received for the address the listener socket is bound to, we - * check if the source of the packet is from one that has an existing pending - * connection. If it does, we process the packet for the pending socket. When - * that socket reaches the connected state, it is removed from the listener - * socket's pending list and enqueued in the listener socket's accept queue. - * Callers of accept(2) will accept connected sockets from the listener socket's - * accept queue. If the socket cannot be accepted for some reason then it is - * marked rejected. Once the connection is accepted, it is owned by the user - * process and the responsibility for cleanup falls with that user process. + * implementation because they are in the VSOCK_SS_LISTEN state. When a + * connection request is received (the second kind of socket mentioned above), + * we create a new socket and refer to it as a pending socket. These pending + * sockets are placed on the pending connection list of the listener socket. + * When future packets are received for the address the listener socket is + * bound to, we check if the source of the packet is from one that has an + * existing pending connection. If it does, we process the packet for the + * pending socket. When that socket reaches the connected state, it is removed + * from the listener socket's pending list and enqueued in the listener + * socket's accept queue. Callers of accept(2) will accept connected sockets + * from the listener socket's accept queue. If the socket cannot be accepted + * for some reason then it is marked rejected. Once the connection is + * accepted, it is owned by the user process and the responsibility for cleanup + * falls with that user process. * * - It is possible that these pending sockets will never reach the connected * state; in fact, we may never receive another packet after the connection @@ -114,8 +115,6 @@ static struct proto vsock_proto = { */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) -#define SS_LISTEN 255 - static const struct vsock_transport *transport; static DEFINE_MUTEX(vsock_register_mutex); @@ -581,13 +580,14 @@ struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, - unsigned short type) + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; - sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto); + sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; @@ -886,7 +886,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, /* Listening sockets that have connections in their accept * queue can be read. */ - if (sk->sk_state == SS_LISTEN + if (sk->sk_state == VSOCK_SS_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= POLLIN | POLLRDNORM; @@ -1143,7 +1143,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = -EALREADY; break; default: - if ((sk->sk_state == SS_LISTEN) || + if ((sk->sk_state == VSOCK_SS_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; @@ -1255,7 +1255,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) goto out; } - if (listener->sk_state != SS_LISTEN) { + if (listener->sk_state != VSOCK_SS_LISTEN) { err = -EINVAL; goto out; } @@ -1347,7 +1347,7 @@ static int vsock_listen(struct socket *sock, int backlog) } sk->sk_max_ack_backlog = backlog; - sk->sk_state = SS_LISTEN; + sk->sk_state = VSOCK_SS_LISTEN; err = 0; @@ -1866,7 +1866,7 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM; + return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; } static const struct net_proto_family vsock_family_ops = { @@ -1947,13 +1947,13 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) err = misc_register(&vsock_device); if (err) { pr_err("Failed to register misc device\n"); - return -ENOENT; + goto err_reset_transport; } err = proto_register(&vsock_proto, 1); /* we want our slab */ if (err) { pr_err("Cannot register vsock protocol\n"); - goto err_misc_deregister; + goto err_deregister_misc; } err = sock_register(&vsock_family_ops); @@ -1968,8 +1968,9 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) err_unregister_proto: proto_unregister(&vsock_proto); -err_misc_deregister: +err_deregister_misc: misc_deregister(&vsock_device); +err_reset_transport: transport = NULL; err_busy: mutex_unlock(&vsock_register_mutex); diff --git a/kernel/net/vmw_vsock/vmci_transport.c b/kernel/net/vmw_vsock/vmci_transport.c index c294da095..0a369bb44 100644 --- a/kernel/net/vmw_vsock/vmci_transport.c +++ b/kernel/net/vmw_vsock/vmci_transport.c @@ -40,13 +40,11 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg); static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg); -static void vmci_transport_peer_attach_cb(u32 sub_id, - const struct vmci_event_data *ed, - void *client_data); static void vmci_transport_peer_detach_cb(u32 sub_id, const struct vmci_event_data *ed, void *client_data); static void vmci_transport_recv_pkt_work(struct work_struct *work); +static void vmci_transport_cleanup(struct work_struct *work); static int vmci_transport_recv_listen(struct sock *sk, struct vmci_transport_packet *pkt); static int vmci_transport_recv_connecting_server( @@ -75,6 +73,10 @@ struct vmci_transport_recv_pkt_info { struct vmci_transport_packet pkt; }; +static LIST_HEAD(vmci_transport_cleanup_list); +static DEFINE_SPINLOCK(vmci_transport_cleanup_lock); +static DECLARE_WORK(vmci_transport_cleanup_work, vmci_transport_cleanup); + static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID, VMCI_INVALID_ID }; static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; @@ -90,8 +92,6 @@ static int PROTOCOL_OVERRIDE = -1; */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) -#define SS_LISTEN 255 - /* Helper function to convert from a VMCI error code to a VSock error code. */ static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) @@ -791,44 +791,6 @@ out: return err; } -static void vmci_transport_peer_attach_cb(u32 sub_id, - const struct vmci_event_data *e_data, - void *client_data) -{ - struct sock *sk = client_data; - const struct vmci_event_payload_qp *e_payload; - struct vsock_sock *vsk; - - e_payload = vmci_event_data_const_payload(e_data); - - vsk = vsock_sk(sk); - - /* We don't ask for delayed CBs when we subscribe to this event (we - * pass 0 as flags to vmci_event_subscribe()). VMCI makes no - * guarantees in that case about what context we might be running in, - * so it could be BH or process, blockable or non-blockable. So we - * need to account for all possible contexts here. - */ - local_bh_disable(); - bh_lock_sock(sk); - - /* XXX This is lame, we should provide a way to lookup sockets by - * qp_handle. - */ - if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, - e_payload->handle)) { - /* XXX This doesn't do anything, but in the future we may want - * to set a flag here to verify the attach really did occur and - * we weren't just sent a datagram claiming it was. - */ - goto out; - } - -out: - bh_unlock_sock(sk); - local_bh_enable(); -} - static void vmci_transport_handle_detach(struct sock *sk) { struct vsock_sock *vsk; @@ -871,28 +833,38 @@ static void vmci_transport_peer_detach_cb(u32 sub_id, const struct vmci_event_data *e_data, void *client_data) { - struct sock *sk = client_data; + struct vmci_transport *trans = client_data; const struct vmci_event_payload_qp *e_payload; - struct vsock_sock *vsk; e_payload = vmci_event_data_const_payload(e_data); - vsk = vsock_sk(sk); - if (vmci_handle_is_invalid(e_payload->handle)) - return; - - /* Same rules for locking as for peer_attach_cb(). */ - local_bh_disable(); - bh_lock_sock(sk); /* XXX This is lame, we should provide a way to lookup sockets by * qp_handle. */ - if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, - e_payload->handle)) - vmci_transport_handle_detach(sk); + if (vmci_handle_is_invalid(e_payload->handle) || + vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) + return; - bh_unlock_sock(sk); - local_bh_enable(); + /* We don't ask for delayed CBs when we subscribe to this event (we + * pass 0 as flags to vmci_event_subscribe()). VMCI makes no + * guarantees in that case about what context we might be running in, + * so it could be BH or process, blockable or non-blockable. So we + * need to account for all possible contexts here. + */ + spin_lock_bh(&trans->lock); + if (!trans->sk) + goto out; + + /* Apart from here, trans->lock is only grabbed as part of sk destruct, + * where trans->sk isn't locked. + */ + bh_lock_sock(trans->sk); + + vmci_transport_handle_detach(trans->sk); + + bh_unlock_sock(trans->sk); + out: + spin_unlock_bh(&trans->lock); } static void vmci_transport_qp_resumed_cb(u32 sub_id, @@ -919,7 +891,7 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work) vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context; switch (sk->sk_state) { - case SS_LISTEN: + case VSOCK_SS_LISTEN: vmci_transport_recv_listen(sk, pkt); break; case SS_CONNECTING: @@ -1022,7 +994,7 @@ static int vmci_transport_recv_listen(struct sock *sk, } pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type); + sk->sk_type, 0); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; @@ -1181,7 +1153,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, */ err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, vmci_transport_peer_detach_cb, - pending, &detach_sub_id); + vmci_trans(vpending), &detach_sub_id); if (err < VMCI_SUCCESS) { vmci_transport_send_reset(pending, pkt); err = vmci_transport_error_to_vsock_error(err); @@ -1262,7 +1234,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, /* Callers of accept() will be be waiting on the listening socket, not * the pending socket. */ - listener->sk_state_change(listener); + listener->sk_data_ready(listener); return 0; @@ -1321,7 +1293,6 @@ vmci_transport_recv_connecting_client(struct sock *sk, || vmci_trans(vsk)->qpair || vmci_trans(vsk)->produce_size != 0 || vmci_trans(vsk)->consume_size != 0 - || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { skerr = EPROTO; err = -EINVAL; @@ -1389,7 +1360,6 @@ static int vmci_transport_recv_connecting_client_negotiate( struct vsock_sock *vsk; struct vmci_handle handle; struct vmci_qp *qpair; - u32 attach_sub_id; u32 detach_sub_id; bool is_local; u32 flags; @@ -1399,7 +1369,6 @@ static int vmci_transport_recv_connecting_client_negotiate( vsk = vsock_sk(sk); handle = VMCI_INVALID_HANDLE; - attach_sub_id = VMCI_INVALID_ID; detach_sub_id = VMCI_INVALID_ID; /* If we have gotten here then we should be past the point where old @@ -1444,23 +1413,15 @@ static int vmci_transport_recv_connecting_client_negotiate( goto destroy; } - /* Subscribe to attach and detach events first. + /* Subscribe to detach events first. * * XXX We attach once for each queue pair created for now so it is easy * to find the socket (it's provided), but later we should only * subscribe once and add a way to lookup sockets by queue pair handle. */ - err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH, - vmci_transport_peer_attach_cb, - sk, &attach_sub_id); - if (err < VMCI_SUCCESS) { - err = vmci_transport_error_to_vsock_error(err); - goto destroy; - } - err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, vmci_transport_peer_detach_cb, - sk, &detach_sub_id); + vmci_trans(vsk), &detach_sub_id); if (err < VMCI_SUCCESS) { err = vmci_transport_error_to_vsock_error(err); goto destroy; @@ -1496,7 +1457,6 @@ static int vmci_transport_recv_connecting_client_negotiate( vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = pkt->u.size; - vmci_trans(vsk)->attach_sub_id = attach_sub_id; vmci_trans(vsk)->detach_sub_id = detach_sub_id; vmci_trans(vsk)->notify_ops->process_negotiate(sk); @@ -1504,9 +1464,6 @@ static int vmci_transport_recv_connecting_client_negotiate( return 0; destroy: - if (attach_sub_id != VMCI_INVALID_ID) - vmci_event_unsubscribe(attach_sub_id); - if (detach_sub_id != VMCI_INVALID_ID) vmci_event_unsubscribe(detach_sub_id); @@ -1607,9 +1564,11 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; vmci_trans(vsk)->qpair = NULL; vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0; - vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id = - VMCI_INVALID_ID; + vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; vmci_trans(vsk)->notify_ops = NULL; + INIT_LIST_HEAD(&vmci_trans(vsk)->elem); + vmci_trans(vsk)->sk = &vsk->sk; + spin_lock_init(&vmci_trans(vsk)->lock); if (psk) { vmci_trans(vsk)->queue_pair_size = vmci_trans(psk)->queue_pair_size; @@ -1629,29 +1588,57 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, return 0; } -static void vmci_transport_destruct(struct vsock_sock *vsk) +static void vmci_transport_free_resources(struct list_head *transport_list) { - if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) { - vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id); - vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID; - } + while (!list_empty(transport_list)) { + struct vmci_transport *transport = + list_first_entry(transport_list, struct vmci_transport, + elem); + list_del(&transport->elem); - if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { - vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id); - vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; - } + if (transport->detach_sub_id != VMCI_INVALID_ID) { + vmci_event_unsubscribe(transport->detach_sub_id); + transport->detach_sub_id = VMCI_INVALID_ID; + } - if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { - vmci_qpair_detach(&vmci_trans(vsk)->qpair); - vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; - vmci_trans(vsk)->produce_size = 0; - vmci_trans(vsk)->consume_size = 0; + if (!vmci_handle_is_invalid(transport->qp_handle)) { + vmci_qpair_detach(&transport->qpair); + transport->qp_handle = VMCI_INVALID_HANDLE; + transport->produce_size = 0; + transport->consume_size = 0; + } + + kfree(transport); } +} + +static void vmci_transport_cleanup(struct work_struct *work) +{ + LIST_HEAD(pending); + + spin_lock_bh(&vmci_transport_cleanup_lock); + list_replace_init(&vmci_transport_cleanup_list, &pending); + spin_unlock_bh(&vmci_transport_cleanup_lock); + vmci_transport_free_resources(&pending); +} + +static void vmci_transport_destruct(struct vsock_sock *vsk) +{ + /* Ensure that the detach callback doesn't use the sk/vsk + * we are about to destruct. + */ + spin_lock_bh(&vmci_trans(vsk)->lock); + vmci_trans(vsk)->sk = NULL; + spin_unlock_bh(&vmci_trans(vsk)->lock); if (vmci_trans(vsk)->notify_ops) vmci_trans(vsk)->notify_ops->socket_destruct(vsk); - kfree(vsk->trans); + spin_lock_bh(&vmci_transport_cleanup_lock); + list_add(&vmci_trans(vsk)->elem, &vmci_transport_cleanup_list); + spin_unlock_bh(&vmci_transport_cleanup_lock); + schedule_work(&vmci_transport_cleanup_work); + vsk->trans = NULL; } @@ -2146,6 +2133,9 @@ module_init(vmci_transport_init); static void __exit vmci_transport_exit(void) { + cancel_work_sync(&vmci_transport_cleanup_work); + vmci_transport_free_resources(&vmci_transport_cleanup_list); + if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) { if (vmci_datagram_destroy_handle( vmci_transport_stream_handle) != VMCI_SUCCESS) @@ -2164,6 +2154,7 @@ module_exit(vmci_transport_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); +MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("vmware_vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/kernel/net/vmw_vsock/vmci_transport.h b/kernel/net/vmw_vsock/vmci_transport.h index ce6c9623d..2ad46f396 100644 --- a/kernel/net/vmw_vsock/vmci_transport.h +++ b/kernel/net/vmw_vsock/vmci_transport.h @@ -119,10 +119,12 @@ struct vmci_transport { u64 queue_pair_size; u64 queue_pair_min_size; u64 queue_pair_max_size; - u32 attach_sub_id; u32 detach_sub_id; union vmci_transport_notify notify; struct vmci_transport_notify_ops *notify_ops; + struct list_head elem; + struct sock *sk; + spinlock_t lock; /* protects sk. */ }; int vmci_transport_register(void); diff --git a/kernel/net/wimax/op-rfkill.c b/kernel/net/wimax/op-rfkill.c index 7d730543f..477364ad7 100644 --- a/kernel/net/wimax/op-rfkill.c +++ b/kernel/net/wimax/op-rfkill.c @@ -135,8 +135,7 @@ EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw); * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on, * %WIMAX_RF_OFF radio off. * - * Reports changes in the software RF switch state to the the WiMAX - * stack. + * Reports changes in the software RF switch state to the WiMAX stack. * * The main use is during initialization, so the driver can query the * device for its current software radio kill switch state and feed it diff --git a/kernel/net/wireless/Kconfig b/kernel/net/wireless/Kconfig index 4f5543dd2..da72ed32f 100644 --- a/kernel/net/wireless/Kconfig +++ b/kernel/net/wireless/Kconfig @@ -174,6 +174,16 @@ config CFG80211_INTERNAL_REGDB Most distributions have a CRDA package. So if unsure, say N. +config CFG80211_CRDA_SUPPORT + bool "support CRDA" if CFG80211_INTERNAL_REGDB + default y + depends on CFG80211 + help + You should enable this option unless you know for sure you have no + need for it, for example when using internal regdb (above.) + + If unsure, say Y. + config CFG80211_WEXT bool "cfg80211 wireless extensions compatibility" if !CFG80211_WEXT_EXPORT depends on CFG80211 diff --git a/kernel/net/wireless/chan.c b/kernel/net/wireless/chan.c index 7aaf7415d..59cabc9bc 100644 --- a/kernel/net/wireless/chan.c +++ b/kernel/net/wireless/chan.c @@ -698,19 +698,20 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_chandef_usable); /* - * For GO only, check if the channel can be used under permissive conditions - * mandated by the some regulatory bodies, i.e., the channel is marked with - * IEEE80211_CHAN_GO_CONCURRENT and there is an additional station interface + * Check if the channel can be used under permissive conditions mandated by + * some regulatory bodies, i.e., the channel is marked with + * IEEE80211_CHAN_IR_CONCURRENT and there is an additional station interface * associated to an AP on the same channel or on the same UNII band * (assuming that the AP is an authorized master). - * In addition allow the GO to operate on a channel on which indoor operation is + * In addition allow operation on a channel on which indoor operation is * allowed, iff we are currently operating in an indoor environment. */ -static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, +static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, + enum nl80211_iftype iftype, struct ieee80211_channel *chan) { - struct wireless_dev *wdev_iter; - struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx); + struct wireless_dev *wdev; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); ASSERT_RTNL(); @@ -718,32 +719,48 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) return false; + /* only valid for GO and TDLS off-channel (station/p2p-CL) */ + if (iftype != NL80211_IFTYPE_P2P_GO && + iftype != NL80211_IFTYPE_STATION && + iftype != NL80211_IFTYPE_P2P_CLIENT) + return false; + if (regulatory_indoor_allowed() && (chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) return true; - if (!(chan->flags & IEEE80211_CHAN_GO_CONCURRENT)) + if (!(chan->flags & IEEE80211_CHAN_IR_CONCURRENT)) return false; /* * Generally, it is possible to rely on another device/driver to allow - * the GO concurrent relaxation, however, since the device can further + * the IR concurrent relaxation, however, since the device can further * enforce the relaxation (by doing a similar verifications as this), * and thus fail the GO instantiation, consider only the interfaces of * the current registered device. */ - list_for_each_entry(wdev_iter, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wdev_list, list) { struct ieee80211_channel *other_chan = NULL; int r1, r2; - if (wdev_iter->iftype != NL80211_IFTYPE_STATION || - !netif_running(wdev_iter->netdev)) - continue; - - wdev_lock(wdev_iter); - if (wdev_iter->current_bss) - other_chan = wdev_iter->current_bss->pub.channel; - wdev_unlock(wdev_iter); + wdev_lock(wdev); + if (wdev->iftype == NL80211_IFTYPE_STATION && + wdev->current_bss) + other_chan = wdev->current_bss->pub.channel; + + /* + * If a GO already operates on the same GO_CONCURRENT channel, + * this one (maybe the same one) can beacon as well. We allow + * the operation even if the station we relied on with + * GO_CONCURRENT is disconnected now. But then we must make sure + * we're not outdoor on an indoor-only channel. + */ + if (iftype == NL80211_IFTYPE_P2P_GO && + wdev->iftype == NL80211_IFTYPE_P2P_GO && + wdev->beacon_interval && + !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) + other_chan = wdev->chandef.chan; + wdev_unlock(wdev); if (!other_chan) continue; @@ -780,25 +797,18 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, return false; } -bool cfg80211_reg_can_beacon(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype) +static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype, + bool check_no_ir) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); bool res; u32 prohibited_flags = IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_RADAR; - trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype); + trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir); - /* - * Under certain conditions suggested by the some regulatory bodies - * a GO can operate on channels marked with IEEE80211_NO_IR - * so set this flag only if such relaxations are not enabled and - * the conditions are not met. - */ - if (iftype != NL80211_IFTYPE_P2P_GO || - !cfg80211_go_permissive_chan(rdev, chandef->chan)) + if (check_no_ir) prohibited_flags |= IEEE80211_CHAN_NO_IR; if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 && @@ -812,8 +822,36 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, trace_cfg80211_return_bool(res); return res; } + +bool cfg80211_reg_can_beacon(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, true); +} EXPORT_SYMBOL(cfg80211_reg_can_beacon); +bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + bool check_no_ir; + + ASSERT_RTNL(); + + /* + * Under certain conditions suggested by some regulatory bodies a + * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag + * only if such relaxations are not enabled and the conditions are not + * met. + */ + check_no_ir = !cfg80211_ir_permissive_chan(wiphy, iftype, + chandef->chan); + + return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir); +} +EXPORT_SYMBOL(cfg80211_reg_can_beacon_relax); + int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { diff --git a/kernel/net/wireless/core.c b/kernel/net/wireless/core.c index 2a0bbd228..8f0bac7e0 100644 --- a/kernel/net/wireless/core.c +++ b/kernel/net/wireless/core.c @@ -407,6 +407,9 @@ use_default_name: INIT_LIST_HEAD(&rdev->bss_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results); + INIT_LIST_HEAD(&rdev->mlme_unreg); + spin_lock_init(&rdev->mlme_unreg_lock); + INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT @@ -416,6 +419,7 @@ use_default_name: device_initialize(&rdev->wiphy.dev); rdev->wiphy.dev.class = &ieee80211_class; rdev->wiphy.dev.platform_data = rdev; + device_enable_async_suspend(&rdev->wiphy.dev); INIT_LIST_HEAD(&rdev->destroy_list); spin_lock_init(&rdev->destroy_list_lock); @@ -457,6 +461,9 @@ use_default_name: rdev->wiphy.max_num_csa_counters = 1; + rdev->wiphy.max_sched_scan_plans = 1; + rdev->wiphy.max_sched_scan_plan_interval = U32_MAX; + return &rdev->wiphy; } EXPORT_SYMBOL(wiphy_new_nm); @@ -632,7 +639,7 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON(!sband->n_channels)) return -EINVAL; /* - * on 60gHz band, there are no legacy rates, so + * on 60GHz band, there are no legacy rates, so * n_bitrates is 0 */ if (WARN_ON(band != IEEE80211_BAND_60GHZ && @@ -802,6 +809,7 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); + flush_work(&rdev->mlme_unreg_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -855,6 +863,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: + cfg80211_mlme_purge_registrations(wdev); cfg80211_stop_p2p_device(rdev, wdev); break; default: @@ -1138,6 +1147,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, return NOTIFY_DONE; } + wireless_nlevent_flush(); + return NOTIFY_OK; } diff --git a/kernel/net/wireless/core.h b/kernel/net/wireless/core.h index 801cd49c5..a618b4b86 100644 --- a/kernel/net/wireless/core.h +++ b/kernel/net/wireless/core.h @@ -59,6 +59,10 @@ struct cfg80211_registered_device { struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; + struct list_head mlme_unreg; + spinlock_t mlme_unreg_lock; + struct work_struct mlme_unreg_wk; + /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; @@ -133,6 +137,7 @@ struct cfg80211_internal_bss { struct list_head list; struct list_head hidden_list; struct rb_node rbn; + u64 ts_boottime; unsigned long ts; unsigned long refcount; atomic_t hold; @@ -222,6 +227,7 @@ struct cfg80211_event { const u8 *ie; size_t ie_len; u16 reason; + bool locally_generated; } dc; struct { u8 bssid[ETH_ALEN]; @@ -347,6 +353,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len); +void cfg80211_mlme_unreg_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/kernel/net/wireless/mlme.c b/kernel/net/wireless/mlme.c index 7aae329e2..fb44fa3bf 100644 --- a/kernel/net/wireless/mlme.c +++ b/kernel/net/wireless/mlme.c @@ -2,6 +2,7 @@ * cfg80211 MLME SAP interface * * Copyright (c) 2009, Jouni Malinen + * Copyright (c) 2015 Intel Deutschland GmbH */ #include @@ -389,6 +390,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct cfg80211_mgmt_registration { struct list_head list; + struct wireless_dev *wdev; u32 nlportid; @@ -399,6 +401,46 @@ struct cfg80211_mgmt_registration { u8 match[]; }; +static void +cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_mgmt_registration *reg; + + ASSERT_RTNL(); + + spin_lock_bh(&rdev->mlme_unreg_lock); + while ((reg = list_first_entry_or_null(&rdev->mlme_unreg, + struct cfg80211_mgmt_registration, + list))) { + list_del(®->list); + spin_unlock_bh(&rdev->mlme_unreg_lock); + + if (rdev->ops->mgmt_frame_register) { + u16 frame_type = le16_to_cpu(reg->frame_type); + + rdev_mgmt_frame_register(rdev, reg->wdev, + frame_type, false); + } + + kfree(reg); + + spin_lock_bh(&rdev->mlme_unreg_lock); + } + spin_unlock_bh(&rdev->mlme_unreg_lock); +} + +void cfg80211_mlme_unreg_wk(struct work_struct *wk) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(wk, struct cfg80211_registered_device, + mlme_unreg_wk); + + rtnl_lock(); + cfg80211_process_mlme_unregistrations(rdev); + rtnl_unlock(); +} + int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len) @@ -449,11 +491,18 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, nreg->match_len = match_len; nreg->nlportid = snd_portid; nreg->frame_type = cpu_to_le16(frame_type); + nreg->wdev = wdev; list_add(&nreg->list, &wdev->mgmt_registrations); + spin_unlock_bh(&wdev->mgmt_registrations_lock); + + /* process all unregistrations to avoid driver confusion */ + cfg80211_process_mlme_unregistrations(rdev); if (rdev->ops->mgmt_frame_register) rdev_mgmt_frame_register(rdev, wdev, frame_type, true); + return 0; + out: spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -472,15 +521,12 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) if (reg->nlportid != nlportid) continue; - if (rdev->ops->mgmt_frame_register) { - u16 frame_type = le16_to_cpu(reg->frame_type); - - rdev_mgmt_frame_register(rdev, wdev, - frame_type, false); - } - list_del(®->list); - kfree(reg); + spin_lock(&rdev->mlme_unreg_lock); + list_add_tail(®->list, &rdev->mlme_unreg); + spin_unlock(&rdev->mlme_unreg_lock); + + schedule_work(&rdev->mlme_unreg_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -496,16 +542,15 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_mgmt_registration *reg, *tmp; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); spin_lock_bh(&wdev->mgmt_registrations_lock); - - list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { - list_del(®->list); - kfree(reg); - } - + spin_lock(&rdev->mlme_unreg_lock); + list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg); + spin_unlock(&rdev->mlme_unreg_lock); spin_unlock_bh(&wdev->mgmt_registrations_lock); + + cfg80211_process_mlme_unregistrations(rdev); } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/kernel/net/wireless/nl80211.c b/kernel/net/wireless/nl80211.c index dd78445c7..75b0d23ee 100644 --- a/kernel/net/wireless/nl80211.c +++ b/kernel/net/wireless/nl80211.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2015 Intel Deutschland GmbH */ #include @@ -478,6 +479,12 @@ nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, }; +static const struct nla_policy +nl80211_plan_policy[NL80211_SCHED_SCAN_PLAN_MAX + 1] = { + [NL80211_SCHED_SCAN_PLAN_INTERVAL] = { .type = NLA_U32 }, + [NL80211_SCHED_SCAN_PLAN_ITERATIONS] = { .type = NLA_U32 }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -639,8 +646,8 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_GO_CONCURRENT) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_GO_CONCURRENT)) + if ((chan->flags & IEEE80211_CHAN_IR_CONCURRENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_IR_CONCURRENT)) goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ)) @@ -1303,7 +1310,13 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN, rdev->wiphy.max_sched_scan_ie_len) || nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS, - rdev->wiphy.max_match_sets)) + rdev->wiphy.max_match_sets) || + nla_put_u32(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS, + rdev->wiphy.max_sched_scan_plans) || + nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL, + rdev->wiphy.max_sched_scan_plan_interval) || + nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS, + rdev->wiphy.max_sched_scan_plan_iterations)) goto nla_put_failure; if ((rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) && @@ -2003,7 +2016,8 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) { + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef, + iftype)) { result = -EINVAL; break; } @@ -2320,6 +2334,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; rdev->wiphy.coverage_class = old_coverage_class; + return result; } } return 0; @@ -2401,6 +2416,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag } } + if (rdev->ops->get_tx_power) { + int dbm, ret; + + ret = rdev_get_tx_power(rdev, wdev, &dbm); + if (ret == 0 && + nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, + DBM_TO_MBM(dbm))) + goto nla_put_failure; + } + if (wdev->ssid_len) { if (nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) goto nla_put_failure; @@ -3403,16 +3428,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } else if (!nl80211_get_ap_channel(rdev, ¶ms)) return -EINVAL; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, + wdev->iftype)) return -EINVAL; - if (info->attrs[NL80211_ATTR_ACL_POLICY]) { - params.acl = parse_acl_data(&rdev->wiphy, info); - if (IS_ERR(params.acl)) - return PTR_ERR(params.acl); - } - if (info->attrs[NL80211_ATTR_SMPS_MODE]) { params.smps_mode = nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); @@ -3436,6 +3455,12 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.smps_mode = NL80211_SMPS_OFF; } + if (info->attrs[NL80211_ATTR_ACL_POLICY]) { + params.acl = parse_acl_data(&rdev->wiphy, info); + if (IS_ERR(params.acl)) + return PTR_ERR(params.acl); + } + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -3943,10 +3968,13 @@ int cfg80211_check_station_change(struct wiphy *wiphy, struct station_parameters *params, enum cfg80211_station_type statype) { - if (params->listen_interval != -1) + if (params->listen_interval != -1 && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) return -EINVAL; + if (params->aid && - !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) + !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) return -EINVAL; /* When you run into this, adjust the code below for the new flag */ @@ -3996,7 +4024,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); } - if (statype != CFG80211_STA_TDLS_PEER_SETUP) { + if (statype != CFG80211_STA_TDLS_PEER_SETUP && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) { /* reject other things that can't change */ if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) return -EINVAL; @@ -4008,7 +4037,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; } - if (statype != CFG80211_STA_AP_CLIENT) { + if (statype != CFG80211_STA_AP_CLIENT && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) { if (params->vlan) return -EINVAL; } @@ -4020,6 +4050,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EOPNOTSUPP; break; case CFG80211_STA_AP_CLIENT: + case CFG80211_STA_AP_CLIENT_UNASSOC: /* accept only the listed bits */ if (params->sta_flags_mask & ~(BIT(NL80211_STA_FLAG_AUTHORIZED) | @@ -4061,7 +4092,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; break; case CFG80211_STA_MESH_PEER_USER: - if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION) + if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION && + params->plink_action != NL80211_PLINK_ACTION_BLOCK) return -EINVAL; break; } @@ -4216,13 +4248,22 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) memset(¶ms, 0, sizeof(params)); - params.listen_interval = -1; - if (!rdev->ops->change_station) return -EOPNOTSUPP; - if (info->attrs[NL80211_ATTR_STA_AID]) - return -EINVAL; + /* + * AID and listen_interval properties can be set only for unassociated + * station. Include these parameters here and will check them in + * cfg80211_check_station_change(). + */ + if (info->attrs[NL80211_ATTR_PEER_AID]) + params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); + + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) + params.listen_interval = + nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + else + params.listen_interval = -1; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -4249,9 +4290,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]); } - if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) - return -EINVAL; - if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; @@ -4915,56 +4953,6 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) return err; } -static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = { - [NL80211_ATTR_REG_RULE_FLAGS] = { .type = NLA_U32 }, - [NL80211_ATTR_FREQ_RANGE_START] = { .type = NLA_U32 }, - [NL80211_ATTR_FREQ_RANGE_END] = { .type = NLA_U32 }, - [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 }, - [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 }, - [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 }, - [NL80211_ATTR_DFS_CAC_TIME] = { .type = NLA_U32 }, -}; - -static int parse_reg_rule(struct nlattr *tb[], - struct ieee80211_reg_rule *reg_rule) -{ - struct ieee80211_freq_range *freq_range = ®_rule->freq_range; - struct ieee80211_power_rule *power_rule = ®_rule->power_rule; - - if (!tb[NL80211_ATTR_REG_RULE_FLAGS]) - return -EINVAL; - if (!tb[NL80211_ATTR_FREQ_RANGE_START]) - return -EINVAL; - if (!tb[NL80211_ATTR_FREQ_RANGE_END]) - return -EINVAL; - if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]) - return -EINVAL; - if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]) - return -EINVAL; - - reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]); - - freq_range->start_freq_khz = - nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]); - freq_range->end_freq_khz = - nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]); - freq_range->max_bandwidth_khz = - nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]); - - power_rule->max_eirp = - nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]); - - if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]) - power_rule->max_antenna_gain = - nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]); - - if (tb[NL80211_ATTR_DFS_CAC_TIME]) - reg_rule->dfs_cac_ms = - nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]); - - return 0; -} - static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) { char *data = NULL; @@ -5596,6 +5584,57 @@ out_err: return err; } +#ifdef CONFIG_CFG80211_CRDA_SUPPORT +static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = { + [NL80211_ATTR_REG_RULE_FLAGS] = { .type = NLA_U32 }, + [NL80211_ATTR_FREQ_RANGE_START] = { .type = NLA_U32 }, + [NL80211_ATTR_FREQ_RANGE_END] = { .type = NLA_U32 }, + [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 }, + [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 }, + [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 }, + [NL80211_ATTR_DFS_CAC_TIME] = { .type = NLA_U32 }, +}; + +static int parse_reg_rule(struct nlattr *tb[], + struct ieee80211_reg_rule *reg_rule) +{ + struct ieee80211_freq_range *freq_range = ®_rule->freq_range; + struct ieee80211_power_rule *power_rule = ®_rule->power_rule; + + if (!tb[NL80211_ATTR_REG_RULE_FLAGS]) + return -EINVAL; + if (!tb[NL80211_ATTR_FREQ_RANGE_START]) + return -EINVAL; + if (!tb[NL80211_ATTR_FREQ_RANGE_END]) + return -EINVAL; + if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]) + return -EINVAL; + if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]) + return -EINVAL; + + reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]); + + freq_range->start_freq_khz = + nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]); + freq_range->end_freq_khz = + nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]); + freq_range->max_bandwidth_khz = + nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]); + + power_rule->max_eirp = + nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]); + + if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]) + power_rule->max_antenna_gain = + nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]); + + if (tb[NL80211_ATTR_DFS_CAC_TIME]) + reg_rule->dfs_cac_ms = + nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]); + + return 0; +} + static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1]; @@ -5672,6 +5711,7 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) kfree(rd); return r; } +#endif /* CONFIG_CFG80211_CRDA_SUPPORT */ static int validate_scan_freqs(struct nlattr *freqs) { @@ -5957,14 +5997,100 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) return err; } +static int +nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, + struct cfg80211_sched_scan_request *request, + struct nlattr **attrs) +{ + int tmp, err, i = 0; + struct nlattr *attr; + + if (!attrs[NL80211_ATTR_SCHED_SCAN_PLANS]) { + u32 interval; + + /* + * If scan plans are not specified, + * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this + * case one scan plan will be set with the specified scan + * interval and infinite number of iterations. + */ + if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) + return -EINVAL; + + interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); + if (!interval) + return -EINVAL; + + request->scan_plans[0].interval = + DIV_ROUND_UP(interval, MSEC_PER_SEC); + if (!request->scan_plans[0].interval) + return -EINVAL; + + if (request->scan_plans[0].interval > + wiphy->max_sched_scan_plan_interval) + request->scan_plans[0].interval = + wiphy->max_sched_scan_plan_interval; + + return 0; + } + + nla_for_each_nested(attr, attrs[NL80211_ATTR_SCHED_SCAN_PLANS], tmp) { + struct nlattr *plan[NL80211_SCHED_SCAN_PLAN_MAX + 1]; + + if (WARN_ON(i >= n_plans)) + return -EINVAL; + + err = nla_parse(plan, NL80211_SCHED_SCAN_PLAN_MAX, + nla_data(attr), nla_len(attr), + nl80211_plan_policy); + if (err) + return err; + + if (!plan[NL80211_SCHED_SCAN_PLAN_INTERVAL]) + return -EINVAL; + + request->scan_plans[i].interval = + nla_get_u32(plan[NL80211_SCHED_SCAN_PLAN_INTERVAL]); + if (!request->scan_plans[i].interval || + request->scan_plans[i].interval > + wiphy->max_sched_scan_plan_interval) + return -EINVAL; + + if (plan[NL80211_SCHED_SCAN_PLAN_ITERATIONS]) { + request->scan_plans[i].iterations = + nla_get_u32(plan[NL80211_SCHED_SCAN_PLAN_ITERATIONS]); + if (!request->scan_plans[i].iterations || + (request->scan_plans[i].iterations > + wiphy->max_sched_scan_plan_iterations)) + return -EINVAL; + } else if (i < n_plans - 1) { + /* + * All scan plans but the last one must specify + * a finite number of iterations + */ + return -EINVAL; + } + + i++; + } + + /* + * The last scan plan must not specify the number of + * iterations, it is supposed to run infinitely + */ + if (request->scan_plans[n_plans - 1].iterations) + return -EINVAL; + + return 0; +} + static struct cfg80211_sched_scan_request * nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct nlattr **attrs) { struct cfg80211_sched_scan_request *request; struct nlattr *attr; - int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i; - u32 interval; + int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i, n_plans = 0; enum ieee80211_band band; size_t ie_len; struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; @@ -5973,13 +6099,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (!is_valid_ie_attr(attrs[NL80211_ATTR_IE])) return ERR_PTR(-EINVAL); - if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) - return ERR_PTR(-EINVAL); - - interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); - if (interval == 0) - return ERR_PTR(-EINVAL); - if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { n_channels = validate_scan_freqs( attrs[NL80211_ATTR_SCAN_FREQUENCIES]); @@ -6043,9 +6162,37 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (ie_len > wiphy->max_sched_scan_ie_len) return ERR_PTR(-EINVAL); + if (attrs[NL80211_ATTR_SCHED_SCAN_PLANS]) { + /* + * NL80211_ATTR_SCHED_SCAN_INTERVAL must not be specified since + * each scan plan already specifies its own interval + */ + if (attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(attr, + attrs[NL80211_ATTR_SCHED_SCAN_PLANS], tmp) + n_plans++; + } else { + /* + * The scan interval attribute is kept for backward + * compatibility. If no scan plans are specified and sched scan + * interval is specified, one scan plan will be set with this + * scan interval and infinite number of iterations. + */ + if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) + return ERR_PTR(-EINVAL); + + n_plans = 1; + } + + if (!n_plans || n_plans > wiphy->max_sched_scan_plans) + return ERR_PTR(-EINVAL); + request = kzalloc(sizeof(*request) + sizeof(*request->ssids) * n_ssids + sizeof(*request->match_sets) * n_match_sets + + sizeof(*request->scan_plans) * n_plans + sizeof(*request->channels) * n_channels + ie_len, GFP_KERNEL); if (!request) @@ -6073,6 +6220,18 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, } request->n_match_sets = n_match_sets; + if (n_match_sets) + request->scan_plans = (void *)(request->match_sets + + n_match_sets); + else if (request->ie) + request->scan_plans = (void *)(request->ie + ie_len); + else if (n_ssids) + request->scan_plans = (void *)(request->ssids + n_ssids); + else + request->scan_plans = (void *)(request->channels + n_channels); + + request->n_scan_plans = n_plans; + i = 0; if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { /* user specified, bail out if channel not found */ @@ -6235,7 +6394,10 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request->delay = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]); - request->interval = interval; + err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs); + if (err) + goto out_free; + request->scan_start = jiffies; return request; @@ -6491,8 +6653,8 @@ skip_beacons: if (err) return err; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, + wdev->iftype)) return -EINVAL; err = cfg80211_chandef_dfs_required(wdev->wiphy, @@ -6588,6 +6750,11 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; + if (intbss->ts_boottime && + nla_put_u64(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, + intbss->ts_boottime)) + goto nla_put_failure; + switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal)) @@ -7388,7 +7555,8 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) int err; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) return -EOPNOTSUPP; if (!rdev->ops->set_mcast_rate) @@ -7773,8 +7941,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { if (!(rdev->wiphy.features & NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || - !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) { + kzfree(connkeys); return -EINVAL; + } connect.flags |= ASSOC_REQ_USE_RRM; } @@ -8827,7 +8997,7 @@ static int nl80211_send_wowlan_tcp(struct sk_buff *msg, static int nl80211_send_wowlan_nd(struct sk_buff *msg, struct cfg80211_sched_scan_request *req) { - struct nlattr *nd, *freqs, *matches, *match; + struct nlattr *nd, *freqs, *matches, *match, *scan_plans, *scan_plan; int i; if (!req) @@ -8837,7 +9007,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (!nd) return -ENOBUFS; - if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL, req->interval)) + if (req->n_scan_plans == 1 && + nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL, + req->scan_plans[0].interval * 1000)) return -ENOBUFS; if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay)) @@ -8864,6 +9036,23 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, nla_nest_end(msg, matches); } + scan_plans = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_PLANS); + if (!scan_plans) + return -ENOBUFS; + + for (i = 0; i < req->n_scan_plans; i++) { + scan_plan = nla_nest_start(msg, i + 1); + if (!scan_plan || + nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL, + req->scan_plans[i].interval) || + (req->scan_plans[i].iterations && + nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS, + req->scan_plans[i].iterations))) + return -ENOBUFS; + nla_nest_end(msg, scan_plan); + } + nla_nest_end(msg, scan_plans); + nla_nest_end(msg, nd); return 0; @@ -9316,6 +9505,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (new_triggers.tcp && new_triggers.tcp->sock) sock_release(new_triggers.tcp->sock); kfree(new_triggers.tcp); + kfree(new_triggers.nd_config); return err; } #endif @@ -9934,6 +10124,9 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) if (!wdev->netdev && !wdev->p2p_started) return -ENETDOWN; } + + if (!vcmd->doit) + return -EOPNOTSUPP; } else { wdev = NULL; } @@ -9953,6 +10146,193 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } +static int nl80211_prepare_vendor_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct wireless_dev **wdev) +{ + u32 vid, subcmd; + unsigned int i; + int vcmd_idx = -1; + int err; + void *data = NULL; + unsigned int data_len = 0; + + rtnl_lock(); + + if (cb->args[0]) { + /* subtract the 1 again here */ + struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); + struct wireless_dev *tmp; + + if (!wiphy) { + err = -ENODEV; + goto out_unlock; + } + *rdev = wiphy_to_rdev(wiphy); + *wdev = NULL; + + if (cb->args[1]) { + list_for_each_entry(tmp, &(*rdev)->wdev_list, list) { + if (tmp->identifier == cb->args[1] - 1) { + *wdev = tmp; + break; + } + } + } + + /* keep rtnl locked in successful case */ + return 0; + } + + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, + nl80211_fam.attrbuf, nl80211_fam.maxattr, + nl80211_policy); + if (err) + goto out_unlock; + + if (!nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID] || + !nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { + err = -EINVAL; + goto out_unlock; + } + + *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), + nl80211_fam.attrbuf); + if (IS_ERR(*wdev)) + *wdev = NULL; + + *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), + nl80211_fam.attrbuf); + if (IS_ERR(*rdev)) { + err = PTR_ERR(*rdev); + goto out_unlock; + } + + vid = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID]); + subcmd = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); + + for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) { + const struct wiphy_vendor_command *vcmd; + + vcmd = &(*rdev)->wiphy.vendor_commands[i]; + + if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) + continue; + + if (!vcmd->dumpit) { + err = -EOPNOTSUPP; + goto out_unlock; + } + + vcmd_idx = i; + break; + } + + if (vcmd_idx < 0) { + err = -EOPNOTSUPP; + goto out_unlock; + } + + if (nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]) { + data = nla_data(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]); + data_len = nla_len(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]); + } + + /* 0 is the first index - add 1 to parse only once */ + cb->args[0] = (*rdev)->wiphy_idx + 1; + /* add 1 to know if it was NULL */ + cb->args[1] = *wdev ? (*wdev)->identifier + 1 : 0; + cb->args[2] = vcmd_idx; + cb->args[3] = (unsigned long)data; + cb->args[4] = data_len; + + /* keep rtnl locked in successful case */ + return 0; + out_unlock: + rtnl_unlock(); + return err; +} + +static int nl80211_vendor_cmd_dump(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + unsigned int vcmd_idx; + const struct wiphy_vendor_command *vcmd; + void *data; + int data_len; + int err; + struct nlattr *vendor_data; + + err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev); + if (err) + return err; + + vcmd_idx = cb->args[2]; + data = (void *)cb->args[3]; + data_len = cb->args[4]; + vcmd = &rdev->wiphy.vendor_commands[vcmd_idx]; + + if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | + WIPHY_VENDOR_CMD_NEED_NETDEV)) { + if (!wdev) + return -EINVAL; + if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && + !wdev->netdev) + return -EINVAL; + + if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { + if (wdev->netdev && + !netif_running(wdev->netdev)) + return -ENETDOWN; + if (!wdev->netdev && !wdev->p2p_started) + return -ENETDOWN; + } + } + + while (1) { + void *hdr = nl80211hdr_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + NL80211_CMD_VENDOR); + if (!hdr) + break; + + if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + (wdev && nla_put_u64(skb, NL80211_ATTR_WDEV, + wdev_id(wdev)))) { + genlmsg_cancel(skb, hdr); + break; + } + + vendor_data = nla_nest_start(skb, NL80211_ATTR_VENDOR_DATA); + if (!vendor_data) { + genlmsg_cancel(skb, hdr); + break; + } + + err = vcmd->dumpit(&rdev->wiphy, wdev, skb, data, data_len, + (unsigned long *)&cb->args[5]); + nla_nest_end(skb, vendor_data); + + if (err == -ENOBUFS || err == -ENOENT) { + genlmsg_cancel(skb, hdr); + break; + } else if (err) { + genlmsg_cancel(skb, hdr); + goto out; + } + + genlmsg_end(skb, hdr); + } + + err = skb->len; + out: + rtnl_unlock(); + return err; +} + struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy, enum nl80211_commands cmd, enum nl80211_attrs attr, @@ -10169,7 +10549,8 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb, return -EINVAL; /* we will be active on the TDLS link */ - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype)) + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef, + wdev->iftype)) return -EINVAL; /* don't allow switching to DFS channels */ @@ -10528,6 +10909,7 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_RTNL, /* can be retrieved by unprivileged users */ }, +#ifdef CONFIG_CFG80211_CRDA_SUPPORT { .cmd = NL80211_CMD_SET_REG, .doit = nl80211_set_reg, @@ -10535,6 +10917,7 @@ static const struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_RTNL, }, +#endif { .cmd = NL80211_CMD_REQ_SET_REG, .doit = nl80211_req_set_reg, @@ -10989,6 +11372,7 @@ static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_VENDOR, .doit = nl80211_vendor_cmd, + .dumpit = nl80211_vendor_cmd_dump, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | diff --git a/kernel/net/wireless/rdev-ops.h b/kernel/net/wireless/rdev-ops.h index c6e83a746..c23516d0f 100644 --- a/kernel/net/wireless/rdev-ops.h +++ b/kernel/net/wireless/rdev-ops.h @@ -733,6 +733,8 @@ static inline void rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u16 frame_type, bool reg) { + might_sleep(); + trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); trace_rdev_return_void(&rdev->wiphy); diff --git a/kernel/net/wireless/reg.c b/kernel/net/wireless/reg.c index 0e347f888..06d050da0 100644 --- a/kernel/net/wireless/reg.c +++ b/kernel/net/wireless/reg.c @@ -135,10 +135,7 @@ static spinlock_t reg_indoor_lock; /* Used to track the userspace process controlling the indoor setting */ static u32 reg_is_indoor_portid; -/* Max number of consecutive attempts to communicate with CRDA */ -#define REG_MAX_CRDA_TIMEOUTS 10 - -static u32 reg_crda_timeouts; +static void restore_regulatory_settings(bool reset_user); static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { @@ -226,9 +223,6 @@ static DECLARE_DELAYED_WORK(reg_check_chans, reg_check_chans_work); static void reg_todo(struct work_struct *work); static DECLARE_WORK(reg_work, reg_todo); -static void reg_timeout_work(struct work_struct *work); -static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work); - /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { .n_reg_rules = 8, @@ -262,7 +256,7 @@ static const struct ieee80211_regdomain world_regdom = { REG_RULE(5745-10, 5825+10, 80, 6, 20, NL80211_RRF_NO_IR), - /* IEEE 802.11ad (60gHz), channels 1..3 */ + /* IEEE 802.11ad (60GHz), channels 1..3 */ REG_RULE(56160+2160*1-1080, 56160+2160*3+1080, 2160, 0, 0, 0), } }; @@ -279,6 +273,9 @@ MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); static void reg_free_request(struct regulatory_request *request) { + if (request == &core_request_world) + return; + if (request != get_last_request()) kfree(request); } @@ -453,68 +450,70 @@ reg_copy_regd(const struct ieee80211_regdomain *src_regd) } #ifdef CONFIG_CFG80211_INTERNAL_REGDB -struct reg_regdb_search_request { - char alpha2[2]; +struct reg_regdb_apply_request { struct list_head list; + const struct ieee80211_regdomain *regdom; }; -static LIST_HEAD(reg_regdb_search_list); -static DEFINE_MUTEX(reg_regdb_search_mutex); +static LIST_HEAD(reg_regdb_apply_list); +static DEFINE_MUTEX(reg_regdb_apply_mutex); -static void reg_regdb_search(struct work_struct *work) +static void reg_regdb_apply(struct work_struct *work) { - struct reg_regdb_search_request *request; - const struct ieee80211_regdomain *curdom, *regdom = NULL; - int i; + struct reg_regdb_apply_request *request; rtnl_lock(); - mutex_lock(®_regdb_search_mutex); - while (!list_empty(®_regdb_search_list)) { - request = list_first_entry(®_regdb_search_list, - struct reg_regdb_search_request, + mutex_lock(®_regdb_apply_mutex); + while (!list_empty(®_regdb_apply_list)) { + request = list_first_entry(®_regdb_apply_list, + struct reg_regdb_apply_request, list); list_del(&request->list); - for (i = 0; i < reg_regdb_size; i++) { - curdom = reg_regdb[i]; - - if (alpha2_equal(request->alpha2, curdom->alpha2)) { - regdom = reg_copy_regd(curdom); - break; - } - } - + set_regdom(request->regdom, REGD_SOURCE_INTERNAL_DB); kfree(request); } - mutex_unlock(®_regdb_search_mutex); - - if (!IS_ERR_OR_NULL(regdom)) - set_regdom(regdom, REGD_SOURCE_INTERNAL_DB); + mutex_unlock(®_regdb_apply_mutex); rtnl_unlock(); } -static DECLARE_WORK(reg_regdb_work, reg_regdb_search); +static DECLARE_WORK(reg_regdb_work, reg_regdb_apply); -static void reg_regdb_query(const char *alpha2) +static int reg_query_builtin(const char *alpha2) { - struct reg_regdb_search_request *request; + const struct ieee80211_regdomain *regdom = NULL; + struct reg_regdb_apply_request *request; + unsigned int i; - if (!alpha2) - return; + for (i = 0; i < reg_regdb_size; i++) { + if (alpha2_equal(alpha2, reg_regdb[i]->alpha2)) { + regdom = reg_regdb[i]; + break; + } + } - request = kzalloc(sizeof(struct reg_regdb_search_request), GFP_KERNEL); + if (!regdom) + return -ENODATA; + + request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL); if (!request) - return; + return -ENOMEM; - memcpy(request->alpha2, alpha2, 2); + request->regdom = reg_copy_regd(regdom); + if (IS_ERR_OR_NULL(request->regdom)) { + kfree(request); + return -ENOMEM; + } - mutex_lock(®_regdb_search_mutex); - list_add_tail(&request->list, ®_regdb_search_list); - mutex_unlock(®_regdb_search_mutex); + mutex_lock(®_regdb_apply_mutex); + list_add_tail(&request->list, ®_regdb_apply_list); + mutex_unlock(®_regdb_apply_mutex); schedule_work(®_regdb_work); + + return 0; } /* Feel free to add any other sanity checks here */ @@ -525,9 +524,45 @@ static void reg_regdb_size_check(void) } #else static inline void reg_regdb_size_check(void) {} -static inline void reg_regdb_query(const char *alpha2) {} +static inline int reg_query_builtin(const char *alpha2) +{ + return -ENODATA; +} #endif /* CONFIG_CFG80211_INTERNAL_REGDB */ +#ifdef CONFIG_CFG80211_CRDA_SUPPORT +/* Max number of consecutive attempts to communicate with CRDA */ +#define REG_MAX_CRDA_TIMEOUTS 10 + +static u32 reg_crda_timeouts; + +static void crda_timeout_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work); + +static void crda_timeout_work(struct work_struct *work) +{ + REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); + rtnl_lock(); + reg_crda_timeouts++; + restore_regulatory_settings(true); + rtnl_unlock(); +} + +static void cancel_crda_timeout(void) +{ + cancel_delayed_work(&crda_timeout); +} + +static void cancel_crda_timeout_sync(void) +{ + cancel_delayed_work_sync(&crda_timeout); +} + +static void reset_crda_timeouts(void) +{ + reg_crda_timeouts = 0; +} + /* * This lets us keep regulatory code which is updated on a regulatory * basis in userspace. @@ -536,36 +571,50 @@ static int call_crda(const char *alpha2) { char country[12]; char *env[] = { country, NULL }; + int ret; snprintf(country, sizeof(country), "COUNTRY=%c%c", alpha2[0], alpha2[1]); - /* query internal regulatory database (if it exists) */ - reg_regdb_query(alpha2); - if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) { - pr_info("Exceeded CRDA call max attempts. Not calling CRDA\n"); + pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n"); return -EINVAL; } if (!is_world_regdom((char *) alpha2)) - pr_info("Calling CRDA for country: %c%c\n", + pr_debug("Calling CRDA for country: %c%c\n", alpha2[0], alpha2[1]); else - pr_info("Calling CRDA to update world regulatory domain\n"); + pr_debug("Calling CRDA to update world regulatory domain\n"); - return kobject_uevent_env(®_pdev->dev.kobj, KOBJ_CHANGE, env); + ret = kobject_uevent_env(®_pdev->dev.kobj, KOBJ_CHANGE, env); + if (ret) + return ret; + + queue_delayed_work(system_power_efficient_wq, + &crda_timeout, msecs_to_jiffies(3142)); + return 0; +} +#else +static inline void cancel_crda_timeout(void) {} +static inline void cancel_crda_timeout_sync(void) {} +static inline void reset_crda_timeouts(void) {} +static inline int call_crda(const char *alpha2) +{ + return -ENODATA; } +#endif /* CONFIG_CFG80211_CRDA_SUPPORT */ -static enum reg_request_treatment -reg_call_crda(struct regulatory_request *request) +static bool reg_query_database(struct regulatory_request *request) { - if (call_crda(request->alpha2)) - return REG_REQ_IGNORE; + /* query internal regulatory database (if it exists) */ + if (reg_query_builtin(request->alpha2) == 0) + return true; - queue_delayed_work(system_power_efficient_wq, - ®_timeout, msecs_to_jiffies(3142)); - return REG_REQ_OK; + if (call_crda(request->alpha2) == 0) + return true; + + return false; } bool reg_is_valid_request(const char *alpha2) @@ -989,8 +1038,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_OFDM; if (rd_flags & NL80211_RRF_NO_OUTDOOR) channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; - if (rd_flags & NL80211_RRF_GO_CONCURRENT) - channel_flags |= IEEE80211_CHAN_GO_CONCURRENT; + if (rd_flags & NL80211_RRF_IR_CONCURRENT) + channel_flags |= IEEE80211_CHAN_IR_CONCURRENT; if (rd_flags & NL80211_RRF_NO_HT40MINUS) channel_flags |= IEEE80211_CHAN_NO_HT40MINUS; if (rd_flags & NL80211_RRF_NO_HT40PLUS) @@ -1004,7 +1053,7 @@ static u32 map_regdom_flags(u32 rd_flags) static const struct ieee80211_reg_rule * freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, - const struct ieee80211_regdomain *regd) + const struct ieee80211_regdomain *regd, u32 bw) { int i; bool band_rule_found = false; @@ -1028,7 +1077,7 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); - bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20)); + bw_fits = reg_does_bw_fit(fr, center_freq, bw); if (band_rule_found && bw_fits) return rr; @@ -1040,14 +1089,26 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, return ERR_PTR(-EINVAL); } -const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, - u32 center_freq) +static const struct ieee80211_reg_rule * +__freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) { - const struct ieee80211_regdomain *regd; + const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); + const struct ieee80211_reg_rule *reg_rule = NULL; + u32 bw; - regd = reg_get_regdomain(wiphy); + for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { + reg_rule = freq_reg_info_regd(wiphy, center_freq, regd, bw); + if (!IS_ERR(reg_rule)) + return reg_rule; + } + + return reg_rule; +} - return freq_reg_info_regd(wiphy, center_freq, regd); +const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, + u32 center_freq) +{ + return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(20)); } EXPORT_SYMBOL(freq_reg_info); @@ -1069,11 +1130,11 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator) } EXPORT_SYMBOL(reg_initiator_name); -#ifdef CONFIG_CFG80211_REG_DEBUG static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, struct ieee80211_channel *chan, const struct ieee80211_reg_rule *reg_rule) { +#ifdef CONFIG_CFG80211_REG_DEBUG const struct ieee80211_power_rule *power_rule; const struct ieee80211_freq_range *freq_range; char max_antenna_gain[32], bw[32]; @@ -1084,7 +1145,7 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, if (!power_rule->max_antenna_gain) snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A"); else - snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d", + snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d mBi", power_rule->max_antenna_gain); if (reg_rule->flags & NL80211_RRF_AUTO_BW) @@ -1098,19 +1159,12 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n", chan->center_freq); - REG_DBG_PRINT("%d KHz - %d KHz @ %s), (%s mBi, %d mBm)\n", + REG_DBG_PRINT("(%d KHz - %d KHz @ %s), (%s, %d mBm)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, max_antenna_gain, power_rule->max_eirp); -} -#else -static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, - struct ieee80211_channel *chan, - const struct ieee80211_reg_rule *reg_rule) -{ - return; -} #endif +} /* * Note that right now we assume the desired channel bandwidth @@ -1176,8 +1230,20 @@ static void handle_channel(struct wiphy *wiphy, if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + /* If we get a reg_rule we can assume that at least 5Mhz fit */ + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags = IEEE80211_CHAN_NO_HT40; + bw_flags |= IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) @@ -1287,7 +1353,8 @@ static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS); } #else -static int reg_ignore_cell_hint(struct regulatory_request *pending_request) +static enum reg_request_treatment +reg_ignore_cell_hint(struct regulatory_request *pending_request) { return REG_REQ_IGNORE; } @@ -1589,7 +1656,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_ADHOC: - return cfg80211_reg_can_beacon(wiphy, &chandef, iftype); + return cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype); case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: return cfg80211_chandef_usable(wiphy, &chandef, @@ -1695,9 +1762,15 @@ static void handle_channel_custom(struct wiphy *wiphy, const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; u32 max_bandwidth_khz; + u32 bw; - reg_rule = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq), - regd); + for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) { + reg_rule = freq_reg_info_regd(wiphy, + MHZ_TO_KHZ(chan->center_freq), + regd, bw); + if (!IS_ERR(reg_rule)) + break; + } if (IS_ERR(reg_rule)) { REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n", @@ -1721,8 +1794,20 @@ static void handle_channel_custom(struct wiphy *wiphy, if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + /* If we get a reg_rule we can assume that at least 5Mhz fit */ + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags = IEEE80211_CHAN_NO_HT40; + bw_flags |= IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) @@ -1804,7 +1889,7 @@ static void reg_set_request_processed(void) need_more_processing = true; spin_unlock(®_requests_lock); - cancel_delayed_work(®_timeout); + cancel_crda_timeout(); if (need_more_processing) schedule_work(®_work); @@ -1816,19 +1901,18 @@ static void reg_set_request_processed(void) * * The wireless subsystem can use this function to process * a regulatory request issued by the regulatory core. - * - * Returns one of the different reg request treatment values. */ static enum reg_request_treatment reg_process_hint_core(struct regulatory_request *core_request) { + if (reg_query_database(core_request)) { + core_request->intersect = false; + core_request->processed = false; + reg_update_last_request(core_request); + return REG_REQ_OK; + } - core_request->intersect = false; - core_request->processed = false; - - reg_update_last_request(core_request); - - return reg_call_crda(core_request); + return REG_REQ_IGNORE; } static enum reg_request_treatment @@ -1873,8 +1957,6 @@ __reg_process_hint_user(struct regulatory_request *user_request) * * The wireless subsystem can use this function to process * a regulatory request initiated by userspace. - * - * Returns one of the different reg request treatment values. */ static enum reg_request_treatment reg_process_hint_user(struct regulatory_request *user_request) @@ -1883,20 +1965,20 @@ reg_process_hint_user(struct regulatory_request *user_request) treatment = __reg_process_hint_user(user_request); if (treatment == REG_REQ_IGNORE || - treatment == REG_REQ_ALREADY_SET) { - reg_free_request(user_request); - return treatment; - } + treatment == REG_REQ_ALREADY_SET) + return REG_REQ_IGNORE; user_request->intersect = treatment == REG_REQ_INTERSECT; user_request->processed = false; - reg_update_last_request(user_request); - - user_alpha2[0] = user_request->alpha2[0]; - user_alpha2[1] = user_request->alpha2[1]; + if (reg_query_database(user_request)) { + reg_update_last_request(user_request); + user_alpha2[0] = user_request->alpha2[0]; + user_alpha2[1] = user_request->alpha2[1]; + return REG_REQ_OK; + } - return reg_call_crda(user_request); + return REG_REQ_IGNORE; } static enum reg_request_treatment @@ -1944,16 +2026,12 @@ reg_process_hint_driver(struct wiphy *wiphy, case REG_REQ_OK: break; case REG_REQ_IGNORE: - reg_free_request(driver_request); - return treatment; + return REG_REQ_IGNORE; case REG_REQ_INTERSECT: - /* fall through */ case REG_REQ_ALREADY_SET: regd = reg_copy_regd(get_cfg80211_regdom()); - if (IS_ERR(regd)) { - reg_free_request(driver_request); + if (IS_ERR(regd)) return REG_REQ_IGNORE; - } tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, regd); @@ -1964,8 +2042,6 @@ reg_process_hint_driver(struct wiphy *wiphy, driver_request->intersect = treatment == REG_REQ_INTERSECT; driver_request->processed = false; - reg_update_last_request(driver_request); - /* * Since CRDA will not be called in this case as we already * have applied the requested regulatory domain before we just @@ -1973,11 +2049,17 @@ reg_process_hint_driver(struct wiphy *wiphy, */ if (treatment == REG_REQ_ALREADY_SET) { nl80211_send_reg_change_event(driver_request); + reg_update_last_request(driver_request); reg_set_request_processed(); - return treatment; + return REG_REQ_ALREADY_SET; + } + + if (reg_query_database(driver_request)) { + reg_update_last_request(driver_request); + return REG_REQ_OK; } - return reg_call_crda(driver_request); + return REG_REQ_IGNORE; } static enum reg_request_treatment @@ -2043,12 +2125,11 @@ reg_process_hint_country_ie(struct wiphy *wiphy, case REG_REQ_OK: break; case REG_REQ_IGNORE: - /* fall through */ + return REG_REQ_IGNORE; case REG_REQ_ALREADY_SET: reg_free_request(country_ie_request); - return treatment; + return REG_REQ_ALREADY_SET; case REG_REQ_INTERSECT: - reg_free_request(country_ie_request); /* * This doesn't happen yet, not sure we * ever want to support it for this case. @@ -2060,9 +2141,12 @@ reg_process_hint_country_ie(struct wiphy *wiphy, country_ie_request->intersect = false; country_ie_request->processed = false; - reg_update_last_request(country_ie_request); + if (reg_query_database(country_ie_request)) { + reg_update_last_request(country_ie_request); + return REG_REQ_OK; + } - return reg_call_crda(country_ie_request); + return REG_REQ_IGNORE; } /* This processes *all* regulatory hints */ @@ -2076,14 +2160,11 @@ static void reg_process_hint(struct regulatory_request *reg_request) switch (reg_request->initiator) { case NL80211_REGDOM_SET_BY_CORE: - reg_process_hint_core(reg_request); - return; + treatment = reg_process_hint_core(reg_request); + break; case NL80211_REGDOM_SET_BY_USER: treatment = reg_process_hint_user(reg_request); - if (treatment == REG_REQ_IGNORE || - treatment == REG_REQ_ALREADY_SET) - return; - return; + break; case NL80211_REGDOM_SET_BY_DRIVER: if (!wiphy) goto out_free; @@ -2099,7 +2180,15 @@ static void reg_process_hint(struct regulatory_request *reg_request) goto out_free; } - /* This is required so that the orig_* parameters are saved */ + if (treatment == REG_REQ_IGNORE) + goto out_free; + + WARN(treatment != REG_REQ_OK && treatment != REG_REQ_ALREADY_SET, + "unexpected treatment value %d\n", treatment); + + /* This is required so that the orig_* parameters are saved. + * NOTE: treatment must be set for any case that reaches here! + */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { wiphy_update_regulatory(wiphy, reg_request->initiator); @@ -2304,7 +2393,7 @@ int regulatory_hint_user(const char *alpha2, request->user_reg_hint_type = user_reg_hint_type; /* Allow calling CRDA again */ - reg_crda_timeouts = 0; + reset_crda_timeouts(); queue_regulatory_request(request); @@ -2376,7 +2465,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) request->initiator = NL80211_REGDOM_SET_BY_DRIVER; /* Allow calling CRDA again */ - reg_crda_timeouts = 0; + reset_crda_timeouts(); queue_regulatory_request(request); @@ -2432,7 +2521,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band, request->country_ie_env = env; /* Allow calling CRDA again */ - reg_crda_timeouts = 0; + reset_crda_timeouts(); queue_regulatory_request(request); request = NULL; @@ -2584,7 +2673,7 @@ static void restore_regulatory_settings(bool reset_user) * settings, user regulatory settings takes precedence. */ if (is_an_alpha2(alpha2)) - regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER); + regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER); spin_lock(®_requests_lock); list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); @@ -2833,11 +2922,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, } request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx); - if (!request_wiphy) { - queue_delayed_work(system_power_efficient_wq, - ®_timeout, 0); + if (!request_wiphy) return -ENODEV; - } if (!driver_request->intersect) { if (request_wiphy->regd) @@ -2894,11 +2980,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, } request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx); - if (!request_wiphy) { - queue_delayed_work(system_power_efficient_wq, - ®_timeout, 0); + if (!request_wiphy) return -ENODEV; - } if (country_ie_request->intersect) return -EINVAL; @@ -2925,7 +3008,7 @@ int set_regdom(const struct ieee80211_regdomain *rd, } if (regd_src == REGD_SOURCE_CRDA) - reg_crda_timeouts = 0; + reset_crda_timeouts(); lr = get_last_request(); @@ -2946,6 +3029,7 @@ int set_regdom(const struct ieee80211_regdomain *rd, break; default: WARN(1, "invalid initiator %d\n", lr->initiator); + kfree(rd); return -EINVAL; } @@ -3082,15 +3166,6 @@ void wiphy_regulatory_deregister(struct wiphy *wiphy) lr->country_ie_env = ENVIRON_ANY; } -static void reg_timeout_work(struct work_struct *work) -{ - REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); - rtnl_lock(); - reg_crda_timeouts++; - restore_regulatory_settings(true); - rtnl_unlock(); -} - /* * See http://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii, for * UNII band definitions @@ -3147,8 +3222,10 @@ int __init regulatory_init(void) /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { - if (err == -ENOMEM) + if (err == -ENOMEM) { + platform_device_unregister(reg_pdev); return err; + } /* * N.B. kobject_uevent_env() can fail mainly for when we're out * memory which is handled and propagated appropriately above @@ -3176,7 +3253,7 @@ void regulatory_exit(void) struct reg_beacon *reg_beacon, *btmp; cancel_work_sync(®_work); - cancel_delayed_work_sync(®_timeout); + cancel_crda_timeout_sync(); cancel_delayed_work_sync(®_check_chans); /* Lock to suppress warnings */ diff --git a/kernel/net/wireless/scan.c b/kernel/net/wireless/scan.c index 3a50aa255..14d5369eb 100644 --- a/kernel/net/wireless/scan.c +++ b/kernel/net/wireless/scan.c @@ -266,8 +266,7 @@ void __cfg80211_sched_scan_results(struct work_struct *wk) spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, request->scan_start); spin_unlock_bh(&rdev->bss_lock); - request->scan_start = - jiffies + msecs_to_jiffies(request->interval); + request->scan_start = jiffies; } nl80211_send_sched_scan_results(rdev, request->dev); } @@ -839,6 +838,7 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found->pub.signal = tmp->pub.signal; found->pub.capability = tmp->pub.capability; found->ts = tmp->ts; + found->ts_boottime = tmp->ts_boottime; } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; @@ -938,14 +938,13 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, } /* Returned bss is reference counted and must be cleaned up appropriately. */ -struct cfg80211_bss* -cfg80211_inform_bss_width(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - enum cfg80211_bss_frame_type ftype, - const u8 *bssid, u64 tsf, u16 capability, - u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp) +struct cfg80211_bss * +cfg80211_inform_bss_data(struct wiphy *wiphy, + struct cfg80211_inform_bss *data, + enum cfg80211_bss_frame_type ftype, + const u8 *bssid, u64 tsf, u16 capability, + u16 beacon_interval, const u8 *ie, size_t ielen, + gfp_t gfp) { struct cfg80211_bss_ies *ies; struct ieee80211_channel *channel; @@ -957,19 +956,21 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && - (signal < 0 || signal > 100))) + (data->signal < 0 || data->signal > 100))) return NULL; - channel = cfg80211_get_bss_channel(wiphy, ie, ielen, rx_channel); + channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan); if (!channel) return NULL; memcpy(tmp.pub.bssid, bssid, ETH_ALEN); tmp.pub.channel = channel; - tmp.pub.scan_width = scan_width; - tmp.pub.signal = signal; + tmp.pub.scan_width = data->scan_width; + tmp.pub.signal = data->signal; tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; + tmp.ts_boottime = data->boottime_ns; + /* * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it @@ -999,7 +1000,7 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, } rcu_assign_pointer(tmp.pub.ies, ies); - signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= + signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); if (!res) @@ -1019,15 +1020,15 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } -EXPORT_SYMBOL(cfg80211_inform_bss_width); +EXPORT_SYMBOL(cfg80211_inform_bss_data); -/* Returned bss is reference counted and must be cleaned up appropriately. */ +/* cfg80211_inform_bss_width_frame helper */ struct cfg80211_bss * -cfg80211_inform_bss_width_frame(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp) +cfg80211_inform_bss_frame_data(struct wiphy *wiphy, + struct cfg80211_inform_bss *data, + struct ieee80211_mgmt *mgmt, size_t len, + gfp_t gfp) + { struct cfg80211_internal_bss tmp = {}, *res; struct cfg80211_bss_ies *ies; @@ -1040,8 +1041,7 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); - trace_cfg80211_inform_bss_width_frame(wiphy, rx_channel, scan_width, mgmt, - len, signal); + trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len); if (WARN_ON(!mgmt)) return NULL; @@ -1050,14 +1050,14 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && - (signal < 0 || signal > 100))) + (data->signal < 0 || data->signal > 100))) return NULL; if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable))) return NULL; channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable, - ielen, rx_channel); + ielen, data->chan); if (!channel) return NULL; @@ -1077,12 +1077,13 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); tmp.pub.channel = channel; - tmp.pub.scan_width = scan_width; - tmp.pub.signal = signal; + tmp.pub.scan_width = data->scan_width; + tmp.pub.signal = data->signal; tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); + tmp.ts_boottime = data->boottime_ns; - signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= + signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid); if (!res) @@ -1102,7 +1103,7 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } -EXPORT_SYMBOL(cfg80211_inform_bss_width_frame); +EXPORT_SYMBOL(cfg80211_inform_bss_frame_data); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { diff --git a/kernel/net/wireless/sme.c b/kernel/net/wireless/sme.c index d11454f87..8020b5b09 100644 --- a/kernel/net/wireless/sme.c +++ b/kernel/net/wireless/sme.c @@ -938,7 +938,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, } void cfg80211_disconnected(struct net_device *dev, u16 reason, - const u8 *ie, size_t ie_len, gfp_t gfp) + const u8 *ie, size_t ie_len, + bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -954,6 +955,7 @@ void cfg80211_disconnected(struct net_device *dev, u16 reason, ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; + ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); diff --git a/kernel/net/wireless/sysfs.c b/kernel/net/wireless/sysfs.c index 9ee6bc1a7..9cee02206 100644 --- a/kernel/net/wireless/sysfs.c +++ b/kernel/net/wireless/sysfs.c @@ -86,7 +86,7 @@ static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; @@ -95,7 +95,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) cfg80211_leave(rdev, wdev); } -static int wiphy_suspend(struct device *dev, pm_message_t state) +static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; @@ -136,6 +136,11 @@ static int wiphy_resume(struct device *dev) return ret; } + +static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); +#define WIPHY_PM_OPS (&wiphy_pm_ops) +#else +#define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(struct device *d) @@ -151,10 +156,7 @@ struct class ieee80211_class = { .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .dev_uevent = wiphy_uevent, -#ifdef CONFIG_PM - .suspend = wiphy_suspend, - .resume = wiphy_resume, -#endif + .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; diff --git a/kernel/net/wireless/trace.h b/kernel/net/wireless/trace.h index af3617c98..0c392d367 100644 --- a/kernel/net/wireless/trace.h +++ b/kernel/net/wireless/trace.h @@ -2358,20 +2358,23 @@ TRACE_EVENT(cfg80211_cqm_rssi_notify, TRACE_EVENT(cfg80211_reg_can_beacon, TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype), - TP_ARGS(wiphy, chandef, iftype), + enum nl80211_iftype iftype, bool check_no_ir), + TP_ARGS(wiphy, chandef, iftype, check_no_ir), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY __field(enum nl80211_iftype, iftype) + __field(bool, check_no_ir) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->iftype = iftype; + __entry->check_no_ir = check_no_ir; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d", - WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype) + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s", + WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype, + BOOL_TO_STR(__entry->check_no_ir)) ); TRACE_EVENT(cfg80211_chandef_dfs_required, @@ -2667,30 +2670,30 @@ TRACE_EVENT(cfg80211_get_bss, __entry->privacy) ); -TRACE_EVENT(cfg80211_inform_bss_width_frame, - TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel, - enum nl80211_bss_scan_width scan_width, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal), - TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal), +TRACE_EVENT(cfg80211_inform_bss_frame, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_inform_bss *data, + struct ieee80211_mgmt *mgmt, size_t len), + TP_ARGS(wiphy, data, mgmt, len), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY __field(enum nl80211_bss_scan_width, scan_width) __dynamic_array(u8, mgmt, len) __field(s32, signal) + __field(u64, ts_boottime) ), TP_fast_assign( WIPHY_ASSIGN; - CHAN_ASSIGN(channel); - __entry->scan_width = scan_width; + CHAN_ASSIGN(data->chan); + __entry->scan_width = data->scan_width; if (mgmt) memcpy(__get_dynamic_array(mgmt), mgmt, len); - __entry->signal = signal; + __entry->signal = data->signal; + __entry->ts_boottime = data->boottime_ns; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d", + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu", WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, - __entry->signal) + __entry->signal, (unsigned long long)__entry->ts_boottime) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, diff --git a/kernel/net/wireless/util.c b/kernel/net/wireless/util.c index 7e4e3fffe..baf7218ce 100644 --- a/kernel/net/wireless/util.c +++ b/kernel/net/wireless/util.c @@ -887,7 +887,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, - ev->dc.reason, true); + ev->dc.reason, + !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, diff --git a/kernel/net/wireless/wext-core.c b/kernel/net/wireless/wext-core.c index c8717c1d0..b50ee5d62 100644 --- a/kernel/net/wireless/wext-core.c +++ b/kernel/net/wireless/wext-core.c @@ -342,6 +342,40 @@ static const int compat_event_type_size[] = { /* IW event code */ +void wireless_nlevent_flush(void) +{ + struct sk_buff *skb; + struct net *net; + + ASSERT_RTNL(); + + for_each_net(net) { + while ((skb = skb_dequeue(&net->wext_nlevents))) + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, + GFP_KERNEL); + } +} +EXPORT_SYMBOL_GPL(wireless_nlevent_flush); + +static int wext_netdev_notifier_call(struct notifier_block *nb, + unsigned long state, void *ptr) +{ + /* + * When a netdev changes state in any way, flush all pending messages + * to avoid them going out in a strange order, e.g. RTM_NEWLINK after + * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close() + * or similar - all of which could otherwise happen due to delays from + * schedule_work(). + */ + wireless_nlevent_flush(); + + return NOTIFY_OK; +} + +static struct notifier_block wext_netdev_notifier = { + .notifier_call = wext_netdev_notifier_call, +}; + static int __net_init wext_pernet_init(struct net *net) { skb_queue_head_init(&net->wext_nlevents); @@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = { static int __init wireless_nlevent_init(void) { - return register_pernet_subsys(&wext_pernet_ops); + int err = register_pernet_subsys(&wext_pernet_ops); + + if (err) + return err; + + return register_netdevice_notifier(&wext_netdev_notifier); } subsys_initcall(wireless_nlevent_init); @@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { - struct sk_buff *skb; - struct net *net; - rtnl_lock(); - - for_each_net(net) { - while ((skb = skb_dequeue(&net->wext_nlevents))) - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, - GFP_KERNEL); - } - + wireless_nlevent_flush(); rtnl_unlock(); } diff --git a/kernel/net/x25/af_x25.c b/kernel/net/x25/af_x25.c index c3ab230e4..a750f330b 100644 --- a/kernel/net/x25/af_x25.c +++ b/kernel/net/x25/af_x25.c @@ -515,10 +515,10 @@ static struct proto x25_proto = { .obj_size = sizeof(struct x25_sock), }; -static struct sock *x25_alloc_socket(struct net *net) +static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; - struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto); + struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; @@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOBUFS; - if ((sk = x25_alloc_socket(net)) == NULL) + if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); @@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) goto out; - if ((sk = x25_alloc_socket(sock_net(osk))) == NULL) + if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); diff --git a/kernel/net/xfrm/xfrm_algo.c b/kernel/net/xfrm/xfrm_algo.c index 12e82a5e4..f07224d8b 100644 --- a/kernel/net/xfrm/xfrm_algo.c +++ b/kernel/net/xfrm/xfrm_algo.c @@ -31,6 +31,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 64, } }, @@ -49,6 +50,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 96, } }, @@ -67,6 +69,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 128, } }, @@ -85,6 +88,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 64, } }, @@ -103,6 +107,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 96, } }, @@ -121,6 +126,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 128, } }, @@ -139,6 +145,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 128, } }, @@ -152,6 +159,18 @@ static struct xfrm_algo_desc aead_list[] = { .sadb_alg_maxbits = 256 } }, +{ + .name = "rfc7539esp(chacha20,poly1305)", + + .uinfo = { + .aead = { + .geniv = "seqiv", + .icv_truncbits = 128, + } + }, + + .pfkey_supported = 0, +}, }; static struct xfrm_algo_desc aalg_list[] = { @@ -353,6 +372,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 64, } @@ -373,6 +393,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 192, } @@ -393,6 +414,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 128, } @@ -413,6 +435,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 128, } @@ -433,6 +456,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -453,6 +477,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -473,6 +498,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -493,6 +519,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -512,6 +539,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "seqiv", .blockbits = 128, .defkeybits = 160, /* 128-bit key + 32-bit nonce */ } diff --git a/kernel/net/xfrm/xfrm_input.c b/kernel/net/xfrm/xfrm_input.c index b58286ecd..ad7f5b3f9 100644 --- a/kernel/net/xfrm/xfrm_input.c +++ b/kernel/net/xfrm/xfrm_input.c @@ -31,7 +31,7 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); @@ -254,13 +254,13 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; spin_lock(&x->lock); - if (unlikely(x->km.state == XFRM_STATE_ACQ)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); - goto drop_unlock; - } if (unlikely(x->km.state != XFRM_STATE_VALID)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; } @@ -330,8 +330,10 @@ resume: if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); - if (inner_mode == NULL) + if (inner_mode == NULL) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; + } } if (inner_mode->input(x, skb)) { diff --git a/kernel/net/xfrm/xfrm_output.c b/kernel/net/xfrm/xfrm_output.c index fbcedbe33..ff4a91fca 100644 --- a/kernel/net/xfrm/xfrm_output.c +++ b/kernel/net/xfrm/xfrm_output.c @@ -19,7 +19,7 @@ #include #include -static int xfrm_output2(struct sock *sk, struct sk_buff *skb); +static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb); static int xfrm_skb_check_space(struct sk_buff *skb) { @@ -38,6 +38,18 @@ static int xfrm_skb_check_space(struct sk_buff *skb) return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC); } +/* Children define the path of the packet through the + * Linux networking. Thus, destinations are stackable. + */ + +static struct dst_entry *skb_dst_pop(struct sk_buff *skb) +{ + struct dst_entry *child = dst_clone(skb_dst(skb)->child); + + skb_dst_drop(skb); + return child; +} + static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb_dst(skb); @@ -119,18 +131,20 @@ out: int xfrm_output_resume(struct sk_buff *skb, int err) { + struct net *net = xs_net(skb_dst(skb)->xfrm); + while (likely((err = xfrm_output_one(skb, err)) == 0)) { nf_reset(skb); - err = skb_dst(skb)->ops->local_out(skb); + err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) goto out; if (!skb_dst(skb)->xfrm) - return dst_output(skb); + return dst_output(net, skb->sk, skb); err = nf_hook(skb_dst(skb)->ops->family, - NF_INET_POST_ROUTING, skb->sk, skb, + NF_INET_POST_ROUTING, net, skb->sk, skb, NULL, skb_dst(skb)->dev, xfrm_output2); if (unlikely(err != 1)) goto out; @@ -144,15 +158,17 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output_resume); -static int xfrm_output2(struct sock *sk, struct sk_buff *skb) +static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(skb, 1); } -static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb) +static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sk_buff *segs; + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) @@ -165,7 +181,7 @@ static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb) int err; segs->next = NULL; - err = xfrm_output2(sk, segs); + err = xfrm_output2(net, sk, segs); if (unlikely(err)) { kfree_skb_list(nskb); @@ -184,7 +200,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) int err; if (skb_is_gso(skb)) - return xfrm_output_gso(sk, skb); + return xfrm_output_gso(net, sk, skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { err = skb_checksum_help(skb); @@ -195,7 +211,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) } } - return xfrm_output2(sk, skb); + return xfrm_output2(net, sk, skb); } EXPORT_SYMBOL_GPL(xfrm_output); diff --git a/kernel/net/xfrm/xfrm_policy.c b/kernel/net/xfrm/xfrm_policy.c index 638af0655..b5e665b3c 100644 --- a/kernel/net/xfrm/xfrm_policy.c +++ b/kernel/net/xfrm/xfrm_policy.c @@ -115,7 +115,8 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) rcu_read_unlock(); } -static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, +static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family) @@ -127,14 +128,15 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, saddr, daddr); + dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); xfrm_policy_put_afinfo(afinfo); return dst; } -static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, +static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, + int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family) @@ -153,7 +155,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family); + dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -301,6 +303,14 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) } EXPORT_SYMBOL(xfrm_policy_alloc); +static void xfrm_policy_destroy_rcu(struct rcu_head *head) +{ + struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); + + security_xfrm_policy_free(policy->security); + kfree(policy); +} + /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) @@ -310,19 +320,10 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); - security_xfrm_policy_free(policy->security); - kfree(policy); + call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); -static void xfrm_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - - while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); -} - /* Rule must be locked. Release descentant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ @@ -335,7 +336,7 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); - xfrm_queue_purge(&policy->polq.hold_queue); + skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); @@ -708,6 +709,9 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; + if (skb_queue_empty(&pq->hold_queue)) + return; + __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); @@ -716,9 +720,6 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); - if (skb_queue_empty(&list)) - return; - pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); @@ -1012,7 +1013,9 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else - x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); + x = list_first_entry(&walk->walk.all, + struct xfrm_policy_walk_entry, all); + list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; @@ -1120,6 +1123,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { + if ((pol->priority >= priority) && ret) + break; + err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -1128,13 +1134,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, ret = ERR_PTR(err); goto fail; } - } else if (pol->priority < priority) { + } else { ret = pol; break; } } - if (ret) - xfrm_pol_hold(ret); + + xfrm_pol_hold(ret); fail: read_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -1209,14 +1215,16 @@ static inline int policy_to_flow_dir(int dir) } } -static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, +static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl) { struct xfrm_policy *pol; struct net *net = sock_net(sk); + rcu_read_lock(); read_lock_bh(&net->xfrm.xfrm_policy_lock); - if ((pol = sk->sk_policy[dir]) != NULL) { + pol = rcu_dereference(sk->sk_policy[dir]); + if (pol != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); int err = 0; @@ -1240,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, } out: read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return pol; } @@ -1308,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) #endif write_lock_bh(&net->xfrm.xfrm_policy_lock); - old_pol = sk->sk_policy[dir]; - sk->sk_policy[dir] = pol; + old_pol = rcu_dereference_protected(sk->sk_policy[dir], + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = get_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } + rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); @@ -1362,29 +1372,38 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) return newp; } -int __xfrm_sk_clone_policy(struct sock *sk) +int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { - struct xfrm_policy *p0 = sk->sk_policy[0], - *p1 = sk->sk_policy[1]; + const struct xfrm_policy *p; + struct xfrm_policy *np; + int i, ret = 0; - sk->sk_policy[0] = sk->sk_policy[1] = NULL; - if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL) - return -ENOMEM; - if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL) - return -ENOMEM; - return 0; + rcu_read_lock(); + for (i = 0; i < 2; i++) { + p = rcu_dereference(osk->sk_policy[i]); + if (p) { + np = clone_policy(p, i); + if (unlikely(!np)) { + ret = -ENOMEM; + break; + } + rcu_assign_pointer(sk->sk_policy[i], np); + } + } + rcu_read_unlock(); + return ret; } static int -xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote, - unsigned short family) +xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, + xfrm_address_t *remote, unsigned short family) { int err; struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, local, remote); + err = afinfo->get_saddr(net, oif, local, remote); xfrm_policy_put_afinfo(afinfo); return err; } @@ -1413,7 +1432,9 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family); + error = xfrm_get_saddr(net, fl->flowi_oif, + &tmp, remote, + tmpl->encap_family); if (error) goto fail; local = &tmp; @@ -1582,8 +1603,6 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; - if (afinfo->init_dst) - afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); @@ -1693,8 +1712,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { family = xfrm[i]->props.family; - dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr, - family); + dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, + &saddr, &daddr, family); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; @@ -1888,6 +1907,7 @@ static void xfrm_policy_queue_process(unsigned long arg) struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = (struct xfrm_policy *)arg; + struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; @@ -1904,8 +1924,7 @@ static void xfrm_policy_queue_process(unsigned long arg) spin_unlock(&pq->hold_queue.lock); dst_hold(dst->path); - dst = xfrm_lookup(xp_net(pol), dst->path, &fl, - sk, 0); + dst = xfrm_lookup(net, dst->path, &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue; @@ -1935,8 +1954,7 @@ static void xfrm_policy_queue_process(unsigned long arg) xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path, - &fl, skb->sk, 0); + dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; @@ -1946,7 +1964,7 @@ static void xfrm_policy_queue_process(unsigned long arg) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(skb); + dst_output(net, skb->sk, skb); } out: @@ -1955,11 +1973,11 @@ out: purge_queue: pq->timeout = 0; - xfrm_queue_purge(&pq->hold_queue); + skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } -static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) +static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); @@ -2186,7 +2204,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, - struct sock *sk, int flags) + const struct sock *sk, int flags) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct flow_cache_object *flo; @@ -2200,6 +2218,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, xdst = NULL; route = NULL; + sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); @@ -2334,7 +2353,7 @@ EXPORT_SYMBOL(xfrm_lookup); */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, - struct sock *sk, int flags) + const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | @@ -2479,6 +2498,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } pol = NULL; + sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl); if (IS_ERR(pol)) { @@ -2806,7 +2826,6 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { - struct net *net; int err = 0; if (unlikely(afinfo == NULL)) return -EINVAL; @@ -2814,7 +2833,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) @@ -2837,26 +2856,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) } spin_unlock(&xfrm_policy_afinfo_lock); - rtnl_lock(); - for_each_net(net) { - struct dst_ops *xfrm_dst_ops; - - switch (afinfo->family) { - case AF_INET: - xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops; - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops; - break; -#endif - default: - BUG(); - } - *xfrm_dst_ops = *afinfo->dst_ops; - } - rtnl_unlock(); - return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); @@ -2892,22 +2891,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); -static void __net_init xfrm_dst_ops_init(struct net *net) -{ - struct xfrm_policy_afinfo *afinfo; - - rcu_read_lock(); - afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]); - if (afinfo) - net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops; -#if IS_ENABLED(CONFIG_IPV6) - afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]); - if (afinfo) - net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops; -#endif - rcu_read_unlock(); -} - static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -3056,7 +3039,6 @@ static int __net_init xfrm_net_init(struct net *net) rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; - xfrm_dst_ops_init(net); rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; @@ -3209,16 +3191,17 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { + if ((pol->priority >= priority) && ret) + break; + if (xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type && - pol->priority < priority) { + pol->type == type) { ret = pol; break; } } - if (ret) - xfrm_pol_hold(ret); + xfrm_pol_hold(ret); read_unlock_bh(&net->xfrm.xfrm_policy_lock); diff --git a/kernel/net/xfrm/xfrm_state.c b/kernel/net/xfrm/xfrm_state.c index 96688cd0f..9895a8c56 100644 --- a/kernel/net/xfrm/xfrm_state.c +++ b/kernel/net/xfrm/xfrm_state.c @@ -1626,7 +1626,7 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, if (list_empty(&walk->all)) x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); else - x = list_entry(&walk->all, struct xfrm_state_walk, all); + x = list_first_entry(&walk->all, struct xfrm_state_walk, all); list_for_each_entry_from(x, &net->xfrm.state_all, all) { if (x->state == XFRM_STATE_DEAD) continue; @@ -1908,7 +1908,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_state_afinfo_lock); diff --git a/kernel/net/xfrm/xfrm_user.c b/kernel/net/xfrm/xfrm_user.c index 209166429..805681a7d 100644 --- a/kernel/net/xfrm/xfrm_user.c +++ b/kernel/net/xfrm/xfrm_user.c @@ -31,6 +31,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) { @@ -289,6 +290,31 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, return 0; } +static int attach_crypt(struct xfrm_state *x, struct nlattr *rta) +{ + struct xfrm_algo *p, *ualg; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = nla_data(rta); + + algo = xfrm_ealg_get_byname(ualg->alg_name, 1); + if (!algo) + return -ENOSYS; + x->props.ealgo = algo->desc.sadb_alg_id; + + p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); + if (!p) + return -ENOMEM; + + strcpy(p->alg_name, algo->name); + x->ealg = p; + x->geniv = algo->uinfo.encr.geniv; + return 0; +} + static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, struct nlattr *rta) { @@ -349,8 +375,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, return 0; } -static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, - struct nlattr *rta) +static int attach_aead(struct xfrm_state *x, struct nlattr *rta) { struct xfrm_algo_aead *p, *ualg; struct xfrm_algo_desc *algo; @@ -363,14 +388,15 @@ static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1); if (!algo) return -ENOSYS; - *props = algo->desc.sadb_alg_id; + x->props.ealgo = algo->desc.sadb_alg_id; p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); - *algpp = p; + x->aead = p; + x->geniv = algo->uinfo.aead.geniv; return 0; } @@ -515,8 +541,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); - if ((err = attach_aead(&x->aead, &x->props.ealgo, - attrs[XFRMA_ALG_AEAD]))) + if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD]))) goto error; if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo, attrs[XFRMA_ALG_AUTH_TRUNC]))) @@ -526,9 +551,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, attrs[XFRMA_ALG_AUTH]))) goto error; } - if ((err = attach_one_algo(&x->ealg, &x->props.ealgo, - xfrm_ealg_get_byname, - attrs[XFRMA_ALG_CRYPT]))) + if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT]))) goto error; if ((err = attach_one_algo(&x->calg, &x->props.calgo, xfrm_calg_get_byname, @@ -706,7 +729,9 @@ static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); - memcpy(&p->stats, &x->stats, sizeof(p->stats)); + put_unaligned(x->stats.replay_window, &p->stats.replay_window); + put_unaligned(x->stats.replay, &p->stats.replay); + put_unaligned(x->stats.integrity_failed, &p->stats.integrity_failed); memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr)); p->mode = x->props.mode; p->replay_window = x->props.replay_window; @@ -903,12 +928,10 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) return err; if (attrs[XFRMA_ADDRESS_FILTER]) { - filter = kmalloc(sizeof(*filter), GFP_KERNEL); + filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]), + sizeof(*filter), GFP_KERNEL); if (filter == NULL) return -ENOMEM; - - memcpy(filter, nla_data(attrs[XFRMA_ADDRESS_FILTER]), - sizeof(*filter)); } if (attrs[XFRMA_PROTO]) @@ -1908,8 +1931,10 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; + struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; + struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; - if (!lt && !rp && !re) + if (!lt && !rp && !re && !et && !rt) return err; /* pedantic mode - thou shalt sayeth replaceth */ @@ -2026,7 +2051,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_audit_policy_delete(xp, 1, true); } else { // reset the timers here? - WARN(1, "Dont know what to do with soft policy expire\n"); + WARN(1, "Don't know what to do with soft policy expire\n"); } km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid); -- cgit 1.2.3-korg