summaryrefslogtreecommitdiffstats
path: root/kernel/net
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/net
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/net')
-rw-r--r--kernel/net/6lowpan/Makefile2
-rw-r--r--kernel/net/6lowpan/core.c45
-rw-r--r--kernel/net/6lowpan/iphc.c707
-rw-r--r--kernel/net/6lowpan/nhc.c16
-rw-r--r--kernel/net/6lowpan/nhc.h14
-rw-r--r--kernel/net/6lowpan/nhc_udp.c35
-rw-r--r--kernel/net/8021q/vlan.c96
-rw-r--r--kernel/net/8021q/vlan_core.c14
-rw-r--r--kernel/net/8021q/vlan_dev.c3
-rw-r--r--kernel/net/9p/client.c8
-rw-r--r--kernel/net/9p/trans_rdma.c34
-rw-r--r--kernel/net/9p/trans_virtio.c1
-rw-r--r--kernel/net/Kconfig11
-rw-r--r--kernel/net/Makefile3
-rw-r--r--kernel/net/appletalk/ddp.c2
-rw-r--r--kernel/net/atm/br2684.c9
-rw-r--r--kernel/net/atm/clip.c3
-rw-r--r--kernel/net/atm/common.c4
-rw-r--r--kernel/net/atm/common.h2
-rw-r--r--kernel/net/atm/pvc.c2
-rw-r--r--kernel/net/atm/svc.c2
-rw-r--r--kernel/net/ax25/af_ax25.c38
-rw-r--r--kernel/net/ax25/ax25_in.c3
-rw-r--r--kernel/net/ax25/ax25_ip.c1
-rw-r--r--kernel/net/ax25/ax25_out.c1
-rw-r--r--kernel/net/ax25/ax25_subr.c1
-rw-r--r--kernel/net/ax25/ax25_uid.c1
-rw-r--r--kernel/net/batman-adv/Makefile6
-rw-r--r--kernel/net/batman-adv/bat_algo.h2
-rw-r--r--kernel/net/batman-adv/bat_iv_ogm.c336
-rw-r--r--kernel/net/batman-adv/bitarray.c12
-rw-r--r--kernel/net/batman-adv/bitarray.h18
-rw-r--r--kernel/net/batman-adv/bridge_loop_avoidance.c144
-rw-r--r--kernel/net/batman-adv/bridge_loop_avoidance.h14
-rw-r--r--kernel/net/batman-adv/debugfs.c47
-rw-r--r--kernel/net/batman-adv/debugfs.h42
-rw-r--r--kernel/net/batman-adv/distributed-arp-table.c137
-rw-r--r--kernel/net/batman-adv/distributed-arp-table.h23
-rw-r--r--kernel/net/batman-adv/fragmentation.c57
-rw-r--r--kernel/net/batman-adv/fragmentation.h11
-rw-r--r--kernel/net/batman-adv/gateway_client.c124
-rw-r--r--kernel/net/batman-adv/gateway_client.h14
-rw-r--r--kernel/net/batman-adv/gateway_common.c80
-rw-r--r--kernel/net/batman-adv/gateway_common.h8
-rw-r--r--kernel/net/batman-adv/hard-interface.c84
-rw-r--r--kernel/net/batman-adv/hard-interface.h25
-rw-r--r--kernel/net/batman-adv/hash.c14
-rw-r--r--kernel/net/batman-adv/hash.h45
-rw-r--r--kernel/net/batman-adv/icmp_socket.c41
-rw-r--r--kernel/net/batman-adv/icmp_socket.h8
-rw-r--r--kernel/net/batman-adv/main.c187
-rw-r--r--kernel/net/batman-adv/main.h86
-rw-r--r--kernel/net/batman-adv/multicast.c145
-rw-r--r--kernel/net/batman-adv/multicast.h6
-rw-r--r--kernel/net/batman-adv/network-coding.c134
-rw-r--r--kernel/net/batman-adv/network-coding.h11
-rw-r--r--kernel/net/batman-adv/originator.c298
-rw-r--r--kernel/net/batman-adv/originator.h45
-rw-r--r--kernel/net/batman-adv/packet.h209
-rw-r--r--kernel/net/batman-adv/routing.c81
-rw-r--r--kernel/net/batman-adv/routing.h10
-rw-r--r--kernel/net/batman-adv/send.c51
-rw-r--r--kernel/net/batman-adv/send.h20
-rw-r--r--kernel/net/batman-adv/soft-interface.c112
-rw-r--r--kernel/net/batman-adv/soft-interface.h9
-rw-r--r--kernel/net/batman-adv/sysfs.c66
-rw-r--r--kernel/net/batman-adv/sysfs.h10
-rw-r--r--kernel/net/batman-adv/translation-table.c482
-rw-r--r--kernel/net/batman-adv/translation-table.h38
-rw-r--r--kernel/net/batman-adv/types.h161
-rw-r--r--kernel/net/bluetooth/6lowpan.c214
-rw-r--r--kernel/net/bluetooth/Kconfig5
-rw-r--r--kernel/net/bluetooth/Makefile6
-rw-r--r--kernel/net/bluetooth/a2mp.c17
-rw-r--r--kernel/net/bluetooth/a2mp.h19
-rw-r--r--kernel/net/bluetooth/af_bluetooth.c10
-rw-r--r--kernel/net/bluetooth/amp.c134
-rw-r--r--kernel/net/bluetooth/amp.h14
-rw-r--r--kernel/net/bluetooth/bnep/sock.c2
-rw-r--r--kernel/net/bluetooth/cmtp/capi.c8
-rw-r--r--kernel/net/bluetooth/cmtp/sock.c2
-rw-r--r--kernel/net/bluetooth/hci_conn.c427
-rw-r--r--kernel/net/bluetooth/hci_core.c401
-rw-r--r--kernel/net/bluetooth/hci_event.c372
-rw-r--r--kernel/net/bluetooth/hci_request.c137
-rw-r--r--kernel/net/bluetooth/hci_request.h4
-rw-r--r--kernel/net/bluetooth/hci_sock.c113
-rw-r--r--kernel/net/bluetooth/hidp/core.c15
-rw-r--r--kernel/net/bluetooth/hidp/sock.c2
-rw-r--r--kernel/net/bluetooth/l2cap_core.c43
-rw-r--r--kernel/net/bluetooth/l2cap_sock.c104
-rw-r--r--kernel/net/bluetooth/lib.c32
-rw-r--r--kernel/net/bluetooth/mgmt.c814
-rw-r--r--kernel/net/bluetooth/rfcomm/core.c2
-rw-r--r--kernel/net/bluetooth/rfcomm/sock.c28
-rw-r--r--kernel/net/bluetooth/sco.c63
-rw-r--r--kernel/net/bluetooth/smp.c253
-rw-r--r--kernel/net/bluetooth/smp.h1
-rw-r--r--kernel/net/bridge/Makefile2
-rw-r--r--kernel/net/bridge/br.c25
-rw-r--r--kernel/net/bridge/br_device.c16
-rw-r--r--kernel/net/bridge/br_fdb.c242
-rw-r--r--kernel/net/bridge/br_forward.c62
-rw-r--r--kernel/net/bridge/br_if.c8
-rw-r--r--kernel/net/bridge/br_input.c35
-rw-r--r--kernel/net/bridge/br_ioctl.c3
-rw-r--r--kernel/net/bridge/br_mdb.c159
-rw-r--r--kernel/net/bridge/br_multicast.c399
-rw-r--r--kernel/net/bridge/br_netfilter_hooks.c (renamed from kernel/net/bridge/br_netfilter.c)393
-rw-r--r--kernel/net/bridge/br_netfilter_ipv6.c244
-rw-r--r--kernel/net/bridge/br_netlink.c597
-rw-r--r--kernel/net/bridge/br_private.h240
-rw-r--r--kernel/net/bridge/br_stp.c42
-rw-r--r--kernel/net/bridge/br_stp_bpdu.c12
-rw-r--r--kernel/net/bridge/br_stp_if.c32
-rw-r--r--kernel/net/bridge/br_stp_timer.c4
-rw-r--r--kernel/net/bridge/br_sysfs_br.c11
-rw-r--r--kernel/net/bridge/br_sysfs_if.c2
-rw-r--r--kernel/net/bridge/br_vlan.c800
-rw-r--r--kernel/net/bridge/netfilter/ebt_log.c2
-rw-r--r--kernel/net/bridge/netfilter/ebt_nflog.c2
-rw-r--r--kernel/net/bridge/netfilter/ebt_stp.c6
-rw-r--r--kernel/net/bridge/netfilter/ebtable_broute.c8
-rw-r--r--kernel/net/bridge/netfilter/ebtable_filter.c13
-rw-r--r--kernel/net/bridge/netfilter/ebtable_nat.c13
-rw-r--r--kernel/net/bridge/netfilter/ebtables.c20
-rw-r--r--kernel/net/bridge/netfilter/nf_tables_bridge.c20
-rw-r--r--kernel/net/bridge/netfilter/nft_reject_bridge.c19
-rw-r--r--kernel/net/caif/caif_dev.c2
-rw-r--r--kernel/net/caif/caif_socket.c25
-rw-r--r--kernel/net/can/af_can.c2
-rw-r--r--kernel/net/can/bcm.c15
-rw-r--r--kernel/net/can/gw.c68
-rw-r--r--kernel/net/ceph/auth_x.c36
-rw-r--r--kernel/net/ceph/ceph_common.c87
-rw-r--r--kernel/net/ceph/crush/crush.c13
-rw-r--r--kernel/net/ceph/crush/crush_ln_table.h32
-rw-r--r--kernel/net/ceph/crush/hash.c8
-rw-r--r--kernel/net/ceph/crush/mapper.c148
-rw-r--r--kernel/net/ceph/crypto.c10
-rw-r--r--kernel/net/ceph/crypto.h4
-rw-r--r--kernel/net/ceph/messenger.c291
-rw-r--r--kernel/net/ceph/mon_client.c50
-rw-r--r--kernel/net/ceph/osd_client.c140
-rw-r--r--kernel/net/ceph/osdmap.c2
-rw-r--r--kernel/net/ceph/pagevec.c5
-rw-r--r--kernel/net/core/Makefile1
-rw-r--r--kernel/net/core/datagram.c2
-rw-r--r--kernel/net/core/dev.c491
-rw-r--r--kernel/net/core/dst.c127
-rw-r--r--kernel/net/core/ethtool.c15
-rw-r--r--kernel/net/core/fib_rules.c34
-rw-r--r--kernel/net/core/filter.c543
-rw-r--r--kernel/net/core/flow_dissector.c820
-rw-r--r--kernel/net/core/gen_estimator.c13
-rw-r--r--kernel/net/core/lwtunnel.c249
-rw-r--r--kernel/net/core/neighbour.c68
-rw-r--r--kernel/net/core/net-sysfs.c62
-rw-r--r--kernel/net/core/net-traces.c1
-rw-r--r--kernel/net/core/net_namespace.c133
-rw-r--r--kernel/net/core/netclassid_cgroup.c31
-rw-r--r--kernel/net/core/netevent.c5
-rw-r--r--kernel/net/core/netpoll.c31
-rw-r--r--kernel/net/core/netprio_cgroup.c9
-rw-r--r--kernel/net/core/pktgen.c133
-rw-r--r--kernel/net/core/ptp_classifier.c16
-rw-r--r--kernel/net/core/request_sock.c88
-rw-r--r--kernel/net/core/rtnetlink.c406
-rw-r--r--kernel/net/core/scm.c9
-rw-r--r--kernel/net/core/secure_seq.c2
-rw-r--r--kernel/net/core/skbuff.c421
-rw-r--r--kernel/net/core/sock.c176
-rw-r--r--kernel/net/core/sock_diag.c97
-rw-r--r--kernel/net/core/stream.c12
-rw-r--r--kernel/net/core/sysctl_net_core.c10
-rw-r--r--kernel/net/core/timestamping.c6
-rw-r--r--kernel/net/core/tso.c18
-rw-r--r--kernel/net/core/utils.c70
-rw-r--r--kernel/net/dcb/dcbnl.c30
-rw-r--r--kernel/net/dccp/ackvec.c12
-rw-r--r--kernel/net/dccp/ccid.c3
-rw-r--r--kernel/net/dccp/dccp.h16
-rw-r--r--kernel/net/dccp/diag.c1
-rw-r--r--kernel/net/dccp/ipv4.c94
-rw-r--r--kernel/net/dccp/ipv6.c175
-rw-r--r--kernel/net/dccp/minisocks.c22
-rw-r--r--kernel/net/dccp/output.c17
-rw-r--r--kernel/net/dccp/probe.c11
-rw-r--r--kernel/net/dccp/proto.c5
-rw-r--r--kernel/net/decnet/af_decnet.c19
-rw-r--r--kernel/net/decnet/dn_neigh.c23
-rw-r--r--kernel/net/decnet/dn_nsp_in.c7
-rw-r--r--kernel/net/decnet/dn_nsp_out.c4
-rw-r--r--kernel/net/decnet/dn_route.c38
-rw-r--r--kernel/net/decnet/dn_rules.c1
-rw-r--r--kernel/net/decnet/netfilter/dn_rtmsg.c2
-rw-r--r--kernel/net/dns_resolver/dns_key.c20
-rw-r--r--kernel/net/dns_resolver/dns_query.c9
-rw-r--r--kernel/net/dns_resolver/internal.h8
-rw-r--r--kernel/net/dsa/dsa.c216
-rw-r--r--kernel/net/dsa/dsa_priv.h8
-rw-r--r--kernel/net/dsa/slave.c479
-rw-r--r--kernel/net/dsa/tag_brcm.c15
-rw-r--r--kernel/net/dsa/tag_dsa.c12
-rw-r--r--kernel/net/dsa/tag_edsa.c12
-rw-r--r--kernel/net/dsa/tag_trailer.c14
-rw-r--r--kernel/net/ethernet/eth.c19
-rw-r--r--kernel/net/hsr/hsr_device.c4
-rw-r--r--kernel/net/ieee802154/6lowpan/6lowpan_i.h25
-rw-r--r--kernel/net/ieee802154/6lowpan/core.c199
-rw-r--r--kernel/net/ieee802154/6lowpan/reassembly.c174
-rw-r--r--kernel/net/ieee802154/6lowpan/rx.c378
-rw-r--r--kernel/net/ieee802154/6lowpan/tx.c100
-rw-r--r--kernel/net/ieee802154/Kconfig5
-rw-r--r--kernel/net/ieee802154/core.c14
-rw-r--r--kernel/net/ieee802154/core.h1
-rw-r--r--kernel/net/ieee802154/header_ops.c20
-rw-r--r--kernel/net/ieee802154/nl-mac.c39
-rw-r--r--kernel/net/ieee802154/nl-phy.c10
-rw-r--r--kernel/net/ieee802154/nl802154.c1494
-rw-r--r--kernel/net/ieee802154/rdev-ops.h165
-rw-r--r--kernel/net/ieee802154/socket.c30
-rw-r--r--kernel/net/ieee802154/sysfs.c38
-rw-r--r--kernel/net/ieee802154/trace.h79
-rw-r--r--kernel/net/ipv4/Kconfig34
-rw-r--r--kernel/net/ipv4/Makefile3
-rw-r--r--kernel/net/ipv4/af_inet.c89
-rw-r--r--kernel/net/ipv4/ah4.c4
-rw-r--r--kernel/net/ipv4/arp.c147
-rw-r--r--kernel/net/ipv4/datagram.c2
-rw-r--r--kernel/net/ipv4/devinet.c25
-rw-r--r--kernel/net/ipv4/esp4.c201
-rw-r--r--kernel/net/ipv4/fib_frontend.c130
-rw-r--r--kernel/net/ipv4/fib_lookup.h1
-rw-r--r--kernel/net/ipv4/fib_rules.c6
-rw-r--r--kernel/net/ipv4/fib_semantics.c520
-rw-r--r--kernel/net/ipv4/fib_trie.c90
-rw-r--r--kernel/net/ipv4/fou.c35
-rw-r--r--kernel/net/ipv4/geneve.c453
-rw-r--r--kernel/net/ipv4/gre_demux.c235
-rw-r--r--kernel/net/ipv4/gre_offload.c3
-rw-r--r--kernel/net/ipv4/icmp.c32
-rw-r--r--kernel/net/ipv4/igmp.c214
-rw-r--r--kernel/net/ipv4/inet_connection_sock.c297
-rw-r--r--kernel/net/ipv4/inet_diag.c152
-rw-r--r--kernel/net/ipv4/inet_fragment.c46
-rw-r--r--kernel/net/ipv4/inet_hashtables.c137
-rw-r--r--kernel/net/ipv4/inet_timewait_sock.c73
-rw-r--r--kernel/net/ipv4/inetpeer.c20
-rw-r--r--kernel/net/ipv4/ip_forward.c37
-rw-r--r--kernel/net/ipv4/ip_fragment.c120
-rw-r--r--kernel/net/ipv4/ip_gre.c464
-rw-r--r--kernel/net/ipv4/ip_input.c50
-rw-r--r--kernel/net/ipv4/ip_output.c204
-rw-r--r--kernel/net/ipv4/ip_sockglue.c54
-rw-r--r--kernel/net/ipv4/ip_tunnel.c37
-rw-r--r--kernel/net/ipv4/ip_tunnel_core.c258
-rw-r--r--kernel/net/ipv4/ip_vti.c2
-rw-r--r--kernel/net/ipv4/ipconfig.c34
-rw-r--r--kernel/net/ipv4/ipip.c8
-rw-r--r--kernel/net/ipv4/ipmr.c39
-rw-r--r--kernel/net/ipv4/netfilter.c16
-rw-r--r--kernel/net/ipv4/netfilter/Kconfig17
-rw-r--r--kernel/net/ipv4/netfilter/Makefile3
-rw-r--r--kernel/net/ipv4/netfilter/arp_tables.c131
-rw-r--r--kernel/net/ipv4/netfilter/arptable_filter.c7
-rw-r--r--kernel/net/ipv4/netfilter/ip_tables.c170
-rw-r--r--kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c17
-rw-r--r--kernel/net/ipv4/netfilter/ipt_ECN.c2
-rw-r--r--kernel/net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--kernel/net/ipv4/netfilter/ipt_SYNPROXY.c35
-rw-r--r--kernel/net/ipv4/netfilter/ipt_ah.c2
-rw-r--r--kernel/net/ipv4/netfilter/ipt_rpfilter.c11
-rw-r--r--kernel/net/ipv4/netfilter/iptable_filter.c9
-rw-r--r--kernel/net/ipv4/netfilter/iptable_mangle.c19
-rw-r--r--kernel/net/ipv4/netfilter/iptable_nat.c26
-rw-r--r--kernel/net/ipv4/netfilter/iptable_raw.c9
-rw-r--r--kernel/net/ipv4/netfilter/iptable_security.c12
-rw-r--r--kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c20
-rw-r--r--kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c8
-rw-r--r--kernel/net/ipv4/netfilter/nf_defrag_ipv4.c42
-rw-r--r--kernel/net/ipv4/netfilter/nf_dup_ipv4.c106
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c48
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c2
-rw-r--r--kernel/net/ipv4/netfilter/nf_reject_ipv4.c6
-rw-r--r--kernel/net/ipv4/netfilter/nf_tables_arp.c6
-rw-r--r--kernel/net/ipv4/netfilter/nf_tables_ipv4.c10
-rw-r--r--kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c22
-rw-r--r--kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c8
-rw-r--r--kernel/net/ipv4/netfilter/nft_dup_ipv4.c110
-rw-r--r--kernel/net/ipv4/netfilter/nft_masq_ipv4.c2
-rw-r--r--kernel/net/ipv4/netfilter/nft_redir_ipv4.c2
-rw-r--r--kernel/net/ipv4/netfilter/nft_reject_ipv4.c5
-rw-r--r--kernel/net/ipv4/ping.c7
-rw-r--r--kernel/net/ipv4/proc.c4
-rw-r--r--kernel/net/ipv4/raw.c28
-rw-r--r--kernel/net/ipv4/route.c354
-rw-r--r--kernel/net/ipv4/syncookies.c33
-rw-r--r--kernel/net/ipv4/sysctl_net_ipv4.c59
-rw-r--r--kernel/net/ipv4/tcp.c189
-rw-r--r--kernel/net/ipv4/tcp_bic.c2
-rw-r--r--kernel/net/ipv4/tcp_cdg.c433
-rw-r--r--kernel/net/ipv4/tcp_cong.c27
-rw-r--r--kernel/net/ipv4/tcp_cubic.c26
-rw-r--r--kernel/net/ipv4/tcp_dctcp.c26
-rw-r--r--kernel/net/ipv4/tcp_diag.c8
-rw-r--r--kernel/net/ipv4/tcp_fastopen.c75
-rw-r--r--kernel/net/ipv4/tcp_highspeed.c2
-rw-r--r--kernel/net/ipv4/tcp_htcp.c2
-rw-r--r--kernel/net/ipv4/tcp_hybla.c2
-rw-r--r--kernel/net/ipv4/tcp_illinois.c2
-rw-r--r--kernel/net/ipv4/tcp_input.c532
-rw-r--r--kernel/net/ipv4/tcp_ipv4.c273
-rw-r--r--kernel/net/ipv4/tcp_metrics.c83
-rw-r--r--kernel/net/ipv4/tcp_minisocks.c88
-rw-r--r--kernel/net/ipv4/tcp_offload.c4
-rw-r--r--kernel/net/ipv4/tcp_output.c278
-rw-r--r--kernel/net/ipv4/tcp_recovery.c109
-rw-r--r--kernel/net/ipv4/tcp_scalable.c2
-rw-r--r--kernel/net/ipv4/tcp_timer.c25
-rw-r--r--kernel/net/ipv4/tcp_vegas.c6
-rw-r--r--kernel/net/ipv4/tcp_veno.c2
-rw-r--r--kernel/net/ipv4/tcp_yeah.c2
-rw-r--r--kernel/net/ipv4/udp.c21
-rw-r--r--kernel/net/ipv4/udp_diag.c2
-rw-r--r--kernel/net/ipv4/udp_tunnel.c33
-rw-r--r--kernel/net/ipv4/xfrm4_input.c7
-rw-r--r--kernel/net/ipv4/xfrm4_output.c13
-rw-r--r--kernel/net/ipv4/xfrm4_policy.c113
-rw-r--r--kernel/net/ipv6/Kconfig30
-rw-r--r--kernel/net/ipv6/Makefile2
-rw-r--r--kernel/net/ipv6/addrconf.c512
-rw-r--r--kernel/net/ipv6/addrconf_core.c11
-rw-r--r--kernel/net/ipv6/addrlabel.c2
-rw-r--r--kernel/net/ipv6/af_inet6.c34
-rw-r--r--kernel/net/ipv6/ah6.c4
-rw-r--r--kernel/net/ipv6/datagram.c19
-rw-r--r--kernel/net/ipv6/esp6.c201
-rw-r--r--kernel/net/ipv6/exthdrs.c5
-rw-r--r--kernel/net/ipv6/fib6_rules.c25
-rw-r--r--kernel/net/ipv6/icmp.c44
-rw-r--r--kernel/net/ipv6/ila.c229
-rw-r--r--kernel/net/ipv6/inet6_connection_sock.c98
-rw-r--r--kernel/net/ipv6/inet6_hashtables.c19
-rw-r--r--kernel/net/ipv6/ip6_fib.c65
-rw-r--r--kernel/net/ipv6/ip6_flowlabel.c9
-rw-r--r--kernel/net/ipv6/ip6_gre.c106
-rw-r--r--kernel/net/ipv6/ip6_input.c20
-rw-r--r--kernel/net/ipv6/ip6_offload.c12
-rw-r--r--kernel/net/ipv6/ip6_output.c268
-rw-r--r--kernel/net/ipv6/ip6_tunnel.c149
-rw-r--r--kernel/net/ipv6/ip6_udp_tunnel.c13
-rw-r--r--kernel/net/ipv6/ip6_vti.c2
-rw-r--r--kernel/net/ipv6/ip6mr.c32
-rw-r--r--kernel/net/ipv6/ipv6_sockglue.c33
-rw-r--r--kernel/net/ipv6/mcast.c11
-rw-r--r--kernel/net/ipv6/mcast_snoop.c216
-rw-r--r--kernel/net/ipv6/mip6.c16
-rw-r--r--kernel/net/ipv6/ndisc.c94
-rw-r--r--kernel/net/ipv6/netfilter.c9
-rw-r--r--kernel/net/ipv6/netfilter/Kconfig17
-rw-r--r--kernel/net/ipv6/netfilter/Makefile3
-rw-r--r--kernel/net/ipv6/netfilter/ip6_tables.c162
-rw-r--r--kernel/net/ipv6/netfilter/ip6t_REJECT.c11
-rw-r--r--kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c31
-rw-r--r--kernel/net/ipv6/netfilter/ip6t_rpfilter.c6
-rw-r--r--kernel/net/ipv6/netfilter/ip6table_filter.c6
-rw-r--r--kernel/net/ipv6/netfilter/ip6table_mangle.c18
-rw-r--r--kernel/net/ipv6/netfilter/ip6table_nat.c26
-rw-r--r--kernel/net/ipv6/netfilter/ip6table_raw.c6
-rw-r--r--kernel/net/ipv6/netfilter/ip6table_security.c7
-rw-r--r--kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c20
-rw-r--r--kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c12
-rw-r--r--kernel/net/ipv6/netfilter/nf_conntrack_reasm.c33
-rw-r--r--kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c32
-rw-r--r--kernel/net/ipv6/netfilter/nf_dup_ipv6.c82
-rw-r--r--kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c48
-rw-r--r--kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c2
-rw-r--r--kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c2
-rw-r--r--kernel/net/ipv6/netfilter/nf_reject_ipv6.c6
-rw-r--r--kernel/net/ipv6/netfilter/nf_tables_ipv6.c10
-rw-r--r--kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c22
-rw-r--r--kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c14
-rw-r--r--kernel/net/ipv6/netfilter/nft_dup_ipv6.c108
-rw-r--r--kernel/net/ipv6/netfilter/nft_redir_ipv6.c3
-rw-r--r--kernel/net/ipv6/netfilter/nft_reject_ipv6.c7
-rw-r--r--kernel/net/ipv6/output_core.c38
-rw-r--r--kernel/net/ipv6/raw.c31
-rw-r--r--kernel/net/ipv6/reassembly.c30
-rw-r--r--kernel/net/ipv6/route.c932
-rw-r--r--kernel/net/ipv6/sit.c28
-rw-r--r--kernel/net/ipv6/syncookies.c34
-rw-r--r--kernel/net/ipv6/sysctl_net_ipv6.c23
-rw-r--r--kernel/net/ipv6/tcp_ipv6.c291
-rw-r--r--kernel/net/ipv6/tunnel6.c12
-rw-r--r--kernel/net/ipv6/udp.c22
-rw-r--r--kernel/net/ipv6/xfrm6_input.c4
-rw-r--r--kernel/net/ipv6/xfrm6_mode_tunnel.c5
-rw-r--r--kernel/net/ipv6/xfrm6_output.c40
-rw-r--r--kernel/net/ipv6/xfrm6_policy.c92
-rw-r--r--kernel/net/ipx/af_ipx.c2
-rw-r--r--kernel/net/irda/af_irda.c8
-rw-r--r--kernel/net/irda/ircomm/ircomm_tty.c31
-rw-r--r--kernel/net/irda/irlmp.c2
-rw-r--r--kernel/net/irda/timer.c4
-rw-r--r--kernel/net/iucv/af_iucv.c24
-rw-r--r--kernel/net/iucv/iucv.c12
-rw-r--r--kernel/net/key/af_key.c51
-rw-r--r--kernel/net/l2tp/l2tp_core.c26
-rw-r--r--kernel/net/l2tp/l2tp_core.h3
-rw-r--r--kernel/net/l2tp/l2tp_eth.c1
-rw-r--r--kernel/net/l2tp/l2tp_ip.c1
-rw-r--r--kernel/net/l2tp/l2tp_ip6.c9
-rw-r--r--kernel/net/l2tp/l2tp_netlink.c25
-rw-r--r--kernel/net/l2tp/l2tp_ppp.c5
-rw-r--r--kernel/net/l3mdev/Kconfig10
-rw-r--r--kernel/net/l3mdev/Makefile5
-rw-r--r--kernel/net/l3mdev/l3mdev.c92
-rw-r--r--kernel/net/llc/af_llc.c6
-rw-r--r--kernel/net/llc/llc_conn.c6
-rw-r--r--kernel/net/mac80211/Kconfig17
-rw-r--r--kernel/net/mac80211/Makefile2
-rw-r--r--kernel/net/mac80211/aes_ccm.c33
-rw-r--r--kernel/net/mac80211/aes_cmac.c17
-rw-r--r--kernel/net/mac80211/aes_gcm.c33
-rw-r--r--kernel/net/mac80211/aes_gmac.c14
-rw-r--r--kernel/net/mac80211/agg-rx.c10
-rw-r--r--kernel/net/mac80211/agg-tx.c22
-rw-r--r--kernel/net/mac80211/cfg.c489
-rw-r--r--kernel/net/mac80211/cfg.h9
-rw-r--r--kernel/net/mac80211/chan.c41
-rw-r--r--kernel/net/mac80211/debugfs.c179
-rw-r--r--kernel/net/mac80211/debugfs_key.c70
-rw-r--r--kernel/net/mac80211/debugfs_netdev.c75
-rw-r--r--kernel/net/mac80211/debugfs_sta.c95
-rw-r--r--kernel/net/mac80211/driver-ops.c309
-rw-r--r--kernel/net/mac80211/driver-ops.h341
-rw-r--r--kernel/net/mac80211/ethtool.c32
-rw-r--r--kernel/net/mac80211/event.c27
-rw-r--r--kernel/net/mac80211/ibss.c34
-rw-r--r--kernel/net/mac80211/ieee80211_i.h120
-rw-r--r--kernel/net/mac80211/iface.c134
-rw-r--r--kernel/net/mac80211/key.c97
-rw-r--r--kernel/net/mac80211/key.h10
-rw-r--r--kernel/net/mac80211/led.c268
-rw-r--r--kernel/net/mac80211/led.h44
-rw-r--r--kernel/net/mac80211/main.c64
-rw-r--r--kernel/net/mac80211/mesh.c98
-rw-r--r--kernel/net/mac80211/mesh.h14
-rw-r--r--kernel/net/mac80211/mesh_hwmp.c117
-rw-r--r--kernel/net/mac80211/mesh_pathtbl.c8
-rw-r--r--kernel/net/mac80211/mesh_plink.c345
-rw-r--r--kernel/net/mac80211/mesh_ps.c42
-rw-r--r--kernel/net/mac80211/mesh_sync.c16
-rw-r--r--kernel/net/mac80211/mlme.c704
-rw-r--r--kernel/net/mac80211/ocb.c4
-rw-r--r--kernel/net/mac80211/offchannel.c8
-rw-r--r--kernel/net/mac80211/pm.c32
-rw-r--r--kernel/net/mac80211/rate.c333
-rw-r--r--kernel/net/mac80211/rate.h66
-rw-r--r--kernel/net/mac80211/rc80211_minstrel.c2
-rw-r--r--kernel/net/mac80211/rc80211_minstrel_debugfs.c12
-rw-r--r--kernel/net/mac80211/rc80211_minstrel_ht.c15
-rw-r--r--kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c12
-rw-r--r--kernel/net/mac80211/rx.c435
-rw-r--r--kernel/net/mac80211/scan.c71
-rw-r--r--kernel/net/mac80211/sta_info.c175
-rw-r--r--kernel/net/mac80211/sta_info.h251
-rw-r--r--kernel/net/mac80211/status.c326
-rw-r--r--kernel/net/mac80211/tdls.c379
-rw-r--r--kernel/net/mac80211/trace.h94
-rw-r--r--kernel/net/mac80211/tx.c620
-rw-r--r--kernel/net/mac80211/util.c299
-rw-r--r--kernel/net/mac80211/vht.c59
-rw-r--r--kernel/net/mac80211/wpa.c102
-rw-r--r--kernel/net/mac802154/Kconfig1
-rw-r--r--kernel/net/mac802154/Makefile4
-rw-r--r--kernel/net/mac802154/cfg.c367
-rw-r--r--kernel/net/mac802154/driver-ops.h96
-rw-r--r--kernel/net/mac802154/ieee802154_i.h20
-rw-r--r--kernel/net/mac802154/iface.c296
-rw-r--r--kernel/net/mac802154/llsec.c65
-rw-r--r--kernel/net/mac802154/mac_cmd.c42
-rw-r--r--kernel/net/mac802154/main.c39
-rw-r--r--kernel/net/mac802154/mib.c63
-rw-r--r--kernel/net/mac802154/rx.c31
-rw-r--r--kernel/net/mac802154/trace.c9
-rw-r--r--kernel/net/mac802154/trace.h272
-rw-r--r--kernel/net/mac802154/tx.c34
-rw-r--r--kernel/net/mac802154/util.c13
-rw-r--r--kernel/net/mpls/Kconfig8
-rw-r--r--kernel/net/mpls/Makefile1
-rw-r--r--kernel/net/mpls/af_mpls.c796
-rw-r--r--kernel/net/mpls/internal.h81
-rw-r--r--kernel/net/mpls/mpls_gso.c2
-rw-r--r--kernel/net/mpls/mpls_iptunnel.c231
-rw-r--r--kernel/net/netfilter/Kconfig52
-rw-r--r--kernel/net/netfilter/Makefile3
-rw-r--r--kernel/net/netfilter/core.c233
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_gen.h61
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_ip.c58
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c119
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_port.c45
-rw-r--r--kernel/net/netfilter/ipset/ip_set_core.c404
-rw-r--r--kernel/net/netfilter/ipset/ip_set_getport.c19
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_gen.h756
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ip.c72
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipmark.c87
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipport.c98
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipportip.c91
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c96
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_mac.c30
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_net.c73
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netiface.c250
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netnet.c158
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netport.c86
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netportnet.c188
-rw-r--r--kernel/net/netfilter/ipset/ip_set_list_set.c427
-rw-r--r--kernel/net/netfilter/ipset/pfxlen.c16
-rw-r--r--kernel/net/netfilter/ipvs/Kconfig11
-rw-r--r--kernel/net/netfilter/ipvs/Makefile1
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_app.c36
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_conn.c91
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_core.c566
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_ctl.c502
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_est.c20
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_ftp.c27
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_lblc.c3
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_lblcr.c3
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_nfct.c5
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_ovf.c86
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_pe_sip.c2
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto.c33
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c32
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c58
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c61
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_udp.c49
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_sched.c14
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_sh.c45
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_sync.c374
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_xmit.c145
-rw-r--r--kernel/net/netfilter/nf_conntrack_core.c197
-rw-r--r--kernel/net/netfilter/nf_conntrack_expect.c22
-rw-r--r--kernel/net/netfilter/nf_conntrack_h323_main.c4
-rw-r--r--kernel/net/netfilter/nf_conntrack_labels.c34
-rw-r--r--kernel/net/netfilter/nf_conntrack_netlink.c331
-rw-r--r--kernel/net/netfilter/nf_conntrack_pptp.c3
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_dccp.c2
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_generic.c10
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_gre.c3
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_sctp.c103
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_tcp.c2
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_udp.c1
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_udplite.c1
-rw-r--r--kernel/net/netfilter/nf_conntrack_seqadj.c9
-rw-r--r--kernel/net/netfilter/nf_conntrack_standalone.c39
-rw-r--r--kernel/net/netfilter/nf_internals.h1
-rw-r--r--kernel/net/netfilter/nf_log.c9
-rw-r--r--kernel/net/netfilter/nf_nat_core.c28
-rw-r--r--kernel/net/netfilter/nf_nat_proto_dccp.c2
-rw-r--r--kernel/net/netfilter/nf_nat_proto_tcp.c2
-rw-r--r--kernel/net/netfilter/nf_nat_proto_udp.c2
-rw-r--r--kernel/net/netfilter/nf_nat_proto_udplite.c2
-rw-r--r--kernel/net/netfilter/nf_nat_redirect.c2
-rw-r--r--kernel/net/netfilter/nf_queue.c55
-rw-r--r--kernel/net/netfilter/nf_synproxy_core.c24
-rw-r--r--kernel/net/netfilter/nf_tables_api.c221
-rw-r--r--kernel/net/netfilter/nf_tables_core.c10
-rw-r--r--kernel/net/netfilter/nf_tables_netdev.c256
-rw-r--r--kernel/net/netfilter/nfnetlink.c54
-rw-r--r--kernel/net/netfilter/nfnetlink_acct.c71
-rw-r--r--kernel/net/netfilter/nfnetlink_cttimeout.c34
-rw-r--r--kernel/net/netfilter/nfnetlink_log.c91
-rw-r--r--kernel/net/netfilter/nfnetlink_queue.c (renamed from kernel/net/netfilter/nfnetlink_queue_core.c)144
-rw-r--r--kernel/net/netfilter/nfnetlink_queue_ct.c113
-rw-r--r--kernel/net/netfilter/nft_compat.c26
-rw-r--r--kernel/net/netfilter/nft_counter.c124
-rw-r--r--kernel/net/netfilter/nft_ct.c1
-rw-r--r--kernel/net/netfilter/nft_dynset.c5
-rw-r--r--kernel/net/netfilter/nft_limit.c188
-rw-r--r--kernel/net/netfilter/nft_log.c3
-rw-r--r--kernel/net/netfilter/nft_meta.c44
-rw-r--r--kernel/net/netfilter/nft_payload.c57
-rw-r--r--kernel/net/netfilter/nft_queue.c2
-rw-r--r--kernel/net/netfilter/nft_reject_inet.c19
-rw-r--r--kernel/net/netfilter/x_tables.c85
-rw-r--r--kernel/net/netfilter/xt_CT.c46
-rw-r--r--kernel/net/netfilter/xt_IDLETIMER.c1
-rw-r--r--kernel/net/netfilter/xt_LOG.c2
-rw-r--r--kernel/net/netfilter/xt_NFLOG.c2
-rw-r--r--kernel/net/netfilter/xt_TCPMSS.c16
-rw-r--r--kernel/net/netfilter/xt_TCPOPTSTRIP.c2
-rw-r--r--kernel/net/netfilter/xt_TEE.c168
-rw-r--r--kernel/net/netfilter/xt_TPROXY.c30
-rw-r--r--kernel/net/netfilter/xt_addrtype.c6
-rw-r--r--kernel/net/netfilter/xt_connlabel.c16
-rw-r--r--kernel/net/netfilter/xt_connlimit.c13
-rw-r--r--kernel/net/netfilter/xt_ipvs.c5
-rw-r--r--kernel/net/netfilter/xt_mark.c1
-rw-r--r--kernel/net/netfilter/xt_nfacct.c2
-rw-r--r--kernel/net/netfilter/xt_osf.c2
-rw-r--r--kernel/net/netfilter/xt_owner.c6
-rw-r--r--kernel/net/netfilter/xt_recent.c2
-rw-r--r--kernel/net/netfilter/xt_set.c47
-rw-r--r--kernel/net/netfilter/xt_socket.c73
-rw-r--r--kernel/net/netlink/af_netlink.c303
-rw-r--r--kernel/net/netlink/genetlink.c14
-rw-r--r--kernel/net/netrom/af_netrom.c4
-rw-r--r--kernel/net/netrom/nr_route.c1
-rw-r--r--kernel/net/nfc/af_nfc.c2
-rw-r--r--kernel/net/nfc/core.c4
-rw-r--r--kernel/net/nfc/digital_core.c3
-rw-r--r--kernel/net/nfc/hci/core.c3
-rw-r--r--kernel/net/nfc/hci/llc.c2
-rw-r--r--kernel/net/nfc/llcp.h2
-rw-r--r--kernel/net/nfc/llcp_core.c2
-rw-r--r--kernel/net/nfc/llcp_sock.c10
-rw-r--r--kernel/net/nfc/nci/Kconfig9
-rw-r--r--kernel/net/nfc/nci/Makefile6
-rw-r--r--kernel/net/nfc/nci/core.c217
-rw-r--r--kernel/net/nfc/nci/data.c13
-rw-r--r--kernel/net/nfc/nci/hci.c176
-rw-r--r--kernel/net/nfc/nci/ntf.c11
-rw-r--r--kernel/net/nfc/nci/rsp.c11
-rw-r--r--kernel/net/nfc/nci/spi.c11
-rw-r--r--kernel/net/nfc/nci/uart.c494
-rw-r--r--kernel/net/nfc/netlink.c146
-rw-r--r--kernel/net/nfc/nfc.h7
-rw-r--r--kernel/net/nfc/rawsock.c7
-rw-r--r--kernel/net/openvswitch/Kconfig4
-rw-r--r--kernel/net/openvswitch/Makefile6
-rw-r--r--kernel/net/openvswitch/actions.c306
-rw-r--r--kernel/net/openvswitch/conntrack.c790
-rw-r--r--kernel/net/openvswitch/conntrack.h93
-rw-r--r--kernel/net/openvswitch/datapath.c140
-rw-r--r--kernel/net/openvswitch/datapath.h23
-rw-r--r--kernel/net/openvswitch/dp_notify.c7
-rw-r--r--kernel/net/openvswitch/flow.c45
-rw-r--r--kernel/net/openvswitch/flow.h91
-rw-r--r--kernel/net/openvswitch/flow_netlink.c458
-rw-r--r--kernel/net/openvswitch/flow_netlink.h19
-rw-r--r--kernel/net/openvswitch/flow_table.c11
-rw-r--r--kernel/net/openvswitch/vport-geneve.c183
-rw-r--r--kernel/net/openvswitch/vport-gre.c248
-rw-r--r--kernel/net/openvswitch/vport-internal_dev.c145
-rw-r--r--kernel/net/openvswitch/vport-netdev.c154
-rw-r--r--kernel/net/openvswitch/vport-netdev.h15
-rw-r--r--kernel/net/openvswitch/vport-vxlan.c238
-rw-r--r--kernel/net/openvswitch/vport-vxlan.h11
-rw-r--r--kernel/net/openvswitch/vport.c224
-rw-r--r--kernel/net/openvswitch/vport.h80
-rw-r--r--kernel/net/packet/af_packet.c562
-rw-r--r--kernel/net/packet/internal.h18
-rw-r--r--kernel/net/phonet/af_phonet.c6
-rw-r--r--kernel/net/phonet/pep.c2
-rw-r--r--kernel/net/rds/af_rds.c77
-rw-r--r--kernel/net/rds/bind.c129
-rw-r--r--kernel/net/rds/connection.c47
-rw-r--r--kernel/net/rds/ib.c67
-rw-r--r--kernel/net/rds/ib.h113
-rw-r--r--kernel/net/rds/ib_cm.c185
-rw-r--r--kernel/net/rds/ib_rdma.c171
-rw-r--r--kernel/net/rds/ib_recv.c198
-rw-r--r--kernel/net/rds/ib_send.c245
-rw-r--r--kernel/net/rds/ib_stats.c22
-rw-r--r--kernel/net/rds/iw.c14
-rw-r--r--kernel/net/rds/iw.h9
-rw-r--r--kernel/net/rds/iw_cm.c14
-rw-r--r--kernel/net/rds/iw_rdma.c140
-rw-r--r--kernel/net/rds/iw_send.c177
-rw-r--r--kernel/net/rds/rdma.c9
-rw-r--r--kernel/net/rds/rdma_transport.c53
-rw-r--r--kernel/net/rds/rds.h43
-rw-r--r--kernel/net/rds/send.c80
-rw-r--r--kernel/net/rds/tcp.c181
-rw-r--r--kernel/net/rds/tcp.h7
-rw-r--r--kernel/net/rds/tcp_connect.c9
-rw-r--r--kernel/net/rds/tcp_listen.c65
-rw-r--r--kernel/net/rds/tcp_recv.c11
-rw-r--r--kernel/net/rds/tcp_send.c8
-rw-r--r--kernel/net/rds/threads.c2
-rw-r--r--kernel/net/rds/transport.c27
-rw-r--r--kernel/net/rfkill/Kconfig3
-rw-r--r--kernel/net/rfkill/core.c44
-rw-r--r--kernel/net/rfkill/rfkill-gpio.c25
-rw-r--r--kernel/net/rose/af_rose.c7
-rw-r--r--kernel/net/rose/rose_link.c1
-rw-r--r--kernel/net/rose/rose_route.c1
-rw-r--r--kernel/net/rxrpc/af_rxrpc.c4
-rw-r--r--kernel/net/rxrpc/ar-ack.c4
-rw-r--r--kernel/net/rxrpc/ar-connection.c6
-rw-r--r--kernel/net/rxrpc/ar-internal.h4
-rw-r--r--kernel/net/rxrpc/ar-key.c32
-rw-r--r--kernel/net/rxrpc/ar-local.c4
-rw-r--r--kernel/net/rxrpc/ar-output.c4
-rw-r--r--kernel/net/rxrpc/ar-security.c4
-rw-r--r--kernel/net/rxrpc/ar-transport.c4
-rw-r--r--kernel/net/rxrpc/rxkad.c16
-rw-r--r--kernel/net/sched/Kconfig11
-rw-r--r--kernel/net/sched/Makefile1
-rw-r--r--kernel/net/sched/act_api.c52
-rw-r--r--kernel/net/sched/act_bpf.c93
-rw-r--r--kernel/net/sched/act_connmark.c12
-rw-r--r--kernel/net/sched/act_csum.c3
-rw-r--r--kernel/net/sched/act_gact.c44
-rw-r--r--kernel/net/sched/act_ipt.c3
-rw-r--r--kernel/net/sched/act_mirred.c80
-rw-r--r--kernel/net/sched/act_nat.c10
-rw-r--r--kernel/net/sched/act_pedit.c13
-rw-r--r--kernel/net/sched/act_simple.c3
-rw-r--r--kernel/net/sched/act_skbedit.c3
-rw-r--r--kernel/net/sched/act_vlan.c3
-rw-r--r--kernel/net/sched/cls_bpf.c98
-rw-r--r--kernel/net/sched/cls_cgroup.c23
-rw-r--r--kernel/net/sched/cls_flow.c43
-rw-r--r--kernel/net/sched/cls_flower.c697
-rw-r--r--kernel/net/sched/cls_rsvp.h18
-rw-r--r--kernel/net/sched/cls_tcindex.c29
-rw-r--r--kernel/net/sched/em_ipset.c5
-rw-r--r--kernel/net/sched/em_meta.c138
-rw-r--r--kernel/net/sched/sch_api.c90
-rw-r--r--kernel/net/sched/sch_atm.c2
-rw-r--r--kernel/net/sched/sch_blackhole.c15
-rw-r--r--kernel/net/sched/sch_cbq.c2
-rw-r--r--kernel/net/sched/sch_choke.c94
-rw-r--r--kernel/net/sched/sch_codel.c15
-rw-r--r--kernel/net/sched/sch_drr.c2
-rw-r--r--kernel/net/sched/sch_dsmark.c65
-rw-r--r--kernel/net/sched/sch_fifo.c2
-rw-r--r--kernel/net/sched/sch_fq.c13
-rw-r--r--kernel/net/sched/sch_fq_codel.c61
-rw-r--r--kernel/net/sched/sch_generic.c60
-rw-r--r--kernel/net/sched/sch_gred.c26
-rw-r--r--kernel/net/sched/sch_hfsc.c2
-rw-r--r--kernel/net/sched/sch_hhf.c30
-rw-r--r--kernel/net/sched/sch_htb.c8
-rw-r--r--kernel/net/sched/sch_ingress.c59
-rw-r--r--kernel/net/sched/sch_mq.c4
-rw-r--r--kernel/net/sched/sch_mqprio.c4
-rw-r--r--kernel/net/sched/sch_multiq.c2
-rw-r--r--kernel/net/sched/sch_netem.c4
-rw-r--r--kernel/net/sched/sch_plug.c9
-rw-r--r--kernel/net/sched/sch_prio.c2
-rw-r--r--kernel/net/sched/sch_qfq.c6
-rw-r--r--kernel/net/sched/sch_sfb.c28
-rw-r--r--kernel/net/sched/sch_sfq.c31
-rw-r--r--kernel/net/sctp/associola.c22
-rw-r--r--kernel/net/sctp/auth.c4
-rw-r--r--kernel/net/sctp/ipv6.c31
-rw-r--r--kernel/net/sctp/outqueue.c2
-rw-r--r--kernel/net/sctp/protocol.c93
-rw-r--r--kernel/net/sctp/sm_make_chunk.c28
-rw-r--r--kernel/net/sctp/sm_sideeffect.c48
-rw-r--r--kernel/net/sctp/sm_statefuns.c13
-rw-r--r--kernel/net/sctp/socket.c90
-rw-r--r--kernel/net/sctp/sysctl.c2
-rw-r--r--kernel/net/sctp/transport.c2
-rw-r--r--kernel/net/socket.c36
-rw-r--r--kernel/net/sunrpc/Kconfig28
-rw-r--r--kernel/net/sunrpc/Makefile5
-rw-r--r--kernel/net/sunrpc/auth.c2
-rw-r--r--kernel/net/sunrpc/auth_gss/auth_gss.c13
-rw-r--r--kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c8
-rw-r--r--kernel/net/sunrpc/auth_unix.c2
-rw-r--r--kernel/net/sunrpc/backchannel_rqst.c156
-rw-r--r--kernel/net/sunrpc/bc_svc.c63
-rw-r--r--kernel/net/sunrpc/cache.c158
-rw-r--r--kernel/net/sunrpc/clnt.c114
-rw-r--r--kernel/net/sunrpc/debugfs.c78
-rw-r--r--kernel/net/sunrpc/sched.c18
-rw-r--r--kernel/net/sunrpc/svc.c169
-rw-r--r--kernel/net/sunrpc/svc_xprt.c10
-rw-r--r--kernel/net/sunrpc/svcsock.c40
-rw-r--r--kernel/net/sunrpc/sysctl.c23
-rw-r--r--kernel/net/sunrpc/xprt.c7
-rw-r--r--kernel/net/sunrpc/xprtrdma/Makefile15
-rw-r--r--kernel/net/sunrpc/xprtrdma/backchannel.c394
-rw-r--r--kernel/net/sunrpc/xprtrdma/fmr_ops.c120
-rw-r--r--kernel/net/sunrpc/xprtrdma/frwr_ops.c343
-rw-r--r--kernel/net/sunrpc/xprtrdma/module.c46
-rw-r--r--kernel/net/sunrpc/xprtrdma/physical_ops.c31
-rw-r--r--kernel/net/sunrpc/xprtrdma/rpc_rdma.c353
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma.c14
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c140
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c141
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c117
-rw-r--r--kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c256
-rw-r--r--kernel/net/sunrpc/xprtrdma/transport.c149
-rw-r--r--kernel/net/sunrpc/xprtrdma/verbs.c1029
-rw-r--r--kernel/net/sunrpc/xprtrdma/xprt_rdma.h129
-rw-r--r--kernel/net/sunrpc/xprtsock.c501
-rw-r--r--kernel/net/switchdev/switchdev.c1290
-rw-r--r--kernel/net/sysctl_net.c6
-rw-r--r--kernel/net/tipc/addr.c7
-rw-r--r--kernel/net/tipc/addr.h8
-rw-r--r--kernel/net/tipc/bcast.c963
-rw-r--r--kernel/net/tipc/bcast.h120
-rw-r--r--kernel/net/tipc/bearer.c142
-rw-r--r--kernel/net/tipc/bearer.h14
-rw-r--r--kernel/net/tipc/core.c13
-rw-r--r--kernel/net/tipc/core.h55
-rw-r--r--kernel/net/tipc/discover.c150
-rw-r--r--kernel/net/tipc/link.c2575
-rw-r--r--kernel/net/tipc/link.h234
-rw-r--r--kernel/net/tipc/msg.c201
-rw-r--r--kernel/net/tipc/msg.h133
-rw-r--r--kernel/net/tipc/name_distr.c6
-rw-r--r--kernel/net/tipc/name_table.c34
-rw-r--r--kernel/net/tipc/net.c7
-rw-r--r--kernel/net/tipc/netlink_compat.c139
-rw-r--r--kernel/net/tipc/node.c1131
-rw-r--r--kernel/net/tipc/node.h127
-rw-r--r--kernel/net/tipc/server.c6
-rw-r--r--kernel/net/tipc/socket.c409
-rw-r--r--kernel/net/tipc/socket.h2
-rw-r--r--kernel/net/tipc/subscr.c251
-rw-r--r--kernel/net/tipc/subscr.h18
-rw-r--r--kernel/net/tipc/udp_media.c29
-rw-r--r--kernel/net/unix/af_unix.c678
-rw-r--r--kernel/net/unix/diag.c2
-rw-r--r--kernel/net/unix/garbage.c17
-rw-r--r--kernel/net/vmw_vsock/af_vsock.c51
-rw-r--r--kernel/net/vmw_vsock/vmci_transport.c181
-rw-r--r--kernel/net/vmw_vsock/vmci_transport.h4
-rw-r--r--kernel/net/wimax/op-rfkill.c3
-rw-r--r--kernel/net/wireless/Kconfig10
-rw-r--r--kernel/net/wireless/chan.c100
-rw-r--r--kernel/net/wireless/core.c13
-rw-r--r--kernel/net/wireless/core.h7
-rw-r--r--kernel/net/wireless/mlme.c75
-rw-r--r--kernel/net/wireless/nl80211.c566
-rw-r--r--kernel/net/wireless/rdev-ops.h2
-rw-r--r--kernel/net/wireless/reg.c379
-rw-r--r--kernel/net/wireless/scan.c61
-rw-r--r--kernel/net/wireless/sme.c4
-rw-r--r--kernel/net/wireless/sysfs.c14
-rw-r--r--kernel/net/wireless/trace.h33
-rw-r--r--kernel/net/wireless/util.c3
-rw-r--r--kernel/net/wireless/wext-core.c52
-rw-r--r--kernel/net/x25/af_x25.c8
-rw-r--r--kernel/net/xfrm/xfrm_algo.c28
-rw-r--r--kernel/net/xfrm/xfrm_input.c16
-rw-r--r--kernel/net/xfrm/xfrm_output.c34
-rw-r--r--kernel/net/xfrm/xfrm_policy.c173
-rw-r--r--kernel/net/xfrm/xfrm_state.c4
-rw-r--r--kernel/net/xfrm/xfrm_user.c57
848 files changed, 49510 insertions, 27151 deletions
diff --git a/kernel/net/6lowpan/Makefile b/kernel/net/6lowpan/Makefile
index eb8baa72a..c6ffc55ee 100644
--- a/kernel/net/6lowpan/Makefile
+++ b/kernel/net/6lowpan/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_6LOWPAN) += 6lowpan.o
-6lowpan-y := iphc.o nhc.o
+6lowpan-y := core.o iphc.o nhc.o
#rfc6282 nhcs
obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o
diff --git a/kernel/net/6lowpan/core.c b/kernel/net/6lowpan/core.c
new file mode 100644
index 000000000..83b19e072
--- /dev/null
+++ b/kernel/net/6lowpan/core.c
@@ -0,0 +1,45 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <linux/module.h>
+
+#include <net/6lowpan.h>
+
+void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype)
+{
+ dev->addr_len = EUI64_ADDR_LEN;
+ dev->type = ARPHRD_6LOWPAN;
+ dev->mtu = IPV6_MIN_MTU;
+ dev->priv_flags |= IFF_NO_QUEUE;
+
+ lowpan_priv(dev)->lltype = lltype;
+}
+EXPORT_SYMBOL(lowpan_netdev_setup);
+
+static int __init lowpan_module_init(void)
+{
+ request_module_nowait("ipv6");
+
+ request_module_nowait("nhc_dest");
+ request_module_nowait("nhc_fragment");
+ request_module_nowait("nhc_hop");
+ request_module_nowait("nhc_ipv6");
+ request_module_nowait("nhc_mobility");
+ request_module_nowait("nhc_routing");
+ request_module_nowait("nhc_udp");
+
+ return 0;
+}
+module_init(lowpan_module_init);
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/6lowpan/iphc.c b/kernel/net/6lowpan/iphc.c
index 94a375c04..346b5c1a9 100644
--- a/kernel/net/6lowpan/iphc.c
+++ b/kernel/net/6lowpan/iphc.c
@@ -48,38 +48,179 @@
#include <linux/bitops.h>
#include <linux/if_arp.h>
-#include <linux/module.h>
#include <linux/netdevice.h>
+
#include <net/6lowpan.h>
#include <net/ipv6.h>
-#include <net/af_ieee802154.h>
+
+/* special link-layer handling */
+#include <net/mac802154.h>
#include "nhc.h"
+/* Values of fields within the IPHC encoding first byte */
+#define LOWPAN_IPHC_TF_MASK 0x18
+#define LOWPAN_IPHC_TF_00 0x00
+#define LOWPAN_IPHC_TF_01 0x08
+#define LOWPAN_IPHC_TF_10 0x10
+#define LOWPAN_IPHC_TF_11 0x18
+
+#define LOWPAN_IPHC_NH 0x04
+
+#define LOWPAN_IPHC_HLIM_MASK 0x03
+#define LOWPAN_IPHC_HLIM_00 0x00
+#define LOWPAN_IPHC_HLIM_01 0x01
+#define LOWPAN_IPHC_HLIM_10 0x02
+#define LOWPAN_IPHC_HLIM_11 0x03
+
+/* Values of fields within the IPHC encoding second byte */
+#define LOWPAN_IPHC_CID 0x80
+
+#define LOWPAN_IPHC_SAC 0x40
+
+#define LOWPAN_IPHC_SAM_MASK 0x30
+#define LOWPAN_IPHC_SAM_00 0x00
+#define LOWPAN_IPHC_SAM_01 0x10
+#define LOWPAN_IPHC_SAM_10 0x20
+#define LOWPAN_IPHC_SAM_11 0x30
+
+#define LOWPAN_IPHC_M 0x08
+
+#define LOWPAN_IPHC_DAC 0x04
+
+#define LOWPAN_IPHC_DAM_MASK 0x03
+#define LOWPAN_IPHC_DAM_00 0x00
+#define LOWPAN_IPHC_DAM_01 0x01
+#define LOWPAN_IPHC_DAM_10 0x02
+#define LOWPAN_IPHC_DAM_11 0x03
+
+/* ipv6 address based on mac
+ * second bit-flip (Universe/Local) is done according RFC2464
+ */
+#define is_addr_mac_addr_based(a, m) \
+ ((((a)->s6_addr[8]) == (((m)[0]) ^ 0x02)) && \
+ (((a)->s6_addr[9]) == (m)[1]) && \
+ (((a)->s6_addr[10]) == (m)[2]) && \
+ (((a)->s6_addr[11]) == (m)[3]) && \
+ (((a)->s6_addr[12]) == (m)[4]) && \
+ (((a)->s6_addr[13]) == (m)[5]) && \
+ (((a)->s6_addr[14]) == (m)[6]) && \
+ (((a)->s6_addr[15]) == (m)[7]))
+
+/* check whether we can compress the IID to 16 bits,
+ * it's possible for unicast addresses with first 49 bits are zero only.
+ */
+#define lowpan_is_iid_16_bit_compressable(a) \
+ ((((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr[10]) == 0) && \
+ (((a)->s6_addr[11]) == 0xff) && \
+ (((a)->s6_addr[12]) == 0xfe) && \
+ (((a)->s6_addr[13]) == 0))
+
+/* check whether the 112-bit gid of the multicast address is mappable to: */
+
+/* 48 bits, FFXX::00XX:XXXX:XXXX */
+#define lowpan_is_mcast_addr_compressable48(a) \
+ ((((a)->s6_addr16[1]) == 0) && \
+ (((a)->s6_addr16[2]) == 0) && \
+ (((a)->s6_addr16[3]) == 0) && \
+ (((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr[10]) == 0))
+
+/* 32 bits, FFXX::00XX:XXXX */
+#define lowpan_is_mcast_addr_compressable32(a) \
+ ((((a)->s6_addr16[1]) == 0) && \
+ (((a)->s6_addr16[2]) == 0) && \
+ (((a)->s6_addr16[3]) == 0) && \
+ (((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr16[5]) == 0) && \
+ (((a)->s6_addr[12]) == 0))
+
+/* 8 bits, FF02::00XX */
+#define lowpan_is_mcast_addr_compressable8(a) \
+ ((((a)->s6_addr[1]) == 2) && \
+ (((a)->s6_addr16[1]) == 0) && \
+ (((a)->s6_addr16[2]) == 0) && \
+ (((a)->s6_addr16[3]) == 0) && \
+ (((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr16[5]) == 0) && \
+ (((a)->s6_addr16[6]) == 0) && \
+ (((a)->s6_addr[14]) == 0))
+
+static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ /* fe:80::XXXX:XXXX:XXXX:XXXX
+ * \_________________/
+ * hwaddr
+ */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ memcpy(&ipaddr->s6_addr[8], lladdr, EUI64_ADDR_LEN);
+ /* second bit-flip (Universe/Local)
+ * is done according RFC2464
+ */
+ ipaddr->s6_addr[8] ^= 0x02;
+}
+
+static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ u8 eui64[EUI64_ADDR_LEN] = { };
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(eui64, &addr->extended_addr);
+ iphc_uncompress_eui64_lladdr(ipaddr, eui64);
+ break;
+ case IEEE802154_ADDR_SHORT:
+ /* fe:80::ff:fe00:XXXX
+ * \__/
+ * short_addr
+ *
+ * Universe/Local bit is zero.
+ */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&ipaddr->s6_addr16[7],
+ &addr->short_addr);
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
/* Uncompress address function for source and
* destination address(non-multicast).
*
- * address_mode is sam value or dam value.
+ * address_mode is the masked value for sam or dam value
*/
-static int uncompress_addr(struct sk_buff *skb,
- struct in6_addr *ipaddr, const u8 address_mode,
- const u8 *lladdr, const u8 addr_type,
- const u8 addr_len)
+static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
+ struct in6_addr *ipaddr, u8 address_mode,
+ const void *lladdr)
{
bool fail;
switch (address_mode) {
- case LOWPAN_IPHC_ADDR_00:
+ /* SAM and DAM are the same here */
+ case LOWPAN_IPHC_DAM_00:
/* for global link addresses */
fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
break;
- case LOWPAN_IPHC_ADDR_01:
+ case LOWPAN_IPHC_SAM_01:
+ case LOWPAN_IPHC_DAM_01:
/* fe:80::XXXX:XXXX:XXXX:XXXX */
ipaddr->s6_addr[0] = 0xFE;
ipaddr->s6_addr[1] = 0x80;
fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
break;
- case LOWPAN_IPHC_ADDR_02:
+ case LOWPAN_IPHC_SAM_10:
+ case LOWPAN_IPHC_DAM_10:
/* fe:80::ff:fe00:XXXX */
ipaddr->s6_addr[0] = 0xFE;
ipaddr->s6_addr[1] = 0x80;
@@ -87,38 +228,16 @@ static int uncompress_addr(struct sk_buff *skb,
ipaddr->s6_addr[12] = 0xFE;
fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
break;
- case LOWPAN_IPHC_ADDR_03:
+ case LOWPAN_IPHC_SAM_11:
+ case LOWPAN_IPHC_DAM_11:
fail = false;
- switch (addr_type) {
- case IEEE802154_ADDR_LONG:
- /* fe:80::XXXX:XXXX:XXXX:XXXX
- * \_________________/
- * hwaddr
- */
- ipaddr->s6_addr[0] = 0xFE;
- ipaddr->s6_addr[1] = 0x80;
- memcpy(&ipaddr->s6_addr[8], lladdr, addr_len);
- /* second bit-flip (Universe/Local)
- * is done according RFC2464
- */
- ipaddr->s6_addr[8] ^= 0x02;
- break;
- case IEEE802154_ADDR_SHORT:
- /* fe:80::ff:fe00:XXXX
- * \__/
- * short_addr
- *
- * Universe/Local bit is zero.
- */
- ipaddr->s6_addr[0] = 0xFE;
- ipaddr->s6_addr[1] = 0x80;
- ipaddr->s6_addr[11] = 0xFF;
- ipaddr->s6_addr[12] = 0xFE;
- ipaddr->s6_addr16[7] = htons(*((u16 *)lladdr));
+ switch (lowpan_priv(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ iphc_uncompress_802154_lladdr(ipaddr, lladdr);
break;
default:
- pr_debug("Invalid addr_type set\n");
- return -EINVAL;
+ iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+ break;
}
break;
default:
@@ -142,24 +261,25 @@ static int uncompress_addr(struct sk_buff *skb,
*/
static int uncompress_context_based_src_addr(struct sk_buff *skb,
struct in6_addr *ipaddr,
- const u8 sam)
+ u8 address_mode)
{
- switch (sam) {
- case LOWPAN_IPHC_ADDR_00:
+ switch (address_mode) {
+ case LOWPAN_IPHC_SAM_00:
/* unspec address ::
* Do nothing, address is already ::
*/
break;
- case LOWPAN_IPHC_ADDR_01:
+ case LOWPAN_IPHC_SAM_01:
/* TODO */
- case LOWPAN_IPHC_ADDR_02:
+ case LOWPAN_IPHC_SAM_10:
/* TODO */
- case LOWPAN_IPHC_ADDR_03:
+ case LOWPAN_IPHC_SAM_11:
/* TODO */
- netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam);
+ netdev_warn(skb->dev, "SAM value 0x%x not supported\n",
+ address_mode);
return -EINVAL;
default:
- pr_debug("Invalid sam value: 0x%x\n", sam);
+ pr_debug("Invalid sam value: 0x%x\n", address_mode);
return -EINVAL;
}
@@ -175,11 +295,11 @@ static int uncompress_context_based_src_addr(struct sk_buff *skb,
*/
static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
struct in6_addr *ipaddr,
- const u8 dam)
+ u8 address_mode)
{
bool fail;
- switch (dam) {
+ switch (address_mode) {
case LOWPAN_IPHC_DAM_00:
/* 00: 128 bits. The full address
* is carried in-line.
@@ -211,7 +331,7 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1);
break;
default:
- pr_debug("DAM value has a wrong value: 0x%x\n", dam);
+ pr_debug("DAM value has a wrong value: 0x%x\n", address_mode);
return -EINVAL;
}
@@ -226,77 +346,142 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
return 0;
}
-/* TTL uncompression values */
-static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 };
-
-int
-lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
- const u8 *saddr, const u8 saddr_type,
- const u8 saddr_len, const u8 *daddr,
- const u8 daddr_type, const u8 daddr_len,
- u8 iphc0, u8 iphc1)
+/* get the ecn values from iphc tf format and set it to ipv6hdr */
+static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf)
{
- struct ipv6hdr hdr = {};
- u8 tmp, num_context = 0;
- int err;
+ /* get the two higher bits which is ecn */
+ u8 ecn = tf[0] & 0xc0;
- raw_dump_table(__func__, "raw skb data dump uncompressed",
- skb->data, skb->len);
+ /* ECN takes 0x30 in hdr->flow_lbl[0] */
+ hdr->flow_lbl[0] |= (ecn >> 2);
+}
- /* another if the CID flag is set */
- if (iphc1 & LOWPAN_IPHC_CID) {
- pr_debug("CID flag is set, increase header with one\n");
- if (lowpan_fetch_skb(skb, &num_context, sizeof(num_context)))
- return -EINVAL;
- }
+/* get the dscp values from iphc tf format and set it to ipv6hdr */
+static inline void lowpan_iphc_tf_set_dscp(struct ipv6hdr *hdr, const u8 *tf)
+{
+ /* DSCP is at place after ECN */
+ u8 dscp = tf[0] & 0x3f;
- hdr.version = 6;
+ /* The four highest bits need to be set at hdr->priority */
+ hdr->priority |= ((dscp & 0x3c) >> 2);
+ /* The two lower bits is part of hdr->flow_lbl[0] */
+ hdr->flow_lbl[0] |= ((dscp & 0x03) << 6);
+}
- /* Traffic Class and Flow Label */
- switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) {
- /* Traffic Class and FLow Label carried in-line
- * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes)
+/* get the flow label values from iphc tf format and set it to ipv6hdr */
+static inline void lowpan_iphc_tf_set_lbl(struct ipv6hdr *hdr, const u8 *lbl)
+{
+ /* flow label is always some array started with lower nibble of
+ * flow_lbl[0] and followed with two bytes afterwards. Inside inline
+ * data the flow_lbl position can be different, which will be handled
+ * by lbl pointer. E.g. case "01" vs "00" the traffic class is 8 bit
+ * shifted, the different lbl pointer will handle that.
+ *
+ * The flow label will started at lower nibble of flow_lbl[0], the
+ * higher nibbles are part of DSCP + ECN.
*/
- case 0: /* 00b */
- if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp)))
+ hdr->flow_lbl[0] |= lbl[0] & 0x0f;
+ memcpy(&hdr->flow_lbl[1], &lbl[1], 2);
+}
+
+/* lowpan_iphc_tf_decompress - decompress the traffic class.
+ * This function will return zero on success, a value lower than zero if
+ * failed.
+ */
+static int lowpan_iphc_tf_decompress(struct sk_buff *skb, struct ipv6hdr *hdr,
+ u8 val)
+{
+ u8 tf[4];
+
+ /* Traffic Class and Flow Label */
+ switch (val) {
+ case LOWPAN_IPHC_TF_00:
+ /* ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) */
+ if (lowpan_fetch_skb(skb, tf, 4))
return -EINVAL;
- memcpy(&hdr.flow_lbl, &skb->data[0], 3);
- skb_pull(skb, 3);
- hdr.priority = ((tmp >> 2) & 0x0f);
- hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) |
- (hdr.flow_lbl[0] & 0x0f);
+ /* 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN| DSCP | rsv | Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ lowpan_iphc_tf_set_ecn(hdr, tf);
+ lowpan_iphc_tf_set_dscp(hdr, tf);
+ lowpan_iphc_tf_set_lbl(hdr, &tf[1]);
break;
- /* Traffic class carried in-line
- * ECN + DSCP (1 byte), Flow Label is elided
- */
- case 2: /* 10b */
- if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp)))
+ case LOWPAN_IPHC_TF_01:
+ /* ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided. */
+ if (lowpan_fetch_skb(skb, tf, 3))
return -EINVAL;
- hdr.priority = ((tmp >> 2) & 0x0f);
- hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30);
+ /* 1 2
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN|rsv| Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ lowpan_iphc_tf_set_ecn(hdr, tf);
+ lowpan_iphc_tf_set_lbl(hdr, &tf[0]);
break;
- /* Flow Label carried in-line
- * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided
- */
- case 1: /* 01b */
- if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp)))
+ case LOWPAN_IPHC_TF_10:
+ /* ECN + DSCP (1 byte), Flow Label is elided. */
+ if (lowpan_fetch_skb(skb, tf, 1))
return -EINVAL;
- hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30);
- memcpy(&hdr.flow_lbl[1], &skb->data[0], 2);
- skb_pull(skb, 2);
+ /* 0 1 2 3 4 5 6 7
+ * +-+-+-+-+-+-+-+-+
+ * |ECN| DSCP |
+ * +-+-+-+-+-+-+-+-+
+ */
+ lowpan_iphc_tf_set_ecn(hdr, tf);
+ lowpan_iphc_tf_set_dscp(hdr, tf);
break;
- /* Traffic Class and Flow Label are elided */
- case 3: /* 11b */
+ case LOWPAN_IPHC_TF_11:
+ /* Traffic Class and Flow Label are elided */
break;
default:
- break;
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
+ return 0;
+}
+
+/* TTL uncompression values */
+static const u8 lowpan_ttl_values[] = {
+ [LOWPAN_IPHC_HLIM_01] = 1,
+ [LOWPAN_IPHC_HLIM_10] = 64,
+ [LOWPAN_IPHC_HLIM_11] = 255,
+};
+
+int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
+ const void *daddr, const void *saddr)
+{
+ struct ipv6hdr hdr = {};
+ u8 iphc0, iphc1;
+ int err;
+
+ raw_dump_table(__func__, "raw skb data dump uncompressed",
+ skb->data, skb->len);
+
+ if (lowpan_fetch_skb(skb, &iphc0, sizeof(iphc0)) ||
+ lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1)))
+ return -EINVAL;
+
+ /* another if the CID flag is set */
+ if (iphc1 & LOWPAN_IPHC_CID)
+ return -ENOTSUPP;
+
+ hdr.version = 6;
+
+ err = lowpan_iphc_tf_decompress(skb, &hdr,
+ iphc0 & LOWPAN_IPHC_TF_MASK);
+ if (err < 0)
+ return err;
+
/* Next Header */
- if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) {
+ if (!(iphc0 & LOWPAN_IPHC_NH)) {
/* Next header is carried inline */
if (lowpan_fetch_skb(skb, &hdr.nexthdr, sizeof(hdr.nexthdr)))
return -EINVAL;
@@ -306,35 +491,30 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
}
/* Hop Limit */
- if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) {
- hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03];
+ if ((iphc0 & LOWPAN_IPHC_HLIM_MASK) != LOWPAN_IPHC_HLIM_00) {
+ hdr.hop_limit = lowpan_ttl_values[iphc0 & LOWPAN_IPHC_HLIM_MASK];
} else {
if (lowpan_fetch_skb(skb, &hdr.hop_limit,
sizeof(hdr.hop_limit)))
return -EINVAL;
}
- /* Extract SAM to the tmp variable */
- tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03;
-
if (iphc1 & LOWPAN_IPHC_SAC) {
/* Source address context based uncompression */
pr_debug("SAC bit is set. Handle context based source address.\n");
- err = uncompress_context_based_src_addr(skb, &hdr.saddr, tmp);
+ err = uncompress_context_based_src_addr(skb, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK);
} else {
/* Source address uncompression */
pr_debug("source address stateless compression\n");
- err = uncompress_addr(skb, &hdr.saddr, tmp, saddr,
- saddr_type, saddr_len);
+ err = uncompress_addr(skb, dev, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK, saddr);
}
/* Check on error of previous branch */
if (err)
return -EINVAL;
- /* Extract DAM to the tmp variable */
- tmp = ((iphc1 & LOWPAN_IPHC_DAM_11) >> LOWPAN_IPHC_DAM_BIT) & 0x03;
-
/* check for Multicast Compression */
if (iphc1 & LOWPAN_IPHC_M) {
if (iphc1 & LOWPAN_IPHC_DAC) {
@@ -342,22 +522,22 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
/* TODO: implement this */
} else {
err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
- tmp);
+ iphc1 & LOWPAN_IPHC_DAM_MASK);
if (err)
return -EINVAL;
}
} else {
- err = uncompress_addr(skb, &hdr.daddr, tmp, daddr,
- daddr_type, daddr_len);
+ err = uncompress_addr(skb, dev, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
pr_debug("dest: stateless compression mode %d dest %pI6c\n",
- tmp, &hdr.daddr);
+ iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr);
if (err)
return -EINVAL;
}
/* Next header data uncompression */
- if (iphc0 & LOWPAN_IPHC_NH_C) {
+ if (iphc0 & LOWPAN_IPHC_NH) {
err = lowpan_nhc_do_uncompression(skb, dev, &hdr);
if (err < 0)
return err;
@@ -367,7 +547,18 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
return err;
}
- hdr.payload_len = htons(skb->len);
+ switch (lowpan_priv(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_802154_cb(skb)->d_size)
+ hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size -
+ sizeof(struct ipv6hdr));
+ else
+ hdr.payload_len = htons(skb->len);
+ break;
+ default:
+ hdr.payload_len = htons(skb->len);
+ break;
+ }
pr_debug("skb headroom size = %d, data length = %d\n",
skb_headroom(skb), skb->len);
@@ -387,42 +578,176 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
}
EXPORT_SYMBOL_GPL(lowpan_header_decompress);
-static u8 lowpan_compress_addr_64(u8 **hc_ptr, u8 shift,
- const struct in6_addr *ipaddr,
- const unsigned char *lladdr)
+static const u8 lowpan_iphc_dam_to_sam_value[] = {
+ [LOWPAN_IPHC_DAM_00] = LOWPAN_IPHC_SAM_00,
+ [LOWPAN_IPHC_DAM_01] = LOWPAN_IPHC_SAM_01,
+ [LOWPAN_IPHC_DAM_10] = LOWPAN_IPHC_SAM_10,
+ [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
+};
+
+static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr,
+ const unsigned char *lladdr, bool sam)
{
- u8 val = 0;
+ u8 dam = LOWPAN_IPHC_DAM_00;
if (is_addr_mac_addr_based(ipaddr, lladdr)) {
- val = 3; /* 0-bits */
+ dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
pr_debug("address compression 0 bits\n");
} else if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
/* compress IID to 16 bits xxxx::XXXX */
lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2);
- val = 2; /* 16-bits */
+ dam = LOWPAN_IPHC_DAM_10; /* 16-bits */
raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)",
*hc_ptr - 2, 2);
} else {
/* do not compress IID => xxxx::IID */
lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
- val = 1; /* 64-bits */
+ dam = LOWPAN_IPHC_DAM_01; /* 64-bits */
raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
*hc_ptr - 8, 8);
}
- return rol8(val, shift);
+ if (sam)
+ return lowpan_iphc_dam_to_sam_value[dam];
+ else
+ return dam;
+}
+
+/* lowpan_iphc_get_tc - get the ECN + DCSP fields in hc format */
+static inline u8 lowpan_iphc_get_tc(const struct ipv6hdr *hdr)
+{
+ u8 dscp, ecn;
+
+ /* hdr->priority contains the higher bits of dscp, lower are part of
+ * flow_lbl[0]. Note ECN, DCSP is swapped in ipv6 hdr.
+ */
+ dscp = (hdr->priority << 2) | ((hdr->flow_lbl[0] & 0xc0) >> 6);
+ /* ECN is at the two lower bits from first nibble of flow_lbl[0] */
+ ecn = (hdr->flow_lbl[0] & 0x30);
+ /* for pretty debug output, also shift ecn to get the ecn value */
+ pr_debug("ecn 0x%02x dscp 0x%02x\n", ecn >> 4, dscp);
+ /* ECN is at 0x30 now, shift it to have ECN + DCSP */
+ return (ecn << 2) | dscp;
+}
+
+/* lowpan_iphc_is_flow_lbl_zero - check if flow label is zero */
+static inline bool lowpan_iphc_is_flow_lbl_zero(const struct ipv6hdr *hdr)
+{
+ return ((!(hdr->flow_lbl[0] & 0x0f)) &&
+ !hdr->flow_lbl[1] && !hdr->flow_lbl[2]);
+}
+
+/* lowpan_iphc_tf_compress - compress the traffic class which is set by
+ * ipv6hdr. Return the corresponding format identifier which is used.
+ */
+static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr)
+{
+ /* get ecn dscp data in a byteformat as: ECN(hi) + DSCP(lo) */
+ u8 tc = lowpan_iphc_get_tc(hdr), tf[4], val;
+
+ /* printout the traffic class in hc format */
+ pr_debug("tc 0x%02x\n", tc);
+
+ if (lowpan_iphc_is_flow_lbl_zero(hdr)) {
+ if (!tc) {
+ /* 11: Traffic Class and Flow Label are elided. */
+ val = LOWPAN_IPHC_TF_11;
+ } else {
+ /* 10: ECN + DSCP (1 byte), Flow Label is elided.
+ *
+ * 0 1 2 3 4 5 6 7
+ * +-+-+-+-+-+-+-+-+
+ * |ECN| DSCP |
+ * +-+-+-+-+-+-+-+-+
+ */
+ lowpan_push_hc_data(hc_ptr, &tc, sizeof(tc));
+ val = LOWPAN_IPHC_TF_10;
+ }
+ } else {
+ /* check if dscp is zero, it's after the first two bit */
+ if (!(tc & 0x3f)) {
+ /* 01: ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided
+ *
+ * 1 2
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN|rsv| Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ memcpy(&tf[0], &hdr->flow_lbl[0], 3);
+ /* zero the highest 4-bits, contains DCSP + ECN */
+ tf[0] &= ~0xf0;
+ /* set ECN */
+ tf[0] |= (tc & 0xc0);
+
+ lowpan_push_hc_data(hc_ptr, tf, 3);
+ val = LOWPAN_IPHC_TF_01;
+ } else {
+ /* 00: ECN + DSCP + 4-bit Pad + Flow Label (4 bytes)
+ *
+ * 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN| DSCP | rsv | Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ memcpy(&tf[0], &tc, sizeof(tc));
+ /* highest nibble of flow_lbl[0] is part of DSCP + ECN
+ * which will be the 4-bit pad and will be filled with
+ * zeros afterwards.
+ */
+ memcpy(&tf[1], &hdr->flow_lbl[0], 3);
+ /* zero the 4-bit pad, which is reserved */
+ tf[1] &= ~0xf0;
+
+ lowpan_push_hc_data(hc_ptr, tf, 4);
+ val = LOWPAN_IPHC_TF_00;
+ }
+ }
+
+ return val;
}
-int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
- unsigned short type, const void *_daddr,
- const void *_saddr, unsigned int len)
+static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
+ const struct in6_addr *ipaddr)
{
- u8 tmp, iphc0, iphc1, *hc_ptr;
+ u8 val;
+
+ if (lowpan_is_mcast_addr_compressable8(ipaddr)) {
+ pr_debug("compressed to 1 octet\n");
+ /* use last byte */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[15], 1);
+ val = LOWPAN_IPHC_DAM_11;
+ } else if (lowpan_is_mcast_addr_compressable32(ipaddr)) {
+ pr_debug("compressed to 4 octets\n");
+ /* second byte + the last three */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[1], 1);
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[13], 3);
+ val = LOWPAN_IPHC_DAM_10;
+ } else if (lowpan_is_mcast_addr_compressable48(ipaddr)) {
+ pr_debug("compressed to 6 octets\n");
+ /* second byte + the last five */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[1], 1);
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[11], 5);
+ val = LOWPAN_IPHC_DAM_01;
+ } else {
+ pr_debug("using full address\n");
+ lowpan_push_hc_data(hc_ptr, ipaddr->s6_addr, 16);
+ val = LOWPAN_IPHC_DAM_00;
+ }
+
+ return val;
+}
+
+int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
+ const void *daddr, const void *saddr)
+{
+ u8 iphc0, iphc1, *hc_ptr;
struct ipv6hdr *hdr;
- u8 head[100] = {};
+ u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {};
int ret, addr_type;
- if (type != ETH_P_IPV6)
+ if (skb->protocol != htons(ETH_P_IPV6))
return -EINVAL;
hdr = ipv6_hdr(skb);
@@ -446,63 +771,26 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
/* TODO: context lookup */
- raw_dump_inline(__func__, "saddr",
- (unsigned char *)_saddr, IEEE802154_ADDR_LEN);
- raw_dump_inline(__func__, "daddr",
- (unsigned char *)_daddr, IEEE802154_ADDR_LEN);
+ raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN);
+ raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN);
raw_dump_table(__func__, "sending raw skb network uncompressed packet",
skb->data, skb->len);
- /* Traffic class, flow label
- * If flow label is 0, compress it. If traffic class is 0, compress it
- * We have to process both in the same time as the offset of traffic
- * class depends on the presence of version and flow label
- */
-
- /* hc format of TC is ECN | DSCP , original one is DSCP | ECN */
- tmp = (hdr->priority << 4) | (hdr->flow_lbl[0] >> 4);
- tmp = ((tmp & 0x03) << 6) | (tmp >> 2);
-
- if (((hdr->flow_lbl[0] & 0x0F) == 0) &&
- (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) {
- /* flow label can be compressed */
- iphc0 |= LOWPAN_IPHC_FL_C;
- if ((hdr->priority == 0) &&
- ((hdr->flow_lbl[0] & 0xF0) == 0)) {
- /* compress (elide) all */
- iphc0 |= LOWPAN_IPHC_TC_C;
- } else {
- /* compress only the flow label */
- *hc_ptr = tmp;
- hc_ptr += 1;
- }
- } else {
- /* Flow label cannot be compressed */
- if ((hdr->priority == 0) &&
- ((hdr->flow_lbl[0] & 0xF0) == 0)) {
- /* compress only traffic class */
- iphc0 |= LOWPAN_IPHC_TC_C;
- *hc_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F);
- memcpy(hc_ptr + 1, &hdr->flow_lbl[1], 2);
- hc_ptr += 3;
- } else {
- /* compress nothing */
- memcpy(hc_ptr, hdr, 4);
- /* replace the top byte with new ECN | DSCP format */
- *hc_ptr = tmp;
- hc_ptr += 4;
- }
- }
+ /* Traffic Class, Flow Label compression */
+ iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr);
/* NOTE: payload length is always compressed */
/* Check if we provide the nhc format for nexthdr and compression
* functionality. If not nexthdr is handled inline and not compressed.
*/
- ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr, &iphc0);
- if (ret < 0)
- return ret;
+ ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr);
+ if (ret == -ENOENT)
+ lowpan_push_hc_data(&hc_ptr, &hdr->nexthdr,
+ sizeof(hdr->nexthdr));
+ else
+ iphc0 |= LOWPAN_IPHC_NH;
/* Hop limit
* if 1: compress, encoding is 01
@@ -512,13 +800,13 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
*/
switch (hdr->hop_limit) {
case 1:
- iphc0 |= LOWPAN_IPHC_TTL_1;
+ iphc0 |= LOWPAN_IPHC_HLIM_01;
break;
case 64:
- iphc0 |= LOWPAN_IPHC_TTL_64;
+ iphc0 |= LOWPAN_IPHC_HLIM_10;
break;
case 255:
- iphc0 |= LOWPAN_IPHC_TTL_255;
+ iphc0 |= LOWPAN_IPHC_HLIM_11;
break;
default:
lowpan_push_hc_data(&hc_ptr, &hdr->hop_limit,
@@ -532,9 +820,8 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
iphc1 |= LOWPAN_IPHC_SAC;
} else {
if (addr_type & IPV6_ADDR_LINKLOCAL) {
- iphc1 |= lowpan_compress_addr_64(&hc_ptr,
- LOWPAN_IPHC_SAM_BIT,
- &hdr->saddr, _saddr);
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr,
+ saddr, true);
pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
&hdr->saddr, iphc1);
} else {
@@ -548,38 +835,12 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
if (addr_type & IPV6_ADDR_MULTICAST) {
pr_debug("destination address is multicast: ");
iphc1 |= LOWPAN_IPHC_M;
- if (lowpan_is_mcast_addr_compressable8(&hdr->daddr)) {
- pr_debug("compressed to 1 octet\n");
- iphc1 |= LOWPAN_IPHC_DAM_11;
- /* use last byte */
- lowpan_push_hc_data(&hc_ptr,
- &hdr->daddr.s6_addr[15], 1);
- } else if (lowpan_is_mcast_addr_compressable32(&hdr->daddr)) {
- pr_debug("compressed to 4 octets\n");
- iphc1 |= LOWPAN_IPHC_DAM_10;
- /* second byte + the last three */
- lowpan_push_hc_data(&hc_ptr,
- &hdr->daddr.s6_addr[1], 1);
- lowpan_push_hc_data(&hc_ptr,
- &hdr->daddr.s6_addr[13], 3);
- } else if (lowpan_is_mcast_addr_compressable48(&hdr->daddr)) {
- pr_debug("compressed to 6 octets\n");
- iphc1 |= LOWPAN_IPHC_DAM_01;
- /* second byte + the last five */
- lowpan_push_hc_data(&hc_ptr,
- &hdr->daddr.s6_addr[1], 1);
- lowpan_push_hc_data(&hc_ptr,
- &hdr->daddr.s6_addr[11], 5);
- } else {
- pr_debug("using full address\n");
- iphc1 |= LOWPAN_IPHC_DAM_00;
- lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16);
- }
+ iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr);
} else {
if (addr_type & IPV6_ADDR_LINKLOCAL) {
/* TODO: context lookup */
- iphc1 |= lowpan_compress_addr_64(&hc_ptr,
- LOWPAN_IPHC_DAM_BIT, &hdr->daddr, _daddr);
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr,
+ daddr, false);
pr_debug("dest address unicast link-local %pI6c "
"iphc1 0x%02x\n", &hdr->daddr, iphc1);
} else {
@@ -589,7 +850,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
}
/* next header compression */
- if (iphc0 & LOWPAN_IPHC_NH_C) {
+ if (iphc0 & LOWPAN_IPHC_NH) {
ret = lowpan_nhc_do_compression(skb, hdr, &hc_ptr);
if (ret < 0)
return ret;
@@ -610,19 +871,3 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
return 0;
}
EXPORT_SYMBOL_GPL(lowpan_header_compress);
-
-static int __init lowpan_module_init(void)
-{
- request_module_nowait("nhc_dest");
- request_module_nowait("nhc_fragment");
- request_module_nowait("nhc_hop");
- request_module_nowait("nhc_ipv6");
- request_module_nowait("nhc_mobility");
- request_module_nowait("nhc_routing");
- request_module_nowait("nhc_udp");
-
- return 0;
-}
-module_init(lowpan_module_init);
-
-MODULE_LICENSE("GPL");
diff --git a/kernel/net/6lowpan/nhc.c b/kernel/net/6lowpan/nhc.c
index fd20fc51a..7008d53e4 100644
--- a/kernel/net/6lowpan/nhc.c
+++ b/kernel/net/6lowpan/nhc.c
@@ -95,23 +95,20 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
}
int lowpan_nhc_check_compression(struct sk_buff *skb,
- const struct ipv6hdr *hdr, u8 **hc_ptr,
- u8 *iphc0)
+ const struct ipv6hdr *hdr, u8 **hc_ptr)
{
struct lowpan_nhc *nhc;
+ int ret = 0;
spin_lock_bh(&lowpan_nhc_lock);
nhc = lowpan_nexthdr_nhcs[hdr->nexthdr];
- if (nhc && nhc->compress)
- *iphc0 |= LOWPAN_IPHC_NH_C;
- else
- lowpan_push_hc_data(hc_ptr, &hdr->nexthdr,
- sizeof(hdr->nexthdr));
+ if (!(nhc && nhc->compress))
+ ret = -ENOENT;
spin_unlock_bh(&lowpan_nhc_lock);
- return 0;
+ return ret;
}
int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
@@ -157,7 +154,8 @@ out:
return ret;
}
-int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev,
+int lowpan_nhc_do_uncompression(struct sk_buff *skb,
+ const struct net_device *dev,
struct ipv6hdr *hdr)
{
struct lowpan_nhc *nhc;
diff --git a/kernel/net/6lowpan/nhc.h b/kernel/net/6lowpan/nhc.h
index ed44938eb..803041400 100644
--- a/kernel/net/6lowpan/nhc.h
+++ b/kernel/net/6lowpan/nhc.h
@@ -8,8 +8,6 @@
#include <net/6lowpan.h>
#include <net/ipv6.h>
-#define LOWPAN_NHC_MAX_ID_LEN 1
-
/**
* LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct
*
@@ -88,19 +86,16 @@ struct lowpan_nhc *lowpan_nhc_by_nexthdr(u8 nexthdr);
/**
* lowpan_nhc_check_compression - checks if we support compression format. If
- * we support the nhc by nexthdr field, the 6LoWPAN iphc NHC bit will be
- * set. If we don't support nexthdr will be added as inline data to the
- * 6LoWPAN header.
+ * we support the nhc by nexthdr field, the function will return 0. If we
+ * don't support the nhc by nexthdr this function will return -ENOENT.
*
* @skb: skb of 6LoWPAN header to read nhc and replace header.
* @hdr: ipv6hdr to check the nexthdr value
* @hc_ptr: pointer for 6LoWPAN header which should increment at the end of
* replaced header.
- * @iphc0: iphc0 pointer to set the 6LoWPAN NHC bit
*/
int lowpan_nhc_check_compression(struct sk_buff *skb,
- const struct ipv6hdr *hdr, u8 **hc_ptr,
- u8 *iphc0);
+ const struct ipv6hdr *hdr, u8 **hc_ptr);
/**
* lowpan_nhc_do_compression - calling compress callback for nhc
@@ -121,7 +116,8 @@ int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
* @dev: netdevice for print logging information.
* @hdr: ipv6hdr for setting nexthdr value.
*/
-int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev,
+int lowpan_nhc_do_uncompression(struct sk_buff *skb,
+ const struct net_device *dev,
struct ipv6hdr *hdr);
/**
diff --git a/kernel/net/6lowpan/nhc_udp.c b/kernel/net/6lowpan/nhc_udp.c
index c6bcaeb42..69537a2ea 100644
--- a/kernel/net/6lowpan/nhc_udp.c
+++ b/kernel/net/6lowpan/nhc_udp.c
@@ -17,7 +17,27 @@
#include "nhc.h"
-#define LOWPAN_NHC_UDP_IDLEN 1
+#define LOWPAN_NHC_UDP_MASK 0xF8
+#define LOWPAN_NHC_UDP_ID 0xF0
+#define LOWPAN_NHC_UDP_IDLEN 1
+
+#define LOWPAN_NHC_UDP_4BIT_PORT 0xF0B0
+#define LOWPAN_NHC_UDP_4BIT_MASK 0xFFF0
+#define LOWPAN_NHC_UDP_8BIT_PORT 0xF000
+#define LOWPAN_NHC_UDP_8BIT_MASK 0xFF00
+
+/* values for port compression, _with checksum_ ie bit 5 set to 0 */
+
+/* all inline */
+#define LOWPAN_NHC_UDP_CS_P_00 0xF0
+/* source 16bit inline, dest = 0xF0 + 8 bit inline */
+#define LOWPAN_NHC_UDP_CS_P_01 0xF1
+/* source = 0xF0 + 8bit inline, dest = 16 bit inline */
+#define LOWPAN_NHC_UDP_CS_P_10 0xF2
+/* source & dest = 0xF0B + 4bit inline */
+#define LOWPAN_NHC_UDP_CS_P_11 0xF3
+/* checksum elided */
+#define LOWPAN_NHC_UDP_CS_C 0x04
static int udp_uncompress(struct sk_buff *skb, size_t needed)
{
@@ -71,7 +91,18 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed)
* here, we obtain the hint from the remaining size of the
* frame
*/
- uh.len = htons(skb->len + sizeof(struct udphdr));
+ switch (lowpan_priv(skb->dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_802154_cb(skb)->d_size)
+ uh.len = htons(lowpan_802154_cb(skb)->d_size -
+ sizeof(struct ipv6hdr));
+ else
+ uh.len = htons(skb->len + sizeof(struct udphdr));
+ break;
+ default:
+ uh.len = htons(skb->len + sizeof(struct udphdr));
+ break;
+ }
pr_debug("uncompressed UDP length: src = %d", ntohs(uh.len));
/* replace the compressed UDP head by the uncompressed UDP
diff --git a/kernel/net/8021q/vlan.c b/kernel/net/8021q/vlan.c
index 59555f0f8..d2cd9de4b 100644
--- a/kernel/net/8021q/vlan.c
+++ b/kernel/net/8021q/vlan.c
@@ -618,6 +618,92 @@ out:
return err;
}
+static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff *p, **pp = NULL;
+ struct vlan_hdr *vhdr;
+ unsigned int hlen, off_vlan;
+ const struct packet_offload *ptype;
+ __be16 type;
+ int flush = 1;
+
+ off_vlan = skb_gro_offset(skb);
+ hlen = off_vlan + sizeof(*vhdr);
+ vhdr = skb_gro_header_fast(skb, off_vlan);
+ if (skb_gro_header_hard(skb, hlen)) {
+ vhdr = skb_gro_header_slow(skb, hlen, off_vlan);
+ if (unlikely(!vhdr))
+ goto out;
+ }
+
+ type = vhdr->h_vlan_encapsulated_proto;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out_unlock;
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ struct vlan_hdr *vhdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
+ if (compare_vlan_header(vhdr, vhdr2))
+ NAPI_GRO_CB(p)->same_flow = 0;
+ }
+
+ skb_gro_pull(skb, sizeof(*vhdr));
+ skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
+ __be16 type = vhdr->h_vlan_encapsulated_proto;
+ struct packet_offload *ptype;
+ int err = -ENOENT;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
+
+ rcu_read_unlock();
+ return err;
+}
+
+static struct packet_offload vlan_packet_offloads[] __read_mostly = {
+ {
+ .type = cpu_to_be16(ETH_P_8021Q),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+ {
+ .type = cpu_to_be16(ETH_P_8021AD),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+};
+
static int __net_init vlan_init_net(struct net *net)
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
@@ -645,6 +731,7 @@ static struct pernet_operations vlan_net_ops = {
static int __init vlan_proto_init(void)
{
int err;
+ unsigned int i;
pr_info("%s v%s\n", vlan_fullname, vlan_version);
@@ -668,6 +755,9 @@ static int __init vlan_proto_init(void)
if (err < 0)
goto err5;
+ for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+ dev_add_offload(&vlan_packet_offloads[i]);
+
vlan_ioctl_set(vlan_ioctl_handler);
return 0;
@@ -685,7 +775,13 @@ err0:
static void __exit vlan_cleanup_module(void)
{
+ unsigned int i;
+
vlan_ioctl_set(NULL);
+
+ for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+ dev_remove_offload(&vlan_packet_offloads[i]);
+
vlan_netlink_fini();
unregister_netdevice_notifier(&vlan_notifier_block);
diff --git a/kernel/net/8021q/vlan_core.c b/kernel/net/8021q/vlan_core.c
index 61bf2a06e..e2ed69850 100644
--- a/kernel/net/8021q/vlan_core.c
+++ b/kernel/net/8021q/vlan_core.c
@@ -30,7 +30,9 @@ bool vlan_do_receive(struct sk_buff **skbp)
skb->pkt_type = PACKET_HOST;
}
- if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+ if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
+ !netif_is_macvlan_port(vlan_dev) &&
+ !netif_is_bridge_port(vlan_dev)) {
unsigned int offset = skb->data - skb_mac_header(skb);
/*
@@ -206,7 +208,10 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
return -ENOMEM;
if (vlan_hw_filter_capable(dev, vid_info)) {
- err = ops->ndo_vlan_rx_add_vid(dev, proto, vid);
+ if (netif_device_present(dev))
+ err = ops->ndo_vlan_rx_add_vid(dev, proto, vid);
+ else
+ err = -ENODEV;
if (err) {
kfree(vid_info);
return err;
@@ -264,7 +269,10 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
int err;
if (vlan_hw_filter_capable(dev, vid_info)) {
- err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
+ if (netif_device_present(dev))
+ err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
+ else
+ err = -ENODEV;
if (err) {
pr_warn("failed to kill vid %04x/%d for device %s\n",
proto, vid, dev->name);
diff --git a/kernel/net/8021q/vlan_dev.c b/kernel/net/8021q/vlan_dev.c
index 01d7ba840..fded86508 100644
--- a/kernel/net/8021q/vlan_dev.c
+++ b/kernel/net/8021q/vlan_dev.c
@@ -791,10 +791,9 @@ void vlan_setup(struct net_device *dev)
{
ether_setup(dev);
- dev->priv_flags |= IFF_802_1Q_VLAN;
+ dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
netif_keep_dst(dev);
- dev->tx_queue_len = 0;
dev->netdev_ops = &vlan_netdev_ops;
dev->destructor = vlan_dev_free;
diff --git a/kernel/net/9p/client.c b/kernel/net/9p/client.c
index fcf6fe063..ea79ee9a7 100644
--- a/kernel/net/9p/client.c
+++ b/kernel/net/9p/client.c
@@ -1584,6 +1584,10 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
p9_free_req(clnt, req);
break;
}
+ if (rsize < count) {
+ pr_err("bogus RREAD count (%d > %d)\n", count, rsize);
+ count = rsize;
+ }
p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
if (!count) {
@@ -1652,6 +1656,10 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
p9_free_req(clnt, req);
break;
}
+ if (rsize < count) {
+ pr_err("bogus RWRITE count (%d > %d)\n", count, rsize);
+ count = rsize;
+ }
p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
diff --git a/kernel/net/9p/trans_rdma.c b/kernel/net/9p/trans_rdma.c
index 3533d2a53..52b4a2f99 100644
--- a/kernel/net/9p/trans_rdma.c
+++ b/kernel/net/9p/trans_rdma.c
@@ -94,8 +94,6 @@ struct p9_trans_rdma {
struct ib_pd *pd;
struct ib_qp *qp;
struct ib_cq *cq;
- struct ib_mr *dma_mr;
- u32 lkey;
long timeout;
int sq_depth;
struct semaphore sq_sem;
@@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
if (!rdma)
return;
- if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
- ib_dereg_mr(rdma->dma_mr);
-
if (rdma->qp && !IS_ERR(rdma->qp))
ib_destroy_qp(rdma->qp);
@@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
sge.addr = c->busa;
sge.length = client->msize;
- sge.lkey = rdma->lkey;
+ sge.lkey = rdma->pd->local_dma_lkey;
wr.next = NULL;
c->wc_op = IB_WC_RECV;
@@ -506,7 +501,7 @@ dont_need_post_recv:
sge.addr = c->busa;
sge.length = c->req->tc->size;
- sge.lkey = rdma->lkey;
+ sge.lkey = rdma->pd->local_dma_lkey;
wr.next = NULL;
c->wc_op = IB_WC_SEND;
@@ -647,7 +642,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
struct p9_trans_rdma *rdma;
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
- struct ib_device_attr devattr;
+ struct ib_cq_init_attr cq_attr = {};
/* Parse the transport specific mount options */
err = parse_opts(args, &opts);
@@ -660,8 +655,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
return -ENOMEM;
/* Create the RDMA CM ID */
- rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
- IB_QPT_RC);
+ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(rdma->cm_id))
goto error;
@@ -699,15 +694,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
goto error;
- /* Query the device attributes */
- err = ib_query_device(rdma->cm_id->device, &devattr);
- if (err)
- goto error;
-
/* Create the Completion Queue */
+ cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
cq_event_handler, client,
- opts.sq_depth + opts.rq_depth + 1, 0);
+ &cq_attr);
if (IS_ERR(rdma->cq))
goto error;
ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
@@ -717,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
if (IS_ERR(rdma->pd))
goto error;
- /* Cache the DMA lkey in the transport */
- rdma->dma_mr = NULL;
- if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
- rdma->lkey = rdma->cm_id->device->local_dma_lkey;
- else {
- rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(rdma->dma_mr))
- goto error;
- rdma->lkey = rdma->dma_mr->lkey;
- }
-
/* Create the Queue Pair */
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
diff --git a/kernel/net/9p/trans_virtio.c b/kernel/net/9p/trans_virtio.c
index 9dd49ca67..6e70ddb15 100644
--- a/kernel/net/9p/trans_virtio.c
+++ b/kernel/net/9p/trans_virtio.c
@@ -704,6 +704,7 @@ static void p9_virtio_remove(struct virtio_device *vdev)
mutex_unlock(&virtio_9p_lock);
+ vdev->config->reset(vdev);
vdev->config->del_vqs(vdev);
sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
diff --git a/kernel/net/Kconfig b/kernel/net/Kconfig
index 44dd5786e..127da94ae 100644
--- a/kernel/net/Kconfig
+++ b/kernel/net/Kconfig
@@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES
Newly written code should NEVER need this option but do
compat-independent messages instead!
+config NET_INGRESS
+ bool
+
menu "Networking options"
source "net/packet/Kconfig"
@@ -229,6 +232,7 @@ source "net/netlink/Kconfig"
source "net/mpls/Kconfig"
source "net/hsr/Kconfig"
source "net/switchdev/Kconfig"
+source "net/l3mdev/Kconfig"
config RPS
bool
@@ -371,6 +375,13 @@ source "net/caif/Kconfig"
source "net/ceph/Kconfig"
source "net/nfc/Kconfig"
+config LWTUNNEL
+ bool "Network light weight tunnels"
+ ---help---
+ This feature provides an infrastructure to support light weight
+ tunnels like mpls. There is no netdevice associated with a light
+ weight tunnel endpoint. Tunnel encapsulation parameters are stored
+ with light weight tunnel state associated with fib routes.
endif # if NET
diff --git a/kernel/net/Makefile b/kernel/net/Makefile
index 3995613e5..a5d04098d 100644
--- a/kernel/net/Makefile
+++ b/kernel/net/Makefile
@@ -74,3 +74,6 @@ obj-$(CONFIG_HSR) += hsr/
ifneq ($(CONFIG_NET_SWITCHDEV),)
obj-y += switchdev/
endif
+ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
+obj-y += l3mdev/
+endif
diff --git a/kernel/net/appletalk/ddp.c b/kernel/net/appletalk/ddp.c
index 3b7ad43c7..d5871ac49 100644
--- a/kernel/net/appletalk/ddp.c
+++ b/kernel/net/appletalk/ddp.c
@@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
goto out;
rc = -ENOMEM;
- sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto);
+ sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
if (!sk)
goto out;
rc = 0;
diff --git a/kernel/net/atm/br2684.c b/kernel/net/atm/br2684.c
index cc78538d1..aa0047c5c 100644
--- a/kernel/net/atm/br2684.c
+++ b/kernel/net/atm/br2684.c
@@ -802,13 +802,10 @@ static int br2684_seq_show(struct seq_file *seq, void *v)
(brdev->payload == p_bridged) ? "bridged" : "routed",
brvcc->copies_failed, brvcc->copies_needed);
#ifdef CONFIG_ATM_BR2684_IPFILTER
-#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte]
-#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
if (brvcc->filter.netmask != 0)
- seq_printf(seq, " filter=%d.%d.%d.%d/"
- "%d.%d.%d.%d\n", bs(prefix), bs(netmask));
-#undef bs
-#undef b1
+ seq_printf(seq, " filter=%pI4/%pI4\n",
+ &brvcc->filter.prefix,
+ &brvcc->filter.netmask);
#endif /* CONFIG_ATM_BR2684_IPFILTER */
}
return 0;
diff --git a/kernel/net/atm/clip.c b/kernel/net/atm/clip.c
index 17e55dfec..e07f551a8 100644
--- a/kernel/net/atm/clip.c
+++ b/kernel/net/atm/clip.c
@@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh)
static int clip_encap(struct atm_vcc *vcc, int mode)
{
+ if (!CLIP_VCC(vcc))
+ return -EBADFD;
+
CLIP_VCC(vcc)->encap = mode;
return 0;
}
diff --git a/kernel/net/atm/common.c b/kernel/net/atm/common.c
index ed0466637..49a872db7 100644
--- a/kernel/net/atm/common.c
+++ b/kernel/net/atm/common.c
@@ -141,7 +141,7 @@ static struct proto vcc_proto = {
.release_cb = vcc_release_cb,
};
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern)
{
struct sock *sk;
struct atm_vcc *vcc;
@@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
sock->sk = NULL;
if (sock->type == SOCK_STREAM)
return -EINVAL;
- sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto);
+ sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern);
if (!sk)
return -ENOMEM;
sock_init_data(sock, sk);
diff --git a/kernel/net/atm/common.h b/kernel/net/atm/common.h
index 4d6f5b206..959436b87 100644
--- a/kernel/net/atm/common.h
+++ b/kernel/net/atm/common.h
@@ -10,7 +10,7 @@
#include <linux/poll.h> /* for poll_table */
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family);
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern);
int vcc_release(struct socket *sock);
int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/kernel/net/atm/pvc.c b/kernel/net/atm/pvc.c
index ae0324021..040207ec3 100644
--- a/kernel/net/atm/pvc.c
+++ b/kernel/net/atm/pvc.c
@@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol,
return -EAFNOSUPPORT;
sock->ops = &pvc_proto_ops;
- return vcc_create(net, sock, protocol, PF_ATMPVC);
+ return vcc_create(net, sock, protocol, PF_ATMPVC, kern);
}
static const struct net_proto_family pvc_family_ops = {
diff --git a/kernel/net/atm/svc.c b/kernel/net/atm/svc.c
index 1ba23f501..3fa0a9ee9 100644
--- a/kernel/net/atm/svc.c
+++ b/kernel/net/atm/svc.c
@@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol,
return -EAFNOSUPPORT;
sock->ops = &svc_proto_ops;
- error = vcc_create(net, sock, protocol, AF_ATMSVC);
+ error = vcc_create(net, sock, protocol, AF_ATMSVC, kern);
if (error)
return error;
ATM_SD(sock)->local.sas_family = AF_ATMSVC;
diff --git a/kernel/net/ax25/af_ax25.c b/kernel/net/ax25/af_ax25.c
index 330c1f4a5..fbd0acf80 100644
--- a/kernel/net/ax25/af_ax25.c
+++ b/kernel/net/ax25/af_ax25.c
@@ -40,7 +40,6 @@
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
-#include <linux/netfilter.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/spinlock.h>
@@ -58,7 +57,7 @@ static const struct proto_ops ax25_proto_ops;
static void ax25_free_sock(struct sock *sk)
{
- ax25_cb_put(ax25_sk(sk));
+ ax25_cb_put(sk_to_ax25(sk));
}
/*
@@ -307,7 +306,7 @@ void ax25_destroy_socket(ax25_cb *ax25)
while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) {
if (skb->sk != ax25->sk) {
/* A pending connection */
- ax25_cb *sax25 = ax25_sk(skb->sk);
+ ax25_cb *sax25 = sk_to_ax25(skb->sk);
/* Queue the unaccepted socket for death */
sock_orphan(skb->sk);
@@ -552,7 +551,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
switch (optname) {
case AX25_WINDOW:
@@ -698,7 +697,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
length = min_t(unsigned int, maxlen, sizeof(int));
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
switch (optname) {
case AX25_WINDOW:
@@ -797,7 +796,7 @@ out:
static struct proto ax25_proto = {
.name = "AX25",
.owner = THIS_MODULE,
- .obj_size = sizeof(struct sock),
+ .obj_size = sizeof(struct ax25_sock),
};
static int ax25_create(struct net *net, struct socket *sock, int protocol,
@@ -806,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
struct sock *sk;
ax25_cb *ax25;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
@@ -855,11 +857,11 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
return -ESOCKTNOSUPPORT;
}
- sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto);
+ sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern);
if (sk == NULL)
return -ENOMEM;
- ax25 = sk->sk_protinfo = ax25_create_cb();
+ ax25 = ax25_sk(sk)->cb = ax25_create_cb();
if (!ax25) {
sk_free(sk);
return -ENOMEM;
@@ -881,7 +883,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
struct sock *sk;
ax25_cb *ax25, *oax25;
- sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot);
+ sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0);
if (sk == NULL)
return NULL;
@@ -911,7 +913,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sk->sk_state = TCP_ESTABLISHED;
sock_copy_flags(sk, osk);
- oax25 = ax25_sk(osk);
+ oax25 = sk_to_ax25(osk);
ax25->modulus = oax25->modulus;
ax25->backoff = oax25->backoff;
@@ -939,7 +941,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
}
}
- sk->sk_protinfo = ax25;
+ ax25_sk(sk)->cb = ax25;
sk->sk_destruct = ax25_free_sock;
ax25->sk = sk;
@@ -957,7 +959,7 @@ static int ax25_release(struct socket *sock)
sock_hold(sk);
sock_orphan(sk);
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (sk->sk_type == SOCK_SEQPACKET) {
switch (ax25->state) {
@@ -1067,7 +1069,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (!sock_flag(sk, SOCK_ZAPPED)) {
err = -EINVAL;
goto out;
@@ -1114,7 +1116,7 @@ static int __must_check ax25_connect(struct socket *sock,
struct sockaddr *uaddr, int addr_len, int flags)
{
struct sock *sk = sock->sk;
- ax25_cb *ax25 = ax25_sk(sk), *ax25t;
+ ax25_cb *ax25 = sk_to_ax25(sk), *ax25t;
struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
ax25_digi *digi = NULL;
int ct = 0, err = 0;
@@ -1395,7 +1397,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
memset(fsa, 0, sizeof(*fsa));
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (peer != 0) {
if (sk->sk_state != TCP_ESTABLISHED) {
@@ -1447,7 +1449,7 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
return -EINVAL;
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (sock_flag(sk, SOCK_ZAPPED)) {
err = -EADDRNOTAVAIL;
@@ -1622,7 +1624,7 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (skb == NULL)
goto out;
- if (!ax25_sk(sk)->pidincl)
+ if (!sk_to_ax25(sk)->pidincl)
skb_pull(skb, 1); /* Remove PID */
skb_reset_transport_header(skb);
@@ -1763,7 +1765,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCAX25GETINFO:
case SIOCAX25GETINFOOLD: {
- ax25_cb *ax25 = ax25_sk(sk);
+ ax25_cb *ax25 = sk_to_ax25(sk);
struct ax25_info_struct ax25_info;
ax25_info.t1 = ax25->t1 / HZ;
diff --git a/kernel/net/ax25/ax25_in.c b/kernel/net/ax25/ax25_in.c
index 7ed8ab724..bb5a0e4e9 100644
--- a/kernel/net/ax25/ax25_in.c
+++ b/kernel/net/ax25/ax25_in.c
@@ -23,7 +23,6 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <linux/netfilter.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <asm/uaccess.h>
@@ -354,7 +353,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
return 0;
}
- ax25 = ax25_sk(make);
+ ax25 = sk_to_ax25(make);
skb_set_owner_r(skb, make);
skb_queue_head(&sk->sk_receive_queue, skb);
diff --git a/kernel/net/ax25/ax25_ip.c b/kernel/net/ax25/ax25_ip.c
index 7c646bb2c..b563a3f5f 100644
--- a/kernel/net/ax25/ax25_ip.c
+++ b/kernel/net/ax25/ax25_ip.c
@@ -31,7 +31,6 @@
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
-#include <linux/netfilter.h>
#include <linux/sysctl.h>
#include <net/ip.h>
#include <net/arp.h>
diff --git a/kernel/net/ax25/ax25_out.c b/kernel/net/ax25/ax25_out.c
index be2acab9b..8ddd41baa 100644
--- a/kernel/net/ax25/ax25_out.c
+++ b/kernel/net/ax25/ax25_out.c
@@ -24,7 +24,6 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <linux/netfilter.h>
#include <net/sock.h>
#include <asm/uaccess.h>
#include <linux/fcntl.h>
diff --git a/kernel/net/ax25/ax25_subr.c b/kernel/net/ax25/ax25_subr.c
index 1997538a5..3b78e8473 100644
--- a/kernel/net/ax25/ax25_subr.c
+++ b/kernel/net/ax25/ax25_subr.c
@@ -264,6 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
+ ax25_stop_heartbeat(ax25);
ax25_stop_t1timer(ax25);
ax25_stop_t2timer(ax25);
ax25_stop_t3timer(ax25);
diff --git a/kernel/net/ax25/ax25_uid.c b/kernel/net/ax25/ax25_uid.c
index 71c4badbc..4ad2fb7bc 100644
--- a/kernel/net/ax25/ax25_uid.c
+++ b/kernel/net/ax25/ax25_uid.c
@@ -34,7 +34,6 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
-#include <linux/netfilter.h>
#include <linux/sysctl.h>
#include <linux/export.h>
#include <net/ip.h>
diff --git a/kernel/net/batman-adv/Makefile b/kernel/net/batman-adv/Makefile
index eb7d8c038..21434ab79 100644
--- a/kernel/net/batman-adv/Makefile
+++ b/kernel/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
#
@@ -20,7 +20,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
batman-adv-y += bat_iv_ogm.o
batman-adv-y += bitarray.o
batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
-batman-adv-y += debugfs.o
+batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o
batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
batman-adv-y += fragmentation.o
batman-adv-y += gateway_client.o
@@ -29,6 +29,7 @@ batman-adv-y += hard-interface.o
batman-adv-y += hash.o
batman-adv-y += icmp_socket.o
batman-adv-y += main.o
+batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o
batman-adv-y += originator.o
batman-adv-y += routing.o
@@ -36,4 +37,3 @@ batman-adv-y += send.o
batman-adv-y += soft-interface.o
batman-adv-y += sysfs.o
batman-adv-y += translation-table.o
-batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
diff --git a/kernel/net/batman-adv/bat_algo.h b/kernel/net/batman-adv/bat_algo.h
index 4e49666f8..4e59cf3eb 100644
--- a/kernel/net/batman-adv/bat_algo.h
+++ b/kernel/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/kernel/net/batman-adv/bat_iv_ogm.c b/kernel/net/batman-adv/bat_iv_ogm.c
index 00e00e09b..912d9c36f 100644
--- a/kernel/net/batman-adv/bat_iv_ogm.c
+++ b/kernel/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,20 +15,50 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "bat_algo.h"
#include "main.h"
-#include "translation-table.h"
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bitarray.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "network-coding.h"
#include "originator.h"
+#include "packet.h"
#include "routing.h"
-#include "gateway_common.h"
-#include "gateway_client.h"
-#include "hard-interface.h"
#include "send.h"
-#include "bat_algo.h"
-#include "network-coding.h"
+#include "translation-table.h"
/**
* enum batadv_dup_status - duplicate status
- * @BATADV_NO_DUP: the packet is a duplicate
+ * @BATADV_NO_DUP: the packet is no duplicate
* @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the
* neighbor)
* @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor
@@ -47,24 +77,25 @@ enum batadv_dup_status {
* @lq_index: index to store the value at
* @value: value to store in the ring buffer
*/
-static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index,
- uint8_t value)
+static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value)
{
lq_recv[*lq_index] = value;
*lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE;
}
/**
- * batadv_ring_buffer_set - compute the average of all non-zero values stored
+ * batadv_ring_buffer_avg - compute the average of all non-zero values stored
* in the given ring buffer
* @lq_recv: pointer to the ring buffer
*
* Returns computed average value.
*/
-static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[])
+static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
{
- const uint8_t *ptr;
- uint16_t count = 0, i = 0, sum = 0;
+ const u8 *ptr;
+ u16 count = 0;
+ u16 i = 0;
+ u16 sum = 0;
ptr = lq_recv;
@@ -81,7 +112,7 @@ static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[])
if (count == 0)
return 0;
- return (uint8_t)(sum / count);
+ return (u8)(sum / count);
}
/**
@@ -123,14 +154,14 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
kfree(orig_node->bat_iv.bcast_own);
orig_node->bat_iv.bcast_own = data_ptr;
- data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC);
+ data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
if (!data_ptr) {
kfree(orig_node->bat_iv.bcast_own);
goto unlock;
}
memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
- (max_if_num - 1) * sizeof(uint8_t));
+ (max_if_num - 1) * sizeof(u8));
kfree(orig_node->bat_iv.bcast_own_sum);
orig_node->bat_iv.bcast_own_sum = data_ptr;
@@ -183,19 +214,19 @@ free_bcast_own:
if (max_if_num == 0)
goto free_own_sum;
- data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC);
+ data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
if (!data_ptr) {
kfree(orig_node->bat_iv.bcast_own);
goto unlock;
}
memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
- del_if_num * sizeof(uint8_t));
+ del_if_num * sizeof(u8));
- if_offset = (del_if_num + 1) * sizeof(uint8_t);
- memcpy((char *)data_ptr + del_if_num * sizeof(uint8_t),
+ if_offset = (del_if_num + 1) * sizeof(u8);
+ memcpy((char *)data_ptr + del_if_num * sizeof(u8),
orig_node->bat_iv.bcast_own_sum + if_offset,
- (max_if_num - del_if_num) * sizeof(uint8_t));
+ (max_if_num - del_if_num) * sizeof(u8));
free_own_sum:
kfree(orig_node->bat_iv.bcast_own_sum);
@@ -218,7 +249,7 @@ unlock:
* If the object does not exists it is created an initialised.
*/
static struct batadv_orig_node *
-batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr)
+batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
{
struct batadv_orig_node *orig_node;
int size, hash_added;
@@ -238,7 +269,7 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr)
if (!orig_node->bat_iv.bcast_own)
goto free_orig_node;
- size = bat_priv->num_ifaces * sizeof(uint8_t);
+ size = bat_priv->num_ifaces * sizeof(u8);
orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC);
if (!orig_node->bat_iv.bcast_own_sum)
goto free_orig_node;
@@ -261,43 +292,17 @@ free_orig_node:
static struct batadv_neigh_node *
batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
- const uint8_t *neigh_addr,
+ const u8 *neigh_addr,
struct batadv_orig_node *orig_node,
struct batadv_orig_node *orig_neigh)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_neigh_node *neigh_node, *tmp_neigh_node;
+ struct batadv_neigh_node *neigh_node;
- neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node);
+ neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr);
if (!neigh_node)
goto out;
- if (!atomic_inc_not_zero(&hard_iface->refcount)) {
- kfree(neigh_node);
- neigh_node = NULL;
- goto out;
- }
-
neigh_node->orig_node = orig_neigh;
- neigh_node->if_incoming = hard_iface;
-
- spin_lock_bh(&orig_node->neigh_list_lock);
- tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface,
- neigh_addr);
- if (!tmp_neigh_node) {
- hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
- } else {
- kfree(neigh_node);
- batadv_hardif_free_ref(hard_iface);
- neigh_node = tmp_neigh_node;
- }
- spin_unlock_bh(&orig_node->neigh_list_lock);
-
- if (!tmp_neigh_node)
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Creating new neighbor %pM for orig_node %pM on interface %s\n",
- neigh_addr, orig_node->orig,
- hard_iface->net_dev->name);
out:
return neigh_node;
@@ -307,8 +312,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
{
struct batadv_ogm_packet *batadv_ogm_packet;
unsigned char *ogm_buff;
- uint32_t random_seqno;
- int res = -ENOMEM;
+ u32 random_seqno;
/* randomize initial seqno to avoid collision */
get_random_bytes(&random_seqno, sizeof(random_seqno));
@@ -317,7 +321,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN;
ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC);
if (!ogm_buff)
- goto out;
+ return -ENOMEM;
hard_iface->bat_iv.ogm_buff = ogm_buff;
@@ -329,10 +333,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
batadv_ogm_packet->reserved = 0;
batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE;
- res = 0;
-
-out:
- return res;
+ return 0;
}
static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
@@ -383,8 +384,7 @@ static unsigned long batadv_iv_ogm_fwd_send_time(void)
}
/* apply hop penalty for a normal link */
-static uint8_t batadv_hop_penalty(uint8_t tq,
- const struct batadv_priv *bat_priv)
+static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
{
int hop_penalty = atomic_read(&bat_priv->hop_penalty);
int new_tq;
@@ -396,8 +396,8 @@ static uint8_t batadv_hop_penalty(uint8_t tq,
}
/* is there another aggregated packet here? */
-static int batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
- __be16 tvlv_len)
+static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
+ __be16 tvlv_len)
{
int next_buff_pos = 0;
@@ -413,12 +413,12 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
struct batadv_hard_iface *hard_iface)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- char *fwd_str;
- uint8_t packet_num;
- int16_t buff_pos;
+ const char *fwd_str;
+ u8 packet_num;
+ s16 buff_pos;
struct batadv_ogm_packet *batadv_ogm_packet;
struct sk_buff *skb;
- uint8_t *packet_pos;
+ u8 *packet_pos;
if (hard_iface->if_status != BATADV_IF_ACTIVE)
return;
@@ -451,7 +451,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
batadv_ogm_packet->orig,
ntohl(batadv_ogm_packet->seqno),
batadv_ogm_packet->tq, batadv_ogm_packet->ttl,
- (batadv_ogm_packet->flags & BATADV_DIRECTLINK ?
+ ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ?
"on" : "off"),
hard_iface->net_dev->name,
hard_iface->net_dev->dev_addr);
@@ -548,58 +548,62 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
* - the send time is within our MAX_AGGREGATION_MS time
* - the resulting packet wont be bigger than
* MAX_AGGREGATION_BYTES
+ * otherwise aggregation is not possible
*/
- if (time_before(send_time, forw_packet->send_time) &&
- time_after_eq(aggregation_end_time, forw_packet->send_time) &&
- (aggregated_bytes <= BATADV_MAX_AGGREGATION_BYTES)) {
- /* check aggregation compatibility
- * -> direct link packets are broadcasted on
- * their interface only
- * -> aggregate packet if the current packet is
- * a "global" packet as well as the base
- * packet
- */
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
-
- /* packet is not leaving on the same interface. */
- if (forw_packet->if_outgoing != if_outgoing)
- goto out;
+ if (!time_before(send_time, forw_packet->send_time) ||
+ !time_after_eq(aggregation_end_time, forw_packet->send_time))
+ return false;
+
+ if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES)
+ return false;
+
+ /* packet is not leaving on the same interface. */
+ if (forw_packet->if_outgoing != if_outgoing)
+ return false;
+
+ /* check aggregation compatibility
+ * -> direct link packets are broadcasted on
+ * their interface only
+ * -> aggregate packet if the current packet is
+ * a "global" packet as well as the base
+ * packet
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return false;
- /* packets without direct link flag and high TTL
- * are flooded through the net
- */
- if ((!directlink) &&
- (!(batadv_ogm_packet->flags & BATADV_DIRECTLINK)) &&
- (batadv_ogm_packet->ttl != 1) &&
-
- /* own packets originating non-primary
- * interfaces leave only that interface
- */
- ((!forw_packet->own) ||
- (forw_packet->if_incoming == primary_if))) {
- res = true;
- goto out;
- }
+ /* packets without direct link flag and high TTL
+ * are flooded through the net
+ */
+ if (!directlink &&
+ !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) &&
+ batadv_ogm_packet->ttl != 1 &&
+
+ /* own packets originating non-primary
+ * interfaces leave only that interface
+ */
+ (!forw_packet->own ||
+ forw_packet->if_incoming == primary_if)) {
+ res = true;
+ goto out;
+ }
- /* if the incoming packet is sent via this one
- * interface only - we still can aggregate
- */
- if ((directlink) &&
- (new_bat_ogm_packet->ttl == 1) &&
- (forw_packet->if_incoming == if_incoming) &&
-
- /* packets from direct neighbors or
- * own secondary interface packets
- * (= secondary interface packets in general)
- */
- (batadv_ogm_packet->flags & BATADV_DIRECTLINK ||
- (forw_packet->own &&
- forw_packet->if_incoming != primary_if))) {
- res = true;
- goto out;
- }
+ /* if the incoming packet is sent via this one
+ * interface only - we still can aggregate
+ */
+ if (directlink &&
+ new_bat_ogm_packet->ttl == 1 &&
+ forw_packet->if_incoming == if_incoming &&
+
+ /* packets from direct neighbors or
+ * own secondary interface packets
+ * (= secondary interface packets in general)
+ */
+ (batadv_ogm_packet->flags & BATADV_DIRECTLINK ||
+ (forw_packet->own &&
+ forw_packet->if_incoming != primary_if))) {
+ res = true;
+ goto out;
}
out:
@@ -642,19 +646,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) {
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"batman packet queue full\n");
- goto out;
+ goto out_free_outgoing;
}
}
forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC);
- if (!forw_packet_aggr) {
- if (!own_packet)
- atomic_inc(&bat_priv->batman_queue_left);
- goto out;
- }
+ if (!forw_packet_aggr)
+ goto out_nomem;
- if ((atomic_read(&bat_priv->aggregated_ogms)) &&
- (packet_len < BATADV_MAX_AGGREGATION_BYTES))
+ if (atomic_read(&bat_priv->aggregated_ogms) &&
+ packet_len < BATADV_MAX_AGGREGATION_BYTES)
skb_size = BATADV_MAX_AGGREGATION_BYTES;
else
skb_size = packet_len;
@@ -662,12 +663,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
skb_size += ETH_HLEN;
forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
- if (!forw_packet_aggr->skb) {
- if (!own_packet)
- atomic_inc(&bat_priv->batman_queue_left);
- kfree(forw_packet_aggr);
- goto out;
- }
+ if (!forw_packet_aggr->skb)
+ goto out_free_forw_packet;
forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
skb_reserve(forw_packet_aggr->skb, ETH_HLEN);
@@ -699,7 +696,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
send_time - jiffies);
return;
-out:
+out_free_forw_packet:
+ kfree(forw_packet_aggr);
+out_nomem:
+ if (!own_packet)
+ atomic_inc(&bat_priv->batman_queue_left);
+out_free_outgoing:
batadv_hardif_free_ref(if_outgoing);
out_free_incoming:
batadv_hardif_free_ref(if_incoming);
@@ -752,13 +754,13 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv,
unsigned long max_aggregation_jiffies;
batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff;
- direct_link = batadv_ogm_packet->flags & BATADV_DIRECTLINK ? 1 : 0;
+ direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK);
max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS);
/* find position for the packet in the forward queue */
spin_lock_bh(&bat_priv->forw_bat_list_lock);
/* own packets are not to be aggregated */
- if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) {
+ if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) {
hlist_for_each_entry(forw_packet_pos,
&bat_priv->forw_bat_list, list) {
if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet,
@@ -807,7 +809,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
- uint16_t tvlv_len;
+ u16 tvlv_len;
if (batadv_ogm_packet->ttl <= 1) {
batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
@@ -866,9 +868,9 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
struct hlist_head *head;
struct batadv_orig_node *orig_node;
unsigned long *word;
- uint32_t i;
+ u32 i;
size_t word_index;
- uint8_t *w;
+ u8 *w;
int if_num;
for (i = 0; i < hash->size; i++) {
@@ -897,8 +899,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
struct batadv_ogm_packet *batadv_ogm_packet;
struct batadv_hard_iface *primary_if, *tmp_hard_iface;
int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len;
- uint32_t seqno;
- uint16_t tvlv_len = 0;
+ u32 seqno;
+ u16 tvlv_len = 0;
unsigned long send_time;
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -917,7 +919,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
batadv_ogm_packet->tvlv_len = htons(tvlv_len);
/* change sequence number to network order */
- seqno = (uint32_t)atomic_read(&hard_iface->bat_iv.ogm_seqno);
+ seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno);
batadv_ogm_packet->seqno = htonl(seqno);
atomic_inc(&hard_iface->bat_iv.ogm_seqno);
@@ -940,7 +942,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
rcu_read_lock();
list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) {
if (tmp_hard_iface->soft_iface != hard_iface->soft_iface)
- continue;
+ continue;
batadv_iv_ogm_queue_add(bat_priv, *ogm_buff,
*ogm_buff_len, hard_iface,
tmp_hard_iface, 1, send_time);
@@ -976,13 +978,14 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
{
struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
struct batadv_neigh_ifinfo *router_ifinfo = NULL;
- struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_neigh_node *tmp_neigh_node = NULL;
struct batadv_neigh_node *router = NULL;
struct batadv_orig_node *orig_node_tmp;
int if_num;
- uint8_t sum_orig, sum_neigh;
- uint8_t *neigh_addr;
- uint8_t tq_avg;
+ u8 sum_orig, sum_neigh;
+ u8 *neigh_addr;
+ u8 tq_avg;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"update_originator(): Searching and updating originator entry of received packet\n");
@@ -1034,9 +1037,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
batadv_orig_node_free_ref(orig_tmp);
if (!neigh_node)
goto unlock;
- } else
+ } else {
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Updating existing last-hop neighbor of originator\n");
+ }
rcu_read_unlock();
neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
@@ -1081,7 +1085,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
* won't consider it either
*/
if (router_ifinfo &&
- (neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg)) {
+ neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) {
orig_node_tmp = router->orig_node;
spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
if_num = router->if_incoming->if_num;
@@ -1133,8 +1137,8 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node;
struct batadv_neigh_ifinfo *neigh_ifinfo;
- uint8_t total_count;
- uint8_t orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
+ u8 total_count;
+ u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0;
unsigned int combined_tq;
@@ -1280,13 +1284,13 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
struct batadv_neigh_node *neigh_node;
struct batadv_neigh_ifinfo *neigh_ifinfo;
int is_dup;
- int32_t seq_diff;
+ s32 seq_diff;
int need_update = 0;
int set_mark;
enum batadv_dup_status ret = BATADV_NO_DUP;
- uint32_t seqno = ntohl(batadv_ogm_packet->seqno);
- uint8_t *neigh_addr;
- uint8_t packet_count;
+ u32 seqno = ntohl(batadv_ogm_packet->seqno);
+ u8 *neigh_addr;
+ u8 packet_count;
unsigned long *bitmap;
orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig);
@@ -1356,8 +1360,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
out:
spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
batadv_orig_node_free_ref(orig_node);
- if (orig_ifinfo)
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
return ret;
}
@@ -1376,7 +1379,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
- struct batadv_neigh_node *router = NULL, *router_router = NULL;
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_neigh_node *router_router = NULL;
struct batadv_orig_node *orig_neigh_node;
struct batadv_orig_ifinfo *orig_ifinfo;
struct batadv_neigh_node *orig_neigh_router = NULL;
@@ -1388,7 +1392,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
bool sameseq, similar_ttl;
struct sk_buff *skb_priv;
struct ethhdr *ethhdr;
- uint8_t *prev_sender;
+ u8 *prev_sender;
int is_bidirect;
/* create a private copy of the skb, as some functions change tq value
@@ -1570,7 +1574,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
struct batadv_orig_node *orig_neigh_node, *orig_node;
struct batadv_hard_iface *hard_iface;
struct batadv_ogm_packet *ogm_packet;
- uint32_t if_incoming_seqno;
+ u32 if_incoming_seqno;
bool has_directlink_flag;
struct ethhdr *ethhdr;
bool is_my_oldorig = false;
@@ -1643,9 +1647,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
if (is_my_orig) {
unsigned long *word;
int offset;
- int32_t bit_pos;
- int16_t if_num;
- uint8_t *weight;
+ s32 bit_pos;
+ s16 if_num;
+ u8 *weight;
orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
ethhdr->h_source);
@@ -1721,7 +1725,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
{
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct batadv_ogm_packet *ogm_packet;
- uint8_t *packet_pos;
+ u8 *packet_pos;
int ogm_offset;
bool ret;
@@ -1805,7 +1809,7 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
unsigned long last_seen_jiffies;
struct hlist_head *head;
int batman_count = 0;
- uint32_t i;
+ u32 i;
seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n",
"Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE,
@@ -1873,7 +1877,7 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing2)
{
struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
- uint8_t tq1, tq2;
+ u8 tq1, tq2;
int diff;
neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
@@ -1915,7 +1919,7 @@ batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing2)
{
struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
- uint8_t tq1, tq2;
+ u8 tq1, tq2;
bool ret;
neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
diff --git a/kernel/net/batman-adv/bitarray.c b/kernel/net/batman-adv/bitarray.c
index e3da07a64..25cbc36e9 100644
--- a/kernel/net/batman-adv/bitarray.c
+++ b/kernel/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -15,13 +15,13 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "bitarray.h"
+#include "main.h"
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
/* shift the packet array by n places. */
-static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n)
+static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
{
if (n <= 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE)
return;
@@ -35,8 +35,8 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n)
* 1 if the window was moved (either new or very old)
* 0 if the window was not moved/shifted.
*/
-int batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
- int32_t seq_num_diff, int set_mark)
+int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
+ int set_mark)
{
struct batadv_priv *bat_priv = priv;
diff --git a/kernel/net/batman-adv/bitarray.h b/kernel/net/batman-adv/bitarray.h
index 2acaafe60..0226b220f 100644
--- a/kernel/net/batman-adv/bitarray.h
+++ b/kernel/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -18,13 +18,19 @@
#ifndef _NET_BATMAN_ADV_BITARRAY_H_
#define _NET_BATMAN_ADV_BITARRAY_H_
+#include "main.h"
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+
/* Returns 1 if the corresponding bit in the given seq_bits indicates true
* and curr_seqno is within range of last_seqno. Otherwise returns 0.
*/
static inline int batadv_test_bit(const unsigned long *seq_bits,
- uint32_t last_seqno, uint32_t curr_seqno)
+ u32 last_seqno, u32 curr_seqno)
{
- int32_t diff;
+ s32 diff;
diff = last_seqno - curr_seqno;
if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE)
@@ -33,7 +39,7 @@ static inline int batadv_test_bit(const unsigned long *seq_bits,
}
/* turn corresponding bit on, so we can remember that we got the packet */
-static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n)
+static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
{
/* if too old, just drop it */
if (n < 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE)
@@ -45,7 +51,7 @@ static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n)
/* receive and process one packet, returns 1 if received seq_num is considered
* new, 0 if old
*/
-int batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
- int32_t seq_num_diff, int set_mark);
+int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
+ int set_mark);
#endif /* _NET_BATMAN_ADV_BITARRAY_H_ */
diff --git a/kernel/net/batman-adv/bridge_loop_avoidance.c b/kernel/net/batman-adv/bridge_loop_avoidance.c
index ac4b96ecc..f5d2fe5e3 100644
--- a/kernel/net/batman-adv/bridge_loop_avoidance.c
+++ b/kernel/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -15,21 +15,43 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
-#include "hash.h"
-#include "hard-interface.h"
-#include "originator.h"
#include "bridge_loop_avoidance.h"
-#include "translation-table.h"
-#include "send.h"
+#include "main.h"
-#include <linux/etherdevice.h>
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
#include <linux/crc16.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
#include <linux/if_arp.h>
-#include <net/arp.h>
+#include <linux/if_ether.h>
#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <net/arp.h>
-static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
+#include "hard-interface.h"
+#include "hash.h"
+#include "originator.h"
+#include "packet.h"
+#include "translation-table.h"
+
+static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
static void batadv_bla_periodic_work(struct work_struct *work);
static void
@@ -37,34 +59,25 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv,
struct batadv_bla_backbone_gw *backbone_gw);
/* return the index of the claim */
-static inline uint32_t batadv_choose_claim(const void *data, uint32_t size)
+static inline u32 batadv_choose_claim(const void *data, u32 size)
{
struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
- uint32_t hash = 0;
-
- hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr));
- hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid));
+ u32 hash = 0;
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
+ hash = jhash(&claim->addr, sizeof(claim->addr), hash);
+ hash = jhash(&claim->vid, sizeof(claim->vid), hash);
return hash % size;
}
/* return the index of the backbone gateway */
-static inline uint32_t batadv_choose_backbone_gw(const void *data,
- uint32_t size)
+static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
{
const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
- uint32_t hash = 0;
+ u32 hash = 0;
- hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr));
- hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid));
-
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
+ hash = jhash(&claim->addr, sizeof(claim->addr), hash);
+ hash = jhash(&claim->vid, sizeof(claim->vid), hash);
return hash % size;
}
@@ -75,7 +88,8 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node,
{
const void *data1 = container_of(node, struct batadv_bla_backbone_gw,
hash_entry);
- const struct batadv_bla_backbone_gw *gw1 = data1, *gw2 = data2;
+ const struct batadv_bla_backbone_gw *gw1 = data1;
+ const struct batadv_bla_backbone_gw *gw2 = data2;
if (!batadv_compare_eth(gw1->orig, gw2->orig))
return 0;
@@ -92,7 +106,8 @@ static int batadv_compare_claim(const struct hlist_node *node,
{
const void *data1 = container_of(node, struct batadv_bla_claim,
hash_entry);
- const struct batadv_bla_claim *cl1 = data1, *cl2 = data2;
+ const struct batadv_bla_claim *cl1 = data1;
+ const struct batadv_bla_claim *cl2 = data2;
if (!batadv_compare_eth(cl1->addr, cl2->addr))
return 0;
@@ -112,21 +127,17 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
}
/* finally deinitialize the claim */
-static void batadv_claim_free_rcu(struct rcu_head *rcu)
+static void batadv_claim_release(struct batadv_bla_claim *claim)
{
- struct batadv_bla_claim *claim;
-
- claim = container_of(rcu, struct batadv_bla_claim, rcu);
-
batadv_backbone_gw_free_ref(claim->backbone_gw);
- kfree(claim);
+ kfree_rcu(claim, rcu);
}
/* free a claim, call claim_free_rcu if its the last reference */
static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
{
if (atomic_dec_and_test(&claim->refcount))
- call_rcu(&claim->rcu, batadv_claim_free_rcu);
+ batadv_claim_release(claim);
}
/**
@@ -178,8 +189,8 @@ static struct batadv_bla_claim
* Returns claim if found or NULL otherwise.
*/
static struct batadv_bla_backbone_gw *
-batadv_backbone_hash_find(struct batadv_priv *bat_priv,
- uint8_t *addr, unsigned short vid)
+batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
+ unsigned short vid)
{
struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
struct hlist_head *head;
@@ -255,14 +266,14 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
* @vid: the VLAN ID
* @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...)
*/
-static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac,
+static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
unsigned short vid, int claimtype)
{
struct sk_buff *skb;
struct ethhdr *ethhdr;
struct batadv_hard_iface *primary_if;
struct net_device *soft_iface;
- uint8_t *hw_src;
+ u8 *hw_src;
struct batadv_bla_claim_dst local_claim_dest;
__be32 zeroip = 0;
@@ -290,13 +301,13 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac,
* with XX = claim type
* and YY:YY = group id
*/
- (uint8_t *)&local_claim_dest);
+ (u8 *)&local_claim_dest);
if (!skb)
goto out;
ethhdr = (struct ethhdr *)skb->data;
- hw_src = (uint8_t *)ethhdr + ETH_HLEN + sizeof(struct arphdr);
+ hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr);
/* now we pretend that the client would have sent this ... */
switch (claimtype) {
@@ -369,7 +380,7 @@ out:
* be found.
*/
static struct batadv_bla_backbone_gw *
-batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig,
+batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid, bool own_backbone)
{
struct batadv_bla_backbone_gw *entry;
@@ -538,7 +549,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
struct batadv_bla_backbone_gw *backbone_gw)
{
- uint8_t mac[ETH_ALEN];
+ u8 mac[ETH_ALEN];
__be16 crc;
memcpy(mac, batadv_announce_mac, 4);
@@ -557,7 +568,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
* @backbone_gw: the backbone gateway which claims it
*/
static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
- const uint8_t *mac, const unsigned short vid,
+ const u8 *mac, const unsigned short vid,
struct batadv_bla_backbone_gw *backbone_gw)
{
struct batadv_bla_claim *claim;
@@ -621,7 +632,7 @@ claim_free_ref:
* given mac address and vid.
*/
static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
- const uint8_t *mac, const unsigned short vid)
+ const u8 *mac, const unsigned short vid)
{
struct batadv_bla_claim search_claim, *claim;
@@ -645,12 +656,11 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
}
/* check for ANNOUNCE frame, return 1 if handled */
-static int batadv_handle_announce(struct batadv_priv *bat_priv,
- uint8_t *an_addr, uint8_t *backbone_addr,
- unsigned short vid)
+static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
+ u8 *backbone_addr, unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
- uint16_t crc;
+ u16 crc;
if (memcmp(an_addr, batadv_announce_mac, 4) != 0)
return 0;
@@ -694,8 +704,8 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv,
/* check for REQUEST frame, return 1 if handled */
static int batadv_handle_request(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
- uint8_t *backbone_addr,
- struct ethhdr *ethhdr, unsigned short vid)
+ u8 *backbone_addr, struct ethhdr *ethhdr,
+ unsigned short vid)
{
/* check for REQUEST frame */
if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest))
@@ -718,8 +728,8 @@ static int batadv_handle_request(struct batadv_priv *bat_priv,
/* check for UNCLAIM frame, return 1 if handled */
static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
- uint8_t *backbone_addr,
- uint8_t *claim_addr, unsigned short vid)
+ u8 *backbone_addr, u8 *claim_addr,
+ unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
@@ -747,7 +757,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
/* check for CLAIM frame, return 1 if handled */
static int batadv_handle_claim(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
- uint8_t *backbone_addr, uint8_t *claim_addr,
+ u8 *backbone_addr, u8 *claim_addr,
unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
@@ -791,10 +801,10 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
*/
static int batadv_check_claim_group(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
- uint8_t *hw_src, uint8_t *hw_dst,
+ u8 *hw_src, u8 *hw_dst,
struct ethhdr *ethhdr)
{
- uint8_t *backbone_addr;
+ u8 *backbone_addr;
struct batadv_orig_node *orig_node;
struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
@@ -863,7 +873,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
struct sk_buff *skb)
{
struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
- uint8_t *hw_src, *hw_dst;
+ u8 *hw_src, *hw_dst;
struct vlan_hdr *vhdr, vhdr_buf;
struct ethhdr *ethhdr;
struct arphdr *arphdr;
@@ -909,7 +919,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
/* pskb_may_pull() may have modified the pointers, get ethhdr again */
ethhdr = eth_hdr(skb);
- arphdr = (struct arphdr *)((uint8_t *)ethhdr + headlen);
+ arphdr = (struct arphdr *)((u8 *)ethhdr + headlen);
/* Check whether the ARP frame carries a valid
* IP information
@@ -923,7 +933,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
if (arphdr->ar_pln != 4)
return 0;
- hw_src = (uint8_t *)arphdr + sizeof(struct arphdr);
+ hw_src = (u8 *)arphdr + sizeof(struct arphdr);
hw_dst = hw_src + ETH_ALEN + 4;
bla_dst = (struct batadv_bla_claim_dst *)hw_dst;
bla_dst_own = &bat_priv->bla.claim_dest;
@@ -1224,9 +1234,9 @@ static struct lock_class_key batadv_backbone_hash_lock_class_key;
int batadv_bla_init(struct batadv_priv *bat_priv)
{
int i;
- uint8_t claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00};
+ u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00};
struct batadv_hard_iface *primary_if;
- uint16_t crc;
+ u16 crc;
unsigned long entrytime;
spin_lock_init(&bat_priv->bla.bcast_duplist_lock);
@@ -1354,7 +1364,7 @@ out:
*
* Returns true if orig is a backbone for this vid, false otherwise.
*/
-bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig,
+bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid)
{
struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
@@ -1633,9 +1643,9 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_bla_claim *claim;
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
- uint32_t i;
+ u32 i;
bool is_own;
- uint8_t *primary_addr;
+ u8 *primary_addr;
primary_if = batadv_seq_print_text_primary_if_get(seq);
if (!primary_if)
@@ -1678,9 +1688,9 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
int secs, msecs;
- uint32_t i;
+ u32 i;
bool is_own;
- uint8_t *primary_addr;
+ u8 *primary_addr;
primary_if = batadv_seq_print_text_primary_if_get(seq);
if (!primary_if)
diff --git a/kernel/net/batman-adv/bridge_loop_avoidance.h b/kernel/net/batman-adv/bridge_loop_avoidance.h
index 43c985d92..025152b34 100644
--- a/kernel/net/batman-adv/bridge_loop_avoidance.h
+++ b/kernel/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -18,6 +18,13 @@
#ifndef _NET_BATMAN_ADV_BLA_H_
#define _NET_BATMAN_ADV_BLA_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct seq_file;
+struct sk_buff;
+
#ifdef CONFIG_BATMAN_ADV_BLA
int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid, bool is_bcast);
@@ -28,7 +35,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb,
int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
void *offset);
-bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig,
+bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid);
int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
struct sk_buff *skb);
@@ -74,8 +81,7 @@ static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
}
static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv,
- uint8_t *orig,
- unsigned short vid)
+ u8 *orig, unsigned short vid)
{
return false;
}
diff --git a/kernel/net/batman-adv/debugfs.c b/kernel/net/batman-adv/debugfs.c
index a4972874c..c4c1e8030 100644
--- a/kernel/net/batman-adv/debugfs.c
+++ b/kernel/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -15,21 +15,42 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "debugfs.h"
#include "main.h"
+#include <linux/compiler.h>
#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/fcntl.h>
+#include <linux/fs.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/printk.h>
+#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/stddef.h>
+#include <linux/stringify.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+#include <stdarg.h>
-#include "debugfs.h"
-#include "translation-table.h"
-#include "originator.h"
-#include "hard-interface.h"
-#include "gateway_common.h"
-#include "gateway_client.h"
-#include "soft-interface.h"
-#include "icmp_socket.h"
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "icmp_socket.h"
#include "network-coding.h"
+#include "originator.h"
+#include "translation-table.h"
static struct dentry *batadv_debugfs;
@@ -482,11 +503,7 @@ rem_attr:
debugfs_remove_recursive(hard_iface->debug_dir);
hard_iface->debug_dir = NULL;
out:
-#ifdef CONFIG_DEBUG_FS
return -ENOMEM;
-#else
- return 0;
-#endif /* CONFIG_DEBUG_FS */
}
/**
@@ -541,11 +558,7 @@ rem_attr:
debugfs_remove_recursive(bat_priv->debug_dir);
bat_priv->debug_dir = NULL;
out:
-#ifdef CONFIG_DEBUG_FS
return -ENOMEM;
-#else
- return 0;
-#endif /* CONFIG_DEBUG_FS */
}
void batadv_debugfs_del_meshif(struct net_device *dev)
diff --git a/kernel/net/batman-adv/debugfs.h b/kernel/net/batman-adv/debugfs.h
index 37c4d6ddd..80ab8d6f0 100644
--- a/kernel/net/batman-adv/debugfs.h
+++ b/kernel/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -18,8 +18,16 @@
#ifndef _NET_BATMAN_ADV_DEBUGFS_H_
#define _NET_BATMAN_ADV_DEBUGFS_H_
+#include "main.h"
+
+#include <linux/kconfig.h>
+
+struct net_device;
+
#define BATADV_DEBUGFS_SUBDIR "batman_adv"
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+
void batadv_debugfs_init(void);
void batadv_debugfs_destroy(void);
int batadv_debugfs_add_meshif(struct net_device *dev);
@@ -27,4 +35,36 @@ void batadv_debugfs_del_meshif(struct net_device *dev);
int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
+#else
+
+static inline void batadv_debugfs_init(void)
+{
+}
+
+static inline void batadv_debugfs_destroy(void)
+{
+}
+
+static inline int batadv_debugfs_add_meshif(struct net_device *dev)
+{
+ return 0;
+}
+
+static inline void batadv_debugfs_del_meshif(struct net_device *dev)
+{
+}
+
+static inline
+int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
+{
+ return 0;
+}
+
+static inline
+void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
+{
+}
+
+#endif
+
#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */
diff --git a/kernel/net/batman-adv/distributed-arp-table.c b/kernel/net/batman-adv/distributed-arp-table.c
index aad022dd1..a49c705fb 100644
--- a/kernel/net/batman-adv/distributed-arp-table.c
+++ b/kernel/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
@@ -15,18 +15,37 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include <linux/if_ether.h>
+#include "distributed-arp-table.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
#include <linux/if_arp.h>
+#include <linux/if_ether.h>
#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
#include <net/arp.h>
-#include "main.h"
-#include "hash.h"
-#include "distributed-arp-table.h"
#include "hard-interface.h"
+#include "hash.h"
#include "originator.h"
#include "send.h"
-#include "types.h"
#include "translation-table.h"
static void batadv_dat_purge(struct work_struct *work);
@@ -83,7 +102,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv,
struct batadv_dat_entry *dat_entry;
struct hlist_node *node_tmp;
struct hlist_head *head;
- uint32_t i;
+ u32 i;
if (!bat_priv->dat.hash)
return;
@@ -149,11 +168,11 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
*
* Returns the value of the hw_src field in the ARP packet.
*/
-static uint8_t *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
+static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
{
- uint8_t *addr;
+ u8 *addr;
- addr = (uint8_t *)(skb->data + hdr_size);
+ addr = (u8 *)(skb->data + hdr_size);
addr += ETH_HLEN + sizeof(struct arphdr);
return addr;
@@ -178,7 +197,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
*
* Returns the value of the hw_dst field in the ARP packet.
*/
-static uint8_t *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
+static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
{
return batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN + 4;
}
@@ -202,13 +221,26 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
*
* Returns the selected index in the hash table for the given data.
*/
-static uint32_t batadv_hash_dat(const void *data, uint32_t size)
+static u32 batadv_hash_dat(const void *data, u32 size)
{
- uint32_t hash = 0;
+ u32 hash = 0;
const struct batadv_dat_entry *dat = data;
+ const unsigned char *key;
+ u32 i;
+
+ key = (const unsigned char *)&dat->ip;
+ for (i = 0; i < sizeof(dat->ip); i++) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
- hash = batadv_hash_bytes(hash, &dat->ip, sizeof(dat->ip));
- hash = batadv_hash_bytes(hash, &dat->vid, sizeof(dat->vid));
+ key = (const unsigned char *)&dat->vid;
+ for (i = 0; i < sizeof(dat->vid); i++) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
hash += (hash << 3);
hash ^= (hash >> 11);
@@ -233,7 +265,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
struct hlist_head *head;
struct batadv_dat_entry to_find, *dat_entry, *dat_entry_tmp = NULL;
struct batadv_hashtable *hash = bat_priv->dat.hash;
- uint32_t index;
+ u32 index;
if (!hash)
return NULL;
@@ -268,7 +300,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
* @vid: VLAN identifier
*/
static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
- uint8_t *mac_addr, unsigned short vid)
+ u8 *mac_addr, unsigned short vid)
{
struct batadv_dat_entry *dat_entry;
int hash_added;
@@ -325,11 +357,11 @@ out:
* @msg: message to print together with the debugging information
*/
static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
- uint16_t type, int hdr_size, char *msg)
+ u16 type, int hdr_size, char *msg)
{
struct batadv_unicast_4addr_packet *unicast_4addr_packet;
struct batadv_bcast_packet *bcast_pkt;
- uint8_t *orig_addr;
+ u8 *orig_addr;
__be32 ip_src, ip_dst;
if (msg)
@@ -392,7 +424,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
#else
static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
- uint16_t type, int hdr_size, char *msg)
+ u16 type, int hdr_size, char *msg)
{
}
@@ -422,7 +454,7 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
int j;
/* check if orig node candidate is running DAT */
- if (!(candidate->capabilities & BATADV_ORIG_CAPA_HAS_DAT))
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_DAT, &candidate->capabilities))
goto out;
/* Check if this node has already been selected... */
@@ -465,7 +497,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
int select, batadv_dat_addr_t ip_key,
batadv_dat_addr_t *last_max)
{
- batadv_dat_addr_t max = 0, tmp_max = 0;
+ batadv_dat_addr_t max = 0;
+ batadv_dat_addr_t tmp_max = 0;
struct batadv_orig_node *orig_node, *max_orig_node = NULL;
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_head *head;
@@ -533,6 +566,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
int select;
batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key;
struct batadv_dat_candidate *res;
+ struct batadv_dat_entry dat;
if (!bat_priv->orig_hash)
return NULL;
@@ -542,7 +576,9 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
if (!res)
return NULL;
- ip_key = (batadv_dat_addr_t)batadv_hash_dat(&ip_dst,
+ dat.ip = ip_dst;
+ dat.vid = 0;
+ ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat,
BATADV_DAT_ADDR_MAX);
batadv_dbg(BATADV_DBG_DAT, bat_priv,
@@ -677,14 +713,13 @@ void batadv_dat_status_update(struct net_device *net_dev)
*/
static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
- void *tvlv_value,
- uint16_t tvlv_value_len)
+ u8 flags,
+ void *tvlv_value, u16 tvlv_value_len)
{
if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
- orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_DAT;
+ clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities);
else
- orig->capabilities |= BATADV_ORIG_CAPA_HAS_DAT;
+ set_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities);
}
/**
@@ -755,7 +790,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
struct hlist_head *head;
unsigned long last_seen_jiffies;
int last_seen_msecs, last_seen_secs, last_seen_mins;
- uint32_t i;
+ u32 i;
primary_if = batadv_seq_print_text_primary_if_get(seq);
if (!primary_if)
@@ -798,14 +833,14 @@ out:
*
* Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise.
*/
-static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv,
- struct sk_buff *skb, int hdr_size)
+static u16 batadv_arp_get_type(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
{
struct arphdr *arphdr;
struct ethhdr *ethhdr;
__be32 ip_src, ip_dst;
- uint8_t *hw_src, *hw_dst;
- uint16_t type = 0;
+ u8 *hw_src, *hw_dst;
+ u16 type = 0;
/* pull the ethernet header */
if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN)))
@@ -902,9 +937,9 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
struct sk_buff *skb)
{
- uint16_t type = 0;
+ u16 type = 0;
__be32 ip_dst, ip_src;
- uint8_t *hw_src;
+ u8 *hw_src;
bool ret = false;
struct batadv_dat_entry *dat_entry = NULL;
struct sk_buff *skb_new;
@@ -990,9 +1025,9 @@ out:
bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
struct sk_buff *skb, int hdr_size)
{
- uint16_t type;
+ u16 type;
__be32 ip_src, ip_dst;
- uint8_t *hw_src;
+ u8 *hw_src;
struct sk_buff *skb_new;
struct batadv_dat_entry *dat_entry = NULL;
bool ret = false;
@@ -1068,9 +1103,9 @@ out:
void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
struct sk_buff *skb)
{
- uint16_t type;
+ u16 type;
__be32 ip_src, ip_dst;
- uint8_t *hw_src, *hw_dst;
+ u8 *hw_src, *hw_dst;
int hdr_size = 0;
unsigned short vid;
@@ -1107,14 +1142,17 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @skb: packet to check
* @hdr_size: size of the encapsulation header
+ *
+ * Returns true if the packet was snooped and consumed by DAT. False if the
+ * packet has to be delivered to the interface
*/
bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
struct sk_buff *skb, int hdr_size)
{
- uint16_t type;
+ u16 type;
__be32 ip_src, ip_dst;
- uint8_t *hw_src, *hw_dst;
- bool ret = false;
+ u8 *hw_src, *hw_dst;
+ bool dropped = false;
unsigned short vid;
if (!atomic_read(&bat_priv->distributed_arp_table))
@@ -1143,12 +1181,17 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
/* if this REPLY is directed to a client of mine, let's deliver the
* packet to the interface
*/
- ret = !batadv_is_my_client(bat_priv, hw_dst, vid);
+ dropped = !batadv_is_my_client(bat_priv, hw_dst, vid);
+
+ /* if this REPLY is sent on behalf of a client of mine, let's drop the
+ * packet because the client will reply by itself
+ */
+ dropped |= batadv_is_my_client(bat_priv, hw_src, vid);
out:
- if (ret)
+ if (dropped)
kfree_skb(skb);
- /* if ret == false -> packet has to be delivered to the interface */
- return ret;
+ /* if dropped == false -> deliver to the interface */
+ return dropped;
}
/**
@@ -1162,7 +1205,7 @@ out:
bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
struct batadv_forw_packet *forw_packet)
{
- uint16_t type;
+ u16 type;
__be32 ip_dst;
struct batadv_dat_entry *dat_entry = NULL;
bool ret = false;
diff --git a/kernel/net/batman-adv/distributed-arp-table.h b/kernel/net/batman-adv/distributed-arp-table.h
index 2fe0764c6..26d4a525a 100644
--- a/kernel/net/batman-adv/distributed-arp-table.h
+++ b/kernel/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
@@ -18,12 +18,19 @@
#ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
#define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
-#ifdef CONFIG_BATMAN_ADV_DAT
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
-#include "types.h"
#include "originator.h"
+#include "packet.h"
-#include <linux/if_arp.h>
+struct seq_file;
+struct sk_buff;
+
+#ifdef CONFIG_BATMAN_ADV_DAT
/* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */
#define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0)
@@ -47,7 +54,7 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
static inline void
batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
{
- uint32_t addr;
+ u32 addr;
addr = batadv_choose_orig(orig_node->orig, BATADV_DAT_ADDR_MAX);
orig_node->dat_addr = (batadv_dat_addr_t)addr;
@@ -62,7 +69,7 @@ static inline void
batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if)
{
- uint32_t addr;
+ u32 addr;
addr = batadv_choose_orig(primary_if->net_dev->dev_addr,
BATADV_DAT_ADDR_MAX);
@@ -82,7 +89,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset);
* Updates the ethtool statistics for the received packet if it is a DAT subtype
*/
static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
- uint8_t subtype)
+ u8 subtype)
{
switch (subtype) {
case BATADV_P_DAT_DHT_GET:
@@ -162,7 +169,7 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv)
}
static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
- uint8_t subtype)
+ u8 subtype)
{
}
diff --git a/kernel/net/batman-adv/fragmentation.c b/kernel/net/batman-adv/fragmentation.c
index 3d1dcaa3e..700c96c82 100644
--- a/kernel/net/batman-adv/fragmentation.c
+++ b/kernel/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*
@@ -15,12 +15,29 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "fragmentation.h"
-#include "send.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+
+#include "hard-interface.h"
#include "originator.h"
+#include "packet.h"
#include "routing.h"
-#include "hard-interface.h"
+#include "send.h"
#include "soft-interface.h"
/**
@@ -50,7 +67,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
bool (*check_cb)(struct batadv_frag_table_entry *))
{
struct batadv_frag_table_entry *chain;
- uint8_t i;
+ u8 i;
for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
chain = &orig_node->fragments[i];
@@ -94,8 +111,10 @@ static int batadv_frag_size_limit(void)
* without searching for the right position.
*/
static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
- uint16_t seqno)
+ u16 seqno)
{
+ lockdep_assert_held(&chain->lock);
+
if (chain->seqno == seqno)
return false;
@@ -129,8 +148,8 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
struct batadv_frag_list_entry *frag_entry_new = NULL, *frag_entry_curr;
struct batadv_frag_list_entry *frag_entry_last = NULL;
struct batadv_frag_packet *frag_packet;
- uint8_t bucket;
- uint16_t seqno, hdr_size = sizeof(struct batadv_frag_packet);
+ u8 bucket;
+ u16 seqno, hdr_size = sizeof(struct batadv_frag_packet);
bool ret = false;
/* Linearize packet to avoid linearizing 16 packets in a row when doing
@@ -161,6 +180,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
hlist_add_head(&frag_entry_new->list, &chain->head);
chain->size = skb->len - hdr_size;
chain->timestamp = jiffies;
+ chain->total_size = ntohs(frag_packet->total_size);
ret = true;
goto out;
}
@@ -195,9 +215,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
out:
if (chain->size > batadv_frag_size_limit() ||
- ntohs(frag_packet->total_size) > batadv_frag_size_limit()) {
+ chain->total_size != ntohs(frag_packet->total_size) ||
+ chain->total_size > batadv_frag_size_limit()) {
/* Clear chain if total size of either the list or the packet
- * exceeds the maximum size of one merged packet.
+ * exceeds the maximum size of one merged packet. Don't allow
+ * packets to have different total_size.
*/
batadv_frag_clear_chain(&chain->head);
chain->size = 0;
@@ -228,19 +250,13 @@ err:
* Returns the merged skb or NULL on error.
*/
static struct sk_buff *
-batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb)
+batadv_frag_merge_packets(struct hlist_head *chain)
{
struct batadv_frag_packet *packet;
struct batadv_frag_list_entry *entry;
struct sk_buff *skb_out = NULL;
int size, hdr_size = sizeof(struct batadv_frag_packet);
- /* Make sure incoming skb has non-bogus data. */
- packet = (struct batadv_frag_packet *)skb->data;
- size = ntohs(packet->total_size);
- if (size > batadv_frag_size_limit())
- goto free;
-
/* Remove first entry, as this is the destination for the rest of the
* fragments.
*/
@@ -249,6 +265,9 @@ batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb)
skb_out = entry->skb;
kfree(entry);
+ packet = (struct batadv_frag_packet *)skb_out->data;
+ size = ntohs(packet->total_size);
+
/* Make room for the rest of the fragments. */
if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
kfree_skb(skb_out);
@@ -304,7 +323,7 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb,
if (hlist_empty(&head))
goto out;
- skb_out = batadv_frag_merge_packets(&head, *skb);
+ skb_out = batadv_frag_merge_packets(&head);
if (!skb_out)
goto out_err;
@@ -335,7 +354,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
struct batadv_orig_node *orig_node_dst = NULL;
struct batadv_neigh_node *neigh_node = NULL;
struct batadv_frag_packet *packet;
- uint16_t total_size;
+ u16 total_size;
bool ret = false;
packet = (struct batadv_frag_packet *)skb->data;
diff --git a/kernel/net/batman-adv/fragmentation.h b/kernel/net/batman-adv/fragmentation.h
index d848cf667..8b9877e70 100644
--- a/kernel/net/batman-adv/fragmentation.h
+++ b/kernel/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*
@@ -18,6 +18,15 @@
#ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_
#define _NET_BATMAN_ADV_FRAGMENTATION_H_
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+struct sk_buff;
+
void batadv_frag_purge_orig(struct batadv_orig_node *orig,
bool (*check_cb)(struct batadv_frag_table_entry *));
bool batadv_frag_skb_fwd(struct sk_buff *skb,
diff --git a/kernel/net/batman-adv/gateway_client.c b/kernel/net/batman-adv/gateway_client.c
index 090828cf1..e6c8382c7 100644
--- a/kernel/net/batman-adv/gateway_client.c
+++ b/kernel/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -15,18 +15,37 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
-#include "sysfs.h"
#include "gateway_client.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/udp.h>
+
#include "gateway_common.h"
#include "hard-interface.h"
#include "originator.h"
-#include "translation-table.h"
+#include "packet.h"
#include "routing.h"
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/udp.h>
-#include <linux/if_vlan.h>
+#include "sysfs.h"
+#include "translation-table.h"
/* These are the offsets of the "hw type" and "hw address length" in the dhcp
* packet starting at the beginning of the dhcp header
@@ -133,20 +152,14 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
struct batadv_neigh_node *router;
struct batadv_neigh_ifinfo *router_ifinfo;
struct batadv_gw_node *gw_node, *curr_gw = NULL;
- uint32_t max_gw_factor = 0, tmp_gw_factor = 0;
- uint32_t gw_divisor;
- uint8_t max_tq = 0;
- uint8_t tq_avg;
+ u64 max_gw_factor = 0;
+ u64 tmp_gw_factor = 0;
+ u8 max_tq = 0;
+ u8 tq_avg;
struct batadv_orig_node *orig_node;
- gw_divisor = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE;
- gw_divisor *= 64;
-
rcu_read_lock();
hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
- if (gw_node->deleted)
- continue;
-
orig_node = gw_node->orig_node;
router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
if (!router)
@@ -167,7 +180,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
tmp_gw_factor = tq_avg * tq_avg;
tmp_gw_factor *= gw_node->bandwidth_down;
tmp_gw_factor *= 100 * 100;
- tmp_gw_factor /= gw_divisor;
+ tmp_gw_factor >>= 18;
if ((tmp_gw_factor > max_gw_factor) ||
((tmp_gw_factor == max_gw_factor) &&
@@ -247,7 +260,8 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
void batadv_gw_election(struct batadv_priv *bat_priv)
{
- struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL;
+ struct batadv_gw_node *curr_gw = NULL;
+ struct batadv_gw_node *next_gw = NULL;
struct batadv_neigh_node *router = NULL;
struct batadv_neigh_ifinfo *router_ifinfo = NULL;
char gw_addr[18] = { '\0' };
@@ -331,8 +345,9 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv,
struct batadv_neigh_ifinfo *router_orig_tq = NULL;
struct batadv_neigh_ifinfo *router_gw_tq = NULL;
struct batadv_orig_node *curr_gw_orig;
- struct batadv_neigh_node *router_gw = NULL, *router_orig = NULL;
- uint8_t gw_tq_avg, orig_tq_avg;
+ struct batadv_neigh_node *router_gw = NULL;
+ struct batadv_neigh_node *router_orig = NULL;
+ u8 gw_tq_avg, orig_tq_avg;
curr_gw_orig = batadv_gw_get_selected_orig(bat_priv);
if (!curr_gw_orig)
@@ -419,6 +434,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
INIT_HLIST_NODE(&gw_node->list);
gw_node->orig_node = orig_node;
+ gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
+ gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
atomic_set(&gw_node->refcount, 1);
spin_lock_bh(&bat_priv->gw.list_lock);
@@ -452,9 +469,6 @@ batadv_gw_node_get(struct batadv_priv *bat_priv,
if (gw_node_tmp->orig_node != orig_node)
continue;
- if (gw_node_tmp->deleted)
- continue;
-
if (!atomic_inc_not_zero(&gw_node_tmp->refcount))
continue;
@@ -504,9 +518,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
- gw_node->deleted = 0;
if (ntohl(gateway->bandwidth_down) == 0) {
- gw_node->deleted = jiffies;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Gateway %pM removed from gateway list\n",
orig_node->orig);
@@ -514,14 +526,21 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
/* Note: We don't need a NULL check here, since curr_gw never
* gets dereferenced.
*/
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ hlist_del_init_rcu(&gw_node->list);
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ batadv_gw_node_free_ref(gw_node);
+
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
if (gw_node == curr_gw)
batadv_gw_reselect(bat_priv);
+
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
}
out:
- if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
if (gw_node)
batadv_gw_node_free_ref(gw_node);
}
@@ -537,39 +556,18 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv,
batadv_gw_node_update(bat_priv, orig_node, &gateway);
}
-void batadv_gw_node_purge(struct batadv_priv *bat_priv)
+void batadv_gw_node_free(struct batadv_priv *bat_priv)
{
- struct batadv_gw_node *gw_node, *curr_gw;
+ struct batadv_gw_node *gw_node;
struct hlist_node *node_tmp;
- unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT);
- int do_reselect = 0;
-
- curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
spin_lock_bh(&bat_priv->gw.list_lock);
-
hlist_for_each_entry_safe(gw_node, node_tmp,
&bat_priv->gw.list, list) {
- if (((!gw_node->deleted) ||
- (time_before(jiffies, gw_node->deleted + timeout))) &&
- atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE)
- continue;
-
- if (curr_gw == gw_node)
- do_reselect = 1;
-
- hlist_del_rcu(&gw_node->list);
+ hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_free_ref(gw_node);
}
-
spin_unlock_bh(&bat_priv->gw.list_lock);
-
- /* gw_reselect() needs to acquire the gw_list_lock */
- if (do_reselect)
- batadv_gw_reselect(bat_priv);
-
- if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
}
/* fails if orig_node has no router */
@@ -633,9 +631,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
rcu_read_lock();
hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
- if (gw_node->deleted)
- continue;
-
/* fails if orig_node has no router */
if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0)
continue;
@@ -670,7 +665,7 @@ out:
*/
enum batadv_dhcp_recipient
batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
- uint8_t *chaddr)
+ u8 *chaddr)
{
enum batadv_dhcp_recipient ret = BATADV_DHCP_NO;
struct ethhdr *ethhdr;
@@ -680,7 +675,7 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
struct vlan_ethhdr *vhdr;
int chaddr_offset;
__be16 proto;
- uint8_t *p;
+ u8 *p;
/* check for ethernet header */
if (!pskb_may_pull(skb, *header_len + ETH_HLEN))
@@ -733,11 +728,6 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr)))
return BATADV_DHCP_NO;
- /* skb->data might have been reallocated by pskb_may_pull() */
- ethhdr = eth_hdr(skb);
- if (ntohs(ethhdr->h_proto) == ETH_P_8021Q)
- ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN);
-
udphdr = (struct udphdr *)(skb->data + *header_len);
*header_len += sizeof(*udphdr);
@@ -795,13 +785,15 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
struct sk_buff *skb)
{
- struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL;
+ struct batadv_neigh_node *neigh_curr = NULL;
+ struct batadv_neigh_node *neigh_old = NULL;
struct batadv_orig_node *orig_dst_node = NULL;
- struct batadv_gw_node *gw_node = NULL, *curr_gw = NULL;
+ struct batadv_gw_node *gw_node = NULL;
+ struct batadv_gw_node *curr_gw = NULL;
struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo;
struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
bool out_of_range = false;
- uint8_t curr_tq_avg;
+ u8 curr_tq_avg;
unsigned short vid;
vid = batadv_get_vid(skb, 0);
diff --git a/kernel/net/batman-adv/gateway_client.h b/kernel/net/batman-adv/gateway_client.h
index 7ee53bb7d..fa9527785 100644
--- a/kernel/net/batman-adv/gateway_client.h
+++ b/kernel/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -18,6 +18,14 @@
#ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
#define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct batadv_tvlv_gateway_data;
+struct seq_file;
+struct sk_buff;
+
void batadv_gw_check_client_stop(struct batadv_priv *bat_priv);
void batadv_gw_reselect(struct batadv_priv *bat_priv);
void batadv_gw_election(struct batadv_priv *bat_priv);
@@ -30,11 +38,11 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
struct batadv_tvlv_gateway_data *gateway);
void batadv_gw_node_delete(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node);
-void batadv_gw_node_purge(struct batadv_priv *bat_priv);
+void batadv_gw_node_free(struct batadv_priv *bat_priv);
int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
enum batadv_dhcp_recipient
batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
- uint8_t *chaddr);
+ u8 *chaddr);
#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/kernel/net/batman-adv/gateway_common.c b/kernel/net/batman-adv/gateway_common.c
index 88a1bc380..0cb5e6b6f 100644
--- a/kernel/net/batman-adv/gateway_common.c
+++ b/kernel/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -15,9 +15,20 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "gateway_common.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/byteorder/generic.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/netdevice.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+
#include "gateway_client.h"
+#include "packet.h"
/**
* batadv_parse_gw_bandwidth - parse supplied string buffer to extract download
@@ -30,11 +41,11 @@
* Returns false on parse error and true otherwise.
*/
static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
- uint32_t *down, uint32_t *up)
+ u32 *down, u32 *up)
{
enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT;
char *slash_ptr, *tmp_ptr;
- long ldown, lup;
+ u64 ldown, lup;
int ret;
slash_ptr = strchr(buff, '/');
@@ -52,7 +63,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
*tmp_ptr = '\0';
}
- ret = kstrtol(buff, 10, &ldown);
+ ret = kstrtou64(buff, 10, &ldown);
if (ret) {
batadv_err(net_dev,
"Download speed of gateway mode invalid: %s\n",
@@ -62,14 +73,31 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
switch (bw_unit_type) {
case BATADV_BW_UNIT_MBIT:
- *down = ldown * 10;
+ /* prevent overflow */
+ if (U64_MAX / 10 < ldown) {
+ batadv_err(net_dev,
+ "Download speed of gateway mode too large: %s\n",
+ buff);
+ return false;
+ }
+
+ ldown *= 10;
break;
case BATADV_BW_UNIT_KBIT:
default:
- *down = ldown / 100;
+ ldown = div_u64(ldown, 100);
break;
}
+ if (U32_MAX < ldown) {
+ batadv_err(net_dev,
+ "Download speed of gateway mode too large: %s\n",
+ buff);
+ return false;
+ }
+
+ *down = ldown;
+
/* we also got some upload info */
if (slash_ptr) {
bw_unit_type = BATADV_BW_UNIT_KBIT;
@@ -85,7 +113,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
*tmp_ptr = '\0';
}
- ret = kstrtol(slash_ptr + 1, 10, &lup);
+ ret = kstrtou64(slash_ptr + 1, 10, &lup);
if (ret) {
batadv_err(net_dev,
"Upload speed of gateway mode invalid: %s\n",
@@ -95,13 +123,30 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
switch (bw_unit_type) {
case BATADV_BW_UNIT_MBIT:
- *up = lup * 10;
+ /* prevent overflow */
+ if (U64_MAX / 10 < lup) {
+ batadv_err(net_dev,
+ "Upload speed of gateway mode too large: %s\n",
+ slash_ptr + 1);
+ return false;
+ }
+
+ lup *= 10;
break;
case BATADV_BW_UNIT_KBIT:
default:
- *up = lup / 100;
+ lup = div_u64(lup, 100);
break;
}
+
+ if (U32_MAX < lup) {
+ batadv_err(net_dev,
+ "Upload speed of gateway mode too large: %s\n",
+ slash_ptr + 1);
+ return false;
+ }
+
+ *up = lup;
}
return true;
@@ -115,7 +160,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
{
struct batadv_tvlv_gateway_data gw;
- uint32_t down, up;
+ u32 down, up;
char gw_mode;
gw_mode = atomic_read(&bat_priv->gw_mode);
@@ -140,7 +185,10 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
size_t count)
{
struct batadv_priv *bat_priv = netdev_priv(net_dev);
- uint32_t down_curr, up_curr, down_new = 0, up_new = 0;
+ u32 down_curr;
+ u32 up_curr;
+ u32 down_new = 0;
+ u32 up_new = 0;
bool ret;
down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down);
@@ -148,7 +196,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new);
if (!ret)
- goto end;
+ return -EINVAL;
if (!down_new)
down_new = 1;
@@ -172,7 +220,6 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
atomic_set(&bat_priv->gw.bandwidth_up, up_new);
batadv_gw_tvlv_container_update(bat_priv);
-end:
return count;
}
@@ -186,9 +233,8 @@ end:
*/
static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
- void *tvlv_value,
- uint16_t tvlv_value_len)
+ u8 flags,
+ void *tvlv_value, u16 tvlv_value_len)
{
struct batadv_tvlv_gateway_data gateway, *gateway_ptr;
diff --git a/kernel/net/batman-adv/gateway_common.h b/kernel/net/batman-adv/gateway_common.h
index aa5116561..ab893e318 100644
--- a/kernel/net/batman-adv/gateway_common.h
+++ b/kernel/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -18,6 +18,12 @@
#ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_
#define _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct net_device;
+
enum batadv_gw_modes {
BATADV_GW_MODE_OFF,
BATADV_GW_MODE_CLIENT,
diff --git a/kernel/net/batman-adv/hard-interface.c b/kernel/net/batman-adv/hard-interface.c
index baf1f9843..f11345e16 100644
--- a/kernel/net/batman-adv/hard-interface.c
+++ b/kernel/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,22 +15,36 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
-#include "distributed-arp-table.h"
#include "hard-interface.h"
-#include "soft-interface.h"
-#include "send.h"
-#include "translation-table.h"
-#include "routing.h"
-#include "sysfs.h"
-#include "debugfs.h"
-#include "originator.h"
-#include "hash.h"
-#include "bridge_loop_avoidance.h"
-#include "gateway_client.h"
+#include "main.h"
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
+#include <linux/if.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <net/net_namespace.h>
+
+#include "bridge_loop_avoidance.h"
+#include "debugfs.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "originator.h"
+#include "packet.h"
+#include "send.h"
+#include "soft-interface.h"
+#include "sysfs.h"
+#include "translation-table.h"
void batadv_hardif_free_rcu(struct rcu_head *rcu)
{
@@ -238,6 +252,44 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev)
rcu_read_unlock();
}
+/**
+ * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom
+ * @soft_iface: netdev struct of the mesh interface
+ */
+static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface)
+{
+ const struct batadv_hard_iface *hard_iface;
+ unsigned short lower_header_len = ETH_HLEN;
+ unsigned short lower_headroom = 0;
+ unsigned short lower_tailroom = 0;
+ unsigned short needed_headroom;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+ continue;
+
+ if (hard_iface->soft_iface != soft_iface)
+ continue;
+
+ lower_header_len = max_t(unsigned short, lower_header_len,
+ hard_iface->net_dev->hard_header_len);
+
+ lower_headroom = max_t(unsigned short, lower_headroom,
+ hard_iface->net_dev->needed_headroom);
+
+ lower_tailroom = max_t(unsigned short, lower_tailroom,
+ hard_iface->net_dev->needed_tailroom);
+ }
+ rcu_read_unlock();
+
+ needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN);
+ needed_headroom += batadv_max_header_len();
+
+ soft_iface->needed_headroom = needed_headroom;
+ soft_iface->needed_tailroom = lower_tailroom;
+}
+
int batadv_hardif_min_mtu(struct net_device *soft_iface)
{
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -460,6 +512,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
"Not using interface %s (retrying later): interface not active\n",
hard_iface->net_dev->name);
+ batadv_hardif_recalc_extra_skbroom(soft_iface);
+
/* begin scheduling originator messages on that interface */
batadv_schedule_bat_ogm(hard_iface);
@@ -514,6 +568,9 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_purge_outstanding_packets(bat_priv, hard_iface);
dev_put(hard_iface->soft_iface);
+ netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
+ batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface);
+
/* nobody uses this interface anymore */
if (!bat_priv->num_ifaces) {
batadv_gw_check_client_stop(bat_priv);
@@ -522,7 +579,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_softif_destroy_sysfs(hard_iface->soft_iface);
}
- netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
hard_iface->soft_iface = NULL;
batadv_hardif_free_ref(hard_iface);
diff --git a/kernel/net/batman-adv/hard-interface.h b/kernel/net/batman-adv/hard-interface.h
index 1918cd50b..7b12ea8ea 100644
--- a/kernel/net/batman-adv/hard-interface.h
+++ b/kernel/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,6 +18,17 @@
#ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_
#define _NET_BATMAN_ADV_HARD_INTERFACE_H_
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+struct net_device;
+
enum batadv_hard_if_state {
BATADV_IF_NOT_IN_USE,
BATADV_IF_TO_BE_REMOVED,
@@ -64,18 +75,6 @@ batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu);
}
-/**
- * batadv_hardif_free_ref_now - decrement the hard interface refcounter and
- * possibly free it (without rcu callback)
- * @hard_iface: the hard interface to free
- */
-static inline void
-batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface)
-{
- if (atomic_dec_and_test(&hard_iface->refcount))
- batadv_hardif_free_rcu(&hard_iface->rcu);
-}
-
static inline struct batadv_hard_iface *
batadv_primary_if_get_selected(struct batadv_priv *bat_priv)
{
diff --git a/kernel/net/batman-adv/hash.c b/kernel/net/batman-adv/hash.c
index 7c1c63080..2ea6a18d7 100644
--- a/kernel/net/batman-adv/hash.c
+++ b/kernel/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -15,13 +15,17 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "hash.h"
+#include "main.h"
+
+#include <linux/fs.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
/* clears the hash */
static void batadv_hash_init(struct batadv_hashtable *hash)
{
- uint32_t i;
+ u32 i;
for (i = 0; i < hash->size; i++) {
INIT_HLIST_HEAD(&hash->table[i]);
@@ -38,7 +42,7 @@ void batadv_hash_destroy(struct batadv_hashtable *hash)
}
/* allocates and clears the hash */
-struct batadv_hashtable *batadv_hash_new(uint32_t size)
+struct batadv_hashtable *batadv_hash_new(u32 size)
{
struct batadv_hashtable *hash;
@@ -69,7 +73,7 @@ free_hash:
void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
struct lock_class_key *key)
{
- uint32_t i;
+ u32 i;
for (i = 0; i < hash->size; i++)
lockdep_set_class(&hash->list_locks[i], key);
diff --git a/kernel/net/batman-adv/hash.h b/kernel/net/batman-adv/hash.h
index 539fc1266..377626250 100644
--- a/kernel/net/batman-adv/hash.h
+++ b/kernel/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -18,7 +18,16 @@
#ifndef _NET_BATMAN_ADV_HASH_H_
#define _NET_BATMAN_ADV_HASH_H_
+#include "main.h"
+
+#include <linux/compiler.h>
#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+struct lock_class_key;
/* callback to a compare function. should compare 2 element datas for their
* keys, return 0 if same and not 0 if not same
@@ -30,17 +39,17 @@ typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *,
* based on the key in the data of the first
* argument and the size the second
*/
-typedef uint32_t (*batadv_hashdata_choose_cb)(const void *, uint32_t);
+typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32);
typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *);
struct batadv_hashtable {
struct hlist_head *table; /* the hashtable itself with the buckets */
spinlock_t *list_locks; /* spinlock for each hash list entry */
- uint32_t size; /* size of hashtable */
+ u32 size; /* size of hashtable */
};
/* allocates and clears the hash */
-struct batadv_hashtable *batadv_hash_new(uint32_t size);
+struct batadv_hashtable *batadv_hash_new(u32 size);
/* set class key for all locks */
void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
@@ -60,7 +69,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash,
struct hlist_head *head;
struct hlist_node *node, *node_tmp;
spinlock_t *list_lock; /* spinlock to protect write access */
- uint32_t i;
+ u32 i;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -80,28 +89,6 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash,
}
/**
- * batadv_hash_bytes - hash some bytes and add them to the previous hash
- * @hash: previous hash value
- * @data: data to be hashed
- * @size: number of bytes to be hashed
- *
- * Returns the new hash value.
- */
-static inline uint32_t batadv_hash_bytes(uint32_t hash, const void *data,
- uint32_t size)
-{
- const unsigned char *key = data;
- int i;
-
- for (i = 0; i < size; i++) {
- hash += key[i];
- hash += (hash << 10);
- hash ^= (hash >> 6);
- }
- return hash;
-}
-
-/**
* batadv_hash_add - adds data to the hashtable
* @hash: storage hash table
* @compare: callback to determine if 2 hash elements are identical
@@ -118,7 +105,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash,
const void *data,
struct hlist_node *data_node)
{
- uint32_t index;
+ u32 index;
int ret = -1;
struct hlist_head *head;
struct hlist_node *node;
@@ -162,7 +149,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
batadv_hashdata_choose_cb choose,
void *data)
{
- uint32_t index;
+ u32 index;
struct hlist_node *node;
struct hlist_head *head;
void *data_save = NULL;
diff --git a/kernel/net/batman-adv/icmp_socket.c b/kernel/net/batman-adv/icmp_socket.c
index 161ef8f17..bcabb5e3f 100644
--- a/kernel/net/batman-adv/icmp_socket.c
+++ b/kernel/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -15,14 +15,39 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "icmp_socket.h"
#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
#include <linux/debugfs.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/export.h>
+#include <linux/fcntl.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <linux/poll.h>
+#include <linux/printk.h>
+#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/skbuff.h>
#include <linux/slab.h>
-#include "icmp_socket.h"
-#include "send.h"
-#include "hash.h"
-#include "originator.h"
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+
#include "hard-interface.h"
+#include "originator.h"
+#include "packet.h"
+#include "send.h"
static struct batadv_socket_client *batadv_socket_client_hash[256];
@@ -158,7 +183,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
struct batadv_orig_node *orig_node = NULL;
struct batadv_neigh_node *neigh_node = NULL;
size_t packet_len = sizeof(struct batadv_icmp_packet);
- uint8_t *addr;
+ u8 *addr;
if (len < sizeof(struct batadv_icmp_header)) {
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -312,8 +337,8 @@ err:
}
/**
- * batadv_socket_receive_packet - schedule an icmp packet to be sent to userspace
- * on an icmp socket.
+ * batadv_socket_receive_packet - schedule an icmp packet to be sent to
+ * userspace on an icmp socket.
* @socket_client: the socket this packet belongs to
* @icmph: pointer to the header of the icmp packet
* @icmp_len: total length of the icmp packet
diff --git a/kernel/net/batman-adv/icmp_socket.h b/kernel/net/batman-adv/icmp_socket.h
index 0c33950aa..e937143f0 100644
--- a/kernel/net/batman-adv/icmp_socket.h
+++ b/kernel/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -18,6 +18,12 @@
#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct batadv_icmp_header;
+
#define BATADV_ICMP_SOCKET "socket"
void batadv_socket_init(void);
diff --git a/kernel/net/batman-adv/main.c b/kernel/net/batman-adv/main.c
index 12fc77bef..d7f17c1aa 100644
--- a/kernel/net/batman-adv/main.c
+++ b/kernel/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,31 +15,54 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
#include <linux/crc32c.h>
-#include <linux/highmem.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
#include <linux/if_vlan.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
#include <net/dsfield.h>
-#include "main.h"
-#include "sysfs.h"
+#include <net/rtnetlink.h>
+
+#include "bat_algo.h"
+#include "bridge_loop_avoidance.h"
#include "debugfs.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
+#include "hard-interface.h"
+#include "icmp_socket.h"
+#include "multicast.h"
+#include "network-coding.h"
+#include "originator.h"
+#include "packet.h"
#include "routing.h"
#include "send.h"
-#include "originator.h"
#include "soft-interface.h"
-#include "icmp_socket.h"
#include "translation-table.h"
-#include "hard-interface.h"
-#include "gateway_client.h"
-#include "bridge_loop_avoidance.h"
-#include "distributed-arp-table.h"
-#include "multicast.h"
-#include "gateway_common.h"
-#include "hash.h"
-#include "bat_algo.h"
-#include "network-coding.h"
-#include "fragmentation.h"
/* List manipulations on hardif_list have to be rtnl_lock()'ed,
* list traversals just rcu-locked
@@ -126,7 +149,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list);
#endif
INIT_LIST_HEAD(&bat_priv->tt.changes_list);
- INIT_LIST_HEAD(&bat_priv->tt.req_list);
+ INIT_HLIST_HEAD(&bat_priv->tt.req_list);
INIT_LIST_HEAD(&bat_priv->tt.roam_list);
#ifdef CONFIG_BATMAN_ADV_MCAST
INIT_HLIST_HEAD(&bat_priv->mcast.mla_list);
@@ -176,7 +199,7 @@ void batadv_mesh_free(struct net_device *soft_iface)
batadv_purge_outstanding_packets(bat_priv, NULL);
- batadv_gw_node_purge(bat_priv);
+ batadv_gw_node_free(bat_priv);
batadv_nc_mesh_free(bat_priv);
batadv_dat_free(bat_priv);
batadv_bla_free(bat_priv);
@@ -209,10 +232,13 @@ void batadv_mesh_free(struct net_device *soft_iface)
* interfaces in the current mesh
* @bat_priv: the bat priv with all the soft interface information
* @addr: the address to check
+ *
+ * Returns 'true' if the mac address was found, false otherwise.
*/
-int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr)
+bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
{
const struct batadv_hard_iface *hard_iface;
+ bool is_my_mac = false;
rcu_read_lock();
list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
@@ -223,12 +249,12 @@ int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr)
continue;
if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) {
- rcu_read_unlock();
- return 1;
+ is_my_mac = true;
+ break;
}
}
rcu_read_unlock();
- return 0;
+ return is_my_mac;
}
/**
@@ -362,7 +388,7 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
struct batadv_priv *bat_priv;
struct batadv_ogm_packet *batadv_ogm_packet;
struct batadv_hard_iface *hard_iface;
- uint8_t idx;
+ u8 idx;
int ret;
hard_iface = container_of(ptype, struct batadv_hard_iface,
@@ -471,7 +497,7 @@ static void batadv_recv_handler_init(void)
}
int
-batadv_recv_handler_register(uint8_t packet_type,
+batadv_recv_handler_register(u8 packet_type,
int (*recv_handler)(struct sk_buff *,
struct batadv_hard_iface *))
{
@@ -487,7 +513,7 @@ batadv_recv_handler_register(uint8_t packet_type,
return 0;
}
-void batadv_recv_handler_unregister(uint8_t packet_type)
+void batadv_recv_handler_unregister(u8 packet_type)
{
batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet;
}
@@ -510,14 +536,12 @@ static struct batadv_algo_ops *batadv_algo_get(char *name)
int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
{
struct batadv_algo_ops *bat_algo_ops_tmp;
- int ret;
bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name);
if (bat_algo_ops_tmp) {
pr_info("Trying to register already registered routing algorithm: %s\n",
bat_algo_ops->name);
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
/* all algorithms must implement all ops (for now) */
@@ -531,32 +555,26 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
!bat_algo_ops->bat_neigh_is_equiv_or_better) {
pr_info("Routing algo '%s' does not implement required ops\n",
bat_algo_ops->name);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
INIT_HLIST_NODE(&bat_algo_ops->list);
hlist_add_head(&bat_algo_ops->list, &batadv_algo_list);
- ret = 0;
-out:
- return ret;
+ return 0;
}
int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
{
struct batadv_algo_ops *bat_algo_ops;
- int ret = -EINVAL;
bat_algo_ops = batadv_algo_get(name);
if (!bat_algo_ops)
- goto out;
+ return -EINVAL;
bat_priv->bat_algo_ops = bat_algo_ops;
- ret = 0;
-out:
- return ret;
+ return 0;
}
int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
@@ -566,7 +584,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
seq_puts(seq, "Available routing algorithms:\n");
hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
- seq_printf(seq, "%s\n", bat_algo_ops->name);
+ seq_printf(seq, " * %s\n", bat_algo_ops->name);
}
return 0;
@@ -625,8 +643,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
* Returns tvlv handler if found or NULL otherwise.
*/
static struct batadv_tvlv_handler
-*batadv_tvlv_handler_get(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version)
+*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
{
struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL;
@@ -674,8 +691,7 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
* Returns tvlv container if found or NULL otherwise.
*/
static struct batadv_tvlv_container
-*batadv_tvlv_container_get(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version)
+*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
{
struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
@@ -706,10 +722,10 @@ static struct batadv_tvlv_container
*
* Returns size of all currently registered tvlv containers in bytes.
*/
-static uint16_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
+static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
{
struct batadv_tvlv_container *tvlv;
- uint16_t tvlv_len = 0;
+ u16 tvlv_len = 0;
hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
tvlv_len += sizeof(struct batadv_tvlv_hdr);
@@ -722,13 +738,17 @@ static uint16_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
/**
* batadv_tvlv_container_remove - remove tvlv container from the tvlv container
* list
+ * @bat_priv: the bat priv with all the soft interface information
* @tvlv: the to be removed tvlv container
*
* Has to be called with the appropriate locks being acquired
* (tvlv.container_list_lock).
*/
-static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv)
+static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_container *tvlv)
{
+ lockdep_assert_held(&bat_priv->tvlv.handler_list_lock);
+
if (!tvlv)
return;
@@ -747,13 +767,13 @@ static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv)
* @version: tvlv container type to unregister
*/
void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version)
+ u8 type, u8 version)
{
struct batadv_tvlv_container *tvlv;
spin_lock_bh(&bat_priv->tvlv.container_list_lock);
tvlv = batadv_tvlv_container_get(bat_priv, type, version);
- batadv_tvlv_container_remove(tvlv);
+ batadv_tvlv_container_remove(bat_priv, tvlv);
spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
}
@@ -770,8 +790,8 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
* content is going to replace the old one.
*/
void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version,
- void *tvlv_value, uint16_t tvlv_value_len)
+ u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len)
{
struct batadv_tvlv_container *tvlv_old, *tvlv_new;
@@ -792,7 +812,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
spin_lock_bh(&bat_priv->tvlv.container_list_lock);
tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
- batadv_tvlv_container_remove(tvlv_old);
+ batadv_tvlv_container_remove(bat_priv, tvlv_old);
hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
}
@@ -819,15 +839,15 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC);
/* keep old buffer if kmalloc should fail */
- if (new_buff) {
- memcpy(new_buff, *packet_buff, min_packet_len);
- kfree(*packet_buff);
- *packet_buff = new_buff;
- *packet_buff_len = min_packet_len + additional_packet_len;
- return true;
- }
+ if (!new_buff)
+ return false;
+
+ memcpy(new_buff, *packet_buff, min_packet_len);
+ kfree(*packet_buff);
+ *packet_buff = new_buff;
+ *packet_buff_len = min_packet_len + additional_packet_len;
- return false;
+ return true;
}
/**
@@ -844,14 +864,13 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
*
* Returns size of all appended tvlv containers in bytes.
*/
-uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
- unsigned char **packet_buff,
- int *packet_buff_len,
- int packet_min_len)
+u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len, int packet_min_len)
{
struct batadv_tvlv_container *tvlv;
struct batadv_tvlv_hdr *tvlv_hdr;
- uint16_t tvlv_value_len;
+ u16 tvlv_value_len;
void *tvlv_value;
bool ret;
@@ -876,7 +895,7 @@ uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
tvlv_hdr->len = tvlv->tvlv_hdr.len;
tvlv_value = tvlv_hdr + 1;
memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len));
- tvlv_value = (uint8_t *)tvlv_value + ntohs(tvlv->tvlv_hdr.len);
+ tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len);
}
end:
@@ -903,8 +922,8 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
struct batadv_tvlv_handler *tvlv_handler,
bool ogm_source,
struct batadv_orig_node *orig_node,
- uint8_t *src, uint8_t *dst,
- void *tvlv_value, uint16_t tvlv_value_len)
+ u8 *src, u8 *dst,
+ void *tvlv_value, u16 tvlv_value_len)
{
if (!tvlv_handler)
return NET_RX_SUCCESS;
@@ -955,13 +974,13 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
bool ogm_source,
struct batadv_orig_node *orig_node,
- uint8_t *src, uint8_t *dst,
- void *tvlv_value, uint16_t tvlv_value_len)
+ u8 *src, u8 *dst,
+ void *tvlv_value, u16 tvlv_value_len)
{
struct batadv_tvlv_handler *tvlv_handler;
struct batadv_tvlv_hdr *tvlv_hdr;
- uint16_t tvlv_value_cont_len;
- uint8_t cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND;
+ u16 tvlv_value_cont_len;
+ u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND;
int ret = NET_RX_SUCCESS;
while (tvlv_value_len >= sizeof(*tvlv_hdr)) {
@@ -983,7 +1002,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
tvlv_value_cont_len);
if (tvlv_handler)
batadv_tvlv_handler_free_ref(tvlv_handler);
- tvlv_value = (uint8_t *)tvlv_value + tvlv_value_cont_len;
+ tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
tvlv_value_len -= tvlv_value_cont_len;
}
@@ -1017,7 +1036,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node)
{
void *tvlv_value;
- uint16_t tvlv_value_len;
+ u16 tvlv_value_len;
if (!batadv_ogm_packet)
return;
@@ -1049,14 +1068,14 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
void (*optr)(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
+ u8 flags,
void *tvlv_value,
- uint16_t tvlv_value_len),
+ u16 tvlv_value_len),
int (*uptr)(struct batadv_priv *bat_priv,
- uint8_t *src, uint8_t *dst,
+ u8 *src, u8 *dst,
void *tvlv_value,
- uint16_t tvlv_value_len),
- uint8_t type, uint8_t version, uint8_t flags)
+ u16 tvlv_value_len),
+ u8 type, u8 version, u8 flags)
{
struct batadv_tvlv_handler *tvlv_handler;
@@ -1091,7 +1110,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
* @version: tvlv handler version to be unregistered
*/
void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version)
+ u8 type, u8 version)
{
struct batadv_tvlv_handler *tvlv_handler;
@@ -1117,9 +1136,9 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
* @tvlv_value: tvlv content
* @tvlv_value_len: tvlv content length
*/
-void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src,
- uint8_t *dst, uint8_t type, uint8_t version,
- void *tvlv_value, uint16_t tvlv_value_len)
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
+ u8 *dst, u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len)
{
struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
struct batadv_tvlv_hdr *tvlv_hdr;
diff --git a/kernel/net/batman-adv/main.h b/kernel/net/batman-adv/main.h
index 4d2318829..ebd8af0a1 100644
--- a/kernel/net/batman-adv/main.h
+++ b/kernel/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -24,7 +24,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2015.0"
+#define BATADV_SOURCE_VERSION "2015.2"
#endif
/* B.A.T.M.A.N. parameters */
@@ -44,7 +44,7 @@
#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */
#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
-#define BATADV_DAT_ENTRY_TIMEOUT (5*60000) /* 5 mins in milliseconds */
+#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */
/* sliding packet range of received originator messages in sequence numbers
* (should be a multiple of our word size)
*/
@@ -163,28 +163,26 @@ enum batadv_uev_type {
/* Kernel headers */
-#include <linux/mutex.h> /* mutex */
-#include <linux/module.h> /* needed by all modules */
-#include <linux/netdevice.h> /* netdevice */
-#include <linux/etherdevice.h> /* ethernet address classification */
-#include <linux/if_ether.h> /* ethernet header */
-#include <linux/poll.h> /* poll_table */
-#include <linux/kthread.h> /* kernel threads */
-#include <linux/pkt_sched.h> /* schedule types */
-#include <linux/workqueue.h> /* workqueue */
+#include <linux/atomic.h>
+#include <linux/bitops.h> /* for packet.h */
+#include <linux/compiler.h>
+#include <linux/cpumask.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h> /* for packet.h */
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/types.h>
#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <net/sock.h> /* struct sock */
-#include <net/addrconf.h> /* ipv6 address stuff */
-#include <linux/ip.h>
-#include <net/rtnetlink.h>
#include <linux/jiffies.h>
-#include <linux/seq_file.h>
#include <linux/if_vlan.h>
#include "types.h"
-#define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \
+struct batadv_ogm_packet;
+struct seq_file;
+struct sk_buff;
+
+#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \
(int)(vid & VLAN_VID_MASK) : -1)
extern char batadv_routing_algo[];
@@ -195,7 +193,7 @@ extern struct workqueue_struct *batadv_event_workqueue;
int batadv_mesh_init(struct net_device *soft_iface);
void batadv_mesh_free(struct net_device *soft_iface);
-int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr);
+bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr);
struct batadv_hard_iface *
batadv_seq_print_text_primary_if_get(struct seq_file *seq);
int batadv_max_header_len(void);
@@ -204,10 +202,10 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *ptype,
struct net_device *orig_dev);
int
-batadv_recv_handler_register(uint8_t packet_type,
+batadv_recv_handler_register(u8 packet_type,
int (*recv_handler)(struct sk_buff *,
struct batadv_hard_iface *));
-void batadv_recv_handler_unregister(uint8_t packet_type);
+void batadv_recv_handler_unregister(u8 packet_type);
int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
@@ -279,7 +277,7 @@ static inline void _batadv_dbg(int type __always_unused,
*
* note: can't use ether_addr_equal() as it requires aligned memory
*/
-static inline int batadv_compare_eth(const void *data1, const void *data2)
+static inline bool batadv_compare_eth(const void *data1, const void *data2)
{
return ether_addr_equal_unaligned(data1, data2);
}
@@ -306,7 +304,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp,
* they handle overflows/underflows and can correctly check for a
* predecessor/successor unless the variable sequence number has grown by
* more then 2**(bitwidth(x)-1)-1.
- * This means that for a uint8_t with the maximum value 255, it would think:
+ * This means that for a u8 with the maximum value 255, it would think:
* - when adding nothing - it is neither a predecessor nor a successor
* - before adding more than 127 to the starting value - it is a predecessor,
* - when adding 128 - it is neither a predecessor nor a successor,
@@ -329,10 +327,9 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
/* Sum and return the cpu-local counters for index 'idx' */
-static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv,
- size_t idx)
+static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
{
- uint64_t *counters, sum = 0;
+ u64 *counters, sum = 0;
int cpu;
for_each_possible_cpu(cpu) {
@@ -350,39 +347,38 @@ static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv,
#define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0]))
void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version,
- void *tvlv_value, uint16_t tvlv_value_len);
-uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
- unsigned char **packet_buff,
- int *packet_buff_len,
- int packet_min_len);
+ u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len);
+u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len, int packet_min_len);
void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
struct batadv_ogm_packet *batadv_ogm_packet,
struct batadv_orig_node *orig_node);
void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version);
+ u8 type, u8 version);
void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
void (*optr)(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
+ u8 flags,
void *tvlv_value,
- uint16_t tvlv_value_len),
+ u16 tvlv_value_len),
int (*uptr)(struct batadv_priv *bat_priv,
- uint8_t *src, uint8_t *dst,
+ u8 *src, u8 *dst,
void *tvlv_value,
- uint16_t tvlv_value_len),
- uint8_t type, uint8_t version, uint8_t flags);
+ u16 tvlv_value_len),
+ u8 type, u8 version, u8 flags);
void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
- uint8_t type, uint8_t version);
+ u8 type, u8 version);
int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
bool ogm_source,
struct batadv_orig_node *orig_node,
- uint8_t *src, uint8_t *dst,
- void *tvlv_buff, uint16_t tvlv_buff_len);
-void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src,
- uint8_t *dst, uint8_t type, uint8_t version,
- void *tvlv_value, uint16_t tvlv_value_len);
+ u8 *src, u8 *dst,
+ void *tvlv_buff, u16 tvlv_buff_len);
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
+ u8 *dst, u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len);
unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len);
bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid);
diff --git a/kernel/net/batman-adv/multicast.c b/kernel/net/batman-adv/multicast.c
index b24e4bb64..eb76386f8 100644
--- a/kernel/net/batman-adv/multicast.c
+++ b/kernel/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors:
*
* Linus Lüssing
*
@@ -15,10 +15,36 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "multicast.h"
-#include "originator.h"
-#include "hard-interface.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+
+#include "packet.h"
#include "translation-table.h"
/**
@@ -64,7 +90,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev,
* Returns true if the given address is already in the given list.
* Otherwise returns false.
*/
-static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr,
+static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
struct hlist_head *mcast_list)
{
struct batadv_hw_addr *mcast_entry;
@@ -78,15 +104,19 @@ static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr,
/**
* batadv_mcast_mla_list_free - free a list of multicast addresses
+ * @bat_priv: the bat priv with all the soft interface information
* @mcast_list: the list to free
*
* Removes and frees all items in the given mcast_list.
*/
-static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
+static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
+ struct hlist_head *mcast_list)
{
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
+ lockdep_assert_held(&bat_priv->tt.commit_lock);
+
hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
hlist_del(&mcast_entry->list);
kfree(mcast_entry);
@@ -109,6 +139,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
+ lockdep_assert_held(&bat_priv->tt.commit_lock);
+
hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
list) {
if (mcast_list &&
@@ -139,6 +171,8 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
+ lockdep_assert_held(&bat_priv->tt.commit_lock);
+
if (!mcast_list)
return;
@@ -243,7 +277,7 @@ update:
batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
out:
- batadv_mcast_mla_list_free(&mcast_list);
+ batadv_mcast_mla_list_free(bat_priv, &mcast_list);
}
/**
@@ -565,19 +599,28 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
*
* If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator,
* orig, has toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
*/
static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t mcast_flags)
+ u8 mcast_flags)
{
+ struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
/* switched from flag unset to set */
if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
!(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) {
atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables);
spin_lock_bh(&bat_priv->mcast.want_lists_lock);
- hlist_add_head_rcu(&orig->mcast_want_all_unsnoopables_node,
- &bat_priv->mcast.want_all_unsnoopables_list);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
/* switched from flag set to unset */
} else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) &&
@@ -585,7 +628,10 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables);
spin_lock_bh(&bat_priv->mcast.want_lists_lock);
- hlist_del_rcu(&orig->mcast_want_all_unsnoopables_node);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
}
}
@@ -598,19 +644,28 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
*
* If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has
* toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
*/
static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t mcast_flags)
+ u8 mcast_flags)
{
+ struct hlist_node *node = &orig->mcast_want_all_ipv4_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
/* switched from flag unset to set */
if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 &&
!(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) {
atomic_inc(&bat_priv->mcast.num_want_all_ipv4);
spin_lock_bh(&bat_priv->mcast.want_lists_lock);
- hlist_add_head_rcu(&orig->mcast_want_all_ipv4_node,
- &bat_priv->mcast.want_all_ipv4_list);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
/* switched from flag set to unset */
} else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) &&
@@ -618,7 +673,10 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
atomic_dec(&bat_priv->mcast.num_want_all_ipv4);
spin_lock_bh(&bat_priv->mcast.want_lists_lock);
- hlist_del_rcu(&orig->mcast_want_all_ipv4_node);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
}
}
@@ -631,19 +689,28 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
*
* If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has
* toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
*/
static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t mcast_flags)
+ u8 mcast_flags)
{
+ struct hlist_node *node = &orig->mcast_want_all_ipv6_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
/* switched from flag unset to set */
if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 &&
!(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) {
atomic_inc(&bat_priv->mcast.num_want_all_ipv6);
spin_lock_bh(&bat_priv->mcast.want_lists_lock);
- hlist_add_head_rcu(&orig->mcast_want_all_ipv6_node,
- &bat_priv->mcast.want_all_ipv6_list);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
/* switched from flag set to unset */
} else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) &&
@@ -651,7 +718,10 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
atomic_dec(&bat_priv->mcast.num_want_all_ipv6);
spin_lock_bh(&bat_priv->mcast.want_lists_lock);
- hlist_del_rcu(&orig->mcast_want_all_ipv6_node);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
}
}
@@ -666,47 +736,50 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
*/
static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
+ u8 flags,
void *tvlv_value,
- uint16_t tvlv_value_len)
+ u16 tvlv_value_len)
{
bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
- uint8_t mcast_flags = BATADV_NO_FLAGS;
+ u8 mcast_flags = BATADV_NO_FLAGS;
bool orig_initialized;
- orig_initialized = orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST;
+ if (orig_mcast_enabled && tvlv_value &&
+ (tvlv_value_len >= sizeof(mcast_flags)))
+ mcast_flags = *(u8 *)tvlv_value;
+
+ spin_lock_bh(&orig->mcast_handler_lock);
+ orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+ &orig->capa_initialized);
/* If mcast support is turned on decrease the disabled mcast node
* counter only if we had increased it for this node before. If this
* is a completely new orig_node no need to decrease the counter.
*/
if (orig_mcast_enabled &&
- !(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) {
+ !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
if (orig_initialized)
atomic_dec(&bat_priv->mcast.num_disabled);
- orig->capabilities |= BATADV_ORIG_CAPA_HAS_MCAST;
+ set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
/* If mcast support is being switched off or if this is an initial
* OGM without mcast support then increase the disabled mcast
* node counter.
*/
} else if (!orig_mcast_enabled &&
- (orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST ||
+ (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) ||
!orig_initialized)) {
atomic_inc(&bat_priv->mcast.num_disabled);
- orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_MCAST;
+ clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
}
- orig->capa_initialized |= BATADV_ORIG_CAPA_HAS_MCAST;
-
- if (orig_mcast_enabled && tvlv_value &&
- (tvlv_value_len >= sizeof(mcast_flags)))
- mcast_flags = *(uint8_t *)tvlv_value;
+ set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized);
batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags);
batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags);
batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
orig->mcast_flags = mcast_flags;
+ spin_unlock_bh(&orig->mcast_handler_lock);
}
/**
@@ -740,11 +813,15 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
{
struct batadv_priv *bat_priv = orig->bat_priv;
- if (!(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST) &&
- orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST)
+ spin_lock_bh(&orig->mcast_handler_lock);
+
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) &&
+ test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized))
atomic_dec(&bat_priv->mcast.num_disabled);
batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
+
+ spin_unlock_bh(&orig->mcast_handler_lock);
}
diff --git a/kernel/net/batman-adv/multicast.h b/kernel/net/batman-adv/multicast.h
index 3a44ebdb4..8f3cb04b9 100644
--- a/kernel/net/batman-adv/multicast.h
+++ b/kernel/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors:
*
* Linus Lüssing
*
@@ -18,6 +18,10 @@
#ifndef _NET_BATMAN_ADV_MULTICAST_H_
#define _NET_BATMAN_ADV_MULTICAST_H_
+#include "main.h"
+
+struct sk_buff;
+
/**
* batadv_forw_mode - the way a packet should be forwarded as
* @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic
diff --git a/kernel/net/batman-adv/network-coding.c b/kernel/net/batman-adv/network-coding.c
index 127cc4d73..d0956f726 100644
--- a/kernel/net/batman-adv/network-coding.c
+++ b/kernel/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*
@@ -15,15 +15,45 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "network-coding.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
#include <linux/debugfs.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
-#include "main.h"
+#include "hard-interface.h"
#include "hash.h"
-#include "network-coding.h"
-#include "send.h"
#include "originator.h"
-#include "hard-interface.h"
+#include "packet.h"
#include "routing.h"
+#include "send.h"
static struct lock_class_key batadv_nc_coding_hash_lock_class_key;
static struct lock_class_key batadv_nc_decoding_hash_lock_class_key;
@@ -100,14 +130,13 @@ void batadv_nc_status_update(struct net_device *net_dev)
*/
static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
- void *tvlv_value,
- uint16_t tvlv_value_len)
+ u8 flags,
+ void *tvlv_value, u16 tvlv_value_len)
{
if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
- orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_NC;
+ clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
else
- orig->capabilities |= BATADV_ORIG_CAPA_HAS_NC;
+ set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
}
/**
@@ -155,7 +184,7 @@ err:
*/
void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
{
- atomic_set(&bat_priv->network_coding, 1);
+ atomic_set(&bat_priv->network_coding, 0);
bat_priv->nc.min_tq = 200;
bat_priv->nc.max_fwd_delay = 10;
bat_priv->nc.max_buffer_time = 200;
@@ -174,28 +203,25 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
}
/**
- * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove
- * its refcount on the orig_node
- * @rcu: rcu pointer of the nc node
+ * batadv_nc_node_release - release nc_node from lists and queue for free after
+ * rcu grace period
+ * @nc_node: the nc node to free
*/
-static void batadv_nc_node_free_rcu(struct rcu_head *rcu)
+static void batadv_nc_node_release(struct batadv_nc_node *nc_node)
{
- struct batadv_nc_node *nc_node;
-
- nc_node = container_of(rcu, struct batadv_nc_node, rcu);
batadv_orig_node_free_ref(nc_node->orig_node);
- kfree(nc_node);
+ kfree_rcu(nc_node, rcu);
}
/**
- * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly
- * frees it
+ * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly
+ * release it
* @nc_node: the nc node to free
*/
static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
{
if (atomic_dec_and_test(&nc_node->refcount))
- call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu);
+ batadv_nc_node_release(nc_node);
}
/**
@@ -275,7 +301,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
* max_buffer time
*/
return batadv_has_timed_out(nc_path->last_valid,
- bat_priv->nc.max_buffer_time*10);
+ bat_priv->nc.max_buffer_time * 10);
}
/**
@@ -352,7 +378,7 @@ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_head *head;
struct batadv_orig_node *orig_node;
- uint32_t i;
+ u32 i;
if (!hash)
return;
@@ -388,7 +414,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
struct hlist_node *node_tmp;
struct batadv_nc_path *nc_path;
spinlock_t *lock; /* Protects lists in hash */
- uint32_t i;
+ u32 i;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -448,19 +474,13 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
*
* Returns the selected index in the hash table for the given data.
*/
-static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size)
+static u32 batadv_nc_hash_choose(const void *data, u32 size)
{
const struct batadv_nc_path *nc_path = data;
- uint32_t hash = 0;
-
- hash = batadv_hash_bytes(hash, &nc_path->prev_hop,
- sizeof(nc_path->prev_hop));
- hash = batadv_hash_bytes(hash, &nc_path->next_hop,
- sizeof(nc_path->next_hop));
+ u32 hash = 0;
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
+ hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash);
+ hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash);
return hash % size;
}
@@ -563,6 +583,8 @@ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
unsigned long timeout = bat_priv->nc.max_buffer_time;
bool res = false;
+ lockdep_assert_held(&nc_path->packet_list_lock);
+
/* Packets are added to tail, so the remaining packets did not time
* out and we can stop processing the current queue
*/
@@ -599,6 +621,8 @@ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
{
unsigned long timeout = bat_priv->nc.max_fwd_delay;
+ lockdep_assert_held(&nc_path->packet_list_lock);
+
/* Packets are added to tail, so the remaining packets did not time
* out and we can stop processing the current queue
*/
@@ -720,8 +744,8 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
struct batadv_ogm_packet *ogm_packet)
{
struct batadv_orig_ifinfo *orig_ifinfo;
- uint32_t last_real_seqno;
- uint8_t last_ttl;
+ u32 last_real_seqno;
+ u8 last_ttl;
orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT);
if (!orig_ifinfo)
@@ -849,8 +873,8 @@ free:
}
/**
- * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node structs
- * (best called on incoming OGMs)
+ * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node
+ * structs (best called on incoming OGMs)
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: orig node originating the ogm packet
* @orig_neigh_node: neighboring orig node from which we received the ogm packet
@@ -864,14 +888,15 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
struct batadv_ogm_packet *ogm_packet,
int is_single_hop_neigh)
{
- struct batadv_nc_node *in_nc_node = NULL, *out_nc_node = NULL;
+ struct batadv_nc_node *in_nc_node = NULL;
+ struct batadv_nc_node *out_nc_node = NULL;
/* Check if network coding is enabled */
if (!atomic_read(&bat_priv->network_coding))
goto out;
/* check if orig node is network coding enabled */
- if (!(orig_node->capabilities & BATADV_ORIG_CAPA_HAS_NC))
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities))
goto out;
/* accept ogms from 'good' neighbors and single hop neighbors */
@@ -914,8 +939,8 @@ out:
*/
static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
struct batadv_hashtable *hash,
- uint8_t *src,
- uint8_t *dst)
+ u8 *src,
+ u8 *dst)
{
int hash_added;
struct batadv_nc_path *nc_path, nc_path_key;
@@ -967,9 +992,9 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
* selection of a receiver with slightly lower TQ than the other
* @tq: to be weighted tq value
*/
-static uint8_t batadv_nc_random_weight_tq(uint8_t tq)
+static u8 batadv_nc_random_weight_tq(u8 tq)
{
- uint8_t rand_val, rand_tq;
+ u8 rand_val, rand_tq;
get_random_bytes(&rand_val, sizeof(rand_val));
@@ -1014,7 +1039,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
struct batadv_nc_packet *nc_packet,
struct batadv_neigh_node *neigh_node)
{
- uint8_t tq_weighted_neigh, tq_weighted_coding, tq_tmp;
+ u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp;
struct sk_buff *skb_dest, *skb_src;
struct batadv_unicast_packet *packet1;
struct batadv_unicast_packet *packet2;
@@ -1023,7 +1048,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
struct batadv_neigh_node *router_coding = NULL;
struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL;
struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL;
- uint8_t *first_source, *first_dest, *second_source, *second_dest;
+ u8 *first_source, *first_dest, *second_source, *second_dest;
__be32 packet_id1, packet_id2;
size_t count;
bool res = false;
@@ -1207,8 +1232,7 @@ out:
*
* Returns true if coding of a decoded packet is allowed.
*/
-static bool batadv_nc_skb_coding_possible(struct sk_buff *skb,
- uint8_t *dst, uint8_t *src)
+static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
{
if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src))
return false;
@@ -1231,7 +1255,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv,
struct batadv_nc_node *in_nc_node,
struct batadv_nc_node *out_nc_node,
struct sk_buff *skb,
- uint8_t *eth_dst)
+ u8 *eth_dst)
{
struct batadv_nc_path *nc_path, nc_path_key;
struct batadv_nc_packet *nc_packet_out = NULL;
@@ -1297,8 +1321,8 @@ batadv_nc_path_search(struct batadv_priv *bat_priv,
static struct batadv_nc_packet *
batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- uint8_t *eth_dst,
- uint8_t *eth_src,
+ u8 *eth_dst,
+ u8 *eth_src,
struct batadv_nc_node *in_nc_node)
{
struct batadv_orig_node *orig_node;
@@ -1338,7 +1362,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
*/
static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- uint8_t *eth_dst_new)
+ u8 *eth_dst_new)
{
struct ethhdr *ethhdr;
@@ -1614,7 +1638,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_unicast_packet *unicast_packet;
struct batadv_coded_packet coded_packet_tmp;
struct ethhdr *ethhdr, ethhdr_tmp;
- uint8_t *orig_dest, ttl, ttvn;
+ u8 *orig_dest, ttl, ttvn;
unsigned int coding_len;
int err;
@@ -1706,7 +1730,7 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
struct batadv_hashtable *hash = bat_priv->nc.decoding_hash;
struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL;
struct batadv_nc_path *nc_path, nc_path_key;
- uint8_t *dest, *source;
+ u8 *dest, *source;
__be32 packet_id;
int index;
diff --git a/kernel/net/batman-adv/network-coding.h b/kernel/net/batman-adv/network-coding.h
index 358c0d686..8f6d4ad87 100644
--- a/kernel/net/batman-adv/network-coding.h
+++ b/kernel/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*
@@ -18,6 +18,15 @@
#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_
#define _NET_BATMAN_ADV_NETWORK_CODING_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct batadv_ogm_packet;
+struct net_device;
+struct seq_file;
+struct sk_buff;
+
#ifdef CONFIG_BATMAN_ADV_NC
void batadv_nc_status_update(struct net_device *net_dev);
diff --git a/kernel/net/batman-adv/originator.c b/kernel/net/batman-adv/originator.c
index 90e805aba..17851d3aa 100644
--- a/kernel/net/batman-adv/originator.c
+++ b/kernel/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,19 +15,32 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "originator.h"
#include "main.h"
+
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
#include "distributed-arp-table.h"
-#include "originator.h"
-#include "hash.h"
-#include "translation-table.h"
-#include "routing.h"
+#include "fragmentation.h"
#include "gateway_client.h"
#include "hard-interface.h"
-#include "soft-interface.h"
-#include "bridge_loop_avoidance.h"
-#include "network-coding.h"
-#include "fragmentation.h"
+#include "hash.h"
#include "multicast.h"
+#include "network-coding.h"
+#include "routing.h"
+#include "translation-table.h"
/* hash class keys */
static struct lock_class_key batadv_orig_hash_lock_class_key;
@@ -58,7 +71,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
struct batadv_orig_node_vlan *vlan = NULL, *tmp;
rcu_read_lock();
- list_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) {
+ hlist_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) {
if (tmp->vid != vid)
continue;
@@ -106,7 +119,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
atomic_set(&vlan->refcount, 2);
vlan->vid = vid;
- list_add_rcu(&vlan->list, &orig_node->vlan_list);
+ hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list);
out:
spin_unlock_bh(&orig_node->vlan_list_lock);
@@ -150,86 +163,66 @@ err:
}
/**
- * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object
- * @rcu: rcu pointer of the neigh_ifinfo object
- */
-static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu)
-{
- struct batadv_neigh_ifinfo *neigh_ifinfo;
-
- neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu);
-
- if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
- batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing);
-
- kfree(neigh_ifinfo);
-}
-
-/**
- * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free
- * the neigh_ifinfo (without rcu callback)
+ * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for
+ * free after rcu grace period
* @neigh_ifinfo: the neigh_ifinfo object to release
*/
static void
-batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo)
+batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo)
{
- if (atomic_dec_and_test(&neigh_ifinfo->refcount))
- batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu);
+ if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
+ batadv_hardif_free_ref(neigh_ifinfo->if_outgoing);
+
+ kfree_rcu(neigh_ifinfo, rcu);
}
/**
- * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free
+ * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release
* the neigh_ifinfo
* @neigh_ifinfo: the neigh_ifinfo object to release
*/
void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo)
{
if (atomic_dec_and_test(&neigh_ifinfo->refcount))
- call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu);
+ batadv_neigh_ifinfo_release(neigh_ifinfo);
}
/**
* batadv_neigh_node_free_rcu - free the neigh_node
- * @rcu: rcu pointer of the neigh_node
+ * batadv_neigh_node_release - release neigh_node from lists and queue for
+ * free after rcu grace period
+ * @neigh_node: neigh neighbor to free
*/
-static void batadv_neigh_node_free_rcu(struct rcu_head *rcu)
+static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node)
{
struct hlist_node *node_tmp;
- struct batadv_neigh_node *neigh_node;
struct batadv_neigh_ifinfo *neigh_ifinfo;
+ struct batadv_algo_ops *bao;
- neigh_node = container_of(rcu, struct batadv_neigh_node, rcu);
+ bao = neigh_node->orig_node->bat_priv->bat_algo_ops;
hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
&neigh_node->ifinfo_list, list) {
- batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo);
+ batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
}
- batadv_hardif_free_ref_now(neigh_node->if_incoming);
- kfree(neigh_node);
-}
+ if (bao->bat_neigh_free)
+ bao->bat_neigh_free(neigh_node);
-/**
- * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter
- * and possibly free it (without rcu callback)
- * @neigh_node: neigh neighbor to free
- */
-static void
-batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node)
-{
- if (atomic_dec_and_test(&neigh_node->refcount))
- batadv_neigh_node_free_rcu(&neigh_node->rcu);
+ batadv_hardif_free_ref(neigh_node->if_incoming);
+
+ kfree_rcu(neigh_node, rcu);
}
/**
* batadv_neigh_node_free_ref - decrement the neighbors refcounter
- * and possibly free it
+ * and possibly release it
* @neigh_node: neigh neighbor to free
*/
void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
{
if (atomic_dec_and_test(&neigh_node->refcount))
- call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu);
+ batadv_neigh_node_release(neigh_node);
}
/**
@@ -424,41 +417,6 @@ out:
}
/**
- * batadv_neigh_node_new - create and init a new neigh_node object
- * @hard_iface: the interface where the neighbour is connected to
- * @neigh_addr: the mac address of the neighbour interface
- * @orig_node: originator object representing the neighbour
- *
- * Allocates a new neigh_node object and initialises all the generic fields.
- * Returns the new object or NULL on failure.
- */
-struct batadv_neigh_node *
-batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
- const uint8_t *neigh_addr,
- struct batadv_orig_node *orig_node)
-{
- struct batadv_neigh_node *neigh_node;
-
- neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
- if (!neigh_node)
- goto out;
-
- INIT_HLIST_NODE(&neigh_node->list);
- INIT_HLIST_HEAD(&neigh_node->ifinfo_list);
- spin_lock_init(&neigh_node->ifinfo_lock);
-
- ether_addr_copy(neigh_node->addr, neigh_addr);
- neigh_node->if_incoming = hard_iface;
- neigh_node->orig_node = orig_node;
-
- /* extra reference for return */
- atomic_set(&neigh_node->refcount, 2);
-
-out:
- return neigh_node;
-}
-
-/**
* batadv_neigh_node_get - retrieve a neighbour from the list
* @orig_node: originator which the neighbour belongs to
* @hard_iface: the interface where this neighbour is connected to
@@ -468,10 +426,10 @@ out:
* which is connected through the provided hard interface.
* Returns NULL if the neighbour is not found.
*/
-struct batadv_neigh_node *
+static struct batadv_neigh_node *
batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
const struct batadv_hard_iface *hard_iface,
- const uint8_t *addr)
+ const u8 *addr)
{
struct batadv_neigh_node *tmp_neigh_node, *res = NULL;
@@ -495,108 +453,152 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
}
/**
- * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object
- * @rcu: rcu pointer of the orig_ifinfo object
+ * batadv_neigh_node_new - create and init a new neigh_node object
+ * @orig_node: originator object representing the neighbour
+ * @hard_iface: the interface where the neighbour is connected to
+ * @neigh_addr: the mac address of the neighbour interface
+ *
+ * Allocates a new neigh_node object and initialises all the generic fields.
+ * Returns the new object or NULL on failure.
*/
-static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu)
+struct batadv_neigh_node *
+batadv_neigh_node_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
{
- struct batadv_orig_ifinfo *orig_ifinfo;
- struct batadv_neigh_node *router;
+ struct batadv_neigh_node *neigh_node;
- orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu);
+ neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
+ if (neigh_node)
+ goto out;
- if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
- batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing);
+ neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
+ if (!neigh_node)
+ goto out;
- /* this is the last reference to this object */
- router = rcu_dereference_protected(orig_ifinfo->router, true);
- if (router)
- batadv_neigh_node_free_ref_now(router);
- kfree(orig_ifinfo);
+ if (!atomic_inc_not_zero(&hard_iface->refcount)) {
+ kfree(neigh_node);
+ neigh_node = NULL;
+ goto out;
+ }
+
+ INIT_HLIST_NODE(&neigh_node->list);
+ INIT_HLIST_HEAD(&neigh_node->ifinfo_list);
+ spin_lock_init(&neigh_node->ifinfo_lock);
+
+ ether_addr_copy(neigh_node->addr, neigh_addr);
+ neigh_node->if_incoming = hard_iface;
+ neigh_node->orig_node = orig_node;
+
+ /* extra reference for return */
+ atomic_set(&neigh_node->refcount, 2);
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
+ "Creating new neighbor %pM for orig_node %pM on interface %s\n",
+ neigh_addr, orig_node->orig, hard_iface->net_dev->name);
+
+out:
+ return neigh_node;
}
/**
- * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free
- * the orig_ifinfo (without rcu callback)
+ * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
+ * free after rcu grace period
* @orig_ifinfo: the orig_ifinfo object to release
*/
-static void
-batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo)
+static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo)
{
- if (atomic_dec_and_test(&orig_ifinfo->refcount))
- batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu);
+ struct batadv_neigh_node *router;
+
+ if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
+ batadv_hardif_free_ref(orig_ifinfo->if_outgoing);
+
+ /* this is the last reference to this object */
+ router = rcu_dereference_protected(orig_ifinfo->router, true);
+ if (router)
+ batadv_neigh_node_free_ref(router);
+
+ kfree_rcu(orig_ifinfo, rcu);
}
/**
- * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free
+ * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release
* the orig_ifinfo
* @orig_ifinfo: the orig_ifinfo object to release
*/
void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo)
{
if (atomic_dec_and_test(&orig_ifinfo->refcount))
- call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu);
+ batadv_orig_ifinfo_release(orig_ifinfo);
}
+/**
+ * batadv_orig_node_free_rcu - free the orig_node
+ * @rcu: rcu pointer of the orig_node
+ */
static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
{
- struct hlist_node *node_tmp;
- struct batadv_neigh_node *neigh_node;
struct batadv_orig_node *orig_node;
- struct batadv_orig_ifinfo *orig_ifinfo;
orig_node = container_of(rcu, struct batadv_orig_node, rcu);
+ batadv_mcast_purge_orig(orig_node);
+
+ batadv_frag_purge_orig(orig_node, NULL);
+
+ if (orig_node->bat_priv->bat_algo_ops->bat_orig_free)
+ orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node);
+
+ kfree(orig_node->tt_buff);
+ kfree(orig_node);
+}
+
+/**
+ * batadv_orig_node_release - release orig_node from lists and queue for
+ * free after rcu grace period
+ * @orig_node: the orig node to free
+ */
+static void batadv_orig_node_release(struct batadv_orig_node *orig_node)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
+
spin_lock_bh(&orig_node->neigh_list_lock);
/* for all neighbors towards this originator ... */
hlist_for_each_entry_safe(neigh_node, node_tmp,
&orig_node->neigh_list, list) {
hlist_del_rcu(&neigh_node->list);
- batadv_neigh_node_free_ref_now(neigh_node);
+ batadv_neigh_node_free_ref(neigh_node);
}
hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
&orig_node->ifinfo_list, list) {
hlist_del_rcu(&orig_ifinfo->list);
- batadv_orig_ifinfo_free_ref_now(orig_ifinfo);
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
}
spin_unlock_bh(&orig_node->neigh_list_lock);
- batadv_mcast_purge_orig(orig_node);
-
/* Free nc_nodes */
batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
- batadv_frag_purge_orig(orig_node, NULL);
-
- if (orig_node->bat_priv->bat_algo_ops->bat_orig_free)
- orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node);
-
- kfree(orig_node->tt_buff);
- kfree(orig_node);
+ call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
}
/**
* batadv_orig_node_free_ref - decrement the orig node refcounter and possibly
- * schedule an rcu callback for freeing it
+ * release it
* @orig_node: the orig node to free
*/
void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node)
{
if (atomic_dec_and_test(&orig_node->refcount))
- call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
-}
-
-/**
- * batadv_orig_node_free_ref_now - decrement the orig node refcounter and
- * possibly free it (without rcu callback)
- * @orig_node: the orig node to free
- */
-void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node)
-{
- if (atomic_dec_and_test(&orig_node->refcount))
- batadv_orig_node_free_rcu(&orig_node->rcu);
+ batadv_orig_node_release(orig_node);
}
void batadv_originator_free(struct batadv_priv *bat_priv)
@@ -606,7 +608,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
struct hlist_head *head;
spinlock_t *list_lock; /* spinlock to protect write access */
struct batadv_orig_node *orig_node;
- uint32_t i;
+ u32 i;
if (!hash)
return;
@@ -641,7 +643,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
* Returns the newly created object or NULL on failure.
*/
struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
- const uint8_t *addr)
+ const u8 *addr)
{
struct batadv_orig_node *orig_node;
struct batadv_orig_node_vlan *vlan;
@@ -656,7 +658,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
return NULL;
INIT_HLIST_HEAD(&orig_node->neigh_list);
- INIT_LIST_HEAD(&orig_node->vlan_list);
+ INIT_HLIST_HEAD(&orig_node->vlan_list);
INIT_HLIST_HEAD(&orig_node->ifinfo_list);
spin_lock_init(&orig_node->bcast_seqno_lock);
spin_lock_init(&orig_node->neigh_list_lock);
@@ -678,8 +680,13 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
orig_node->last_seen = jiffies;
reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS);
orig_node->bcast_seqno_reset = reset_time;
+
#ifdef CONFIG_BATMAN_ADV_MCAST
orig_node->mcast_flags = BATADV_NO_FLAGS;
+ INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node);
+ INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node);
+ INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node);
+ spin_lock_init(&orig_node->mcast_handler_lock);
#endif
/* create a vlan object for the "untagged" LAN */
@@ -958,7 +965,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv)
struct hlist_head *head;
spinlock_t *list_lock; /* spinlock to protect write access */
struct batadv_orig_node *orig_node;
- uint32_t i;
+ u32 i;
if (!hash)
return;
@@ -987,7 +994,6 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv)
spin_unlock_bh(list_lock);
}
- batadv_gw_node_purge(bat_priv);
batadv_gw_election(bat_priv);
}
@@ -1092,7 +1098,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_head *head;
struct batadv_orig_node *orig_node;
- uint32_t i;
+ u32 i;
int ret;
/* resize all orig nodes because orig_node->bcast_own(_sum) depend on
@@ -1129,7 +1135,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
struct batadv_hard_iface *hard_iface_tmp;
struct batadv_orig_node *orig_node;
struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
- uint32_t i;
+ u32 i;
int ret;
/* resize all orig nodes because orig_node->bcast_own(_sum) depend on
diff --git a/kernel/net/batman-adv/originator.h b/kernel/net/batman-adv/originator.h
index aa4a43696..a5c37882b 100644
--- a/kernel/net/batman-adv/originator.h
+++ b/kernel/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,24 +18,32 @@
#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_
#define _NET_BATMAN_ADV_ORIGINATOR_H_
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/if_ether.h>
+#include <linux/jhash.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
#include "hash.h"
+struct seq_file;
+
int batadv_compare_orig(const struct hlist_node *node, const void *data2);
int batadv_originator_init(struct batadv_priv *bat_priv);
void batadv_originator_free(struct batadv_priv *bat_priv);
void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node);
-void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node);
struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
- const uint8_t *addr);
+ const u8 *addr);
struct batadv_neigh_node *
-batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
- const struct batadv_hard_iface *hard_iface,
- const uint8_t *addr);
-struct batadv_neigh_node *
-batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
- const uint8_t *neigh_addr,
- struct batadv_orig_node *orig_node);
+batadv_neigh_node_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr);
void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node);
struct batadv_neigh_node *
batadv_orig_router_get(struct batadv_orig_node *orig_node,
@@ -73,22 +81,11 @@ void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan);
/* hashfunction to choose an entry in a hash table of given size
* hash algorithm from http://en.wikipedia.org/wiki/Hash_table
*/
-static inline uint32_t batadv_choose_orig(const void *data, uint32_t size)
+static inline u32 batadv_choose_orig(const void *data, u32 size)
{
- const unsigned char *key = data;
- uint32_t hash = 0;
- size_t i;
-
- for (i = 0; i < 6; i++) {
- hash += key[i];
- hash += (hash << 10);
- hash ^= (hash >> 6);
- }
-
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
+ u32 hash = 0;
+ hash = jhash(data, ETH_ALEN, hash);
return hash % size;
}
diff --git a/kernel/net/batman-adv/packet.h b/kernel/net/batman-adv/packet.h
index b81fbbf21..11f996b39 100644
--- a/kernel/net/batman-adv/packet.h
+++ b/kernel/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,6 +18,9 @@
#ifndef _NET_BATMAN_ADV_PACKET_H_
#define _NET_BATMAN_ADV_PACKET_H_
+#include <asm/byteorder.h>
+#include <linux/types.h>
+
/**
* enum batadv_packettype - types for batman-adv encapsulated packets
* @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
@@ -194,8 +197,8 @@ enum batadv_tvlv_type {
* transport the claim type and the group id
*/
struct batadv_bla_claim_dst {
- uint8_t magic[3]; /* FF:43:05 */
- uint8_t type; /* bla_claimframe */
+ u8 magic[3]; /* FF:43:05 */
+ u8 type; /* bla_claimframe */
__be16 group; /* group id */
};
@@ -210,16 +213,16 @@ struct batadv_bla_claim_dst {
* @tvlv_len: length of tvlv data following the ogm header
*/
struct batadv_ogm_packet {
- uint8_t packet_type;
- uint8_t version;
- uint8_t ttl;
- uint8_t flags;
- __be32 seqno;
- uint8_t orig[ETH_ALEN];
- uint8_t prev_sender[ETH_ALEN];
- uint8_t reserved;
- uint8_t tq;
- __be16 tvlv_len;
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 flags;
+ __be32 seqno;
+ u8 orig[ETH_ALEN];
+ u8 prev_sender[ETH_ALEN];
+ u8 reserved;
+ u8 tq;
+ __be16 tvlv_len;
/* __packed is not needed as the struct size is divisible by 4,
* and the largest data type in this struct has a size of 4.
*/
@@ -243,14 +246,14 @@ struct batadv_ogm_packet {
* members are padded the same way as they are in real packets.
*/
struct batadv_icmp_header {
- uint8_t packet_type;
- uint8_t version;
- uint8_t ttl;
- uint8_t msg_type; /* see ICMP message types above */
- uint8_t dst[ETH_ALEN];
- uint8_t orig[ETH_ALEN];
- uint8_t uid;
- uint8_t align[3];
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 msg_type; /* see ICMP message types above */
+ u8 dst[ETH_ALEN];
+ u8 orig[ETH_ALEN];
+ u8 uid;
+ u8 align[3];
};
/**
@@ -266,15 +269,15 @@ struct batadv_icmp_header {
* @seqno: ICMP sequence number
*/
struct batadv_icmp_packet {
- uint8_t packet_type;
- uint8_t version;
- uint8_t ttl;
- uint8_t msg_type; /* see ICMP message types above */
- uint8_t dst[ETH_ALEN];
- uint8_t orig[ETH_ALEN];
- uint8_t uid;
- uint8_t reserved;
- __be16 seqno;
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 msg_type; /* see ICMP message types above */
+ u8 dst[ETH_ALEN];
+ u8 orig[ETH_ALEN];
+ u8 uid;
+ u8 reserved;
+ __be16 seqno;
};
#define BATADV_RR_LEN 16
@@ -293,16 +296,16 @@ struct batadv_icmp_packet {
* @rr: route record array
*/
struct batadv_icmp_packet_rr {
- uint8_t packet_type;
- uint8_t version;
- uint8_t ttl;
- uint8_t msg_type; /* see ICMP message types above */
- uint8_t dst[ETH_ALEN];
- uint8_t orig[ETH_ALEN];
- uint8_t uid;
- uint8_t rr_cur;
- __be16 seqno;
- uint8_t rr[BATADV_RR_LEN][ETH_ALEN];
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 msg_type; /* see ICMP message types above */
+ u8 dst[ETH_ALEN];
+ u8 orig[ETH_ALEN];
+ u8 uid;
+ u8 rr_cur;
+ __be16 seqno;
+ u8 rr[BATADV_RR_LEN][ETH_ALEN];
};
#define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr)
@@ -328,11 +331,11 @@ struct batadv_icmp_packet_rr {
* @dest: originator destination of the unicast packet
*/
struct batadv_unicast_packet {
- uint8_t packet_type;
- uint8_t version;
- uint8_t ttl;
- uint8_t ttvn; /* destination translation table version number */
- uint8_t dest[ETH_ALEN];
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 ttvn; /* destination translation table version number */
+ u8 dest[ETH_ALEN];
/* "4 bytes boundary + 2 bytes" long to make the payload after the
* following ethernet header again 4 bytes boundary aligned
*/
@@ -346,9 +349,9 @@ struct batadv_unicast_packet {
*/
struct batadv_unicast_4addr_packet {
struct batadv_unicast_packet u;
- uint8_t src[ETH_ALEN];
- uint8_t subtype;
- uint8_t reserved;
+ u8 src[ETH_ALEN];
+ u8 subtype;
+ u8 reserved;
/* "4 bytes boundary + 2 bytes" long to make the payload after the
* following ethernet header again 4 bytes boundary aligned
*/
@@ -367,22 +370,22 @@ struct batadv_unicast_4addr_packet {
* @total_size: size of the merged packet
*/
struct batadv_frag_packet {
- uint8_t packet_type;
- uint8_t version; /* batman version field */
- uint8_t ttl;
+ u8 packet_type;
+ u8 version; /* batman version field */
+ u8 ttl;
#if defined(__BIG_ENDIAN_BITFIELD)
- uint8_t no:4;
- uint8_t reserved:4;
+ u8 no:4;
+ u8 reserved:4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- uint8_t reserved:4;
- uint8_t no:4;
+ u8 reserved:4;
+ u8 no:4;
#else
#error "unknown bitfield endianness"
#endif
- uint8_t dest[ETH_ALEN];
- uint8_t orig[ETH_ALEN];
- __be16 seqno;
- __be16 total_size;
+ u8 dest[ETH_ALEN];
+ u8 orig[ETH_ALEN];
+ __be16 seqno;
+ __be16 total_size;
};
/**
@@ -395,12 +398,12 @@ struct batadv_frag_packet {
* @orig: originator of the broadcast packet
*/
struct batadv_bcast_packet {
- uint8_t packet_type;
- uint8_t version; /* batman version field */
- uint8_t ttl;
- uint8_t reserved;
- __be32 seqno;
- uint8_t orig[ETH_ALEN];
+ u8 packet_type;
+ u8 version; /* batman version field */
+ u8 ttl;
+ u8 reserved;
+ __be32 seqno;
+ u8 orig[ETH_ALEN];
/* "4 bytes boundary + 2 bytes" long to make the payload after the
* following ethernet header again 4 bytes boundary aligned
*/
@@ -425,21 +428,21 @@ struct batadv_bcast_packet {
* @coded_len: length of network coded part of the payload
*/
struct batadv_coded_packet {
- uint8_t packet_type;
- uint8_t version; /* batman version field */
- uint8_t ttl;
- uint8_t first_ttvn;
- /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */
- uint8_t first_source[ETH_ALEN];
- uint8_t first_orig_dest[ETH_ALEN];
- __be32 first_crc;
- uint8_t second_ttl;
- uint8_t second_ttvn;
- uint8_t second_dest[ETH_ALEN];
- uint8_t second_source[ETH_ALEN];
- uint8_t second_orig_dest[ETH_ALEN];
- __be32 second_crc;
- __be16 coded_len;
+ u8 packet_type;
+ u8 version; /* batman version field */
+ u8 ttl;
+ u8 first_ttvn;
+ /* u8 first_dest[ETH_ALEN]; - saved in mac header destination */
+ u8 first_source[ETH_ALEN];
+ u8 first_orig_dest[ETH_ALEN];
+ __be32 first_crc;
+ u8 second_ttl;
+ u8 second_ttvn;
+ u8 second_dest[ETH_ALEN];
+ u8 second_source[ETH_ALEN];
+ u8 second_orig_dest[ETH_ALEN];
+ __be32 second_crc;
+ __be16 coded_len;
};
#pragma pack()
@@ -456,14 +459,14 @@ struct batadv_coded_packet {
* @align: 2 bytes to align the header to a 4 byte boundary
*/
struct batadv_unicast_tvlv_packet {
- uint8_t packet_type;
- uint8_t version; /* batman version field */
- uint8_t ttl;
- uint8_t reserved;
- uint8_t dst[ETH_ALEN];
- uint8_t src[ETH_ALEN];
- __be16 tvlv_len;
- uint16_t align;
+ u8 packet_type;
+ u8 version; /* batman version field */
+ u8 ttl;
+ u8 reserved;
+ u8 dst[ETH_ALEN];
+ u8 src[ETH_ALEN];
+ __be16 tvlv_len;
+ u16 align;
};
/**
@@ -473,9 +476,9 @@ struct batadv_unicast_tvlv_packet {
* @len: tvlv container length
*/
struct batadv_tvlv_hdr {
- uint8_t type;
- uint8_t version;
- __be16 len;
+ u8 type;
+ u8 version;
+ __be16 len;
};
/**
@@ -497,9 +500,9 @@ struct batadv_tvlv_gateway_data {
* one batadv_tvlv_tt_vlan_data object per announced vlan
*/
struct batadv_tvlv_tt_data {
- uint8_t flags;
- uint8_t ttvn;
- __be16 num_vlan;
+ u8 flags;
+ u8 ttvn;
+ __be16 num_vlan;
};
/**
@@ -510,9 +513,9 @@ struct batadv_tvlv_tt_data {
* @reserved: unused, useful for alignment purposes
*/
struct batadv_tvlv_tt_vlan_data {
- __be32 crc;
- __be16 vid;
- uint16_t reserved;
+ __be32 crc;
+ __be16 vid;
+ u16 reserved;
};
/**
@@ -524,9 +527,9 @@ struct batadv_tvlv_tt_vlan_data {
* @vid: VLAN identifier
*/
struct batadv_tvlv_tt_change {
- uint8_t flags;
- uint8_t reserved[3];
- uint8_t addr[ETH_ALEN];
+ u8 flags;
+ u8 reserved[3];
+ u8 addr[ETH_ALEN];
__be16 vid;
};
@@ -536,7 +539,7 @@ struct batadv_tvlv_tt_change {
* @vid: VLAN identifier
*/
struct batadv_tvlv_roam_adv {
- uint8_t client[ETH_ALEN];
+ u8 client[ETH_ALEN];
__be16 vid;
};
@@ -546,8 +549,8 @@ struct batadv_tvlv_roam_adv {
* @reserved: reserved field
*/
struct batadv_tvlv_mcast_data {
- uint8_t flags;
- uint8_t reserved[3];
+ u8 flags;
+ u8 reserved[3];
};
#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/kernel/net/batman-adv/routing.c b/kernel/net/batman-adv/routing.c
index da83982bf..3207667e6 100644
--- a/kernel/net/batman-adv/routing.c
+++ b/kernel/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,20 +15,36 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "routing.h"
-#include "send.h"
-#include "soft-interface.h"
-#include "hard-interface.h"
-#include "icmp_socket.h"
-#include "translation-table.h"
-#include "originator.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+
+#include "bitarray.h"
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
-#include "network-coding.h"
#include "fragmentation.h"
-
-#include <linux/if_vlan.h>
+#include "hard-interface.h"
+#include "icmp_socket.h"
+#include "network-coding.h"
+#include "originator.h"
+#include "packet.h"
+#include "send.h"
+#include "soft-interface.h"
+#include "translation-table.h"
static int batadv_route_unicast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
@@ -129,7 +145,7 @@ out:
* 0 if the packet is to be accepted
* 1 if the packet is to be ignored.
*/
-int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff,
+int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
unsigned long *last_reset)
{
if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE ||
@@ -637,19 +653,19 @@ out:
static bool
batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
struct batadv_unicast_packet *unicast_packet,
- uint8_t *dst_addr, unsigned short vid)
+ u8 *dst_addr, unsigned short vid)
{
struct batadv_orig_node *orig_node = NULL;
struct batadv_hard_iface *primary_if = NULL;
bool ret = false;
- uint8_t *orig_addr, orig_ttvn;
+ u8 *orig_addr, orig_ttvn;
if (batadv_is_my_client(bat_priv, dst_addr, vid)) {
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
goto out;
orig_addr = primary_if->net_dev->dev_addr;
- orig_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ orig_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
} else {
orig_node = batadv_transtable_search(bat_priv, NULL, dst_addr,
vid);
@@ -660,7 +676,7 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
goto out;
orig_addr = orig_node->orig;
- orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
}
/* update the packet header */
@@ -682,7 +698,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
struct batadv_unicast_packet *unicast_packet;
struct batadv_hard_iface *primary_if;
struct batadv_orig_node *orig_node;
- uint8_t curr_ttvn, old_ttvn;
+ u8 curr_ttvn, old_ttvn;
struct ethhdr *ethhdr;
unsigned short vid;
int is_old_ttvn;
@@ -724,7 +740,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
* value is used later to check if the node which sent (or re-routed
* last time) the packet had an updated information or not
*/
- curr_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ curr_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
if (!batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
orig_node = batadv_orig_hash_find(bat_priv,
unicast_packet->dest);
@@ -735,7 +751,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
if (!orig_node)
return 0;
- curr_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
batadv_orig_node_free_ref(orig_node);
}
@@ -817,9 +833,10 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
struct batadv_unicast_packet *unicast_packet;
struct batadv_unicast_4addr_packet *unicast_4addr_packet;
- uint8_t *orig_addr;
+ u8 *orig_addr;
struct batadv_orig_node *orig_node = NULL;
int check, hdr_size = sizeof(*unicast_packet);
+ enum batadv_subtype subtype;
bool is4addr;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -847,10 +864,20 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
/* packet for me */
if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
if (is4addr) {
- batadv_dat_inc_counter(bat_priv,
- unicast_4addr_packet->subtype);
- orig_addr = unicast_4addr_packet->src;
- orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ subtype = unicast_4addr_packet->subtype;
+ batadv_dat_inc_counter(bat_priv, subtype);
+
+ /* Only payload data should be considered for speedy
+ * join. For example, DAT also uses unicast 4addr
+ * types, but those packets should not be considered
+ * for speedy join, since the clients do not actually
+ * reside at the sending originator.
+ */
+ if (subtype == BATADV_P_DATA) {
+ orig_addr = unicast_4addr_packet->src;
+ orig_node = batadv_orig_hash_find(bat_priv,
+ orig_addr);
+ }
}
if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb,
@@ -888,7 +915,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
unsigned char *tvlv_buff;
- uint16_t tvlv_buff_len;
+ u16 tvlv_buff_len;
int hdr_size = sizeof(*unicast_tvlv_packet);
int ret = NET_RX_DROP;
@@ -991,8 +1018,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
struct ethhdr *ethhdr;
int hdr_size = sizeof(*bcast_packet);
int ret = NET_RX_DROP;
- int32_t seq_diff;
- uint32_t seqno;
+ s32 seq_diff;
+ u32 seqno;
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
diff --git a/kernel/net/batman-adv/routing.h b/kernel/net/batman-adv/routing.h
index 557d3d12a..204bbe495 100644
--- a/kernel/net/batman-adv/routing.h
+++ b/kernel/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,6 +18,12 @@
#ifndef _NET_BATMAN_ADV_ROUTING_H_
#define _NET_BATMAN_ADV_ROUTING_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct sk_buff;
+
bool batadv_check_management_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
int header_len);
@@ -45,7 +51,7 @@ struct batadv_neigh_node *
batadv_find_router(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
-int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff,
+int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
unsigned long *last_reset);
#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/kernel/net/batman-adv/send.c b/kernel/net/batman-adv/send.c
index 3d64ed20c..f66432480 100644
--- a/kernel/net/batman-adv/send.c
+++ b/kernel/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,19 +15,37 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "send.h"
#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/if.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/workqueue.h>
+
#include "distributed-arp-table.h"
-#include "send.h"
-#include "routing.h"
-#include "translation-table.h"
-#include "soft-interface.h"
-#include "hard-interface.h"
-#include "gateway_common.h"
+#include "fragmentation.h"
#include "gateway_client.h"
-#include "originator.h"
+#include "hard-interface.h"
#include "network-coding.h"
-#include "fragmentation.h"
-#include "multicast.h"
+#include "originator.h"
+#include "routing.h"
+#include "soft-interface.h"
+#include "translation-table.h"
static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
@@ -36,7 +54,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
*/
int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
- const uint8_t *dst_addr)
+ const u8 *dst_addr)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct ethhdr *ethhdr;
@@ -154,7 +172,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
struct batadv_orig_node *orig_node)
{
struct batadv_unicast_packet *unicast_packet;
- uint8_t ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ u8 ttvn = (u8)atomic_read(&orig_node->last_ttvn);
if (batadv_skb_head_push(skb, hdr_size) < 0)
return false;
@@ -255,8 +273,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
unsigned short vid)
{
- struct ethhdr *ethhdr;
struct batadv_unicast_packet *unicast_packet;
+ struct ethhdr *ethhdr;
int ret = NET_XMIT_DROP;
if (!orig_node)
@@ -325,12 +343,12 @@ out:
*/
int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
struct sk_buff *skb, int packet_type,
- int packet_subtype, uint8_t *dst_hint,
+ int packet_subtype, u8 *dst_hint,
unsigned short vid)
{
struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
struct batadv_orig_node *orig_node;
- uint8_t *src, *dst;
+ u8 *src, *dst;
src = ethhdr->h_source;
dst = ethhdr->h_dest;
@@ -598,7 +616,8 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
* we delete only packets belonging to the given interface
*/
if ((hard_iface) &&
- (forw_packet->if_incoming != hard_iface))
+ (forw_packet->if_incoming != hard_iface) &&
+ (forw_packet->if_outgoing != hard_iface))
continue;
spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
diff --git a/kernel/net/batman-adv/send.h b/kernel/net/batman-adv/send.h
index 38d0ec183..82059f259 100644
--- a/kernel/net/batman-adv/send.h
+++ b/kernel/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,9 +18,19 @@
#ifndef _NET_BATMAN_ADV_SEND_H_
#define _NET_BATMAN_ADV_SEND_H_
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#include "packet.h"
+
+struct sk_buff;
+struct work_struct;
+
int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
- const uint8_t *dst_addr);
+ const u8 *dst_addr);
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
@@ -43,7 +53,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
unsigned short vid);
int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
struct sk_buff *skb, int packet_type,
- int packet_subtype, uint8_t *dst_hint,
+ int packet_subtype, u8 *dst_hint,
unsigned short vid);
int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid);
@@ -62,7 +72,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
* Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
- struct sk_buff *skb, uint8_t *dst_hint,
+ struct sk_buff *skb, u8 *dst_hint,
unsigned short vid)
{
return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST, 0,
@@ -87,7 +97,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv,
struct sk_buff *skb,
int packet_subtype,
- uint8_t *dst_hint,
+ u8 *dst_hint,
unsigned short vid)
{
return batadv_send_skb_via_tt_generic(bat_priv, skb,
diff --git a/kernel/net/batman-adv/soft-interface.c b/kernel/net/batman-adv/soft-interface.c
index 5ec31d7de..ac4d08de5 100644
--- a/kernel/net/batman-adv/soft-interface.c
+++ b/kernel/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -15,26 +15,50 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "soft-interface.h"
-#include "hard-interface.h"
-#include "distributed-arp-table.h"
-#include "routing.h"
-#include "send.h"
-#include "debugfs.h"
-#include "translation-table.h"
-#include "hash.h"
-#include "gateway_common.h"
-#include "gateway_client.h"
-#include "sysfs.h"
-#include "originator.h"
-#include <linux/slab.h>
-#include <linux/ethtool.h>
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
#include <linux/if_vlan.h>
-#include "multicast.h"
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
#include "bridge_loop_avoidance.h"
+#include "debugfs.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
+#include "hard-interface.h"
+#include "multicast.h"
#include "network-coding.h"
+#include "packet.h"
+#include "send.h"
+#include "sysfs.h"
+#include "translation-table.h"
static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd);
static void batadv_get_drvinfo(struct net_device *dev,
@@ -105,8 +129,9 @@ static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_softif_vlan *vlan;
struct sockaddr *addr = p;
- uint8_t old_addr[ETH_ALEN];
+ u8 old_addr[ETH_ALEN];
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
@@ -115,12 +140,17 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
ether_addr_copy(dev->dev_addr, addr->sa_data);
/* only modify transtable if it has been initialized before */
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) {
- batadv_tt_local_remove(bat_priv, old_addr, BATADV_NO_FLAGS,
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ batadv_tt_local_remove(bat_priv, old_addr, vlan->vid,
"mac address changed", false);
- batadv_tt_local_add(dev, addr->sa_data, BATADV_NO_FLAGS,
+ batadv_tt_local_add(dev, addr->sa_data, vlan->vid,
BATADV_NULL_IFINDEX, BATADV_NO_MARK);
}
+ rcu_read_unlock();
return 0;
}
@@ -156,22 +186,23 @@ static int batadv_interface_tx(struct sk_buff *skb,
struct batadv_hard_iface *primary_if = NULL;
struct batadv_bcast_packet *bcast_packet;
__be16 ethertype = htons(ETH_P_BATMAN);
- static const uint8_t stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00,
- 0x00, 0x00};
- static const uint8_t ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00,
- 0x00, 0x00};
+ static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00,
+ 0x00, 0x00};
+ static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00,
+ 0x00, 0x00};
enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO;
- uint8_t *dst_hint = NULL, chaddr[ETH_ALEN];
+ u8 *dst_hint = NULL, chaddr[ETH_ALEN];
struct vlan_ethhdr *vhdr;
unsigned int header_len = 0;
int data_len = skb->len, ret;
unsigned long brd_delay = 1;
bool do_bcast = false, client_added;
unsigned short vid;
- uint32_t seqno;
+ u32 seqno;
int gw_mode;
enum batadv_forw_mode forw_mode;
struct batadv_orig_node *mcast_single_orig = NULL;
+ int network_offset = ETH_HLEN;
if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
goto dropped;
@@ -184,14 +215,18 @@ static int batadv_interface_tx(struct sk_buff *skb,
case ETH_P_8021Q:
vhdr = vlan_eth_hdr(skb);
- if (vhdr->h_vlan_encapsulated_proto != ethertype)
+ if (vhdr->h_vlan_encapsulated_proto != ethertype) {
+ network_offset += VLAN_HLEN;
break;
+ }
/* fall through */
case ETH_P_BATMAN:
goto dropped;
}
+ skb_set_network_header(skb, network_offset);
+
if (batadv_bla_tx(bat_priv, skb, vid))
goto dropped;
@@ -449,6 +484,9 @@ out:
*/
void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
{
+ if (!vlan)
+ return;
+
if (atomic_dec_and_test(&vlan->refcount)) {
spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
hlist_del_rcu(&vlan->list);
@@ -712,9 +750,9 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
static int batadv_softif_init_late(struct net_device *dev)
{
struct batadv_priv *bat_priv;
- uint32_t random_seqno;
+ u32 random_seqno;
int ret;
- size_t cnt_len = sizeof(uint64_t) * BATADV_CNT_NUM;
+ size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM;
batadv_set_lockdep_class(dev);
@@ -725,14 +763,14 @@ static int batadv_softif_init_late(struct net_device *dev)
/* batadv_interface_stats() needs to be available as soon as
* register_netdevice() has been called
*/
- bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(uint64_t));
+ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64));
if (!bat_priv->bat_counters)
return -ENOMEM;
atomic_set(&bat_priv->aggregated_ogms, 1);
atomic_set(&bat_priv->bonding, 0);
#ifdef CONFIG_BATMAN_ADV_BLA
- atomic_set(&bat_priv->bridge_loop_avoidance, 0);
+ atomic_set(&bat_priv->bridge_loop_avoidance, 1);
#endif
#ifdef CONFIG_BATMAN_ADV_DAT
atomic_set(&bat_priv->distributed_arp_table, 1);
@@ -818,7 +856,7 @@ static int batadv_softif_slave_add(struct net_device *dev,
int ret = -EINVAL;
hard_iface = batadv_hardif_get_by_netdev(slave_dev);
- if (!hard_iface || hard_iface->soft_iface != NULL)
+ if (!hard_iface || hard_iface->soft_iface)
goto out;
ret = batadv_hardif_enable_interface(hard_iface, dev->name);
@@ -903,14 +941,12 @@ static void batadv_softif_init_early(struct net_device *dev)
dev->netdev_ops = &batadv_netdev_ops;
dev->destructor = batadv_softif_free;
dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
- dev->tx_queue_len = 0;
+ dev->priv_flags |= IFF_NO_QUEUE;
/* can't call min_mtu, because the needed variables
* have not been initialized yet
*/
dev->mtu = ETH_DATA_LEN;
- /* reserve more space in the skbuff for our header */
- dev->hard_header_len = batadv_max_header_len();
/* generate random address */
eth_hw_addr_random(dev);
@@ -1079,8 +1115,7 @@ static const struct {
#endif
};
-static void batadv_get_strings(struct net_device *dev, uint32_t stringset,
- uint8_t *data)
+static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
if (stringset == ETH_SS_STATS)
memcpy(data, batadv_counters_strings,
@@ -1088,8 +1123,7 @@ static void batadv_get_strings(struct net_device *dev, uint32_t stringset,
}
static void batadv_get_ethtool_stats(struct net_device *dev,
- struct ethtool_stats *stats,
- uint64_t *data)
+ struct ethtool_stats *stats, u64 *data)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
int i;
diff --git a/kernel/net/batman-adv/soft-interface.h b/kernel/net/batman-adv/soft-interface.h
index dbab22fd8..8e82176f4 100644
--- a/kernel/net/batman-adv/soft-interface.h
+++ b/kernel/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -18,6 +18,13 @@
#ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_
#define _NET_BATMAN_ADV_SOFT_INTERFACE_H_
+#include "main.h"
+
+#include <net/rtnetlink.h>
+
+struct net_device;
+struct sk_buff;
+
int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
void batadv_interface_rx(struct net_device *soft_iface,
struct sk_buff *skb, struct batadv_hard_iface *recv_if,
diff --git a/kernel/net/batman-adv/sysfs.c b/kernel/net/batman-adv/sysfs.c
index a75dc12f9..9de3c8804 100644
--- a/kernel/net/batman-adv/sysfs.c
+++ b/kernel/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -15,16 +15,35 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "sysfs.h"
-#include "translation-table.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/stringify.h>
+
#include "distributed-arp-table.h"
-#include "network-coding.h"
-#include "originator.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
#include "hard-interface.h"
+#include "network-coding.h"
+#include "packet.h"
#include "soft-interface.h"
-#include "gateway_common.h"
-#include "gateway_client.h"
static struct net_device *batadv_kobj_to_netdev(struct kobject *obj)
{
@@ -151,7 +170,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \
static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
batadv_store_##_name)
-#define BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func) \
+#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \
ssize_t batadv_store_##_name(struct kobject *kobj, \
struct attribute *attr, char *buff, \
size_t count) \
@@ -161,24 +180,24 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \
\
return __batadv_store_uint_attr(buff, count, _min, _max, \
_post_func, attr, \
- &bat_priv->_name, net_dev); \
+ &bat_priv->_var, net_dev); \
}
-#define BATADV_ATTR_SIF_SHOW_UINT(_name) \
+#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
ssize_t batadv_show_##_name(struct kobject *kobj, \
struct attribute *attr, char *buff) \
{ \
struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
\
- return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \
+ return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \
} \
/* Use this, if you are going to set [name] in the soft-interface
* (bat_priv) to an unsigned integer value
*/
-#define BATADV_ATTR_SIF_UINT(_name, _mode, _min, _max, _post_func) \
- static BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func)\
- static BATADV_ATTR_SIF_SHOW_UINT(_name) \
+#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
+ static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\
+ static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
batadv_store_##_name)
@@ -438,7 +457,7 @@ static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
struct attribute *attr, char *buff)
{
struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- uint32_t down, up;
+ u32 down, up;
down = atomic_read(&bat_priv->gw.bandwidth_down);
up = atomic_read(&bat_priv->gw.bandwidth_up);
@@ -493,7 +512,7 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
{
struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
struct batadv_priv *bat_priv = netdev_priv(net_dev);
- uint32_t mark, mask;
+ u32 mark, mask;
char *mask_ptr;
/* parse the mask if it has been specified, otherwise assume the mask is
@@ -540,19 +559,20 @@ BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu);
static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL);
static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode,
batadv_store_gw_mode);
-BATADV_ATTR_SIF_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER,
- INT_MAX, NULL);
-BATADV_ATTR_SIF_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE,
- NULL);
-BATADV_ATTR_SIF_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE,
- batadv_post_gw_reselect);
+BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR,
+ 2 * BATADV_JITTER, INT_MAX, NULL);
+BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0,
+ BATADV_TQ_MAX_VALUE, NULL);
+BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1,
+ BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect);
static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
batadv_store_gw_bwidth);
#ifdef CONFIG_BATMAN_ADV_MCAST
BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL);
#endif
#ifdef CONFIG_BATMAN_ADV_DEBUG
-BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL);
+BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0,
+ BATADV_DBG_ALL, NULL);
#endif
#ifdef CONFIG_BATMAN_ADV_NC
BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR,
diff --git a/kernel/net/batman-adv/sysfs.h b/kernel/net/batman-adv/sysfs.h
index b715b60db..61974428a 100644
--- a/kernel/net/batman-adv/sysfs.h
+++ b/kernel/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -18,6 +18,14 @@
#ifndef _NET_BATMAN_ADV_SYSFS_H_
#define _NET_BATMAN_ADV_SYSFS_H_
+#include "main.h"
+
+#include <linux/sysfs.h>
+#include <linux/types.h>
+
+struct kobject;
+struct net_device;
+
#define BATADV_SYSFS_IF_MESH_SUBDIR "mesh"
#define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv"
/**
diff --git a/kernel/net/batman-adv/translation-table.c b/kernel/net/batman-adv/translation-table.c
index 07b263a43..83b0ca27a 100644
--- a/kernel/net/batman-adv/translation-table.c
+++ b/kernel/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*
@@ -15,24 +15,48 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "main.h"
#include "translation-table.h"
-#include "soft-interface.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/crc32c.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <net/net_namespace.h>
+
+#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
-#include "send.h"
#include "hash.h"
-#include "originator.h"
-#include "routing.h"
-#include "bridge_loop_avoidance.h"
#include "multicast.h"
-
-#include <linux/crc32c.h>
+#include "originator.h"
+#include "packet.h"
+#include "soft-interface.h"
/* hash class keys */
static struct lock_class_key batadv_tt_local_hash_lock_class_key;
static struct lock_class_key batadv_tt_global_hash_lock_class_key;
-static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client,
+static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
unsigned short vid,
struct batadv_orig_node *orig_node);
static void batadv_tt_purge(struct work_struct *work);
@@ -44,13 +68,15 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
unsigned short vid, const char *message,
bool roaming);
-/* returns 1 if they are the same mac addr */
+/* returns 1 if they are the same mac addr and vid */
static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_tt_common_entry,
hash_entry);
+ const struct batadv_tt_common_entry *tt1 = data1;
+ const struct batadv_tt_common_entry *tt2 = data2;
- return batadv_compare_eth(data1, data2);
+ return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2);
}
/**
@@ -61,18 +87,14 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
* Returns the hash index where the object represented by 'data' should be
* stored at.
*/
-static inline uint32_t batadv_choose_tt(const void *data, uint32_t size)
+static inline u32 batadv_choose_tt(const void *data, u32 size)
{
struct batadv_tt_common_entry *tt;
- uint32_t hash = 0;
+ u32 hash = 0;
tt = (struct batadv_tt_common_entry *)data;
- hash = batadv_hash_bytes(hash, &tt->addr, ETH_ALEN);
- hash = batadv_hash_bytes(hash, &tt->vid, sizeof(tt->vid));
-
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
+ hash = jhash(&tt->addr, ETH_ALEN, hash);
+ hash = jhash(&tt->vid, sizeof(tt->vid), hash);
return hash % size;
}
@@ -87,12 +109,12 @@ static inline uint32_t batadv_choose_tt(const void *data, uint32_t size)
* found, NULL otherwise.
*/
static struct batadv_tt_common_entry *
-batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr,
+batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
unsigned short vid)
{
struct hlist_head *head;
struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL;
- uint32_t index;
+ u32 index;
if (!hash)
return NULL;
@@ -132,7 +154,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr,
* found, NULL otherwise.
*/
static struct batadv_tt_local_entry *
-batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr,
+batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid)
{
struct batadv_tt_common_entry *tt_common_entry;
@@ -157,7 +179,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr,
* is found, NULL otherwise.
*/
static struct batadv_tt_global_entry *
-batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr,
+batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid)
{
struct batadv_tt_common_entry *tt_common_entry;
@@ -203,7 +225,7 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
* (excluding ourself).
*/
int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
- const uint8_t *addr, unsigned short vid)
+ const u8 *addr, unsigned short vid)
{
struct batadv_tt_global_entry *tt_global_entry;
int count;
@@ -218,20 +240,6 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
return count;
}
-static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
-{
- struct batadv_tt_orig_list_entry *orig_entry;
-
- orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
-
- /* We are in an rcu callback here, therefore we cannot use
- * batadv_orig_node_free_ref() and its call_rcu():
- * An rcu_barrier() wouldn't wait for that to finish
- */
- batadv_orig_node_free_ref_now(orig_entry->orig_node);
- kfree(orig_entry);
-}
-
/**
* batadv_tt_local_size_mod - change the size by v of the local table identified
* by vid
@@ -295,7 +303,7 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
if (atomic_add_return(v, &vlan->tt.num_entries) == 0) {
spin_lock_bh(&orig_node->vlan_list_lock);
- list_del_rcu(&vlan->list);
+ hlist_del_init_rcu(&vlan->list);
spin_unlock_bh(&orig_node->vlan_list_lock);
batadv_orig_node_vlan_free_ref(vlan);
}
@@ -327,13 +335,25 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
batadv_tt_global_size_mod(orig_node, vid, -1);
}
+/**
+ * batadv_tt_orig_list_entry_release - release tt orig entry from lists and
+ * queue for free after rcu grace period
+ * @orig_entry: tt orig entry to be free'd
+ */
+static void
+batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry)
+{
+ batadv_orig_node_free_ref(orig_entry->orig_node);
+ kfree_rcu(orig_entry, rcu);
+}
+
static void
batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry)
{
if (!atomic_dec_and_test(&orig_entry->refcount))
return;
- call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu);
+ batadv_tt_orig_list_entry_release(orig_entry);
}
/**
@@ -344,11 +364,11 @@ batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry)
*/
static void batadv_tt_local_event(struct batadv_priv *bat_priv,
struct batadv_tt_local_entry *tt_local_entry,
- uint8_t event_flags)
+ u8 event_flags)
{
struct batadv_tt_change_node *tt_change_node, *entry, *safe;
struct batadv_tt_common_entry *common = &tt_local_entry->common;
- uint8_t flags = common->flags | event_flags;
+ u8 flags = common->flags | event_flags;
bool event_removed = false;
bool del_op_requested, del_op_entry;
@@ -428,7 +448,7 @@ static int batadv_tt_len(int changes_num)
*
* Returns the number of entries.
*/
-static uint16_t batadv_tt_entries(uint16_t tt_len)
+static u16 batadv_tt_entries(u16 tt_len)
{
return tt_len / batadv_tt_len(1);
}
@@ -442,7 +462,8 @@ static uint16_t batadv_tt_entries(uint16_t tt_len)
*/
static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv)
{
- uint16_t num_vlan = 0, tt_local_entries = 0;
+ u16 num_vlan = 0;
+ u16 tt_local_entries = 0;
struct batadv_softif_vlan *vlan;
int hdr_size;
@@ -505,8 +526,8 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
*
* Returns true if the client was successfully added, false otherwise.
*/
-bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr,
- unsigned short vid, int ifindex, uint32_t mark)
+bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
+ unsigned short vid, int ifindex, u32 mark)
{
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
struct batadv_tt_local_entry *tt_local;
@@ -516,9 +537,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr,
struct hlist_head *head;
struct batadv_tt_orig_list_entry *orig_entry;
int hash_added, table_size, packet_size_max;
- bool ret = false, roamed_back = false;
- uint8_t remote_flags;
- uint32_t match_mark;
+ bool ret = false;
+ bool roamed_back = false;
+ u8 remote_flags;
+ u32 match_mark;
if (ifindex != BATADV_NULL_IFINDEX)
in_dev = dev_get_by_index(&init_net, ifindex);
@@ -575,11 +597,17 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr,
/* increase the refcounter of the related vlan */
vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d",
+ addr, BATADV_PRINT_VID(vid))) {
+ kfree(tt_local);
+ tt_local = NULL;
+ goto out;
+ }
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n",
addr, BATADV_PRINT_VID(vid),
- (uint8_t)atomic_read(&bat_priv->tt.vn));
+ (u8)atomic_read(&bat_priv->tt.vn));
ether_addr_copy(tt_local->common.addr, addr);
/* The local entry has to be marked as NEW to avoid to send it in
@@ -698,19 +726,22 @@ out:
*
* Return the size of the allocated buffer or 0 in case of failure.
*/
-static uint16_t
+static u16
batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
struct batadv_tvlv_tt_data **tt_data,
struct batadv_tvlv_tt_change **tt_change,
- int32_t *tt_len)
+ s32 *tt_len)
{
- uint16_t num_vlan = 0, num_entries = 0, change_offset, tvlv_len;
+ u16 num_vlan = 0;
+ u16 num_entries = 0;
+ u16 change_offset;
+ u16 tvlv_len;
struct batadv_tvlv_tt_vlan_data *tt_vlan;
struct batadv_orig_node_vlan *vlan;
- uint8_t *tt_change_ptr;
+ u8 *tt_change_ptr;
rcu_read_lock();
- list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
num_vlan++;
num_entries += atomic_read(&vlan->tt.num_entries);
}
@@ -736,14 +767,14 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
(*tt_data)->num_vlan = htons(num_vlan);
tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
- list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
tt_vlan->vid = htons(vlan->vid);
tt_vlan->crc = htonl(vlan->tt.crc);
tt_vlan++;
}
- tt_change_ptr = (uint8_t *)*tt_data + change_offset;
+ tt_change_ptr = (u8 *)*tt_data + change_offset;
*tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
out:
@@ -769,16 +800,18 @@ out:
*
* Return the size of the allocated buffer or 0 in case of failure.
*/
-static uint16_t
+static u16
batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data **tt_data,
struct batadv_tvlv_tt_change **tt_change,
- int32_t *tt_len)
+ s32 *tt_len)
{
struct batadv_tvlv_tt_vlan_data *tt_vlan;
struct batadv_softif_vlan *vlan;
- uint16_t num_vlan = 0, num_entries = 0, tvlv_len;
- uint8_t *tt_change_ptr;
+ u16 num_vlan = 0;
+ u16 num_entries = 0;
+ u16 tvlv_len;
+ u8 *tt_change_ptr;
int change_offset;
rcu_read_lock();
@@ -815,7 +848,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
tt_vlan++;
}
- tt_change_ptr = (uint8_t *)*tt_data + change_offset;
+ tt_change_ptr = (u8 *)*tt_data + change_offset;
*tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
out:
@@ -834,8 +867,9 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
struct batadv_tvlv_tt_data *tt_data;
struct batadv_tvlv_tt_change *tt_change;
int tt_diff_len, tt_change_len = 0;
- int tt_diff_entries_num = 0, tt_diff_entries_count = 0;
- uint16_t tvlv_len;
+ int tt_diff_entries_num = 0;
+ int tt_diff_entries_count = 0;
+ u16 tvlv_len;
tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes);
tt_diff_len = batadv_tt_len(tt_diff_entries_num);
@@ -909,12 +943,12 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_softif_vlan *vlan;
struct hlist_head *head;
unsigned short vid;
- uint32_t i;
+ u32 i;
int last_seen_secs;
int last_seen_msecs;
unsigned long last_seen_jiffies;
bool no_purge;
- uint16_t np_flag = BATADV_TT_CLIENT_NOPURGE;
+ u16 np_flag = BATADV_TT_CLIENT_NOPURGE;
primary_if = batadv_seq_print_text_primary_if_get(seq);
if (!primary_if)
@@ -922,7 +956,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq,
"Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n",
- net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn));
+ net_dev->name, (u8)atomic_read(&bat_priv->tt.vn));
seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID",
"Flags", "Last seen", "CRC");
@@ -954,17 +988,17 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
" * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n",
tt_common_entry->addr,
BATADV_PRINT_VID(tt_common_entry->vid),
- (tt_common_entry->flags &
- BATADV_TT_CLIENT_ROAM ? 'R' : '.'),
+ ((tt_common_entry->flags &
+ BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
no_purge ? 'P' : '.',
- (tt_common_entry->flags &
- BATADV_TT_CLIENT_NEW ? 'N' : '.'),
- (tt_common_entry->flags &
- BATADV_TT_CLIENT_PENDING ? 'X' : '.'),
- (tt_common_entry->flags &
- BATADV_TT_CLIENT_WIFI ? 'W' : '.'),
- (tt_common_entry->flags &
- BATADV_TT_CLIENT_ISOLA ? 'I' : '.'),
+ ((tt_common_entry->flags &
+ BATADV_TT_CLIENT_NEW) ? 'N' : '.'),
+ ((tt_common_entry->flags &
+ BATADV_TT_CLIENT_PENDING) ? 'X' : '.'),
+ ((tt_common_entry->flags &
+ BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
+ ((tt_common_entry->flags &
+ BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
no_purge ? 0 : last_seen_secs,
no_purge ? 0 : last_seen_msecs,
vlan->tt.crc);
@@ -982,7 +1016,7 @@ out:
static void
batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
struct batadv_tt_local_entry *tt_local_entry,
- uint16_t flags, const char *message)
+ u16 flags, const char *message)
{
batadv_tt_local_event(bat_priv, tt_local_entry, flags);
@@ -1008,13 +1042,14 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
*
* Returns the flags assigned to the local entry before being deleted
*/
-uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv,
- const uint8_t *addr, unsigned short vid,
- const char *message, bool roaming)
+u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid, const char *message,
+ bool roaming)
{
struct batadv_tt_local_entry *tt_local_entry;
- uint16_t flags, curr_flags = BATADV_NO_FLAGS;
+ u16 flags, curr_flags = BATADV_NO_FLAGS;
struct batadv_softif_vlan *vlan;
+ void *tt_entry_exists;
tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
if (!tt_local_entry)
@@ -1042,11 +1077,22 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv,
* immediately purge it
*/
batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL);
- hlist_del_rcu(&tt_local_entry->common.hash_entry);
+
+ tt_entry_exists = batadv_hash_remove(bat_priv->tt.local_hash,
+ batadv_compare_tt,
+ batadv_choose_tt,
+ &tt_local_entry->common);
+ if (!tt_entry_exists)
+ goto out;
+
+ /* extra call to free the local tt entry */
batadv_tt_local_entry_free_ref(tt_local_entry);
/* decrease the reference held for this vlan */
vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (!vlan)
+ goto out;
+
batadv_softif_vlan_free_ref(vlan);
batadv_softif_vlan_free_ref(vlan);
@@ -1104,7 +1150,7 @@ static void batadv_tt_local_purge(struct batadv_priv *bat_priv,
struct batadv_hashtable *hash = bat_priv->tt.local_hash;
struct hlist_head *head;
spinlock_t *list_lock; /* protects write access to the hash lists */
- uint32_t i;
+ u32 i;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
@@ -1125,7 +1171,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
struct batadv_softif_vlan *vlan;
struct hlist_node *node_tmp;
struct hlist_head *head;
- uint32_t i;
+ u32 i;
if (!bat_priv->tt.local_hash)
return;
@@ -1147,8 +1193,10 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
/* decrease the reference held for this vlan */
vlan = batadv_softif_vlan_get(bat_priv,
tt_common_entry->vid);
- batadv_softif_vlan_free_ref(vlan);
- batadv_softif_vlan_free_ref(vlan);
+ if (vlan) {
+ batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_free_ref(vlan);
+ }
batadv_tt_local_entry_free_ref(tt_local);
}
@@ -1298,15 +1346,14 @@ out:
static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
const unsigned char *tt_addr,
- unsigned short vid, uint16_t flags,
- uint8_t ttvn)
+ unsigned short vid, u16 flags, u8 ttvn)
{
struct batadv_tt_global_entry *tt_global_entry;
struct batadv_tt_local_entry *tt_local_entry;
bool ret = false;
int hash_added;
struct batadv_tt_common_entry *common;
- uint16_t local_flags;
+ u16 local_flags;
/* ignore global entries from backbone nodes */
if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid))
@@ -1380,9 +1427,15 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
}
/* if the client was temporary added before receiving the first
- * OGM announcing it, we have to clear the TEMP flag
+ * OGM announcing it, we have to clear the TEMP flag. Also,
+ * remove the previous temporary orig node and re-add it
+ * if required. If the orig entry changed, the new one which
+ * is a non-temporary entry is preferred.
*/
- common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ if (common->flags & BATADV_TT_CLIENT_TEMP) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ }
/* the change can carry possible "attribute" flags like the
* TT_CLIENT_WIFI, therefore they have to be copied in the
@@ -1503,8 +1556,8 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
struct batadv_tt_common_entry *tt_common_entry;
struct batadv_orig_node_vlan *vlan;
struct hlist_head *head;
- uint8_t last_ttvn;
- uint16_t flags;
+ u8 last_ttvn;
+ u16 flags;
tt_common_entry = &tt_global_entry->common;
flags = tt_common_entry->flags;
@@ -1528,10 +1581,10 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
BATADV_PRINT_VID(tt_global_entry->common.vid),
best_entry->ttvn, best_entry->orig_node->orig,
last_ttvn, vlan->tt.crc,
- (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'),
- (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'),
- (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'),
- (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.'));
+ ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
+ ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
+ ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
+ ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
batadv_orig_node_vlan_free_ref(vlan);
}
@@ -1560,10 +1613,10 @@ print_list:
BATADV_PRINT_VID(tt_global_entry->common.vid),
orig_entry->ttvn, orig_entry->orig_node->orig,
last_ttvn, vlan->tt.crc,
- (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'),
- (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'),
- (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'),
- (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.'));
+ ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
+ ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
+ ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
+ ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
batadv_orig_node_vlan_free_ref(vlan);
}
@@ -1578,7 +1631,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_tt_global_entry *tt_global;
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
- uint32_t i;
+ u32 i;
primary_if = batadv_seq_print_text_primary_if_get(seq);
if (!primary_if)
@@ -1611,20 +1664,28 @@ out:
}
/**
- * batadv_tt_global_del_orig_entry - remove and free an orig_entry
+ * _batadv_tt_global_del_orig_entry - remove and free an orig_entry
* @tt_global_entry: the global entry to remove the orig_entry from
* @orig_entry: the orig entry to remove and free
*
* Remove an orig_entry from its list in the given tt_global_entry and
* free this orig_entry afterwards.
+ *
+ * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is
+ * part of a list.
*/
static void
-batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry,
- struct batadv_tt_orig_list_entry *orig_entry)
+_batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_tt_orig_list_entry *orig_entry)
{
+ lockdep_assert_held(&tt_global_entry->list_lock);
+
batadv_tt_global_size_dec(orig_entry->orig_node,
tt_global_entry->common.vid);
atomic_dec(&tt_global_entry->orig_list_count);
+ /* requires holding tt_global_entry->list_lock and orig_entry->list
+ * being part of a list
+ */
hlist_del_rcu(&orig_entry->list);
batadv_tt_orig_list_entry_free_ref(orig_entry);
}
@@ -1640,7 +1701,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry)
spin_lock_bh(&tt_global_entry->list_lock);
head = &tt_global_entry->orig_list;
hlist_for_each_entry_safe(orig_entry, safe, head, list)
- batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry);
+ _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry);
spin_unlock_bh(&tt_global_entry->list_lock);
}
@@ -1675,8 +1736,8 @@ batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv,
orig_node->orig,
tt_global_entry->common.addr,
BATADV_PRINT_VID(vid), message);
- batadv_tt_global_del_orig_entry(tt_global_entry,
- orig_entry);
+ _batadv_tt_global_del_orig_entry(tt_global_entry,
+ orig_entry);
}
}
spin_unlock_bh(&tt_global_entry->list_lock);
@@ -1798,12 +1859,12 @@ out:
*/
void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
- int32_t match_vid,
+ s32 match_vid,
const char *message)
{
struct batadv_tt_global_entry *tt_global;
struct batadv_tt_common_entry *tt_common_entry;
- uint32_t i;
+ u32 i;
struct batadv_hashtable *hash = bat_priv->tt.global_hash;
struct hlist_node *safe;
struct hlist_head *head;
@@ -1843,7 +1904,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
}
spin_unlock_bh(list_lock);
}
- orig_node->capa_initialized &= ~BATADV_ORIG_CAPA_HAS_TT;
+ clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized);
}
static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global,
@@ -1874,7 +1935,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv)
struct hlist_head *head;
struct hlist_node *node_tmp;
spinlock_t *list_lock; /* protects write access to the hash lists */
- uint32_t i;
+ u32 i;
char *msg = NULL;
struct batadv_tt_common_entry *tt_common;
struct batadv_tt_global_entry *tt_global;
@@ -1915,7 +1976,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv)
struct batadv_tt_global_entry *tt_global;
struct hlist_node *node_tmp;
struct hlist_head *head;
- uint32_t i;
+ u32 i;
if (!bat_priv->tt.global_hash)
return;
@@ -1976,8 +2037,8 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
* If the two clients are AP isolated the function returns NULL.
*/
struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
- const uint8_t *src,
- const uint8_t *addr,
+ const u8 *src,
+ const u8 *addr,
unsigned short vid)
{
struct batadv_tt_local_entry *tt_local_entry = NULL;
@@ -2045,16 +2106,16 @@ out:
*
* Returns the checksum of the global table of a given originator.
*/
-static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- unsigned short vid)
+static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid)
{
struct batadv_hashtable *hash = bat_priv->tt.global_hash;
struct batadv_tt_common_entry *tt_common;
struct batadv_tt_global_entry *tt_global;
struct hlist_head *head;
- uint32_t i, crc_tmp, crc = 0;
- uint8_t flags;
+ u32 i, crc_tmp, crc = 0;
+ u8 flags;
__be16 tmp_vid;
for (i = 0; i < hash->size; i++) {
@@ -2122,14 +2183,14 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv,
*
* Returns the checksum of the local table
*/
-static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv,
- unsigned short vid)
+static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
+ unsigned short vid)
{
struct batadv_hashtable *hash = bat_priv->tt.local_hash;
struct batadv_tt_common_entry *tt_common;
struct hlist_head *head;
- uint32_t i, crc_tmp, crc = 0;
- uint8_t flags;
+ u32 i, crc_tmp, crc = 0;
+ u8 flags;
__be16 tmp_vid;
for (i = 0; i < hash->size; i++) {
@@ -2171,12 +2232,13 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv,
static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
{
- struct batadv_tt_req_node *node, *safe;
+ struct batadv_tt_req_node *node;
+ struct hlist_node *safe;
spin_lock_bh(&bat_priv->tt.req_list_lock);
- list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
- list_del(&node->list);
+ hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ hlist_del_init(&node->list);
kfree(node);
}
@@ -2186,7 +2248,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
const void *tt_buff,
- uint16_t tt_buff_len)
+ u16 tt_buff_len)
{
/* Replace the old buffer only if I received something in the
* last OGM (the OGM could carry no changes)
@@ -2206,30 +2268,36 @@ static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv,
static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
{
- struct batadv_tt_req_node *node, *safe;
+ struct batadv_tt_req_node *node;
+ struct hlist_node *safe;
spin_lock_bh(&bat_priv->tt.req_list_lock);
- list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
if (batadv_has_timed_out(node->issued_at,
BATADV_TT_REQUEST_TIMEOUT)) {
- list_del(&node->list);
+ hlist_del_init(&node->list);
kfree(node);
}
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
}
-/* returns the pointer to the new tt_req_node struct if no request
- * has already been issued for this orig_node, NULL otherwise
+/**
+ * batadv_tt_req_node_new - search and possibly create a tt_req_node object
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node this request is being issued for
+ *
+ * Returns the pointer to the new tt_req_node struct if no request
+ * has already been issued for this orig_node, NULL otherwise.
*/
static struct batadv_tt_req_node *
-batadv_new_tt_req_node(struct batadv_priv *bat_priv,
+batadv_tt_req_node_new(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node)
{
struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL;
spin_lock_bh(&bat_priv->tt.req_list_lock);
- list_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) {
+ hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) {
if (batadv_compare_eth(tt_req_node_tmp, orig_node) &&
!batadv_has_timed_out(tt_req_node_tmp->issued_at,
BATADV_TT_REQUEST_TIMEOUT))
@@ -2243,7 +2311,7 @@ batadv_new_tt_req_node(struct batadv_priv *bat_priv,
ether_addr_copy(tt_req_node->addr, orig_node->orig);
tt_req_node->issued_at = jiffies;
- list_add(&tt_req_node->list, &bat_priv->tt.req_list);
+ hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list);
unlock:
spin_unlock_bh(&bat_priv->tt.req_list_lock);
return tt_req_node;
@@ -2295,15 +2363,15 @@ static int batadv_tt_global_valid(const void *entry_ptr,
*/
static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
struct batadv_hashtable *hash,
- void *tvlv_buff, uint16_t tt_len,
+ void *tvlv_buff, u16 tt_len,
int (*valid_cb)(const void *, const void *),
void *cb_data)
{
struct batadv_tt_common_entry *tt_common_entry;
struct batadv_tvlv_tt_change *tt_change;
struct hlist_head *head;
- uint16_t tt_tot, tt_num_entries = 0;
- uint32_t i;
+ u16 tt_tot, tt_num_entries = 0;
+ u32 i;
tt_tot = batadv_tt_entries(tt_len);
tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff;
@@ -2345,11 +2413,11 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
*/
static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
struct batadv_tvlv_tt_vlan_data *tt_vlan,
- uint16_t num_vlan)
+ u16 num_vlan)
{
struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp;
struct batadv_orig_node_vlan *vlan;
- uint32_t crc;
+ u32 crc;
int i;
/* check if each received CRC matches the locally stored one */
@@ -2404,11 +2472,11 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node)
{
struct batadv_orig_node_vlan *vlan;
- uint32_t crc;
+ u32 crc;
/* recompute the global CRC for each VLAN */
rcu_read_lock();
- list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
/* if orig_node is a backbone node for this VLAN, don't compute
* the CRC as we ignore all the global entries over it
*/
@@ -2434,9 +2502,9 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
*/
static int batadv_send_tt_request(struct batadv_priv *bat_priv,
struct batadv_orig_node *dst_orig_node,
- uint8_t ttvn,
+ u8 ttvn,
struct batadv_tvlv_tt_vlan_data *tt_vlan,
- uint16_t num_vlan, bool full_table)
+ u16 num_vlan, bool full_table)
{
struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
struct batadv_tt_req_node *tt_req_node = NULL;
@@ -2452,7 +2520,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv,
/* The new tt_req will be issued only if I'm not waiting for a
* reply from the same orig_node yet
*/
- tt_req_node = batadv_new_tt_req_node(bat_priv, dst_orig_node);
+ tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node);
if (!tt_req_node)
goto out;
@@ -2494,7 +2562,8 @@ out:
batadv_hardif_free_ref(primary_if);
if (ret && tt_req_node) {
spin_lock_bh(&bat_priv->tt.req_list_lock);
- list_del(&tt_req_node->list);
+ /* hlist_del_init() verifies tt_req_node still is in the list */
+ hlist_del_init(&tt_req_node->list);
spin_unlock_bh(&bat_priv->tt.req_list_lock);
kfree(tt_req_node);
}
@@ -2514,7 +2583,7 @@ out:
*/
static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
- uint8_t *req_src, uint8_t *req_dst)
+ u8 *req_src, u8 *req_dst)
{
struct batadv_orig_node *req_dst_orig_node;
struct batadv_orig_node *res_dst_orig_node = NULL;
@@ -2522,14 +2591,14 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
struct batadv_tvlv_tt_vlan_data *tt_vlan;
bool ret = false, full_table;
- uint8_t orig_ttvn, req_ttvn;
- uint16_t tvlv_len;
- int32_t tt_len;
+ u8 orig_ttvn, req_ttvn;
+ u16 tvlv_len;
+ s32 tt_len;
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n",
req_src, tt_data->ttvn, req_dst,
- (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.'));
+ ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.'));
/* Let's get the orig node of the REAL destination */
req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst);
@@ -2540,7 +2609,7 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
if (!res_dst_orig_node)
goto out;
- orig_ttvn = (uint8_t)atomic_read(&req_dst_orig_node->last_ttvn);
+ orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn);
req_ttvn = tt_data->ttvn;
tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
@@ -2646,25 +2715,25 @@ out:
*/
static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
- uint8_t *req_src)
+ u8 *req_src)
{
struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_tvlv_tt_change *tt_change;
struct batadv_orig_node *orig_node;
- uint8_t my_ttvn, req_ttvn;
- uint16_t tvlv_len;
+ u8 my_ttvn, req_ttvn;
+ u16 tvlv_len;
bool full_table;
- int32_t tt_len;
+ s32 tt_len;
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n",
req_src, tt_data->ttvn,
- (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.'));
+ ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.'));
spin_lock_bh(&bat_priv->tt.commit_lock);
- my_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ my_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
req_ttvn = tt_data->ttvn;
orig_node = batadv_orig_hash_find(bat_priv, req_src);
@@ -2703,7 +2772,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
bat_priv->tt.last_changeset_len);
spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
} else {
- req_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ req_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
/* allocate the tvlv, put the tt_data and all the tt_vlan_data
* in the initial part
@@ -2764,7 +2833,7 @@ out:
*/
static bool batadv_send_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
- uint8_t *req_src, uint8_t *req_dst)
+ u8 *req_src, u8 *req_dst)
{
if (batadv_is_my_mac(bat_priv, req_dst))
return batadv_send_my_tt_response(bat_priv, tt_data, req_src);
@@ -2775,7 +2844,7 @@ static bool batadv_send_tt_response(struct batadv_priv *bat_priv,
static void _batadv_tt_update_changes(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
struct batadv_tvlv_tt_change *tt_change,
- uint16_t tt_num_changes, uint8_t ttvn)
+ u16 tt_num_changes, u8 ttvn)
{
int i;
int roams;
@@ -2802,13 +2871,13 @@ static void _batadv_tt_update_changes(struct batadv_priv *bat_priv,
return;
}
}
- orig_node->capa_initialized |= BATADV_ORIG_CAPA_HAS_TT;
+ set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized);
}
static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_change *tt_change,
- uint8_t ttvn, uint8_t *resp_src,
- uint16_t num_entries)
+ u8 ttvn, u8 *resp_src,
+ u16 num_entries)
{
struct batadv_orig_node *orig_node;
@@ -2838,7 +2907,7 @@ out:
static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
- uint16_t tt_num_changes, uint8_t ttvn,
+ u16 tt_num_changes, u8 ttvn,
struct batadv_tvlv_tt_change *tt_change)
{
_batadv_tt_update_changes(bat_priv, orig_node, tt_change,
@@ -2857,7 +2926,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
*
* Returns true if the client is served by this node, false otherwise.
*/
-bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr,
+bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid)
{
struct batadv_tt_local_entry *tt_local_entry;
@@ -2888,18 +2957,19 @@ out:
*/
static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
- uint8_t *resp_src, uint16_t num_entries)
+ u8 *resp_src, u16 num_entries)
{
- struct batadv_tt_req_node *node, *safe;
+ struct batadv_tt_req_node *node;
+ struct hlist_node *safe;
struct batadv_orig_node *orig_node = NULL;
struct batadv_tvlv_tt_change *tt_change;
- uint8_t *tvlv_ptr = (uint8_t *)tt_data;
- uint16_t change_offset;
+ u8 *tvlv_ptr = (u8 *)tt_data;
+ u16 change_offset;
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n",
resp_src, tt_data->ttvn, num_entries,
- (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.'));
+ ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.'));
orig_node = batadv_orig_hash_find(bat_priv, resp_src);
if (!orig_node)
@@ -2928,10 +2998,10 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
/* Delete the tt_req_node from pending tt_requests list */
spin_lock_bh(&bat_priv->tt.req_list_lock);
- list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
if (!batadv_compare_eth(node->addr, resp_src))
continue;
- list_del(&node->list);
+ hlist_del_init(&node->list);
kfree(node);
}
@@ -2977,8 +3047,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
*
* returns true if the ROAMING_ADV can be sent, false otherwise
*/
-static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv,
- uint8_t *client)
+static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client)
{
struct batadv_tt_roam_node *tt_roam_node;
bool ret = false;
@@ -3033,7 +3102,7 @@ unlock:
* for this particular roamed client has to be forwarded to the sender of the
* roaming message.
*/
-static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client,
+static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
unsigned short vid,
struct batadv_orig_node *orig_node)
{
@@ -3111,14 +3180,14 @@ void batadv_tt_free(struct batadv_priv *bat_priv)
* @enable: whether to set or unset the flag
* @count: whether to increase the TT size by the number of changed entries
*/
-static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv,
- uint16_t flags, bool enable, bool count)
+static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
+ bool enable, bool count)
{
struct batadv_hashtable *hash = bat_priv->tt.local_hash;
struct batadv_tt_common_entry *tt_common_entry;
- uint16_t changed_num = 0;
+ u16 changed_num = 0;
struct hlist_head *head;
- uint32_t i;
+ u32 i;
if (!hash)
return;
@@ -3160,7 +3229,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
struct hlist_node *node_tmp;
struct hlist_head *head;
spinlock_t *list_lock; /* protects write access to the hash lists */
- uint32_t i;
+ u32 i;
if (!hash)
return;
@@ -3188,8 +3257,10 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
/* decrease the reference held for this vlan */
vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid);
- batadv_softif_vlan_free_ref(vlan);
- batadv_softif_vlan_free_ref(vlan);
+ if (vlan) {
+ batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_free_ref(vlan);
+ }
batadv_tt_local_entry_free_ref(tt_local);
}
@@ -3206,6 +3277,8 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
*/
static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
{
+ lockdep_assert_held(&bat_priv->tt.commit_lock);
+
/* Update multicast addresses in local translation table */
batadv_mcast_mla_update(bat_priv);
@@ -3224,7 +3297,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
atomic_inc(&bat_priv->tt.vn);
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Local changes committed, updating to ttvn %u\n",
- (uint8_t)atomic_read(&bat_priv->tt.vn));
+ (u8)atomic_read(&bat_priv->tt.vn));
/* reset the sending counter */
atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX);
@@ -3243,8 +3316,8 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
spin_unlock_bh(&bat_priv->tt.commit_lock);
}
-bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src,
- uint8_t *dst, unsigned short vid)
+bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
+ unsigned short vid)
{
struct batadv_tt_local_entry *tt_local_entry = NULL;
struct batadv_tt_global_entry *tt_global_entry = NULL;
@@ -3292,17 +3365,18 @@ out:
*/
static void batadv_tt_update_orig(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
- const void *tt_buff, uint16_t tt_num_vlan,
+ const void *tt_buff, u16 tt_num_vlan,
struct batadv_tvlv_tt_change *tt_change,
- uint16_t tt_num_changes, uint8_t ttvn)
+ u16 tt_num_changes, u8 ttvn)
{
- uint8_t orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
struct batadv_tvlv_tt_vlan_data *tt_vlan;
bool full_table = true;
bool has_tt_init;
tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff;
- has_tt_init = orig_node->capa_initialized & BATADV_ORIG_CAPA_HAS_TT;
+ has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT,
+ &orig_node->capa_initialized);
/* orig table not initialised AND first diff is in the OGM OR the ttvn
* increased by one -> we can apply the attached changes
@@ -3374,7 +3448,7 @@ request_table:
* deleted later by a DEL or because of timeout
*/
bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
- uint8_t *addr, unsigned short vid)
+ u8 *addr, unsigned short vid)
{
struct batadv_tt_global_entry *tt_global_entry;
bool ret = false;
@@ -3400,7 +3474,7 @@ out:
* to keep the latter consistent with the node TTVN
*/
bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
- uint8_t *addr, unsigned short vid)
+ u8 *addr, unsigned short vid)
{
struct batadv_tt_local_entry *tt_local_entry;
bool ret = false;
@@ -3486,13 +3560,13 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface)
*/
static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags, void *tvlv_value,
- uint16_t tvlv_value_len)
+ u8 flags, void *tvlv_value,
+ u16 tvlv_value_len)
{
struct batadv_tvlv_tt_vlan_data *tt_vlan;
struct batadv_tvlv_tt_change *tt_change;
struct batadv_tvlv_tt_data *tt_data;
- uint16_t num_entries, num_vlan;
+ u16 num_entries, num_vlan;
if (tvlv_value_len < sizeof(*tt_data))
return;
@@ -3528,12 +3602,12 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
* otherwise.
*/
static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
- uint8_t *src, uint8_t *dst,
+ u8 *src, u8 *dst,
void *tvlv_value,
- uint16_t tvlv_value_len)
+ u16 tvlv_value_len)
{
struct batadv_tvlv_tt_data *tt_data;
- uint16_t tt_vlan_len, tt_num_entries;
+ u16 tt_vlan_len, tt_num_entries;
char tt_flag;
bool ret;
@@ -3609,9 +3683,9 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
* otherwise.
*/
static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
- uint8_t *src, uint8_t *dst,
+ u8 *src, u8 *dst,
void *tvlv_value,
- uint16_t tvlv_value_len)
+ u16 tvlv_value_len)
{
struct batadv_tvlv_roam_adv *roaming_adv;
struct batadv_orig_node *orig_node = NULL;
@@ -3693,7 +3767,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
* otherwise
*/
bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
- const uint8_t *addr, unsigned short vid)
+ const u8 *addr, unsigned short vid)
{
struct batadv_tt_global_entry *tt;
bool ret;
diff --git a/kernel/net/batman-adv/translation-table.h b/kernel/net/batman-adv/translation-table.h
index ad84d7b89..abd8e116e 100644
--- a/kernel/net/batman-adv/translation-table.h
+++ b/kernel/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*
@@ -18,39 +18,45 @@
#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+#include "main.h"
+
+#include <linux/types.h>
+
+struct net_device;
+struct seq_file;
+
int batadv_tt_init(struct batadv_priv *bat_priv);
-bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr,
- unsigned short vid, int ifindex, uint32_t mark);
-uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv,
- const uint8_t *addr, unsigned short vid,
- const char *message, bool roaming);
+bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
+ unsigned short vid, int ifindex, u32 mark);
+u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
+ const u8 *addr, unsigned short vid,
+ const char *message, bool roaming);
int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
- int32_t match_vid, const char *message);
+ s32 match_vid, const char *message);
int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
- const uint8_t *addr, unsigned short vid);
+ const u8 *addr, unsigned short vid);
struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
- const uint8_t *src,
- const uint8_t *addr,
+ const u8 *src, const u8 *addr,
unsigned short vid);
void batadv_tt_free(struct batadv_priv *bat_priv);
-bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr,
+bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid);
-bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src,
- uint8_t *dst, unsigned short vid);
+bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
+ unsigned short vid);
void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv);
bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
- uint8_t *addr, unsigned short vid);
+ u8 *addr, unsigned short vid);
bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
- uint8_t *addr, unsigned short vid);
+ u8 *addr, unsigned short vid);
void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface);
bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
const unsigned char *addr,
unsigned short vid);
bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
- const uint8_t *addr, unsigned short vid);
+ const u8 *addr, unsigned short vid);
#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/kernel/net/batman-adv/types.h b/kernel/net/batman-adv/types.h
index 9398c3fb4..d260efd70 100644
--- a/kernel/net/batman-adv/types.h
+++ b/kernel/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,9 +18,23 @@
#ifndef _NET_BATMAN_ADV_TYPES_H_
#define _NET_BATMAN_ADV_TYPES_H_
+#ifndef _NET_BATMAN_ADV_MAIN_H_
+#error only "main.h" can be included directly
+#endif
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+
#include "packet.h"
-#include "bitarray.h"
-#include <linux/kernel.h>
+
+struct seq_file;
#ifdef CONFIG_BATMAN_ADV_DAT
@@ -30,7 +44,7 @@
*
* *Please be careful: batadv_dat_addr_t must be UNSIGNED*
*/
-#define batadv_dat_addr_t uint16_t
+#define batadv_dat_addr_t u16
#endif /* CONFIG_BATMAN_ADV_DAT */
@@ -89,10 +103,10 @@ struct batadv_hard_iface_bat_iv {
*/
struct batadv_hard_iface {
struct list_head list;
- int16_t if_num;
+ s16 if_num;
char if_status;
struct net_device *net_dev;
- uint8_t num_bcasts;
+ u8 num_bcasts;
struct kobject *hardif_obj;
atomic_t refcount;
struct packet_type batman_adv_ptype;
@@ -118,8 +132,8 @@ struct batadv_orig_ifinfo {
struct hlist_node list;
struct batadv_hard_iface *if_outgoing;
struct batadv_neigh_node __rcu *router; /* rcu protected pointer */
- uint32_t last_real_seqno;
- uint8_t last_ttl;
+ u32 last_real_seqno;
+ u8 last_ttl;
unsigned long batman_seqno_reset;
atomic_t refcount;
struct rcu_head rcu;
@@ -132,13 +146,15 @@ struct batadv_orig_ifinfo {
* @timestamp: time (jiffie) of last received fragment
* @seqno: sequence number of the fragments in the list
* @size: accumulated size of packets in list
+ * @total_size: expected size of the assembled packet
*/
struct batadv_frag_table_entry {
struct hlist_head head;
spinlock_t lock; /* protects head */
unsigned long timestamp;
- uint16_t seqno;
- uint16_t size;
+ u16 seqno;
+ u16 size;
+ u16 total_size;
};
/**
@@ -150,7 +166,7 @@ struct batadv_frag_table_entry {
struct batadv_frag_list_entry {
struct hlist_node list;
struct sk_buff *skb;
- uint8_t no;
+ u8 no;
};
/**
@@ -159,7 +175,7 @@ struct batadv_frag_list_entry {
* @num_entries: number of TT entries for this VLAN
*/
struct batadv_vlan_tt {
- uint32_t crc;
+ u32 crc;
atomic_t num_entries;
};
@@ -174,22 +190,23 @@ struct batadv_vlan_tt {
struct batadv_orig_node_vlan {
unsigned short vid;
struct batadv_vlan_tt tt;
- struct list_head list;
+ struct hlist_node list;
atomic_t refcount;
struct rcu_head rcu;
};
/**
* struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members
- * @bcast_own: bitfield containing the number of our OGMs this orig_node
- * rebroadcasted "back" to us (relative to last_real_seqno)
- * @bcast_own_sum: counted result of bcast_own
+ * @bcast_own: set of bitfields (one per hard interface) where each one counts
+ * the number of our OGMs this orig_node rebroadcasted "back" to us (relative
+ * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
+ * @bcast_own_sum: sum of bcast_own
* @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
* neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
*/
struct batadv_orig_bat_iv {
unsigned long *bcast_own;
- uint8_t *bcast_own_sum;
+ u8 *bcast_own_sum;
/* ogm_cnt_lock protects: bcast_own, bcast_own_sum,
* neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
*/
@@ -204,6 +221,7 @@ struct batadv_orig_bat_iv {
* @batadv_dat_addr_t: address of the orig node in the distributed hash
* @last_seen: time when last packet from this node was received
* @bcast_seqno_reset: time when the broadcast seqno window was reset
+ * @mcast_handler_lock: synchronizes mcast-capability and -flag changes
* @mcast_flags: multicast flags announced by the orig node
* @mcast_want_all_unsnoop_node: a list node for the
* mcast.want_all_unsnoopables list
@@ -242,7 +260,7 @@ struct batadv_orig_bat_iv {
* @bat_iv: B.A.T.M.A.N. IV private structure
*/
struct batadv_orig_node {
- uint8_t orig[ETH_ALEN];
+ u8 orig[ETH_ALEN];
struct hlist_head ifinfo_list;
struct batadv_orig_ifinfo *last_bonding_candidate;
#ifdef CONFIG_BATMAN_ADV_DAT
@@ -251,21 +269,23 @@ struct batadv_orig_node {
unsigned long last_seen;
unsigned long bcast_seqno_reset;
#ifdef CONFIG_BATMAN_ADV_MCAST
- uint8_t mcast_flags;
+ /* synchronizes mcast tvlv specific orig changes */
+ spinlock_t mcast_handler_lock;
+ u8 mcast_flags;
struct hlist_node mcast_want_all_unsnoopables_node;
struct hlist_node mcast_want_all_ipv4_node;
struct hlist_node mcast_want_all_ipv6_node;
#endif
- uint8_t capabilities;
- uint8_t capa_initialized;
+ unsigned long capabilities;
+ unsigned long capa_initialized;
atomic_t last_ttvn;
unsigned char *tt_buff;
- int16_t tt_buff_len;
+ s16 tt_buff_len;
spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */
/* prevents from changing the table while reading it */
spinlock_t tt_lock;
DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
- uint32_t last_bcast_seqno;
+ u32 last_bcast_seqno;
struct hlist_head neigh_list;
/* neigh_list_lock protects: neigh_list and router */
spinlock_t neigh_list_lock;
@@ -282,7 +302,7 @@ struct batadv_orig_node {
spinlock_t out_coding_list_lock; /* Protects out_coding_list */
#endif
struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT];
- struct list_head vlan_list;
+ struct hlist_head vlan_list;
spinlock_t vlan_list_lock; /* protects vlan_list */
struct batadv_orig_bat_iv bat_iv;
};
@@ -296,10 +316,10 @@ struct batadv_orig_node {
* (= orig node announces a tvlv of type BATADV_TVLV_MCAST)
*/
enum batadv_orig_capabilities {
- BATADV_ORIG_CAPA_HAS_DAT = BIT(0),
- BATADV_ORIG_CAPA_HAS_NC = BIT(1),
- BATADV_ORIG_CAPA_HAS_TT = BIT(2),
- BATADV_ORIG_CAPA_HAS_MCAST = BIT(3),
+ BATADV_ORIG_CAPA_HAS_DAT,
+ BATADV_ORIG_CAPA_HAS_NC,
+ BATADV_ORIG_CAPA_HAS_TT,
+ BATADV_ORIG_CAPA_HAS_MCAST,
};
/**
@@ -308,16 +328,14 @@ enum batadv_orig_capabilities {
* @orig_node: pointer to corresponding orig node
* @bandwidth_down: advertised uplink download bandwidth
* @bandwidth_up: advertised uplink upload bandwidth
- * @deleted: this struct is scheduled for deletion
* @refcount: number of contexts the object is used
* @rcu: struct used for freeing in an RCU-safe manner
*/
struct batadv_gw_node {
struct hlist_node list;
struct batadv_orig_node *orig_node;
- uint32_t bandwidth_down;
- uint32_t bandwidth_up;
- unsigned long deleted;
+ u32 bandwidth_down;
+ u32 bandwidth_up;
atomic_t refcount;
struct rcu_head rcu;
};
@@ -338,7 +356,7 @@ struct batadv_gw_node {
struct batadv_neigh_node {
struct hlist_node list;
struct batadv_orig_node *orig_node;
- uint8_t addr[ETH_ALEN];
+ u8 addr[ETH_ALEN];
struct hlist_head ifinfo_list;
spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */
struct batadv_hard_iface *if_incoming;
@@ -358,11 +376,11 @@ struct batadv_neigh_node {
* @real_packet_count: counted result of real_bits
*/
struct batadv_neigh_ifinfo_bat_iv {
- uint8_t tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE];
- uint8_t tq_index;
- uint8_t tq_avg;
+ u8 tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE];
+ u8 tq_index;
+ u8 tq_avg;
DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
- uint8_t real_packet_count;
+ u8 real_packet_count;
};
/**
@@ -378,7 +396,7 @@ struct batadv_neigh_ifinfo {
struct hlist_node list;
struct batadv_hard_iface *if_outgoing;
struct batadv_neigh_ifinfo_bat_iv bat_iv;
- uint8_t last_ttl;
+ u8 last_ttl;
atomic_t refcount;
struct rcu_head rcu;
};
@@ -391,7 +409,7 @@ struct batadv_neigh_ifinfo {
*/
#ifdef CONFIG_BATMAN_ADV_BLA
struct batadv_bcast_duplist_entry {
- uint8_t orig[ETH_ALEN];
+ u8 orig[ETH_ALEN];
__be32 crc;
unsigned long entrytime;
};
@@ -517,13 +535,13 @@ struct batadv_priv_tt {
struct list_head changes_list;
struct batadv_hashtable *local_hash;
struct batadv_hashtable *global_hash;
- struct list_head req_list;
+ struct hlist_head req_list;
struct list_head roam_list;
spinlock_t changes_list_lock; /* protects changes */
spinlock_t req_list_lock; /* protects req_list */
spinlock_t roam_list_lock; /* protects roam_list */
unsigned char *last_changeset;
- int16_t last_changeset_len;
+ s16 last_changeset_len;
/* protects last_changeset & last_changeset_len */
spinlock_t last_changeset_lock;
/* prevents from executing a commit while reading the table */
@@ -643,7 +661,7 @@ struct batadv_priv_mcast {
struct hlist_head want_all_unsnoopables_list;
struct hlist_head want_all_ipv4_list;
struct hlist_head want_all_ipv6_list;
- uint8_t flags;
+ u8 flags;
bool enabled;
atomic_t num_disabled;
atomic_t num_want_all_unsnoopables;
@@ -761,7 +779,7 @@ struct batadv_priv {
atomic_t mesh_state;
struct net_device *soft_iface;
struct net_device_stats stats;
- uint64_t __percpu *bat_counters; /* Per cpu counters */
+ u64 __percpu *bat_counters; /* Per cpu counters */
atomic_t aggregated_ogms;
atomic_t bonding;
atomic_t fragmentation;
@@ -783,8 +801,8 @@ struct batadv_priv {
#ifdef CONFIG_BATMAN_ADV_DEBUG
atomic_t log_level;
#endif
- uint32_t isolation_mark;
- uint32_t isolation_mark_mask;
+ u32 isolation_mark;
+ u32 isolation_mark_mask;
atomic_t bcast_seqno;
atomic_t bcast_queue_left;
atomic_t batman_queue_left;
@@ -850,7 +868,7 @@ struct batadv_socket_client {
struct batadv_socket_packet {
struct list_head list;
size_t icmp_len;
- uint8_t icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE];
+ u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE];
};
/**
@@ -871,14 +889,14 @@ struct batadv_socket_packet {
*/
#ifdef CONFIG_BATMAN_ADV_BLA
struct batadv_bla_backbone_gw {
- uint8_t orig[ETH_ALEN];
+ u8 orig[ETH_ALEN];
unsigned short vid;
struct hlist_node hash_entry;
struct batadv_priv *bat_priv;
unsigned long lasttime;
atomic_t wait_periods;
atomic_t request_sent;
- uint16_t crc;
+ u16 crc;
atomic_t refcount;
struct rcu_head rcu;
};
@@ -894,7 +912,7 @@ struct batadv_bla_backbone_gw {
* @rcu: struct used for freeing in an RCU-safe manner
*/
struct batadv_bla_claim {
- uint8_t addr[ETH_ALEN];
+ u8 addr[ETH_ALEN];
unsigned short vid;
struct batadv_bla_backbone_gw *backbone_gw;
unsigned long lasttime;
@@ -916,10 +934,10 @@ struct batadv_bla_claim {
* @rcu: struct used for freeing in an RCU-safe manner
*/
struct batadv_tt_common_entry {
- uint8_t addr[ETH_ALEN];
+ u8 addr[ETH_ALEN];
unsigned short vid;
struct hlist_node hash_entry;
- uint16_t flags;
+ u16 flags;
unsigned long added_at;
atomic_t refcount;
struct rcu_head rcu;
@@ -961,7 +979,7 @@ struct batadv_tt_global_entry {
*/
struct batadv_tt_orig_list_entry {
struct batadv_orig_node *orig_node;
- uint8_t ttvn;
+ u8 ttvn;
struct hlist_node list;
atomic_t refcount;
struct rcu_head rcu;
@@ -984,9 +1002,9 @@ struct batadv_tt_change_node {
* @list: list node for batadv_priv_tt::req_list
*/
struct batadv_tt_req_node {
- uint8_t addr[ETH_ALEN];
+ u8 addr[ETH_ALEN];
unsigned long issued_at;
- struct list_head list;
+ struct hlist_node list;
};
/**
@@ -998,7 +1016,7 @@ struct batadv_tt_req_node {
* @list: list node for batadv_priv_tt::roam_list
*/
struct batadv_tt_roam_node {
- uint8_t addr[ETH_ALEN];
+ u8 addr[ETH_ALEN];
atomic_t counter;
unsigned long first_time;
struct list_head list;
@@ -1015,7 +1033,7 @@ struct batadv_tt_roam_node {
*/
struct batadv_nc_node {
struct list_head list;
- uint8_t addr[ETH_ALEN];
+ u8 addr[ETH_ALEN];
atomic_t refcount;
struct rcu_head rcu;
struct batadv_orig_node *orig_node;
@@ -1039,8 +1057,8 @@ struct batadv_nc_path {
atomic_t refcount;
struct list_head packet_list;
spinlock_t packet_list_lock; /* Protects packet_list */
- uint8_t next_hop[ETH_ALEN];
- uint8_t prev_hop[ETH_ALEN];
+ u8 next_hop[ETH_ALEN];
+ u8 prev_hop[ETH_ALEN];
unsigned long last_valid;
};
@@ -1092,11 +1110,11 @@ struct batadv_skb_cb {
struct batadv_forw_packet {
struct hlist_node list;
unsigned long send_time;
- uint8_t own;
+ u8 own;
struct sk_buff *skb;
- uint16_t packet_len;
- uint32_t direct_link_flags;
- uint8_t num_packets;
+ u16 packet_len;
+ u32 direct_link_flags;
+ u8 num_packets;
struct delayed_work delayed_work;
struct batadv_hard_iface *if_incoming;
struct batadv_hard_iface *if_outgoing;
@@ -1118,6 +1136,8 @@ struct batadv_forw_packet {
* @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better
* than neigh2 for their respective outgoing interface from the metric
* prospective
+ * @bat_neigh_free: free the resources allocated by the routing algorithm for a
+ * neigh_node object
* @bat_orig_print: print the originator table (optional)
* @bat_orig_free: free the resources allocated by the routing algorithm for an
* orig_node object
@@ -1135,6 +1155,7 @@ struct batadv_algo_ops {
void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface);
void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface);
void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet);
+ /* neigh_node handling API */
int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing1,
struct batadv_neigh_node *neigh2,
@@ -1144,6 +1165,7 @@ struct batadv_algo_ops {
struct batadv_hard_iface *if_outgoing1,
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2);
+ void (*bat_neigh_free)(struct batadv_neigh_node *neigh);
/* orig_node handling API */
void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq,
struct batadv_hard_iface *hard_iface);
@@ -1167,7 +1189,7 @@ struct batadv_algo_ops {
*/
struct batadv_dat_entry {
__be32 ip;
- uint8_t mac_addr[ETH_ALEN];
+ u8 mac_addr[ETH_ALEN];
unsigned short vid;
unsigned long last_update;
struct hlist_node hash_entry;
@@ -1229,14 +1251,13 @@ struct batadv_tvlv_handler {
struct hlist_node list;
void (*ogm_handler)(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig,
- uint8_t flags,
- void *tvlv_value, uint16_t tvlv_value_len);
+ u8 flags, void *tvlv_value, u16 tvlv_value_len);
int (*unicast_handler)(struct batadv_priv *bat_priv,
- uint8_t *src, uint8_t *dst,
- void *tvlv_value, uint16_t tvlv_value_len);
- uint8_t type;
- uint8_t version;
- uint8_t flags;
+ u8 *src, u8 *dst,
+ void *tvlv_value, u16 tvlv_value_len);
+ u8 type;
+ u8 version;
+ u8 flags;
atomic_t refcount;
struct rcu_head rcu;
};
diff --git a/kernel/net/bluetooth/6lowpan.c b/kernel/net/bluetooth/6lowpan.c
index 1742b849f..795ddd8b2 100644
--- a/kernel/net/bluetooth/6lowpan.c
+++ b/kernel/net/bluetooth/6lowpan.c
@@ -21,8 +21,6 @@
#include <net/ip6_route.h>
#include <net/addrconf.h>
-#include <net/af_ieee802154.h> /* to get the address type */
-
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
@@ -35,7 +33,6 @@ static struct dentry *lowpan_enable_debugfs;
static struct dentry *lowpan_control_debugfs;
#define IFACE_NAME_TEMPLATE "bt%d"
-#define EUI64_ADDR_LEN 8
struct skb_cb {
struct in6_addr addr;
@@ -85,7 +82,7 @@ struct lowpan_dev {
static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev)
{
- return netdev_priv(netdev);
+ return (struct lowpan_dev *)lowpan_priv(netdev)->priv;
}
static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer)
@@ -192,7 +189,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev,
if (ipv6_addr_any(nexthop))
return NULL;
} else {
- nexthop = rt6_nexthop(rt);
+ nexthop = rt6_nexthop(rt, daddr);
/* We need to remember the address because it is needed
* by bt_xmit() when sending the packet. In bt_xmit(), the
@@ -266,14 +263,13 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev)
if (!skb_cp)
return NET_RX_DROP;
- return netif_rx(skb_cp);
+ return netif_rx_ni(skb_cp);
}
static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
struct l2cap_chan *chan)
{
const u8 *saddr, *daddr;
- u8 iphc0, iphc1;
struct lowpan_dev *dev;
struct lowpan_peer *peer;
@@ -288,22 +284,7 @@ static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
saddr = peer->eui64_addr;
daddr = dev->netdev->dev_addr;
- /* at least two bytes will be used for the encoding */
- if (skb->len < 2)
- return -EINVAL;
-
- if (lowpan_fetch_skb_u8(skb, &iphc0))
- return -EINVAL;
-
- if (lowpan_fetch_skb_u8(skb, &iphc1))
- return -EINVAL;
-
- return lowpan_header_decompress(skb, netdev,
- saddr, IEEE802154_ADDR_LONG,
- EUI64_ADDR_LEN, daddr,
- IEEE802154_ADDR_LONG, EUI64_ADDR_LEN,
- iphc0, iphc1);
-
+ return lowpan_header_decompress(skb, netdev, daddr, saddr);
}
static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
@@ -315,15 +296,20 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
if (!netif_running(dev))
goto drop;
- if (dev->type != ARPHRD_6LOWPAN)
+ if (dev->type != ARPHRD_6LOWPAN || !skb->len)
goto drop;
+ skb_reset_network_header(skb);
+
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
goto drop;
/* check that it's our buffer */
- if (skb->data[0] == LOWPAN_DISPATCH_IPV6) {
+ if (lowpan_is_ipv6(*skb_network_header(skb))) {
+ /* Pull off the 1-byte of 6lowpan header. */
+ skb_pull(skb, 1);
+
/* Copy the packet so that the IPv6 header is
* properly aligned.
*/
@@ -334,8 +320,8 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
local_skb->protocol = htons(ETH_P_IPV6);
local_skb->pkt_type = PACKET_HOST;
+ local_skb->dev = dev;
- skb_reset_network_header(local_skb);
skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) {
@@ -348,38 +334,35 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
consume_skb(local_skb);
consume_skb(skb);
- } else {
- switch (skb->data[0] & 0xe0) {
- case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */
- local_skb = skb_clone(skb, GFP_ATOMIC);
- if (!local_skb)
- goto drop;
-
- ret = iphc_decompress(local_skb, dev, chan);
- if (ret < 0) {
- kfree_skb(local_skb);
- goto drop;
- }
+ } else if (lowpan_is_iphc(*skb_network_header(skb))) {
+ local_skb = skb_clone(skb, GFP_ATOMIC);
+ if (!local_skb)
+ goto drop;
- local_skb->protocol = htons(ETH_P_IPV6);
- local_skb->pkt_type = PACKET_HOST;
- local_skb->dev = dev;
+ local_skb->dev = dev;
- if (give_skb_to_upper(local_skb, dev)
- != NET_RX_SUCCESS) {
- kfree_skb(local_skb);
- goto drop;
- }
+ ret = iphc_decompress(local_skb, dev, chan);
+ if (ret < 0) {
+ kfree_skb(local_skb);
+ goto drop;
+ }
- dev->stats.rx_bytes += skb->len;
- dev->stats.rx_packets++;
+ local_skb->protocol = htons(ETH_P_IPV6);
+ local_skb->pkt_type = PACKET_HOST;
- consume_skb(local_skb);
- consume_skb(skb);
- break;
- default:
- break;
+ if (give_skb_to_upper(local_skb, dev)
+ != NET_RX_SUCCESS) {
+ kfree_skb(local_skb);
+ goto drop;
}
+
+ dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+
+ consume_skb(local_skb);
+ consume_skb(skb);
+ } else {
+ goto drop;
}
return NET_RX_SUCCESS;
@@ -493,8 +476,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev,
status = 1;
}
- lowpan_header_compress(skb, netdev, ETH_P_IPV6, daddr,
- dev->netdev->dev_addr, skb->len);
+ lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr);
err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0);
if (err < 0)
@@ -674,13 +656,8 @@ static struct header_ops header_ops = {
static void netdev_setup(struct net_device *dev)
{
- dev->addr_len = EUI64_ADDR_LEN;
- dev->type = ARPHRD_6LOWPAN;
-
dev->hard_header_len = 0;
dev->needed_tailroom = 0;
- dev->mtu = IPV6_MIN_MTU;
- dev->tx_queue_len = 0;
dev->flags = IFF_RUNNING | IFF_POINTOPOINT |
IFF_MULTICAST;
dev->watchdog_timeo = 0;
@@ -775,24 +752,7 @@ static struct l2cap_chan *chan_create(void)
chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
chan->mode = L2CAP_MODE_LE_FLOWCTL;
- chan->omtu = 65535;
- chan->imtu = chan->omtu;
-
- return chan;
-}
-
-static struct l2cap_chan *chan_open(struct l2cap_chan *pchan)
-{
- struct l2cap_chan *chan;
-
- chan = chan_create();
- if (!chan)
- return NULL;
-
- chan->remote_mps = chan->omtu;
- chan->mps = chan->omtu;
-
- chan->state = BT_CONNECTED;
+ chan->imtu = 1280;
return chan;
}
@@ -848,20 +808,36 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
struct net_device *netdev;
int err = 0;
- netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE,
- NET_NAME_UNKNOWN, netdev_setup);
+ netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)),
+ IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN,
+ netdev_setup);
if (!netdev)
return -ENOMEM;
set_dev_addr(netdev, &chan->src, chan->src_type);
netdev->netdev_ops = &netdev_ops;
- SET_NETDEV_DEV(netdev, &chan->conn->hcon->dev);
+ SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev);
SET_NETDEV_DEVTYPE(netdev, &bt_type);
+ *dev = lowpan_dev(netdev);
+ (*dev)->netdev = netdev;
+ (*dev)->hdev = chan->conn->hcon->hdev;
+ INIT_LIST_HEAD(&(*dev)->peers);
+
+ spin_lock(&devices_lock);
+ INIT_LIST_HEAD(&(*dev)->list);
+ list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
+ spin_unlock(&devices_lock);
+
+ lowpan_netdev_setup(netdev, LOWPAN_LLTYPE_BTLE);
+
err = register_netdev(netdev);
if (err < 0) {
BT_INFO("register_netdev failed %d", err);
+ spin_lock(&devices_lock);
+ list_del_rcu(&(*dev)->list);
+ spin_unlock(&devices_lock);
free_netdev(netdev);
goto out;
}
@@ -871,16 +847,6 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
&chan->src, chan->src_type);
set_bit(__LINK_STATE_PRESENT, &netdev->state);
- *dev = netdev_priv(netdev);
- (*dev)->netdev = netdev;
- (*dev)->hdev = chan->conn->hcon->hdev;
- INIT_LIST_HEAD(&(*dev)->peers);
-
- spin_lock(&devices_lock);
- INIT_LIST_HEAD(&(*dev)->list);
- list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
- spin_unlock(&devices_lock);
-
return 0;
out:
@@ -913,7 +879,10 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan)
{
struct l2cap_chan *chan;
- chan = chan_open(pchan);
+ chan = chan_create();
+ if (!chan)
+ return NULL;
+
chan->ops = pchan->ops;
BT_DBG("chan %p pchan %p", chan, pchan);
@@ -928,7 +897,7 @@ static void delete_netdev(struct work_struct *work)
unregister_netdev(entry->netdev);
- /* The entry pointer is deleted in device_event() */
+ /* The entry pointer is deleted by the netdev destructor. */
}
static void chan_close_cb(struct l2cap_chan *chan)
@@ -937,7 +906,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
struct lowpan_dev *dev = NULL;
struct lowpan_peer *peer;
int err = -ENOENT;
- bool last = false, removed = true;
+ bool last = false, remove = true;
BT_DBG("chan %p conn %p", chan, chan->conn);
@@ -948,7 +917,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
/* If conn is set, then the netdev is also there and we should
* not remove it.
*/
- removed = false;
+ remove = false;
}
spin_lock(&devices_lock);
@@ -977,7 +946,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
ifdown(dev->netdev);
- if (!removed) {
+ if (remove) {
INIT_WORK(&entry->delete_netdev, delete_netdev);
schedule_work(&entry->delete_netdev);
}
@@ -1059,34 +1028,23 @@ static inline __u8 bdaddr_type(__u8 type)
return BDADDR_LE_RANDOM;
}
-static struct l2cap_chan *chan_get(void)
-{
- struct l2cap_chan *pchan;
-
- pchan = chan_create();
- if (!pchan)
- return NULL;
-
- pchan->ops = &bt_6lowpan_chan_ops;
-
- return pchan;
-}
-
static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
{
- struct l2cap_chan *pchan;
+ struct l2cap_chan *chan;
int err;
- pchan = chan_get();
- if (!pchan)
+ chan = chan_create();
+ if (!chan)
return -EINVAL;
- err = l2cap_chan_connect(pchan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
+ chan->ops = &bt_6lowpan_chan_ops;
+
+ err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
addr, dst_type);
- BT_DBG("chan %p err %d", pchan, err);
+ BT_DBG("chan %p err %d", chan, err);
if (err < 0)
- l2cap_chan_put(pchan);
+ l2cap_chan_put(chan);
return err;
}
@@ -1111,31 +1069,32 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
static struct l2cap_chan *bt_6lowpan_listen(void)
{
bdaddr_t *addr = BDADDR_ANY;
- struct l2cap_chan *pchan;
+ struct l2cap_chan *chan;
int err;
if (!enable_6lowpan)
return NULL;
- pchan = chan_get();
- if (!pchan)
+ chan = chan_create();
+ if (!chan)
return NULL;
- pchan->state = BT_LISTEN;
- pchan->src_type = BDADDR_LE_PUBLIC;
+ chan->ops = &bt_6lowpan_chan_ops;
+ chan->state = BT_LISTEN;
+ chan->src_type = BDADDR_LE_PUBLIC;
- atomic_set(&pchan->nesting, L2CAP_NESTING_PARENT);
+ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
- BT_DBG("chan %p src type %d", pchan, pchan->src_type);
+ BT_DBG("chan %p src type %d", chan, chan->src_type);
- err = l2cap_add_psm(pchan, addr, cpu_to_le16(L2CAP_PSM_IPSP));
+ err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP));
if (err) {
- l2cap_chan_put(pchan);
+ l2cap_chan_put(chan);
BT_ERR("psm cannot be added err %d", err);
return NULL;
}
- return pchan;
+ return chan;
}
static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
@@ -1159,7 +1118,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
return -ENOENT;
hci_dev_lock(hdev);
- hcon = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
+ hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type);
hci_dev_unlock(hdev);
if (!hcon)
@@ -1208,8 +1167,6 @@ static void disconnect_all_peers(void)
list_del_rcu(&peer->list);
kfree_rcu(peer, rcu);
-
- module_put(THIS_MODULE);
}
spin_unlock(&devices_lock);
}
@@ -1418,7 +1375,6 @@ static int device_event(struct notifier_block *unused,
BT_DBG("Unregistered netdev %s %p",
netdev->name, netdev);
list_del(&entry->list);
- kfree(entry);
break;
}
}
diff --git a/kernel/net/bluetooth/Kconfig b/kernel/net/bluetooth/Kconfig
index b8c794b87..95d1a66ba 100644
--- a/kernel/net/bluetooth/Kconfig
+++ b/kernel/net/bluetooth/Kconfig
@@ -53,6 +53,11 @@ source "net/bluetooth/cmtp/Kconfig"
source "net/bluetooth/hidp/Kconfig"
+config BT_HS
+ bool "Bluetooth High Speed (HS) features"
+ depends on BT_BREDR
+ default y
+
config BT_LE
bool "Bluetooth Low Energy (LE) features"
depends on BT
diff --git a/kernel/net/bluetooth/Makefile b/kernel/net/bluetooth/Makefile
index 9a8ea232d..2b15ae8c1 100644
--- a/kernel/net/bluetooth/Makefile
+++ b/kernel/net/bluetooth/Makefile
@@ -12,9 +12,11 @@ obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o
bluetooth_6lowpan-y := 6lowpan.o
bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
- hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \
- a2mp.o amp.o ecc.o hci_request.o mgmt_util.o
+ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \
+ ecc.o hci_request.o mgmt_util.o
+bluetooth-$(CONFIG_BT_BREDR) += sco.o
+bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
diff --git a/kernel/net/bluetooth/a2mp.c b/kernel/net/bluetooth/a2mp.c
index 5a04eb1a7..5f123c332 100644
--- a/kernel/net/bluetooth/a2mp.c
+++ b/kernel/net/bluetooth/a2mp.c
@@ -16,6 +16,7 @@
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
+#include "hci_request.h"
#include "a2mp.h"
#include "amp.h"
@@ -286,11 +287,21 @@ static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb,
return 0;
}
+static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ a2mp_send_getinfo_rsp(hdev);
+}
+
static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
struct a2mp_cmd *hdr)
{
struct a2mp_info_req *req = (void *) skb->data;
struct hci_dev *hdev;
+ struct hci_request hreq;
+ int err = 0;
if (le16_to_cpu(hdr->len) < sizeof(*req))
return -EINVAL;
@@ -311,7 +322,11 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
}
set_bit(READ_LOC_AMP_INFO, &mgr->state);
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
+ hci_req_init(&hreq, hdev);
+ hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
+ err = hci_req_run(&hreq, read_local_amp_info_complete);
+ if (err < 0)
+ a2mp_send_getinfo_rsp(hdev);
done:
if (hdev)
diff --git a/kernel/net/bluetooth/a2mp.h b/kernel/net/bluetooth/a2mp.h
index 296f665ad..a4ff3ea9b 100644
--- a/kernel/net/bluetooth/a2mp.h
+++ b/kernel/net/bluetooth/a2mp.h
@@ -130,10 +130,29 @@ struct a2mp_physlink_rsp {
#define A2MP_STATUS_SECURITY_VIOLATION 0x06
struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr);
+
+#if IS_ENABLED(CONFIG_BT_HS)
int amp_mgr_put(struct amp_mgr *mgr);
struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
struct sk_buff *skb);
void a2mp_discover_amp(struct l2cap_chan *chan);
+#else
+static inline int amp_mgr_put(struct amp_mgr *mgr)
+{
+ return 0;
+}
+
+static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ return NULL;
+}
+
+static inline void a2mp_discover_amp(struct l2cap_chan *chan)
+{
+}
+#endif
+
void a2mp_send_getinfo_rsp(struct hci_dev *hdev);
void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status);
void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status);
diff --git a/kernel/net/bluetooth/af_bluetooth.c b/kernel/net/bluetooth/af_bluetooth.c
index 70f9d945f..70306cc9d 100644
--- a/kernel/net/bluetooth/af_bluetooth.c
+++ b/kernel/net/bluetooth/af_bluetooth.c
@@ -33,7 +33,7 @@
#include "selftest.h"
-#define VERSION "2.20"
+#define VERSION "2.21"
/* Bluetooth sockets */
#define BT_MAX_PROTO 8
@@ -221,7 +221,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
BT_DBG("sock %p sk %p len %zu", sock, sk, len);
- if (flags & (MSG_OOB))
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
skb = skb_recv_datagram(sk, flags, noblock, &err);
@@ -271,11 +271,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo)
if (signal_pending(current) || !timeo)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
release_sock(sk);
timeo = schedule_timeout(timeo);
lock_sock(sk);
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
__set_current_state(TASK_RUNNING);
@@ -441,7 +441,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock,
if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
diff --git a/kernel/net/bluetooth/amp.c b/kernel/net/bluetooth/amp.c
index ee016f039..e32f34189 100644
--- a/kernel/net/bluetooth/amp.c
+++ b/kernel/net/bluetooth/amp.c
@@ -16,6 +16,7 @@
#include <net/bluetooth/hci_core.h>
#include <crypto/hash.h>
+#include "hci_request.h"
#include "a2mp.h"
#include "amp.h"
@@ -220,10 +221,49 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data);
}
+static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data;
+ struct amp_assoc *assoc = &hdev->loc_assoc;
+ size_t rem_len, frag_len;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ goto send_rsp;
+
+ frag_len = skb->len - sizeof(*rp);
+ rem_len = __le16_to_cpu(rp->rem_len);
+
+ if (rem_len > frag_len) {
+ BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
+
+ memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
+ assoc->offset += frag_len;
+
+ /* Read other fragments */
+ amp_read_loc_assoc_frag(hdev, rp->phy_handle);
+
+ return;
+ }
+
+ memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
+ assoc->len = assoc->offset + rem_len;
+ assoc->offset = 0;
+
+send_rsp:
+ /* Send A2MP Rsp when all fragments are received */
+ a2mp_send_getampassoc_rsp(hdev, rp->status);
+ a2mp_send_create_phy_link_req(hdev, rp->status);
+}
+
void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
{
struct hci_cp_read_local_amp_assoc cp;
struct amp_assoc *loc_assoc = &hdev->loc_assoc;
+ struct hci_request req;
+ int err = 0;
BT_DBG("%s handle %d", hdev->name, phy_handle);
@@ -231,12 +271,18 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
cp.len_so_far = cpu_to_le16(loc_assoc->offset);
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
+ if (err < 0)
+ a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
}
void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
{
struct hci_cp_read_local_amp_assoc cp;
+ struct hci_request req;
+ int err = 0;
memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc));
memset(&cp, 0, sizeof(cp));
@@ -244,7 +290,11 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
set_bit(READ_LOC_AMP_ASSOC, &mgr->state);
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_run_skb(&req, read_local_amp_assoc_complete);
+ if (err < 0)
+ a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
}
void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
@@ -252,6 +302,8 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
{
struct hci_cp_read_local_amp_assoc cp;
struct amp_mgr *mgr = hcon->amp_mgr;
+ struct hci_request req;
+ int err = 0;
cp.phy_handle = hcon->handle;
cp.len_so_far = cpu_to_le16(0);
@@ -260,7 +312,25 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state);
/* Read Local AMP Assoc final link information data */
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_run_skb(&req, read_local_amp_assoc_complete);
+ if (err < 0)
+ a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
+}
+
+static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data;
+
+ BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
+ hdev->name, rp->status, rp->phy_handle);
+
+ if (rp->status)
+ return;
+
+ amp_write_rem_assoc_continue(hdev, rp->phy_handle);
}
/* Write AMP Assoc data fragments, returns true with last fragment written*/
@@ -270,6 +340,7 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
struct hci_cp_write_remote_amp_assoc *cp;
struct amp_mgr *mgr = hcon->amp_mgr;
struct amp_ctrl *ctrl;
+ struct hci_request req;
u16 frag_len, len;
ctrl = amp_ctrl_lookup(mgr, hcon->remote_id);
@@ -307,7 +378,9 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
amp_ctrl_put(ctrl);
- hci_send_cmd(hdev, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
+ hci_req_run_skb(&req, write_remote_amp_assoc_complete);
kfree(cp);
@@ -344,10 +417,37 @@ void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle)
amp_write_rem_assoc_frag(hdev, hcon);
}
+static void create_phylink_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct hci_cp_create_phy_link *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (status) {
+ struct hci_conn *hcon;
+
+ hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
+ if (hcon)
+ hci_conn_del(hcon);
+ } else {
+ amp_write_remote_assoc(hdev, cp->phy_handle);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon)
{
struct hci_cp_create_phy_link cp;
+ struct hci_request req;
cp.phy_handle = hcon->handle;
@@ -360,13 +460,33 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
return;
}
- hci_send_cmd(hdev, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
+ hci_req_run(&req, create_phylink_complete);
+}
+
+static void accept_phylink_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct hci_cp_accept_phy_link *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
+ if (!cp)
+ return;
+
+ amp_write_remote_assoc(hdev, cp->phy_handle);
}
void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon)
{
struct hci_cp_accept_phy_link cp;
+ struct hci_request req;
cp.phy_handle = hcon->handle;
@@ -379,7 +499,9 @@ void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
return;
}
- hci_send_cmd(hdev, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
+ hci_req_run(&req, accept_phylink_complete);
}
void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon)
diff --git a/kernel/net/bluetooth/amp.h b/kernel/net/bluetooth/amp.h
index 7ea3db77b..8848f8158 100644
--- a/kernel/net/bluetooth/amp.h
+++ b/kernel/net/bluetooth/amp.h
@@ -44,6 +44,20 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon);
void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon);
+
+#if IS_ENABLED(CONFIG_BT_HS)
+void amp_create_logical_link(struct l2cap_chan *chan);
+void amp_disconnect_logical_link(struct hci_chan *hchan);
+#else
+static inline void amp_create_logical_link(struct l2cap_chan *chan)
+{
+}
+
+static inline void amp_disconnect_logical_link(struct hci_chan *hchan)
+{
+}
+#endif
+
void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle);
void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle);
void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon);
diff --git a/kernel/net/bluetooth/bnep/sock.c b/kernel/net/bluetooth/bnep/sock.c
index bde2bdd9e..b5116fa98 100644
--- a/kernel/net/bluetooth/bnep/sock.c
+++ b/kernel/net/bluetooth/bnep/sock.c
@@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/bluetooth/cmtp/capi.c b/kernel/net/bluetooth/cmtp/capi.c
index b0c6c6af7..9a5033877 100644
--- a/kernel/net/bluetooth/cmtp/capi.c
+++ b/kernel/net/bluetooth/cmtp/capi.c
@@ -100,9 +100,9 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli
static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value)
{
struct cmtp_application *app;
- struct list_head *p, *n;
+ struct list_head *p;
- list_for_each_safe(p, n, &session->applications) {
+ list_for_each(p, &session->applications) {
app = list_entry(p, struct cmtp_application, list);
switch (pattern) {
case CMTP_MSGNUM:
@@ -511,13 +511,13 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
struct capi_ctr *ctrl = m->private;
struct cmtp_session *session = ctrl->driverdata;
struct cmtp_application *app;
- struct list_head *p, *n;
+ struct list_head *p;
seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl));
seq_printf(m, "addr %s\n", session->name);
seq_printf(m, "ctrl %d\n", session->num);
- list_for_each_safe(p, n, &session->applications) {
+ list_for_each(p, &session->applications) {
app = list_entry(p, struct cmtp_application, list);
seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping);
}
diff --git a/kernel/net/bluetooth/cmtp/sock.c b/kernel/net/bluetooth/cmtp/sock.c
index d82787d41..ce86a7bae 100644
--- a/kernel/net/bluetooth/cmtp/sock.c
+++ b/kernel/net/bluetooth/cmtp/sock.c
@@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/bluetooth/hci_conn.c b/kernel/net/bluetooth/hci_conn.c
index ee5e59839..24e941092 100644
--- a/kernel/net/bluetooth/hci_conn.c
+++ b/kernel/net/bluetooth/hci_conn.c
@@ -59,9 +59,126 @@ static const struct sco_param esco_param_msbc[] = {
{ EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */
};
-static void hci_le_create_connection_cancel(struct hci_conn *conn)
+/* This function requires the caller holds hdev->lock */
+static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
+{
+ struct hci_conn_params *params;
+ struct hci_dev *hdev = conn->hdev;
+ struct smp_irk *irk;
+ bdaddr_t *bdaddr;
+ u8 bdaddr_type;
+
+ bdaddr = &conn->dst;
+ bdaddr_type = conn->dst_type;
+
+ /* Check if we need to convert to identity address */
+ irk = hci_get_irk(hdev, bdaddr, bdaddr_type);
+ if (irk) {
+ bdaddr = &irk->bdaddr;
+ bdaddr_type = irk->addr_type;
+ }
+
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr,
+ bdaddr_type);
+ if (!params || !params->explicit_connect)
+ return;
+
+ /* The connection attempt was doing scan for new RPA, and is
+ * in scan phase. If params are not associated with any other
+ * autoconnect action, remove them completely. If they are, just unmark
+ * them as waiting for connection, by clearing explicit_connect field.
+ */
+ params->explicit_connect = false;
+
+ list_del_init(&params->action);
+
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_EXPLICIT:
+ hci_conn_params_del(hdev, bdaddr, bdaddr_type);
+ /* return instead of break to avoid duplicate scan update */
+ return;
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ list_add(&params->action, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ list_add(&params->action, &hdev->pend_le_reports);
+ break;
+ default:
+ break;
+ }
+
+ hci_update_background_scan(hdev);
+}
+
+static void hci_conn_cleanup(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags))
+ hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type);
+
+ hci_chan_list_flush(conn);
+
+ hci_conn_hash_del(hdev, conn);
+
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
+
+ hci_conn_del_sysfs(conn);
+
+ debugfs_remove_recursive(conn->debugfs);
+
+ hci_dev_put(hdev);
+
+ hci_conn_put(conn);
+}
+
+static void le_scan_cleanup(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ le_scan_cleanup);
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_conn *c = NULL;
+
+ BT_DBG("%s hcon %p", hdev->name, conn);
+
+ hci_dev_lock(hdev);
+
+ /* Check that the hci_conn is still around */
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &hdev->conn_hash.list, list) {
+ if (c == conn)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (c == conn) {
+ hci_connect_le_scan_cleanup(conn);
+ hci_conn_cleanup(conn);
+ }
+
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ hci_conn_put(conn);
+}
+
+static void hci_connect_le_scan_remove(struct hci_conn *conn)
{
- hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL);
+ BT_DBG("%s hcon %p", conn->hdev->name, conn);
+
+ /* We can't call hci_conn_del/hci_conn_cleanup here since that
+ * could deadlock with another hci_conn_del() call that's holding
+ * hci_dev_lock and doing cancel_delayed_work_sync(&conn->disc_work).
+ * Instead, grab temporary extra references to the hci_dev and
+ * hci_conn and perform the necessary cleanup in a separate work
+ * callback.
+ */
+
+ hci_dev_hold(conn->hdev);
+ hci_conn_get(conn);
+
+ schedule_work(&conn->le_scan_cleanup);
}
static void hci_acl_create_connection(struct hci_conn *conn)
@@ -107,33 +224,8 @@ static void hci_acl_create_connection(struct hci_conn *conn)
hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp);
}
-static void hci_acl_create_connection_cancel(struct hci_conn *conn)
-{
- struct hci_cp_create_conn_cancel cp;
-
- BT_DBG("hcon %p", conn);
-
- if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2)
- return;
-
- bacpy(&cp.bdaddr, &conn->dst);
- hci_send_cmd(conn->hdev, HCI_OP_CREATE_CONN_CANCEL, sizeof(cp), &cp);
-}
-
-static void hci_reject_sco(struct hci_conn *conn)
-{
- struct hci_cp_reject_sync_conn_req cp;
-
- cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
- bacpy(&cp.bdaddr, &conn->dst);
-
- hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp);
-}
-
int hci_disconnect(struct hci_conn *conn, __u8 reason)
{
- struct hci_cp_disconnect cp;
-
BT_DBG("hcon %p", conn);
/* When we are master of an established connection and it enters
@@ -141,7 +233,8 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason)
* current clock offset. Processing of the result is done
* within the event handling and hci_clock_offset_evt function.
*/
- if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER) {
+ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER &&
+ (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) {
struct hci_dev *hdev = conn->hdev;
struct hci_cp_read_clock_offset clkoff_cp;
@@ -150,25 +243,7 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason)
&clkoff_cp);
}
- conn->state = BT_DISCONN;
-
- cp.handle = cpu_to_le16(conn->handle);
- cp.reason = reason;
- return hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp);
-}
-
-static void hci_amp_disconn(struct hci_conn *conn)
-{
- struct hci_cp_disconn_phy_link cp;
-
- BT_DBG("hcon %p", conn);
-
- conn->state = BT_DISCONN;
-
- cp.phy_handle = HCI_PHY_HANDLE(conn->handle);
- cp.reason = hci_proto_disconn_ind(conn);
- hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK,
- sizeof(cp), &cp);
+ return hci_abort_conn(conn, reason);
}
static void hci_add_sco(struct hci_conn *conn, __u16 handle)
@@ -276,7 +351,7 @@ u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
}
void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
- __u8 ltk[16])
+ __u8 ltk[16], __u8 key_size)
{
struct hci_dev *hdev = conn->hdev;
struct hci_cp_le_start_enc cp;
@@ -288,7 +363,7 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
cp.handle = cpu_to_le16(conn->handle);
cp.rand = rand;
cp.ediv = ediv;
- memcpy(cp.ltk, ltk, sizeof(cp.ltk));
+ memcpy(cp.ltk, ltk, key_size);
hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp);
}
@@ -334,31 +409,14 @@ static void hci_conn_timeout(struct work_struct *work)
if (refcnt > 0)
return;
- switch (conn->state) {
- case BT_CONNECT:
- case BT_CONNECT2:
- if (conn->out) {
- if (conn->type == ACL_LINK)
- hci_acl_create_connection_cancel(conn);
- else if (conn->type == LE_LINK)
- hci_le_create_connection_cancel(conn);
- } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
- hci_reject_sco(conn);
- }
- break;
- case BT_CONFIG:
- case BT_CONNECTED:
- if (conn->type == AMP_LINK) {
- hci_amp_disconn(conn);
- } else {
- __u8 reason = hci_proto_disconn_ind(conn);
- hci_disconnect(conn, reason);
- }
- break;
- default:
- conn->state = BT_CLOSED;
- break;
+ /* LE connections in scanning state need special handling */
+ if (conn->state == BT_CONNECT && conn->type == LE_LINK &&
+ test_bit(HCI_CONN_SCANNING, &conn->flags)) {
+ hci_connect_le_scan_remove(conn);
+ return;
}
+
+ hci_abort_conn(conn, hci_proto_disconn_ind(conn));
}
/* Enter sniff mode */
@@ -426,7 +484,7 @@ static void le_conn_timeout(struct work_struct *work)
return;
}
- hci_le_create_connection_cancel(conn);
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
}
struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
@@ -489,6 +547,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept);
INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle);
INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout);
+ INIT_WORK(&conn->le_scan_cleanup, le_scan_cleanup);
atomic_set(&conn->refcnt, 0);
@@ -535,27 +594,17 @@ int hci_conn_del(struct hci_conn *conn)
}
}
- hci_chan_list_flush(conn);
-
if (conn->amp_mgr)
amp_mgr_put(conn->amp_mgr);
- hci_conn_hash_del(hdev, conn);
- if (hdev->notify)
- hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
-
skb_queue_purge(&conn->data_q);
- hci_conn_del_sysfs(conn);
-
- debugfs_remove_recursive(conn->debugfs);
-
- if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags))
- hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type);
-
- hci_dev_put(hdev);
-
- hci_conn_put(conn);
+ /* Remove the connection from the list and cleanup its remaining
+ * state. This is a separate function since for some cases like
+ * BT_CONNECT_SCAN we *only* want the cleanup part without the
+ * rest of hci_conn_del.
+ */
+ hci_conn_cleanup(conn);
return 0;
}
@@ -637,15 +686,18 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
struct hci_conn *conn;
- if (status == 0)
- return;
+ hci_dev_lock(hdev);
+
+ conn = hci_lookup_le_connect(hdev);
+
+ if (!status) {
+ hci_connect_le_scan_cleanup(conn);
+ goto done;
+ }
BT_ERR("HCI request failed to create LE connection: status 0x%2.2x",
status);
- hci_dev_lock(hdev);
-
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
if (!conn)
goto done;
@@ -670,8 +722,12 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
if (hci_update_random_address(req, false, &own_addr_type))
return;
+ /* Set window to be the same value as the interval to enable
+ * continuous scanning.
+ */
cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
- cp.scan_window = cpu_to_le16(hdev->le_scan_window);
+ cp.scan_window = cp.scan_interval;
+
bacpy(&cp.peer_addr, &conn->dst);
cp.peer_addr_type = conn->dst_type;
cp.own_address_type = own_addr_type;
@@ -685,6 +741,7 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
conn->state = BT_CONNECT;
+ clear_bit(HCI_CONN_SCANNING, &conn->flags);
}
static void hci_req_directed_advertising(struct hci_request *req,
@@ -728,7 +785,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
u8 role)
{
struct hci_conn_params *params;
- struct hci_conn *conn;
+ struct hci_conn *conn, *conn_unfinished;
struct smp_irk *irk;
struct hci_request req;
int err;
@@ -750,27 +807,30 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
* attempt, we simply update pending_sec_level and auth_type fields
* and return the object found.
*/
- conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst);
+ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type);
+ conn_unfinished = NULL;
if (conn) {
- conn->pending_sec_level = sec_level;
- goto done;
+ if (conn->state == BT_CONNECT &&
+ test_bit(HCI_CONN_SCANNING, &conn->flags)) {
+ BT_DBG("will continue unfinished conn %pMR", dst);
+ conn_unfinished = conn;
+ } else {
+ if (conn->pending_sec_level < sec_level)
+ conn->pending_sec_level = sec_level;
+ goto done;
+ }
}
/* Since the controller supports only one LE connection attempt at a
* time, we return -EBUSY if there is any connection attempt running.
*/
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
- if (conn)
+ if (hci_lookup_le_connect(hdev))
return ERR_PTR(-EBUSY);
/* When given an identity address with existing identity
* resolving key, the connection needs to be established
* to a resolvable random address.
*
- * This uses the cached random resolvable address from
- * a previous scan. When no cached address is available,
- * try connecting to the identity address instead.
- *
* Storing the resolvable random address is required here
* to handle connection failures. The address will later
* be resolved back into the original identity address
@@ -782,15 +842,23 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
dst_type = ADDR_LE_DEV_RANDOM;
}
- conn = hci_conn_add(hdev, LE_LINK, dst, role);
+ if (conn_unfinished) {
+ conn = conn_unfinished;
+ bacpy(&conn->dst, dst);
+ } else {
+ conn = hci_conn_add(hdev, LE_LINK, dst, role);
+ }
+
if (!conn)
return ERR_PTR(-ENOMEM);
conn->dst_type = dst_type;
conn->sec_level = BT_SECURITY_LOW;
- conn->pending_sec_level = sec_level;
conn->conn_timeout = conn_timeout;
+ if (!conn_unfinished)
+ conn->pending_sec_level = sec_level;
+
hci_req_init(&req, hdev);
/* Disable advertising if we're active. For master role
@@ -855,6 +923,149 @@ create_conn:
}
done:
+ /* If this is continuation of connect started by hci_connect_le_scan,
+ * it already called hci_conn_hold and calling it again would mess the
+ * counter.
+ */
+ if (!conn_unfinished)
+ hci_conn_hold(conn);
+
+ return conn;
+}
+
+static void hci_connect_le_scan_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct hci_conn *conn;
+
+ if (!status)
+ return;
+
+ BT_ERR("Failed to add device to auto conn whitelist: status 0x%2.2x",
+ status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (conn)
+ hci_le_conn_failed(conn, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_le(hdev, addr, type);
+ if (!conn)
+ return false;
+
+ if (conn->state != BT_CONNECTED)
+ return false;
+
+ return true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static int hci_explicit_conn_params_set(struct hci_request *req,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_conn_params *params;
+
+ if (is_connected(hdev, addr, addr_type))
+ return -EISCONN;
+
+ params = hci_conn_params_lookup(hdev, addr, addr_type);
+ if (!params) {
+ params = hci_conn_params_add(hdev, addr, addr_type);
+ if (!params)
+ return -ENOMEM;
+
+ /* If we created new params, mark them to be deleted in
+ * hci_connect_le_scan_cleanup. It's different case than
+ * existing disabled params, those will stay after cleanup.
+ */
+ params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ }
+
+ /* We're trying to connect, so make sure params are at pend_le_conns */
+ if (params->auto_connect == HCI_AUTO_CONN_DISABLED ||
+ params->auto_connect == HCI_AUTO_CONN_REPORT ||
+ params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {
+ list_del_init(&params->action);
+ list_add(&params->action, &hdev->pend_le_conns);
+ }
+
+ params->explicit_connect = true;
+ __hci_update_background_scan(req);
+
+ BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
+ params->auto_connect);
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
+ u8 dst_type, u8 sec_level,
+ u16 conn_timeout, u8 role)
+{
+ struct hci_conn *conn;
+ struct hci_request req;
+ int err;
+
+ /* Let's make sure that le is enabled.*/
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (lmp_le_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Some devices send ATT messages as soon as the physical link is
+ * established. To be able to handle these ATT messages, the user-
+ * space first establishes the connection and then starts the pairing
+ * process.
+ *
+ * So if a hci_conn object already exists for the following connection
+ * attempt, we simply update pending_sec_level and auth_type fields
+ * and return the object found.
+ */
+ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type);
+ if (conn) {
+ if (conn->pending_sec_level < sec_level)
+ conn->pending_sec_level = sec_level;
+ goto done;
+ }
+
+ BT_DBG("requesting refresh of dst_addr");
+
+ conn = hci_conn_add(hdev, LE_LINK, dst, role);
+ if (!conn)
+ return ERR_PTR(-ENOMEM);
+
+ hci_req_init(&req, hdev);
+
+ if (hci_explicit_conn_params_set(&req, dst, dst_type) < 0)
+ return ERR_PTR(-EBUSY);
+
+ conn->state = BT_CONNECT;
+ set_bit(HCI_CONN_SCANNING, &conn->flags);
+
+ err = hci_req_run(&req, hci_connect_le_scan_complete);
+ if (err && err != -ENODATA) {
+ hci_conn_del(conn);
+ return ERR_PTR(err);
+ }
+
+ conn->dst_type = dst_type;
+ conn->sec_level = BT_SECURITY_LOW;
+ conn->pending_sec_level = sec_level;
+ conn->conn_timeout = conn_timeout;
+
+done:
hci_conn_hold(conn);
return conn;
}
diff --git a/kernel/net/bluetooth/hci_core.c b/kernel/net/bluetooth/hci_core.c
index c4802f3bd..62edbf1b1 100644
--- a/kernel/net/bluetooth/hci_core.c
+++ b/kernel/net/bluetooth/hci_core.c
@@ -65,13 +65,6 @@ static DEFINE_IDA(hci_index_ida);
#define hci_req_lock(d) mutex_lock(&d->req_lock)
#define hci_req_unlock(d) mutex_unlock(&d->req_lock)
-/* ---- HCI notifications ---- */
-
-static void hci_notify(struct hci_dev *hdev, int event)
-{
- hci_sock_dev_event(hdev, event);
-}
-
/* ---- HCI debugfs entries ---- */
static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
@@ -94,7 +87,6 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
char buf[32];
size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
- int err;
if (!test_bit(HCI_UP, &hdev->flags))
return -ENETDOWN;
@@ -121,12 +113,8 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
if (IS_ERR(skb))
return PTR_ERR(skb);
- err = -bt_to_errno(skb->data[0]);
kfree_skb(skb);
- if (err < 0)
- return err;
-
hci_dev_change_flag(hdev, HCI_DUT_MODE);
return count;
@@ -139,6 +127,77 @@ static const struct file_operations dut_mode_fops = {
.llseek = default_llseek,
};
+static ssize_t vendor_diag_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf)-1));
+ bool enable;
+ int err;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ if (strtobool(buf, &enable))
+ return -EINVAL;
+
+ /* When the diagnostic flags are not persistent and the transport
+ * is not active, then there is no need for the vendor callback.
+ *
+ * Instead just store the desired value. If needed the setting
+ * will be programmed when the controller gets powered on.
+ */
+ if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) &&
+ !test_bit(HCI_RUNNING, &hdev->flags))
+ goto done;
+
+ hci_req_lock(hdev);
+ err = hdev->set_diag(hdev, enable);
+ hci_req_unlock(hdev);
+
+ if (err < 0)
+ return err;
+
+done:
+ if (enable)
+ hci_dev_set_flag(hdev, HCI_VENDOR_DIAG);
+ else
+ hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG);
+
+ return count;
+}
+
+static const struct file_operations vendor_diag_fops = {
+ .open = simple_open,
+ .read = vendor_diag_read,
+ .write = vendor_diag_write,
+ .llseek = default_llseek,
+};
+
+static void hci_debugfs_create_basic(struct hci_dev *hdev)
+{
+ debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev,
+ &dut_mode_fops);
+
+ if (hdev->set_diag)
+ debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev,
+ &vendor_diag_fops);
+}
+
/* ---- HCI requests ---- */
static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
@@ -449,12 +508,6 @@ static void le_setup(struct hci_request *req)
/* Read LE Supported States */
hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL);
- /* Read LE White List Size */
- hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL);
-
- /* Clear LE White List */
- hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
-
/* LE-only controllers have LE implicitly enabled */
if (!lmp_bredr_capable(hdev))
hci_dev_set_flag(hdev, HCI_LE_ENABLED);
@@ -698,7 +751,8 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
hci_setup_event_mask(req);
- if (hdev->commands[6] & 0x20) {
+ if (hdev->commands[6] & 0x20 &&
+ !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) {
struct hci_cp_read_stored_link_key cp;
bacpy(&cp.bdaddr, BDADDR_ANY);
@@ -772,6 +826,17 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
}
+ if (hdev->commands[26] & 0x40) {
+ /* Read LE White List Size */
+ hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE,
+ 0, NULL);
+ }
+
+ if (hdev->commands[26] & 0x80) {
+ /* Clear LE White List */
+ hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
+ }
+
if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
/* Read LE Maximum Data Length */
hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
@@ -854,13 +919,8 @@ static int __hci_init(struct hci_dev *hdev)
if (err < 0)
return err;
- /* The Device Under Test (DUT) mode is special and available for
- * all controller types. So just create it early on.
- */
- if (hci_dev_test_flag(hdev, HCI_SETUP)) {
- debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev,
- &dut_mode_fops);
- }
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ hci_debugfs_create_basic(hdev);
err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT);
if (err < 0)
@@ -937,6 +997,9 @@ static int __hci_unconf_init(struct hci_dev *hdev)
if (err < 0)
return err;
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ hci_debugfs_create_basic(hdev);
+
return 0;
}
@@ -1389,10 +1452,15 @@ static int hci_dev_do_open(struct hci_dev *hdev)
goto done;
}
+ set_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_OPEN);
+
atomic_set(&hdev->cmd_cnt, 1);
set_bit(HCI_INIT, &hdev->flags);
if (hci_dev_test_flag(hdev, HCI_SETUP)) {
+ hci_sock_dev_event(hdev, HCI_DEV_SETUP);
+
if (hdev->setup)
ret = hdev->setup(hdev);
@@ -1433,17 +1501,28 @@ static int hci_dev_do_open(struct hci_dev *hdev)
if (!ret) {
if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
- !hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
ret = __hci_init(hdev);
+ if (!ret && hdev->post_init)
+ ret = hdev->post_init(hdev);
+ }
}
+ /* If the HCI Reset command is clearing all diagnostic settings,
+ * then they need to be reprogrammed after the init procedure
+ * completed.
+ */
+ if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) &&
+ hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag)
+ ret = hdev->set_diag(hdev, true);
+
clear_bit(HCI_INIT, &hdev->flags);
if (!ret) {
hci_dev_hold(hdev);
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
set_bit(HCI_UP, &hdev->flags);
- hci_notify(hdev, HCI_DEV_UP);
+ hci_sock_dev_event(hdev, HCI_DEV_UP);
if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
!hci_dev_test_flag(hdev, HCI_CONFIG) &&
!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
@@ -1470,6 +1549,9 @@ static int hci_dev_do_open(struct hci_dev *hdev)
hdev->sent_cmd = NULL;
}
+ clear_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+
hdev->close(hdev);
hdev->flags &= BIT(HCI_RAW);
}
@@ -1553,11 +1635,14 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev)
BT_DBG("All LE pending actions cleared");
}
-static int hci_dev_do_close(struct hci_dev *hdev)
+int hci_dev_do_close(struct hci_dev *hdev)
{
+ bool auto_off;
+
BT_DBG("%s %p", hdev->name, hdev);
if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
test_bit(HCI_UP, &hdev->flags)) {
/* Execute vendor specific shutdown routine */
if (hdev->shutdown)
@@ -1595,6 +1680,11 @@ static int hci_dev_do_close(struct hci_dev *hdev)
if (hci_dev_test_flag(hdev, HCI_MGMT))
cancel_delayed_work_sync(&hdev->rpa_expired);
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work_sync(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+
/* Avoid potential lockdep warnings from the *_flush() calls by
* ensuring the workqueue is empty up front.
*/
@@ -1604,10 +1694,10 @@ static int hci_dev_do_close(struct hci_dev *hdev)
hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- if (!hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
- if (hdev->dev_type == HCI_BREDR)
- mgmt_powered(hdev, 0);
- }
+ auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
+
+ if (!auto_off && hdev->dev_type == HCI_BREDR)
+ mgmt_powered(hdev, 0);
hci_inquiry_cache_flush(hdev);
hci_pend_le_actions_clear(hdev);
@@ -1616,7 +1706,7 @@ static int hci_dev_do_close(struct hci_dev *hdev)
smp_unregister(hdev);
- hci_notify(hdev, HCI_DEV_DOWN);
+ hci_sock_dev_event(hdev, HCI_DEV_DOWN);
if (hdev->flush)
hdev->flush(hdev);
@@ -1624,9 +1714,8 @@ static int hci_dev_do_close(struct hci_dev *hdev)
/* Reset device */
skb_queue_purge(&hdev->cmd_q);
atomic_set(&hdev->cmd_cnt, 1);
- if (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) &&
- !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
- test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) {
+ if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) &&
+ !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
set_bit(HCI_INIT, &hdev->flags);
__hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT);
clear_bit(HCI_INIT, &hdev->flags);
@@ -1647,6 +1736,9 @@ static int hci_dev_do_close(struct hci_dev *hdev)
hdev->sent_cmd = NULL;
}
+ clear_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+
/* After this point our queues are empty
* and no tasks are scheduled. */
hdev->close(hdev);
@@ -2151,6 +2243,17 @@ static void hci_discov_off(struct work_struct *work)
mgmt_discoverable_timeout(hdev);
}
+static void hci_adv_timeout_expire(struct work_struct *work)
+{
+ struct hci_dev *hdev;
+
+ hdev = container_of(work, struct hci_dev, adv_instance_expire.work);
+
+ BT_DBG("%s", hdev->name);
+
+ mgmt_adv_timeout_expired(hdev);
+}
+
void hci_uuids_clear(struct hci_dev *hdev)
{
struct bt_uuid *uuid, *tmp;
@@ -2614,6 +2717,130 @@ int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
return 0;
}
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv_instance;
+
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
+ if (adv_instance->instance == instance)
+ return adv_instance;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) {
+ struct adv_info *cur_instance;
+
+ cur_instance = hci_find_adv_instance(hdev, instance);
+ if (!cur_instance)
+ return NULL;
+
+ if (cur_instance == list_last_entry(&hdev->adv_instances,
+ struct adv_info, list))
+ return list_first_entry(&hdev->adv_instances,
+ struct adv_info, list);
+ else
+ return list_next_entry(cur_instance, list);
+}
+
+/* This function requires the caller holds hdev->lock */
+int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -ENOENT;
+
+ BT_DBG("%s removing %dMR", hdev->name, instance);
+
+ if (hdev->cur_adv_instance == instance && hdev->adv_instance_timeout) {
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+
+ list_del(&adv_instance->list);
+ kfree(adv_instance);
+
+ hdev->adv_instance_cnt--;
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_adv_instances_clear(struct hci_dev *hdev)
+{
+ struct adv_info *adv_instance, *n;
+
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
+ list_del(&adv_instance->list);
+ kfree(adv_instance);
+ }
+
+ hdev->adv_instance_cnt = 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
+ u16 adv_data_len, u8 *adv_data,
+ u16 scan_rsp_len, u8 *scan_rsp_data,
+ u16 timeout, u16 duration)
+{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (adv_instance) {
+ memset(adv_instance->adv_data, 0,
+ sizeof(adv_instance->adv_data));
+ memset(adv_instance->scan_rsp_data, 0,
+ sizeof(adv_instance->scan_rsp_data));
+ } else {
+ if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES ||
+ instance < 1 || instance > HCI_MAX_ADV_INSTANCES)
+ return -EOVERFLOW;
+
+ adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL);
+ if (!adv_instance)
+ return -ENOMEM;
+
+ adv_instance->pending = true;
+ adv_instance->instance = instance;
+ list_add(&adv_instance->list, &hdev->adv_instances);
+ hdev->adv_instance_cnt++;
+ }
+
+ adv_instance->flags = flags;
+ adv_instance->adv_data_len = adv_data_len;
+ adv_instance->scan_rsp_len = scan_rsp_len;
+
+ if (adv_data_len)
+ memcpy(adv_instance->adv_data, adv_data, adv_data_len);
+
+ if (scan_rsp_len)
+ memcpy(adv_instance->scan_rsp_data,
+ scan_rsp_data, scan_rsp_len);
+
+ adv_instance->timeout = timeout;
+ adv_instance->remaining_time = timeout;
+
+ if (duration == 0)
+ adv_instance->duration = HCI_DEFAULT_ADV_DURATION;
+ else
+ adv_instance->duration = duration;
+
+ BT_DBG("%s for %dMR", hdev->name, instance);
+
+ return 0;
+}
+
struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
bdaddr_t *bdaddr, u8 type)
{
@@ -2686,10 +2913,6 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
{
struct hci_conn_params *params;
- /* The conn params list only contains identity addresses */
- if (!hci_is_identity_address(addr, addr_type))
- return NULL;
-
list_for_each_entry(params, &hdev->le_conn_params, list) {
if (bacmp(&params->addr, addr) == 0 &&
params->addr_type == addr_type) {
@@ -2706,10 +2929,6 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
{
struct hci_conn_params *param;
- /* The list only contains identity addresses */
- if (!hci_is_identity_address(addr, addr_type))
- return NULL;
-
list_for_each_entry(param, list, action) {
if (bacmp(&param->addr, addr) == 0 &&
param->addr_type == addr_type)
@@ -2725,9 +2944,6 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
{
struct hci_conn_params *params;
- if (!hci_is_identity_address(addr, addr_type))
- return NULL;
-
params = hci_conn_params_lookup(hdev, addr, addr_type);
if (params)
return params;
@@ -2791,6 +3007,15 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev)
list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) {
if (params->auto_connect != HCI_AUTO_CONN_DISABLED)
continue;
+
+ /* If trying to estabilish one time connection to disabled
+ * device, leave the params, but mark them as just once.
+ */
+ if (params->explicit_connect) {
+ params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ continue;
+ }
+
list_del(&params->list);
kfree(params);
}
@@ -3019,6 +3244,9 @@ struct hci_dev *hci_alloc_dev(void)
hdev->manufacturer = 0xffff; /* Default to internal use */
hdev->inq_tx_power = HCI_TX_POWER_INVALID;
hdev->adv_tx_power = HCI_TX_POWER_INVALID;
+ hdev->adv_instance_cnt = 0;
+ hdev->cur_adv_instance = 0x00;
+ hdev->adv_instance_timeout = 0;
hdev->sniff_max_interval = 800;
hdev->sniff_min_interval = 80;
@@ -3060,6 +3288,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_LIST_HEAD(&hdev->pend_le_conns);
INIT_LIST_HEAD(&hdev->pend_le_reports);
INIT_LIST_HEAD(&hdev->conn_hash.list);
+ INIT_LIST_HEAD(&hdev->adv_instances);
INIT_WORK(&hdev->rx_work, hci_rx_work);
INIT_WORK(&hdev->cmd_work, hci_cmd_work);
@@ -3071,6 +3300,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off);
INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
+ INIT_DELAYED_WORK(&hdev->adv_instance_expire, hci_adv_timeout_expire);
skb_queue_head_init(&hdev->rx_q);
skb_queue_head_init(&hdev->cmd_q);
@@ -3082,7 +3312,6 @@ struct hci_dev *hci_alloc_dev(void)
hci_init_sysfs(hdev);
discovery_init(hdev);
- adv_info_init(hdev);
return hdev;
}
@@ -3183,7 +3412,7 @@ int hci_register_dev(struct hci_dev *hdev)
if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
- hci_notify(hdev, HCI_DEV_REG);
+ hci_sock_dev_event(hdev, HCI_DEV_REG);
hci_dev_hold(hdev);
queue_work(hdev->req_workqueue, &hdev->power_on);
@@ -3231,7 +3460,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
* pending list */
BUG_ON(!list_empty(&hdev->mgmt_pending));
- hci_notify(hdev, HCI_DEV_UNREG);
+ hci_sock_dev_event(hdev, HCI_DEV_UNREG);
if (hdev->rfkill) {
rfkill_unregister(hdev->rfkill);
@@ -3253,6 +3482,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
hci_smp_ltks_clear(hdev);
hci_smp_irks_clear(hdev);
hci_remote_oob_data_clear(hdev);
+ hci_adv_instances_clear(hdev);
hci_bdaddr_list_clear(&hdev->le_white_list);
hci_conn_params_clear_all(hdev);
hci_discovery_filter_clear(hdev);
@@ -3267,7 +3497,7 @@ EXPORT_SYMBOL(hci_unregister_dev);
/* Suspend HCI device */
int hci_suspend_dev(struct hci_dev *hdev)
{
- hci_notify(hdev, HCI_DEV_SUSPEND);
+ hci_sock_dev_event(hdev, HCI_DEV_SUSPEND);
return 0;
}
EXPORT_SYMBOL(hci_suspend_dev);
@@ -3275,7 +3505,7 @@ EXPORT_SYMBOL(hci_suspend_dev);
/* Resume HCI device */
int hci_resume_dev(struct hci_dev *hdev)
{
- hci_notify(hdev, HCI_DEV_RESUME);
+ hci_sock_dev_event(hdev, HCI_DEV_RESUME);
return 0;
}
EXPORT_SYMBOL(hci_resume_dev);
@@ -3307,6 +3537,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
return -ENXIO;
}
+ if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT &&
+ bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
+ bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
/* Incoming skb */
bt_cb(skb)->incoming = 1;
@@ -3320,6 +3557,22 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
}
EXPORT_SYMBOL(hci_recv_frame);
+/* Receive diagnostic message from HCI drivers */
+int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ /* Mark as diagnostic packet */
+ bt_cb(skb)->pkt_type = HCI_DIAG_PKT;
+
+ /* Time stamp */
+ __net_timestamp(skb);
+
+ skb_queue_tail(&hdev->rx_q, skb);
+ queue_work(hdev->workqueue, &hdev->rx_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_recv_diag);
+
/* ---- Interface to upper protocols ---- */
int hci_register_cb(struct hci_cb *cb)
@@ -3366,6 +3619,11 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
/* Get rid of skb owner, prior to sending to the driver. */
skb_orphan(skb);
+ if (!test_bit(HCI_RUNNING, &hdev->flags)) {
+ kfree_skb(skb);
+ return;
+ }
+
err = hdev->send(hdev, skb);
if (err < 0) {
BT_ERR("%s sending frame failed (%d)", hdev->name, err);
@@ -3390,7 +3648,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
/* Stand-alone HCI commands must be flagged as
* single-command requests.
*/
- bt_cb(skb)->req.start = true;
+ bt_cb(skb)->hci.req_start = true;
skb_queue_tail(&hdev->cmd_q, skb);
queue_work(hdev->workqueue, &hdev->cmd_work);
@@ -3416,6 +3674,25 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
}
+/* Send HCI command and wait for command commplete event */
+struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ struct sk_buff *skb;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return ERR_PTR(-ENETDOWN);
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
+
+ hci_req_lock(hdev);
+ skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout);
+ hci_req_unlock(hdev);
+
+ return skb;
+}
+EXPORT_SYMBOL(hci_cmd_sync);
+
/* Send ACL data */
static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags)
{
@@ -4068,7 +4345,7 @@ static bool hci_req_is_complete(struct hci_dev *hdev)
if (!skb)
return true;
- return bt_cb(skb)->req.start;
+ return bt_cb(skb)->hci.req_start;
}
static void hci_resend_last(struct hci_dev *hdev)
@@ -4128,26 +4405,26 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
* callback would be found in hdev->sent_cmd instead of the
* command queue (hdev->cmd_q).
*/
- if (bt_cb(hdev->sent_cmd)->req.complete) {
- *req_complete = bt_cb(hdev->sent_cmd)->req.complete;
+ if (bt_cb(hdev->sent_cmd)->hci.req_complete) {
+ *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete;
return;
}
- if (bt_cb(hdev->sent_cmd)->req.complete_skb) {
- *req_complete_skb = bt_cb(hdev->sent_cmd)->req.complete_skb;
+ if (bt_cb(hdev->sent_cmd)->hci.req_complete_skb) {
+ *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb;
return;
}
/* Remove all pending commands belonging to this request */
spin_lock_irqsave(&hdev->cmd_q.lock, flags);
while ((skb = __skb_dequeue(&hdev->cmd_q))) {
- if (bt_cb(skb)->req.start) {
+ if (bt_cb(skb)->hci.req_start) {
__skb_queue_head(&hdev->cmd_q, skb);
break;
}
- *req_complete = bt_cb(skb)->req.complete;
- *req_complete_skb = bt_cb(skb)->req.complete_skb;
+ *req_complete = bt_cb(skb)->hci.req_complete;
+ *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
kfree_skb(skb);
}
spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
diff --git a/kernel/net/bluetooth/hci_event.c b/kernel/net/bluetooth/hci_event.c
index 7b61be736..d57c11c1c 100644
--- a/kernel/net/bluetooth/hci_event.c
+++ b/kernel/net/bluetooth/hci_event.c
@@ -55,7 +55,12 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)
wake_up_bit(&hdev->flags, HCI_INQUIRY);
hci_dev_lock(hdev);
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ /* Set discovery state to stopped if we're not doing LE active
+ * scanning.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ hdev->le_scan_type != LE_SCAN_ACTIVE)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
hci_dev_unlock(hdev);
hci_conn_check_pending(hdev);
@@ -823,7 +828,7 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev,
BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
if (rp->status)
- goto a2mp_rsp;
+ return;
hdev->amp_status = rp->amp_status;
hdev->amp_total_bw = __le32_to_cpu(rp->total_bw);
@@ -835,46 +840,6 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev,
hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size);
hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to);
hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to);
-
-a2mp_rsp:
- a2mp_send_getinfo_rsp(hdev);
-}
-
-static void hci_cc_read_local_amp_assoc(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_rp_read_local_amp_assoc *rp = (void *) skb->data;
- struct amp_assoc *assoc = &hdev->loc_assoc;
- size_t rem_len, frag_len;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
- if (rp->status)
- goto a2mp_rsp;
-
- frag_len = skb->len - sizeof(*rp);
- rem_len = __le16_to_cpu(rp->rem_len);
-
- if (rem_len > frag_len) {
- BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
-
- memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
- assoc->offset += frag_len;
-
- /* Read other fragments */
- amp_read_loc_assoc_frag(hdev, rp->phy_handle);
-
- return;
- }
-
- memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
- assoc->len = assoc->offset + rem_len;
- assoc->offset = 0;
-
-a2mp_rsp:
- /* Send A2MP Rsp when all fragments are received */
- a2mp_send_getampassoc_rsp(hdev, rp->status);
- a2mp_send_create_phy_link_req(hdev, rp->status);
}
static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev,
@@ -1099,7 +1064,7 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_set_flag(hdev, HCI_LE_ADV);
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ conn = hci_lookup_le_connect(hdev);
if (conn)
queue_delayed_work(hdev->workqueue,
&conn->le_conn_timeout,
@@ -1409,20 +1374,6 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_rp_write_remote_amp_assoc *rp = (void *) skb->data;
-
- BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
- hdev->name, rp->status, rp->phy_handle);
-
- if (rp->status)
- return;
-
- amp_write_rem_assoc_continue(hdev, rp->phy_handle);
-}
-
static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_rp_read_rssi *rp = (void *) skb->data;
@@ -1944,47 +1895,6 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
hci_dev_unlock(hdev);
}
-static void hci_cs_create_phylink(struct hci_dev *hdev, u8 status)
-{
- struct hci_cp_create_phy_link *cp;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
- if (!cp)
- return;
-
- hci_dev_lock(hdev);
-
- if (status) {
- struct hci_conn *hcon;
-
- hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
- if (hcon)
- hci_conn_del(hcon);
- } else {
- amp_write_remote_assoc(hdev, cp->phy_handle);
- }
-
- hci_dev_unlock(hdev);
-}
-
-static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status)
-{
- struct hci_cp_accept_phy_link *cp;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- if (status)
- return;
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
- if (!cp)
- return;
-
- amp_write_remote_assoc(hdev, cp->phy_handle);
-}
-
static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_create_conn *cp;
@@ -2005,7 +1915,8 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->peer_addr);
+ conn = hci_conn_hash_lookup_le(hdev, &cp->peer_addr,
+ cp->peer_addr_type);
if (!conn)
goto unlock;
@@ -2603,6 +2514,63 @@ unlock:
hci_dev_unlock(hdev);
}
+static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ const struct hci_rp_read_enc_key_size *rp;
+ struct hci_conn *conn;
+ u16 handle;
+
+ BT_DBG("%s status 0x%02x", hdev->name, status);
+
+ if (!skb || skb->len < sizeof(*rp)) {
+ BT_ERR("%s invalid HCI Read Encryption Key Size response",
+ hdev->name);
+ return;
+ }
+
+ rp = (void *)skb->data;
+ handle = le16_to_cpu(rp->handle);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn)
+ goto unlock;
+
+ /* If we fail to read the encryption key size, assume maximum
+ * (which is the same we do also when this HCI command isn't
+ * supported.
+ */
+ if (rp->status) {
+ BT_ERR("%s failed to read key size for handle %u", hdev->name,
+ handle);
+ conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ } else {
+ conn->enc_key_size = rp->key_size;
+ }
+
+ if (conn->state == BT_CONFIG) {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, 0);
+ hci_conn_drop(conn);
+ } else {
+ u8 encrypt;
+
+ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ encrypt = 0x00;
+ else if (test_bit(HCI_CONN_AES_CCM, &conn->flags))
+ encrypt = 0x02;
+ else
+ encrypt = 0x01;
+
+ hci_encrypt_cfm(conn, 0, encrypt);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_ev_encrypt_change *ev = (void *) skb->data;
@@ -2650,22 +2618,51 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
goto unlock;
}
- if (conn->state == BT_CONFIG) {
- if (!ev->status)
- conn->state = BT_CONNECTED;
+ /* In Secure Connections Only mode, do not allow any connections
+ * that are not encrypted with AES-CCM using a P-256 authenticated
+ * combination key.
+ */
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) &&
+ (!test_bit(HCI_CONN_AES_CCM, &conn->flags) ||
+ conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) {
+ hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ /* Try reading the encryption key size for encrypted ACL links */
+ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) {
+ struct hci_cp_read_enc_key_size cp;
+ struct hci_request req;
- /* In Secure Connections Only mode, do not allow any
- * connections that are not encrypted with AES-CCM
- * using a P-256 authenticated combination key.
+ /* Only send HCI_Read_Encryption_Key_Size if the
+ * controller really supports it. If it doesn't, assume
+ * the default size (16).
*/
- if (hci_dev_test_flag(hdev, HCI_SC_ONLY) &&
- (!test_bit(HCI_CONN_AES_CCM, &conn->flags) ||
- conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) {
- hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE);
- hci_conn_drop(conn);
- goto unlock;
+ if (!(hdev->commands[20] & 0x10)) {
+ conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ goto notify;
+ }
+
+ hci_req_init(&req, hdev);
+
+ cp.handle = cpu_to_le16(conn->handle);
+ hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
+
+ if (hci_req_run_skb(&req, read_enc_key_size_complete)) {
+ BT_ERR("Sending HCI Read Encryption Key Size failed");
+ conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ goto notify;
}
+ goto unlock;
+ }
+
+notify:
+ if (conn->state == BT_CONFIG) {
+ if (!ev->status)
+ conn->state = BT_CONNECTED;
+
hci_connect_cfm(conn, ev->status);
hci_conn_drop(conn);
} else
@@ -2912,10 +2909,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_read_clock(hdev, skb);
break;
- case HCI_OP_READ_LOCAL_AMP_ASSOC:
- hci_cc_read_local_amp_assoc(hdev, skb);
- break;
-
case HCI_OP_READ_INQ_RSP_TX_POWER:
hci_cc_read_inq_rsp_tx_power(hdev, skb);
break;
@@ -3020,10 +3013,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_set_adv_param(hdev, skb);
break;
- case HCI_OP_WRITE_REMOTE_AMP_ASSOC:
- hci_cc_write_remote_amp_assoc(hdev, skb);
- break;
-
case HCI_OP_READ_RSSI:
hci_cc_read_rssi(hdev, skb);
break;
@@ -3107,14 +3096,6 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cs_setup_sync_conn(hdev, ev->status);
break;
- case HCI_OP_CREATE_PHY_LINK:
- hci_cs_create_phylink(hdev, ev->status);
- break;
-
- case HCI_OP_ACCEPT_PHY_LINK:
- hci_cs_accept_phylink(hdev, ev->status);
- break;
-
case HCI_OP_SNIFF_MODE:
hci_cs_sniff_mode(hdev, ev->status);
break;
@@ -3157,7 +3138,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
* complete event).
*/
if (ev->status ||
- (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->req.event))
+ (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->hci.req_event))
hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete,
req_complete_skb);
@@ -3751,17 +3732,25 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
if (ev->link_type == ESCO_LINK)
goto unlock;
+ /* When the link type in the event indicates SCO connection
+ * and lookup of the connection object fails, then check
+ * if an eSCO connection object exists.
+ *
+ * The core limits the synchronous connections to either
+ * SCO or eSCO. The eSCO connection is preferred and tried
+ * to be setup first and until successfully established,
+ * the link type will be hinted as eSCO.
+ */
conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
if (!conn)
goto unlock;
-
- conn->type = SCO_LINK;
}
switch (ev->status) {
case 0x00:
conn->handle = __le16_to_cpu(ev->handle);
conn->state = BT_CONNECTED;
+ conn->type = ev->link_type;
hci_debugfs_create_conn(conn);
hci_conn_add_sysfs(conn);
@@ -4313,6 +4302,23 @@ unlock:
hci_dev_unlock(hdev);
}
+#if IS_ENABLED(CONFIG_BT_HS)
+static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_channel_selected *ev = (void *)skb->data;
+ struct hci_conn *hcon;
+
+ BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle);
+
+ skb_pull(skb, sizeof(*ev));
+
+ hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
+ if (!hcon)
+ return;
+
+ amp_read_loc_assoc_final_data(hdev, hcon);
+}
+
static void hci_phy_link_complete_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -4436,6 +4442,7 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
hci_dev_unlock(hdev);
}
+#endif
static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
@@ -4454,7 +4461,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
*/
hci_dev_clear_flag(hdev, HCI_LE_ADV);
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ conn = hci_lookup_le_connect(hdev);
if (!conn) {
conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role);
if (!conn) {
@@ -4647,42 +4654,49 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
/* If we're not connectable only connect devices that we have in
* our pend_le_conns list.
*/
- params = hci_pend_le_action_lookup(&hdev->pend_le_conns,
- addr, addr_type);
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, addr,
+ addr_type);
if (!params)
return NULL;
- switch (params->auto_connect) {
- case HCI_AUTO_CONN_DIRECT:
- /* Only devices advertising with ADV_DIRECT_IND are
- * triggering a connection attempt. This is allowing
- * incoming connections from slave devices.
- */
- if (adv_type != LE_ADV_DIRECT_IND)
+ if (!params->explicit_connect) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_DIRECT:
+ /* Only devices advertising with ADV_DIRECT_IND are
+ * triggering a connection attempt. This is allowing
+ * incoming connections from slave devices.
+ */
+ if (adv_type != LE_ADV_DIRECT_IND)
+ return NULL;
+ break;
+ case HCI_AUTO_CONN_ALWAYS:
+ /* Devices advertising with ADV_IND or ADV_DIRECT_IND
+ * are triggering a connection attempt. This means
+ * that incoming connectioms from slave device are
+ * accepted and also outgoing connections to slave
+ * devices are established when found.
+ */
+ break;
+ default:
return NULL;
- break;
- case HCI_AUTO_CONN_ALWAYS:
- /* Devices advertising with ADV_IND or ADV_DIRECT_IND
- * are triggering a connection attempt. This means
- * that incoming connectioms from slave device are
- * accepted and also outgoing connections to slave
- * devices are established when found.
- */
- break;
- default:
- return NULL;
+ }
}
conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW,
HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER);
if (!IS_ERR(conn)) {
- /* Store the pointer since we don't really have any
+ /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned
+ * by higher layer that tried to connect, if no then
+ * store the pointer since we don't really have any
* other owner of the object besides the params that
* triggered it. This way we can abort the connection if
* the parameters get removed and keep the reference
* count consistent once the connection is established.
*/
- params->conn = hci_conn_get(conn);
+
+ if (!params->explicit_connect)
+ params->conn = hci_conn_get(conn);
+
return conn;
}
@@ -4711,6 +4725,27 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
struct hci_conn *conn;
bool match;
u32 flags;
+ u8 *ptr, real_len;
+
+ /* Find the end of the data in case the report contains padded zero
+ * bytes at the end causing an invalid length value.
+ *
+ * When data is NULL, len is 0 so there is no need for extra ptr
+ * check as 'ptr < data + 0' is already false in such case.
+ */
+ for (ptr = data; ptr < data + len && *ptr; ptr += *ptr + 1) {
+ if (ptr + 1 + *ptr > data + len)
+ break;
+ }
+
+ real_len = ptr - data;
+
+ /* Adjust for actual length */
+ if (len != real_len) {
+ BT_ERR_RATELIMITED("%s advertising data length corrected",
+ hdev->name);
+ len = real_len;
+ }
/* If the direct address is present, then this report is from
* a LE Direct Advertising Report event. In that case it is
@@ -4955,7 +4990,8 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
goto not_found;
}
- memcpy(cp.ltk, ltk->val, sizeof(ltk->val));
+ memcpy(cp.ltk, ltk->val, ltk->enc_size);
+ memset(cp.ltk + ltk->enc_size, 0, sizeof(cp.ltk) - ltk->enc_size);
cp.handle = cpu_to_le16(conn->handle);
conn->pending_sec_level = smp_ltk_sec_level(ltk);
@@ -5119,22 +5155,6 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
}
}
-static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb)
-{
- struct hci_ev_channel_selected *ev = (void *) skb->data;
- struct hci_conn *hcon;
-
- BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle);
-
- skb_pull(skb, sizeof(*ev));
-
- hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (!hcon)
- return;
-
- amp_read_loc_assoc_final_data(hdev, hcon);
-}
-
static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
u8 event, struct sk_buff *skb)
{
@@ -5189,7 +5209,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
u8 status = 0, event = hdr->evt, req_evt = 0;
u16 opcode = HCI_OP_NOP;
- if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->req.event == event) {
+ if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->hci.req_event == event) {
struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data;
opcode = __le16_to_cpu(cmd_hdr->opcode);
hci_req_cmd_complete(hdev, opcode, status, &req_complete,
@@ -5355,14 +5375,15 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
hci_le_meta_evt(hdev, skb);
break;
- case HCI_EV_CHANNEL_SELECTED:
- hci_chan_selected_evt(hdev, skb);
- break;
-
case HCI_EV_REMOTE_OOB_DATA_REQUEST:
hci_remote_oob_data_request_evt(hdev, skb);
break;
+#if IS_ENABLED(CONFIG_BT_HS)
+ case HCI_EV_CHANNEL_SELECTED:
+ hci_chan_selected_evt(hdev, skb);
+ break;
+
case HCI_EV_PHY_LINK_COMPLETE:
hci_phy_link_complete_evt(hdev, skb);
break;
@@ -5378,6 +5399,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
case HCI_EV_DISCONN_PHY_LINK_COMPLETE:
hci_disconn_phylink_complete_evt(hdev, skb);
break;
+#endif
case HCI_EV_NUM_COMP_BLOCKS:
hci_num_comp_blocks_evt(hdev, skb);
diff --git a/kernel/net/bluetooth/hci_request.c b/kernel/net/bluetooth/hci_request.c
index d6025d6e6..02778c5bc 100644
--- a/kernel/net/bluetooth/hci_request.c
+++ b/kernel/net/bluetooth/hci_request.c
@@ -56,8 +56,8 @@ static int req_run(struct hci_request *req, hci_req_complete_t complete,
return -ENODATA;
skb = skb_peek_tail(&req->cmd_q);
- bt_cb(skb)->req.complete = complete;
- bt_cb(skb)->req.complete_skb = complete_skb;
+ bt_cb(skb)->hci.req_complete = complete;
+ bt_cb(skb)->hci.req_complete_skb = complete_skb;
spin_lock_irqsave(&hdev->cmd_q.lock, flags);
skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q);
@@ -99,7 +99,7 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
BT_DBG("skb len %d", skb->len);
bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
- bt_cb(skb)->opcode = opcode;
+ bt_cb(skb)->hci.opcode = opcode;
return skb;
}
@@ -128,9 +128,9 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
}
if (skb_queue_empty(&req->cmd_q))
- bt_cb(skb)->req.start = true;
+ bt_cb(skb)->hci.req_start = true;
- bt_cb(skb)->req.event = event;
+ bt_cb(skb)->hci.req_event = event;
skb_queue_tail(&req->cmd_q, skb);
}
@@ -175,21 +175,29 @@ static u8 update_white_list(struct hci_request *req)
* command to remove it from the controller.
*/
list_for_each_entry(b, &hdev->le_white_list, list) {
- struct hci_cp_le_del_from_white_list cp;
+ /* If the device is neither in pend_le_conns nor
+ * pend_le_reports then remove it from the whitelist.
+ */
+ if (!hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &b->bdaddr, b->bdaddr_type) &&
+ !hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &b->bdaddr, b->bdaddr_type)) {
+ struct hci_cp_le_del_from_white_list cp;
+
+ cp.bdaddr_type = b->bdaddr_type;
+ bacpy(&cp.bdaddr, &b->bdaddr);
- if (hci_pend_le_action_lookup(&hdev->pend_le_conns,
- &b->bdaddr, b->bdaddr_type) ||
- hci_pend_le_action_lookup(&hdev->pend_le_reports,
- &b->bdaddr, b->bdaddr_type)) {
- white_list_entries++;
+ hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
+ sizeof(cp), &cp);
continue;
}
- cp.bdaddr_type = b->bdaddr_type;
- bacpy(&cp.bdaddr, &b->bdaddr);
+ if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
+ /* White list can not be used with RPAs */
+ return 0x00;
+ }
- hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
- sizeof(cp), &cp);
+ white_list_entries++;
}
/* Since all no longer valid white list entries have been
@@ -317,7 +325,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
* address be updated at the next cycle.
*/
if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
- hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) {
+ hci_lookup_le_connect(hdev)) {
BT_DBG("Deferring random address update");
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
return;
@@ -479,7 +487,6 @@ void hci_update_page_scan(struct hci_dev *hdev)
void __hci_update_background_scan(struct hci_request *req)
{
struct hci_dev *hdev = req->hdev;
- struct hci_conn *conn;
if (!test_bit(HCI_UP, &hdev->flags) ||
test_bit(HCI_INIT, &hdev->flags) ||
@@ -529,8 +536,7 @@ void __hci_update_background_scan(struct hci_request *req)
* since some controllers are not able to scan and connect at
* the same time.
*/
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
- if (conn)
+ if (hci_lookup_le_connect(hdev))
return;
/* If controller is currently scanning, we stop it to ensure we
@@ -566,3 +572,96 @@ void hci_update_background_scan(struct hci_dev *hdev)
if (err && err != -ENODATA)
BT_ERR("Failed to run HCI request: err %d", err);
}
+
+void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
+ u8 reason)
+{
+ switch (conn->state) {
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ if (conn->type == AMP_LINK) {
+ struct hci_cp_disconn_phy_link cp;
+
+ cp.phy_handle = HCI_PHY_HANDLE(conn->handle);
+ cp.reason = reason;
+ hci_req_add(req, HCI_OP_DISCONN_PHY_LINK, sizeof(cp),
+ &cp);
+ } else {
+ struct hci_cp_disconnect dc;
+
+ dc.handle = cpu_to_le16(conn->handle);
+ dc.reason = reason;
+ hci_req_add(req, HCI_OP_DISCONNECT, sizeof(dc), &dc);
+ }
+
+ conn->state = BT_DISCONN;
+
+ break;
+ case BT_CONNECT:
+ if (conn->type == LE_LINK) {
+ if (test_bit(HCI_CONN_SCANNING, &conn->flags))
+ break;
+ hci_req_add(req, HCI_OP_LE_CREATE_CONN_CANCEL,
+ 0, NULL);
+ } else if (conn->type == ACL_LINK) {
+ if (req->hdev->hci_ver < BLUETOOTH_VER_1_2)
+ break;
+ hci_req_add(req, HCI_OP_CREATE_CONN_CANCEL,
+ 6, &conn->dst);
+ }
+ break;
+ case BT_CONNECT2:
+ if (conn->type == ACL_LINK) {
+ struct hci_cp_reject_conn_req rej;
+
+ bacpy(&rej.bdaddr, &conn->dst);
+ rej.reason = reason;
+
+ hci_req_add(req, HCI_OP_REJECT_CONN_REQ,
+ sizeof(rej), &rej);
+ } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
+ struct hci_cp_reject_sync_conn_req rej;
+
+ bacpy(&rej.bdaddr, &conn->dst);
+
+ /* SCO rejection has its own limited set of
+ * allowed error values (0x0D-0x0F) which isn't
+ * compatible with most values passed to this
+ * function. To be safe hard-code one of the
+ * values that's suitable for SCO.
+ */
+ rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES;
+
+ hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
+ sizeof(rej), &rej);
+ }
+ break;
+ default:
+ conn->state = BT_CLOSED;
+ break;
+ }
+}
+
+static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ if (status)
+ BT_DBG("Failed to abort connection: status 0x%2.2x", status);
+}
+
+int hci_abort_conn(struct hci_conn *conn, u8 reason)
+{
+ struct hci_request req;
+ int err;
+
+ hci_req_init(&req, conn->hdev);
+
+ __hci_abort_conn(&req, conn, reason);
+
+ err = hci_req_run(&req, abort_conn_complete);
+ if (err && err != -ENODATA) {
+ BT_ERR("Failed to run HCI request: err %d", err);
+ return err;
+ }
+
+ return 0;
+}
diff --git a/kernel/net/bluetooth/hci_request.h b/kernel/net/bluetooth/hci_request.h
index bf6df92f4..25c7f1305 100644
--- a/kernel/net/bluetooth/hci_request.h
+++ b/kernel/net/bluetooth/hci_request.h
@@ -55,3 +55,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
void hci_update_background_scan(struct hci_dev *hdev);
void __hci_update_background_scan(struct hci_request *req);
+
+int hci_abort_conn(struct hci_conn *conn, u8 reason);
+void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
+ u8 reason);
diff --git a/kernel/net/bluetooth/hci_sock.c b/kernel/net/bluetooth/hci_sock.c
index e11a5cfda..b1eb8c09a 100644
--- a/kernel/net/bluetooth/hci_sock.c
+++ b/kernel/net/bluetooth/hci_sock.c
@@ -120,10 +120,7 @@ static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb)
/* Apply filter */
flt = &hci_pi(sk)->filter;
- if (bt_cb(skb)->pkt_type == HCI_VENDOR_PKT)
- flt_type = 0;
- else
- flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS;
+ flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS;
if (!test_bit(flt_type, &flt->type_mask))
return true;
@@ -173,6 +170,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) {
+ if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT &&
+ bt_cb(skb)->pkt_type != HCI_EVENT_PKT &&
+ bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
+ bt_cb(skb)->pkt_type != HCI_SCODATA_PKT)
+ continue;
if (is_filtered_packet(sk, skb))
continue;
} else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
@@ -279,6 +281,9 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
else
opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT);
break;
+ case HCI_DIAG_PKT:
+ opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG);
+ break;
default:
return;
}
@@ -303,6 +308,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
{
struct hci_mon_hdr *hdr;
struct hci_mon_new_index *ni;
+ struct hci_mon_index_info *ii;
struct sk_buff *skb;
__le16 opcode;
@@ -312,7 +318,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
if (!skb)
return NULL;
- ni = (void *) skb_put(skb, HCI_MON_NEW_INDEX_SIZE);
+ ni = (void *)skb_put(skb, HCI_MON_NEW_INDEX_SIZE);
ni->type = hdev->dev_type;
ni->bus = hdev->bus;
bacpy(&ni->bdaddr, &hdev->bdaddr);
@@ -329,6 +335,40 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
opcode = cpu_to_le16(HCI_MON_DEL_INDEX);
break;
+ case HCI_DEV_SETUP:
+ if (hdev->manufacturer == 0xffff)
+ return NULL;
+
+ /* fall through */
+
+ case HCI_DEV_UP:
+ skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ ii = (void *)skb_put(skb, HCI_MON_INDEX_INFO_SIZE);
+ bacpy(&ii->bdaddr, &hdev->bdaddr);
+ ii->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ opcode = cpu_to_le16(HCI_MON_INDEX_INFO);
+ break;
+
+ case HCI_DEV_OPEN:
+ skb = bt_skb_alloc(0, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ opcode = cpu_to_le16(HCI_MON_OPEN_INDEX);
+ break;
+
+ case HCI_DEV_CLOSE:
+ skb = bt_skb_alloc(0, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX);
+ break;
+
default:
return NULL;
}
@@ -358,6 +398,28 @@ static void send_monitor_replay(struct sock *sk)
if (sock_queue_rcv_skb(sk, skb))
kfree_skb(skb);
+
+ if (!test_bit(HCI_RUNNING, &hdev->flags))
+ continue;
+
+ skb = create_monitor_event(hdev, HCI_DEV_OPEN);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+
+ if (test_bit(HCI_UP, &hdev->flags))
+ skb = create_monitor_event(hdev, HCI_DEV_UP);
+ else if (hci_dev_test_flag(hdev, HCI_SETUP))
+ skb = create_monitor_event(hdev, HCI_DEV_SETUP);
+ else
+ skb = NULL;
+
+ if (skb) {
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+ }
}
read_unlock(&hci_dev_list_lock);
@@ -392,14 +454,12 @@ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
void hci_sock_dev_event(struct hci_dev *hdev, int event)
{
- struct hci_ev_si_device ev;
-
BT_DBG("hdev %s event %d", hdev->name, event);
- /* Send event to monitor */
if (atomic_read(&monitor_promisc)) {
struct sk_buff *skb;
+ /* Send event to monitor */
skb = create_monitor_event(hdev, event);
if (skb) {
hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
@@ -408,10 +468,14 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event)
}
}
- /* Send event to sockets */
- ev.event = event;
- ev.dev_id = hdev->id;
- hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev);
+ if (event <= HCI_DEV_DOWN) {
+ struct hci_ev_si_device ev;
+
+ /* Send event to sockets */
+ ev.event = event;
+ ev.dev_id = hdev->id;
+ hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev);
+ }
if (event == HCI_DEV_UNREG) {
struct sock *sk;
@@ -503,9 +567,18 @@ static int hci_sock_release(struct socket *sock)
if (hdev) {
if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
- mgmt_index_added(hdev);
+ /* When releasing an user channel exclusive access,
+ * call hci_dev_do_close directly instead of calling
+ * hci_dev_close to ensure the exclusive access will
+ * be released and the controller brought back down.
+ *
+ * The checking of HCI_AUTO_OFF is not needed in this
+ * case since it will have been cleared already when
+ * opening the user channel.
+ */
+ hci_dev_do_close(hdev);
hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
- hci_dev_close(hdev->id);
+ mgmt_index_added(hdev);
}
atomic_dec(&hdev->promisc);
@@ -928,7 +1001,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
BT_DBG("sock %p, sk %p", sock, sk);
- if (flags & (MSG_OOB))
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
if (sk->sk_state == BT_CLOSED)
@@ -1176,7 +1249,7 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
/* Stand-alone HCI commands must be flagged as
* single-command requests.
*/
- bt_cb(skb)->req.start = true;
+ bt_cb(skb)->hci.req_start = true;
skb_queue_tail(&hdev->cmd_q, skb);
queue_work(hdev->workqueue, &hdev->cmd_work);
@@ -1187,6 +1260,12 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
goto drop;
}
+ if (bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
+ bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) {
+ err = -EINVAL;
+ goto drop;
+ }
+
skb_queue_tail(&hdev->raw_q, skb);
queue_work(hdev->workqueue, &hdev->tx_work);
}
@@ -1389,7 +1468,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
sock->ops = &hci_sock_ops;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/bluetooth/hidp/core.c b/kernel/net/bluetooth/hidp/core.c
index 9070dfd6b..0bec4588c 100644
--- a/kernel/net/bluetooth/hidp/core.c
+++ b/kernel/net/bluetooth/hidp/core.c
@@ -401,6 +401,20 @@ static void hidp_idle_timeout(unsigned long arg)
{
struct hidp_session *session = (struct hidp_session *) arg;
+ /* The HIDP user-space API only contains calls to add and remove
+ * devices. There is no way to forward events of any kind. Therefore,
+ * we have to forcefully disconnect a device on idle-timeouts. This is
+ * unfortunate and weird API design, but it is spec-compliant and
+ * required for backwards-compatibility. Hence, on idle-timeout, we
+ * signal driver-detach events, so poll() will be woken up with an
+ * error-condition on both sockets.
+ */
+
+ session->intr_sock->sk->sk_err = EUNATCH;
+ session->ctrl_sock->sk->sk_err = EUNATCH;
+ wake_up_interruptible(sk_sleep(session->intr_sock->sk));
+ wake_up_interruptible(sk_sleep(session->ctrl_sock->sk));
+
hidp_session_terminate(session);
}
@@ -915,6 +929,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
session->conn = l2cap_conn_get(conn);
session->user.probe = hidp_session_probe;
session->user.remove = hidp_session_remove;
+ INIT_LIST_HEAD(&session->user.list);
session->ctrl_sock = ctrl_sock;
session->intr_sock = intr_sock;
skb_queue_head_init(&session->ctrl_transmit);
diff --git a/kernel/net/bluetooth/hidp/sock.c b/kernel/net/bluetooth/hidp/sock.c
index cb3fdde19..008ba439b 100644
--- a/kernel/net/bluetooth/hidp/sock.c
+++ b/kernel/net/bluetooth/hidp/sock.c
@@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/bluetooth/l2cap_core.c b/kernel/net/bluetooth/l2cap_core.c
index dad419782..66e8b6ee1 100644
--- a/kernel/net/bluetooth/l2cap_core.c
+++ b/kernel/net/bluetooth/l2cap_core.c
@@ -239,7 +239,7 @@ static u16 l2cap_alloc_cid(struct l2cap_conn *conn)
else
dyn_end = L2CAP_CID_DYN_END;
- for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) {
+ for (cid = L2CAP_CID_DYN_START; cid <= dyn_end; cid++) {
if (!__l2cap_get_chan_by_scid(conn, cid))
return cid;
}
@@ -1601,7 +1601,7 @@ int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user)
hci_dev_lock(hdev);
- if (user->list.next || user->list.prev) {
+ if (!list_empty(&user->list)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1631,12 +1631,10 @@ void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user)
hci_dev_lock(hdev);
- if (!user->list.next || !user->list.prev)
+ if (list_empty(&user->list))
goto out_unlock;
- list_del(&user->list);
- user->list.next = NULL;
- user->list.prev = NULL;
+ list_del_init(&user->list);
user->remove(conn, user);
out_unlock:
@@ -1650,9 +1648,7 @@ static void l2cap_unregister_all_users(struct l2cap_conn *conn)
while (!list_empty(&conn->users)) {
user = list_first_entry(&conn->users, struct l2cap_user, list);
- list_del(&user->list);
- user->list.next = NULL;
- user->list.prev = NULL;
+ list_del_init(&user->list);
user->remove(conn, user);
}
}
@@ -5254,7 +5250,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
credits = __le16_to_cpu(rsp->credits);
result = __le16_to_cpu(rsp->result);
- if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23))
+ if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 ||
+ dcid < L2CAP_CID_DYN_START ||
+ dcid > L2CAP_CID_LE_DYN_END))
return -EPROTO;
BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x",
@@ -5274,6 +5272,11 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
switch (result) {
case L2CAP_CR_SUCCESS:
+ if (__l2cap_get_chan_by_dcid(conn, dcid)) {
+ err = -EBADSLT;
+ break;
+ }
+
chan->ident = 0;
chan->dcid = dcid;
chan->omtu = mtu;
@@ -5441,9 +5444,16 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
goto response_unlock;
}
+ /* Check for valid dynamic CID range */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
+ result = L2CAP_CR_INVALID_SCID;
+ chan = NULL;
+ goto response_unlock;
+ }
+
/* Check if we already have channel with that dcid */
if (__l2cap_get_chan_by_dcid(conn, scid)) {
- result = L2CAP_CR_NO_MEM;
+ result = L2CAP_CR_SCID_IN_USE;
chan = NULL;
goto response_unlock;
}
@@ -7117,8 +7127,10 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
else
role = HCI_ROLE_MASTER;
- hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level,
- HCI_LE_CONN_TIMEOUT, role);
+ hcon = hci_connect_le_scan(hdev, dst, dst_type,
+ chan->sec_level,
+ HCI_LE_CONN_TIMEOUT,
+ role);
} else {
u8 auth_type = l2cap_get_auth_type(chan);
hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type);
@@ -7442,7 +7454,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
mutex_unlock(&conn->chan_lock);
}
-int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
{
struct l2cap_conn *conn = hcon->l2cap_data;
struct l2cap_hdr *hdr;
@@ -7485,7 +7497,7 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
if (len == skb->len) {
/* Complete frame received */
l2cap_recv_frame(conn, skb);
- return 0;
+ return;
}
BT_DBG("Start: total len %d, frag len %d", len, skb->len);
@@ -7544,7 +7556,6 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
drop:
kfree_skb(skb);
- return 0;
}
static struct hci_cb l2cap_cb = {
diff --git a/kernel/net/bluetooth/l2cap_sock.c b/kernel/net/bluetooth/l2cap_sock.c
index a7278f05e..1bb551527 100644
--- a/kernel/net/bluetooth/l2cap_sock.c
+++ b/kernel/net/bluetooth/l2cap_sock.c
@@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = {
static const struct proto_ops l2cap_sock_ops;
static void l2cap_sock_init(struct sock *sk, struct sock *parent);
static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
- int proto, gfp_t prio);
+ int proto, gfp_t prio, int kern);
bool l2cap_is_socket(struct socket *sock)
{
@@ -1054,18 +1054,23 @@ static void l2cap_sock_kill(struct sock *sk)
sock_put(sk);
}
-static int __l2cap_wait_ack(struct sock *sk)
+static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
{
- struct l2cap_chan *chan = l2cap_pi(sk)->chan;
DECLARE_WAITQUEUE(wait, current);
int err = 0;
- int timeo = HZ/5;
+ int timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
+ /* Timeout to prevent infinite loop */
+ unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT;
add_wait_queue(sk_sleep(sk), &wait);
set_current_state(TASK_INTERRUPTIBLE);
- while (chan->unacked_frames > 0 && chan->conn) {
+ do {
+ BT_DBG("Waiting for %d ACKs, timeout %04d ms",
+ chan->unacked_frames, time_after(jiffies, timeout) ? 0 :
+ jiffies_to_msecs(timeout - jiffies));
+
if (!timeo)
- timeo = HZ/5;
+ timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
if (signal_pending(current)) {
err = sock_intr_errno(timeo);
@@ -1080,7 +1085,15 @@ static int __l2cap_wait_ack(struct sock *sk)
err = sock_error(sk);
if (err)
break;
- }
+
+ if (time_after(jiffies, timeout)) {
+ err = -ENOLINK;
+ break;
+ }
+
+ } while (chan->unacked_frames > 0 &&
+ chan->state == BT_CONNECTED);
+
set_current_state(TASK_RUNNING);
remove_wait_queue(sk_sleep(sk), &wait);
return err;
@@ -1098,41 +1111,76 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
if (!sk)
return 0;
+ lock_sock(sk);
+
+ if (sk->sk_shutdown)
+ goto shutdown_already;
+
+ BT_DBG("Handling sock shutdown");
+
+ /* prevent sk structure from being freed whilst unlocked */
+ sock_hold(sk);
+
chan = l2cap_pi(sk)->chan;
- conn = chan->conn;
+ /* prevent chan structure from being freed whilst unlocked */
+ l2cap_chan_hold(chan);
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+ if (chan->mode == L2CAP_MODE_ERTM &&
+ chan->unacked_frames > 0 &&
+ chan->state == BT_CONNECTED) {
+ err = __l2cap_wait_ack(sk, chan);
+
+ /* After waiting for ACKs, check whether shutdown
+ * has already been actioned to close the L2CAP
+ * link such as by l2cap_disconnection_req().
+ */
+ if (sk->sk_shutdown)
+ goto has_shutdown;
+ }
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ release_sock(sk);
+
+ l2cap_chan_lock(chan);
+ conn = chan->conn;
+ if (conn)
+ /* prevent conn structure from being freed */
+ l2cap_conn_get(conn);
+ l2cap_chan_unlock(chan);
+
if (conn)
+ /* mutex lock must be taken before l2cap_chan_lock() */
mutex_lock(&conn->chan_lock);
l2cap_chan_lock(chan);
- lock_sock(sk);
+ l2cap_chan_close(chan, 0);
+ l2cap_chan_unlock(chan);
- if (!sk->sk_shutdown) {
- if (chan->mode == L2CAP_MODE_ERTM)
- err = __l2cap_wait_ack(sk);
+ if (conn) {
+ mutex_unlock(&conn->chan_lock);
+ l2cap_conn_put(conn);
+ }
- sk->sk_shutdown = SHUTDOWN_MASK;
+ lock_sock(sk);
- release_sock(sk);
- l2cap_chan_close(chan, 0);
- lock_sock(sk);
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED,
+ sk->sk_lingertime);
- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
- !(current->flags & PF_EXITING))
- err = bt_sock_wait_state(sk, BT_CLOSED,
- sk->sk_lingertime);
- }
+has_shutdown:
+ l2cap_chan_put(chan);
+ sock_put(sk);
+shutdown_already:
if (!err && sk->sk_err)
err = -sk->sk_err;
release_sock(sk);
- l2cap_chan_unlock(chan);
- if (conn)
- mutex_unlock(&conn->chan_lock);
+ BT_DBG("Sock shutdown complete err: %d", err);
return err;
}
@@ -1193,7 +1241,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
}
sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
- GFP_ATOMIC);
+ GFP_ATOMIC, 0);
if (!sk) {
release_sock(parent);
return NULL;
@@ -1523,12 +1571,12 @@ static struct proto l2cap_proto = {
};
static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
- int proto, gfp_t prio)
+ int proto, gfp_t prio, int kern)
{
struct sock *sk;
struct l2cap_chan *chan;
- sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern);
if (!sk)
return NULL;
@@ -1574,7 +1622,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
sock->ops = &l2cap_sock_ops;
- sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+ sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/bluetooth/lib.c b/kernel/net/bluetooth/lib.c
index b36bc0415..aa4cf64e3 100644
--- a/kernel/net/bluetooth/lib.c
+++ b/kernel/net/bluetooth/lib.c
@@ -151,6 +151,22 @@ void bt_info(const char *format, ...)
}
EXPORT_SYMBOL(bt_info);
+void bt_warn(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_warn("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_warn);
+
void bt_err(const char *format, ...)
{
struct va_format vaf;
@@ -166,3 +182,19 @@ void bt_err(const char *format, ...)
va_end(args);
}
EXPORT_SYMBOL(bt_err);
+
+void bt_err_ratelimited(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_err_ratelimited("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_err_ratelimited);
diff --git a/kernel/net/bluetooth/mgmt.c b/kernel/net/bluetooth/mgmt.c
index 7fd87e713..7f2211927 100644
--- a/kernel/net/bluetooth/mgmt.c
+++ b/kernel/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 9
+#define MGMT_REVISION 10
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -268,6 +268,14 @@ static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
HCI_SOCK_TRUSTED, skip_sk);
}
+static u8 le_addr_type(u8 mgmt_addr_type)
+{
+ if (mgmt_addr_type == BDADDR_LE_PUBLIC)
+ return ADDR_LE_DEV_PUBLIC;
+ else
+ return ADDR_LE_DEV_RANDOM;
+}
+
static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
{
@@ -832,6 +840,20 @@ static struct mgmt_pending_cmd *pending_find_data(u16 opcode,
return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data);
}
+static u8 get_current_adv_instance(struct hci_dev *hdev)
+{
+ /* The "Set Advertising" setting supersedes the "Add Advertising"
+ * setting. Here we set the advertising data based on which
+ * setting was set. When neither apply, default to the global settings,
+ * represented by instance "0".
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return hdev->cur_adv_instance;
+
+ return 0x00;
+}
+
static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
{
u8 ad_len = 0;
@@ -858,19 +880,25 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
return ad_len;
}
-static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
+ u8 *ptr)
{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
+
/* TODO: Set the appropriate entries based on advertising instance flags
* here once flags other than 0 are supported.
*/
- memcpy(ptr, hdev->adv_instance.scan_rsp_data,
- hdev->adv_instance.scan_rsp_len);
+ memcpy(ptr, adv_instance->scan_rsp_data,
+ adv_instance->scan_rsp_len);
- return hdev->adv_instance.scan_rsp_len;
+ return adv_instance->scan_rsp_len;
}
-static void update_scan_rsp_data_for_instance(struct hci_request *req,
- u8 instance)
+static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance)
{
struct hci_dev *hdev = req->hdev;
struct hci_cp_le_set_scan_rsp_data cp;
@@ -882,7 +910,7 @@ static void update_scan_rsp_data_for_instance(struct hci_request *req,
memset(&cp, 0, sizeof(cp));
if (instance)
- len = create_instance_scan_rsp_data(hdev, cp.data);
+ len = create_instance_scan_rsp_data(hdev, instance, cp.data);
else
len = create_default_scan_rsp_data(hdev, cp.data);
@@ -900,21 +928,7 @@ static void update_scan_rsp_data_for_instance(struct hci_request *req,
static void update_scan_rsp_data(struct hci_request *req)
{
- struct hci_dev *hdev = req->hdev;
- u8 instance;
-
- /* The "Set Advertising" setting supersedes the "Add Advertising"
- * setting. Here we set the scan response data based on which
- * setting was set. When neither apply, default to the global settings,
- * represented by instance "0".
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
- !hci_dev_test_flag(hdev, HCI_ADVERTISING))
- instance = 0x01;
- else
- instance = 0x00;
-
- update_scan_rsp_data_for_instance(req, instance);
+ update_inst_scan_rsp_data(req, get_current_adv_instance(req->hdev));
}
static u8 get_adv_discov_flags(struct hci_dev *hdev)
@@ -941,20 +955,6 @@ static u8 get_adv_discov_flags(struct hci_dev *hdev)
return 0;
}
-static u8 get_current_adv_instance(struct hci_dev *hdev)
-{
- /* The "Set Advertising" setting supersedes the "Add Advertising"
- * setting. Here we set the advertising data based on which
- * setting was set. When neither apply, default to the global settings,
- * represented by instance "0".
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
- !hci_dev_test_flag(hdev, HCI_ADVERTISING))
- return 0x01;
-
- return 0x00;
-}
-
static bool get_connectable(struct hci_dev *hdev)
{
struct mgmt_pending_cmd *cmd;
@@ -975,41 +975,65 @@ static bool get_connectable(struct hci_dev *hdev)
static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
{
u32 flags;
+ struct adv_info *adv_instance;
- if (instance > 0x01)
- return 0;
+ if (instance == 0x00) {
+ /* Instance 0 always manages the "Tx Power" and "Flags"
+ * fields
+ */
+ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
- if (instance == 0x01)
- return hdev->adv_instance.flags;
+ /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting
+ * corresponds to the "connectable" instance flag.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
- /* Instance 0 always manages the "Tx Power" and "Flags" fields */
- flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
+ return flags;
+ }
- /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting corresponds
- * to the "connectable" instance flag.
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
- flags |= MGMT_ADV_FLAG_CONNECTABLE;
+ adv_instance = hci_find_adv_instance(hdev, instance);
- return flags;
+ /* Return 0 when we got an invalid instance identifier. */
+ if (!adv_instance)
+ return 0;
+
+ return adv_instance->flags;
}
-static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
+static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
{
- /* Ignore instance 0 and other unsupported instances */
- if (instance != 0x01)
+ u8 instance = get_current_adv_instance(hdev);
+ struct adv_info *adv_instance;
+
+ /* Ignore instance 0 */
+ if (instance == 0x00)
+ return 0;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
return 0;
/* TODO: Take into account the "appearance" and "local-name" flags here.
* These are currently being ignored as they are not supported.
*/
- return hdev->adv_instance.scan_rsp_len;
+ return adv_instance->scan_rsp_len;
}
static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
{
+ struct adv_info *adv_instance = NULL;
u8 ad_len = 0, flags = 0;
- u32 instance_flags = get_adv_instance_flags(hdev, instance);
+ u32 instance_flags;
+
+ /* Return 0 when the current instance identifier is invalid. */
+ if (instance) {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
+ }
+
+ instance_flags = get_adv_instance_flags(hdev, instance);
/* The Add Advertising command allows userspace to set both the general
* and limited discoverable flags.
@@ -1043,12 +1067,11 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
}
}
- if (instance) {
- memcpy(ptr, hdev->adv_instance.adv_data,
- hdev->adv_instance.adv_data_len);
-
- ad_len += hdev->adv_instance.adv_data_len;
- ptr += hdev->adv_instance.adv_data_len;
+ if (adv_instance) {
+ memcpy(ptr, adv_instance->adv_data,
+ adv_instance->adv_data_len);
+ ad_len += adv_instance->adv_data_len;
+ ptr += adv_instance->adv_data_len;
}
/* Provide Tx Power only if we can provide a valid value for it */
@@ -1065,7 +1088,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
return ad_len;
}
-static void update_adv_data_for_instance(struct hci_request *req, u8 instance)
+static void update_inst_adv_data(struct hci_request *req, u8 instance)
{
struct hci_dev *hdev = req->hdev;
struct hci_cp_le_set_adv_data cp;
@@ -1093,10 +1116,7 @@ static void update_adv_data_for_instance(struct hci_request *req, u8 instance)
static void update_adv_data(struct hci_request *req)
{
- struct hci_dev *hdev = req->hdev;
- u8 instance = get_current_adv_instance(hdev);
-
- update_adv_data_for_instance(req, instance);
+ update_inst_adv_data(req, get_current_adv_instance(req->hdev));
}
int mgmt_update_adv_data(struct hci_dev *hdev)
@@ -1277,7 +1297,7 @@ static void enable_advertising(struct hci_request *req)
if (connectable)
cp.type = LE_ADV_IND;
- else if (get_adv_instance_scan_rsp_len(hdev, instance))
+ else if (get_cur_adv_instance_scan_rsp_len(hdev))
cp.type = LE_ADV_SCAN_IND;
else
cp.type = LE_ADV_NONCONN_IND;
@@ -1459,27 +1479,141 @@ static void advertising_removed(struct sock *sk, struct hci_dev *hdev,
mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk);
}
-static void clear_adv_instance(struct hci_dev *hdev)
+static int schedule_adv_instance(struct hci_request *req, u8 instance,
+ bool force) {
+ struct hci_dev *hdev = req->hdev;
+ struct adv_info *adv_instance = NULL;
+ u16 timeout;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ return -EPERM;
+
+ if (hdev->adv_instance_timeout)
+ return -EBUSY;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -ENOENT;
+
+ /* A zero timeout means unlimited advertising. As long as there is
+ * only one instance, duration should be ignored. We still set a timeout
+ * in case further instances are being added later on.
+ *
+ * If the remaining lifetime of the instance is more than the duration
+ * then the timeout corresponds to the duration, otherwise it will be
+ * reduced to the remaining instance lifetime.
+ */
+ if (adv_instance->timeout == 0 ||
+ adv_instance->duration <= adv_instance->remaining_time)
+ timeout = adv_instance->duration;
+ else
+ timeout = adv_instance->remaining_time;
+
+ /* The remaining time is being reduced unless the instance is being
+ * advertised without time limit.
+ */
+ if (adv_instance->timeout)
+ adv_instance->remaining_time =
+ adv_instance->remaining_time - timeout;
+
+ hdev->adv_instance_timeout = timeout;
+ queue_delayed_work(hdev->workqueue,
+ &hdev->adv_instance_expire,
+ msecs_to_jiffies(timeout * 1000));
+
+ /* If we're just re-scheduling the same instance again then do not
+ * execute any HCI commands. This happens when a single instance is
+ * being advertised.
+ */
+ if (!force && hdev->cur_adv_instance == instance &&
+ hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ hdev->cur_adv_instance = instance;
+ update_adv_data(req);
+ update_scan_rsp_data(req);
+ enable_advertising(req);
+
+ return 0;
+}
+
+static void cancel_adv_timeout(struct hci_dev *hdev)
{
- struct hci_request req;
+ if (hdev->adv_instance_timeout) {
+ hdev->adv_instance_timeout = 0;
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ }
+}
- if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
- return;
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ * lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ * the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ * setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+static void clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
+ u8 instance, bool force)
+{
+ struct adv_info *adv_instance, *n, *next_instance = NULL;
+ int err;
+ u8 rem_inst;
- if (hdev->adv_instance.timeout)
- cancel_delayed_work(&hdev->adv_instance.timeout_exp);
+ /* Cancel any timeout concerning the removed instance(s). */
+ if (!instance || hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
- memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance));
- advertising_removed(NULL, hdev, 1);
- hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
+ /* Get the next instance to advertise BEFORE we remove
+ * the current one. This can be the same instance again
+ * if there is only one instance.
+ */
+ if (instance && hdev->cur_adv_instance == instance)
+ next_instance = hci_get_next_instance(hdev, instance);
- if (!hdev_is_powered(hdev) ||
+ if (instance == 0x00) {
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
+ list) {
+ if (!(force || adv_instance->timeout))
+ continue;
+
+ rem_inst = adv_instance->instance;
+ err = hci_remove_adv_instance(hdev, rem_inst);
+ if (!err)
+ advertising_removed(NULL, hdev, rem_inst);
+ }
+ hdev->cur_adv_instance = 0x00;
+ } else {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+
+ if (force || (adv_instance && adv_instance->timeout &&
+ !adv_instance->remaining_time)) {
+ /* Don't advertise a removed instance. */
+ if (next_instance &&
+ next_instance->instance == instance)
+ next_instance = NULL;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ advertising_removed(NULL, hdev, instance);
+ }
+ }
+
+ if (list_empty(&hdev->adv_instances)) {
+ hdev->cur_adv_instance = 0x00;
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
+ }
+
+ if (!req || !hdev_is_powered(hdev) ||
hci_dev_test_flag(hdev, HCI_ADVERTISING))
return;
- hci_req_init(&req, hdev);
- disable_advertising(&req);
- hci_req_run(&req, NULL);
+ if (next_instance)
+ schedule_adv_instance(req, next_instance->instance, false);
}
static int clean_up_hci_state(struct hci_dev *hdev)
@@ -1497,8 +1631,7 @@ static int clean_up_hci_state(struct hci_dev *hdev)
hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
- if (hdev->adv_instance.timeout)
- clear_adv_instance(hdev);
+ clear_adv_instance(hdev, NULL, 0x00, false);
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
disable_advertising(&req);
@@ -1506,35 +1639,8 @@ static int clean_up_hci_state(struct hci_dev *hdev)
discov_stopped = hci_stop_discovery(&req);
list_for_each_entry(conn, &hdev->conn_hash.list, list) {
- struct hci_cp_disconnect dc;
- struct hci_cp_reject_conn_req rej;
-
- switch (conn->state) {
- case BT_CONNECTED:
- case BT_CONFIG:
- dc.handle = cpu_to_le16(conn->handle);
- dc.reason = 0x15; /* Terminated due to Power Off */
- hci_req_add(&req, HCI_OP_DISCONNECT, sizeof(dc), &dc);
- break;
- case BT_CONNECT:
- if (conn->type == LE_LINK)
- hci_req_add(&req, HCI_OP_LE_CREATE_CONN_CANCEL,
- 0, NULL);
- else if (conn->type == ACL_LINK)
- hci_req_add(&req, HCI_OP_CREATE_CONN_CANCEL,
- 6, &conn->dst);
- break;
- case BT_CONNECT2:
- bacpy(&rej.bdaddr, &conn->dst);
- rej.reason = 0x15; /* Terminated due to Power Off */
- if (conn->type == ACL_LINK)
- hci_req_add(&req, HCI_OP_REJECT_CONN_REQ,
- sizeof(rej), &rej);
- else if (conn->type == SCO_LINK)
- hci_req_add(&req, HCI_OP_REJECT_SYNC_CONN_REQ,
- sizeof(rej), &rej);
- break;
- }
+ /* 0x15 == Terminated due to Power Off */
+ __hci_abort_conn(&req, conn, 0x15);
}
err = hci_req_run(&req, clean_up_hci_complete);
@@ -2453,6 +2559,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
val = !!cp->val;
enabled = lmp_host_le_capable(hdev);
+ if (!val)
+ clear_adv_instance(hdev, NULL, 0x00, true);
+
if (!hdev_is_powered(hdev) || val == enabled) {
bool changed = false;
@@ -2916,9 +3025,10 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_unpair_device *cp = data;
struct mgmt_rp_unpair_device rp;
- struct hci_cp_disconnect dc;
+ struct hci_conn_params *params;
struct mgmt_pending_cmd *cmd;
struct hci_conn *conn;
+ u8 addr_type;
int err;
memset(&rp, 0, sizeof(rp));
@@ -2959,36 +3069,23 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
conn = NULL;
err = hci_remove_link_key(hdev, &cp->addr.bdaddr);
- } else {
- u8 addr_type;
-
- conn = hci_conn_hash_lookup_ba(hdev, LE_LINK,
- &cp->addr.bdaddr);
- if (conn) {
- /* Defer clearing up the connection parameters
- * until closing to give a chance of keeping
- * them if a repairing happens.
- */
- set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags);
-
- /* If disconnection is not requested, then
- * clear the connection variable so that the
- * link is not terminated.
- */
- if (!cp->disconnect)
- conn = NULL;
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_NOT_PAIRED, &rp,
+ sizeof(rp));
+ goto unlock;
}
- if (cp->addr.type == BDADDR_LE_PUBLIC)
- addr_type = ADDR_LE_DEV_PUBLIC;
- else
- addr_type = ADDR_LE_DEV_RANDOM;
+ goto done;
+ }
- hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type);
+ /* LE address type */
+ addr_type = le_addr_type(cp->addr.type);
- err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type);
- }
+ hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type);
+ err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type);
if (err < 0) {
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
MGMT_STATUS_NOT_PAIRED, &rp,
@@ -2996,6 +3093,36 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
goto unlock;
}
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, addr_type);
+ if (!conn) {
+ hci_conn_params_del(hdev, &cp->addr.bdaddr, addr_type);
+ goto done;
+ }
+
+ /* Abort any ongoing SMP pairing */
+ smp_cancel_pairing(conn);
+
+ /* Defer clearing up the connection parameters until closing to
+ * give a chance of keeping them if a repairing happens.
+ */
+ set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags);
+
+ /* Disable auto-connection parameters if present */
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type);
+ if (params) {
+ if (params->explicit_connect)
+ params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ else
+ params->auto_connect = HCI_AUTO_CONN_DISABLED;
+ }
+
+ /* If disconnection is not requested, then clear the connection
+ * variable so that the link is not terminated.
+ */
+ if (!cp->disconnect)
+ conn = NULL;
+
+done:
/* If the connection variable is set, then termination of the
* link is requested.
*/
@@ -3015,9 +3142,7 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
cmd->cmd_complete = addr_cmd_complete;
- dc.handle = cpu_to_le16(conn->handle);
- dc.reason = 0x13; /* Remote User Terminated Connection */
- err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc);
+ err = hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
if (err < 0)
mgmt_pending_remove(cmd);
@@ -3065,7 +3190,8 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
&cp->addr.bdaddr);
else
- conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr);
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) {
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
@@ -3416,14 +3542,8 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level,
auth_type);
} else {
- u8 addr_type;
-
- /* Convert from L2CAP channel address type to HCI address type
- */
- if (cp->addr.type == BDADDR_LE_PUBLIC)
- addr_type = ADDR_LE_DEV_PUBLIC;
- else
- addr_type = ADDR_LE_DEV_RANDOM;
+ u8 addr_type = le_addr_type(cp->addr.type);
+ struct hci_conn_params *p;
/* When pairing a new device, it is expected to remember
* this device for future connections. Adding the connection
@@ -3434,11 +3554,15 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
* If connection parameters already exist, then they
* will be kept and this function does nothing.
*/
- hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type);
+ p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type);
+
+ if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT)
+ p->auto_connect = HCI_AUTO_CONN_DISABLED;
- conn = hci_connect_le(hdev, &cp->addr.bdaddr, addr_type,
- sec_level, HCI_LE_CONN_TIMEOUT,
- HCI_ROLE_MASTER);
+ conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr,
+ addr_type, sec_level,
+ HCI_LE_CONN_TIMEOUT,
+ HCI_ROLE_MASTER);
}
if (IS_ERR(conn)) {
@@ -3564,7 +3688,8 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev,
if (addr->type == BDADDR_BREDR)
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &addr->bdaddr);
else
- conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &addr->bdaddr);
+ conn = hci_conn_hash_lookup_le(hdev, &addr->bdaddr,
+ le_addr_type(addr->type));
if (!conn) {
err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
@@ -4082,11 +4207,12 @@ static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status)
/* Don't let discovery abort an outgoing connection attempt
* that's using directed advertising.
*/
- if (hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) {
+ if (hci_lookup_le_connect(hdev)) {
*status = MGMT_STATUS_REJECTED;
return false;
}
+ cancel_adv_timeout(hdev);
disable_advertising(req);
}
@@ -4669,6 +4795,9 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
{
struct cmd_lookup match = { NULL, hdev };
struct hci_request req;
+ u8 instance;
+ struct adv_info *adv_instance;
+ int err;
hci_dev_lock(hdev);
@@ -4694,18 +4823,31 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
sock_put(match.sk);
/* If "Set Advertising" was just disabled and instance advertising was
- * set up earlier, then enable the advertising instance.
+ * set up earlier, then re-enable multi-instance advertising.
*/
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) ||
+ list_empty(&hdev->adv_instances))
goto unlock;
+ instance = hdev->cur_adv_instance;
+ if (!instance) {
+ adv_instance = list_first_entry_or_null(&hdev->adv_instances,
+ struct adv_info, list);
+ if (!adv_instance)
+ goto unlock;
+
+ instance = adv_instance->instance;
+ }
+
hci_req_init(&req, hdev);
- update_adv_data(&req);
- enable_advertising(&req);
+ err = schedule_adv_instance(&req, instance, true);
+
+ if (!err)
+ err = hci_req_run(&req, enable_advertising_instance);
- if (hci_req_run(&req, enable_advertising_instance) < 0)
+ if (err)
BT_ERR("Failed to re-configure advertising");
unlock:
@@ -4790,10 +4932,15 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
else
hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ cancel_adv_timeout(hdev);
+
if (val) {
- /* Switch to instance "0" for the Set Advertising setting. */
- update_adv_data_for_instance(&req, 0);
- update_scan_rsp_data_for_instance(&req, 0);
+ /* Switch to instance "0" for the Set Advertising setting.
+ * We cannot use update_[adv|scan_rsp]_data() here as the
+ * HCI_ADVERTISING flag is not yet set.
+ */
+ update_inst_adv_data(&req, 0x00);
+ update_inst_scan_rsp_data(&req, 0x00);
enable_advertising(&req);
} else {
disable_advertising(&req);
@@ -5445,14 +5592,9 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
for (i = 0; i < irk_count; i++) {
struct mgmt_irk_info *irk = &cp->irks[i];
- u8 addr_type;
-
- if (irk->addr.type == BDADDR_LE_PUBLIC)
- addr_type = ADDR_LE_DEV_PUBLIC;
- else
- addr_type = ADDR_LE_DEV_RANDOM;
- hci_add_irk(hdev, &irk->addr.bdaddr, addr_type, irk->val,
+ hci_add_irk(hdev, &irk->addr.bdaddr,
+ le_addr_type(irk->addr.type), irk->val,
BDADDR_ANY);
}
@@ -5532,12 +5674,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
for (i = 0; i < key_count; i++) {
struct mgmt_ltk_info *key = &cp->keys[i];
- u8 type, addr_type, authenticated;
-
- if (key->addr.type == BDADDR_LE_PUBLIC)
- addr_type = ADDR_LE_DEV_PUBLIC;
- else
- addr_type = ADDR_LE_DEV_RANDOM;
+ u8 type, authenticated;
switch (key->type) {
case MGMT_LTK_UNAUTHENTICATED:
@@ -5563,9 +5700,9 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
continue;
}
- hci_add_ltk(hdev, &key->addr.bdaddr, addr_type, type,
- authenticated, key->val, key->enc_size, key->ediv,
- key->rand);
+ hci_add_ltk(hdev, &key->addr.bdaddr,
+ le_addr_type(key->addr.type), type, authenticated,
+ key->val, key->enc_size, key->ediv, key->rand);
}
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0,
@@ -5957,17 +6094,30 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr,
switch (auto_connect) {
case HCI_AUTO_CONN_DISABLED:
case HCI_AUTO_CONN_LINK_LOSS:
+ /* If auto connect is being disabled when we're trying to
+ * connect to device, keep connecting.
+ */
+ if (params->explicit_connect)
+ list_add(&params->action, &hdev->pend_le_conns);
+
__hci_update_background_scan(req);
break;
case HCI_AUTO_CONN_REPORT:
- list_add(&params->action, &hdev->pend_le_reports);
+ if (params->explicit_connect)
+ list_add(&params->action, &hdev->pend_le_conns);
+ else
+ list_add(&params->action, &hdev->pend_le_reports);
__hci_update_background_scan(req);
break;
case HCI_AUTO_CONN_DIRECT:
case HCI_AUTO_CONN_ALWAYS:
if (!is_connected(hdev, addr, addr_type)) {
list_add(&params->action, &hdev->pend_le_conns);
- __hci_update_background_scan(req);
+ /* If we are in scan phase of connecting, we were
+ * already added to pend_le_conns and scanning.
+ */
+ if (params->auto_connect != HCI_AUTO_CONN_EXPLICIT)
+ __hci_update_background_scan(req);
}
break;
}
@@ -6064,10 +6214,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
goto added;
}
- if (cp->addr.type == BDADDR_LE_PUBLIC)
- addr_type = ADDR_LE_DEV_PUBLIC;
- else
- addr_type = ADDR_LE_DEV_RANDOM;
+ addr_type = le_addr_type(cp->addr.type);
if (cp->action == 0x02)
auto_conn = HCI_AUTO_CONN_ALWAYS;
@@ -6076,6 +6223,17 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
else
auto_conn = HCI_AUTO_CONN_REPORT;
+ /* Kernel internally uses conn_params with resolvable private
+ * address, but Add Device allows only identity addresses.
+ * Make sure it is enforced before calling
+ * hci_conn_params_lookup.
+ */
+ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
+ err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
/* If the connection parameters don't exist for this device,
* they will be created and configured with defaults.
*/
@@ -6185,10 +6343,19 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
goto complete;
}
- if (cp->addr.type == BDADDR_LE_PUBLIC)
- addr_type = ADDR_LE_DEV_PUBLIC;
- else
- addr_type = ADDR_LE_DEV_RANDOM;
+ addr_type = le_addr_type(cp->addr.type);
+
+ /* Kernel internally uses conn_params with resolvable private
+ * address, but Remove Device allows only identity addresses.
+ * Make sure it is enforced before calling
+ * hci_conn_params_lookup.
+ */
+ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
addr_type);
@@ -6199,7 +6366,8 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (params->auto_connect == HCI_AUTO_CONN_DISABLED) {
+ if (params->auto_connect == HCI_AUTO_CONN_DISABLED ||
+ params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {
err = cmd->cmd_complete(cmd,
MGMT_STATUS_INVALID_PARAMS);
mgmt_pending_remove(cmd);
@@ -6235,6 +6403,10 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
if (p->auto_connect == HCI_AUTO_CONN_DISABLED)
continue;
device_removed(sk, hdev, &p->addr, p->addr_type);
+ if (p->explicit_connect) {
+ p->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ continue;
+ }
list_del(&p->action);
list_del(&p->list);
kfree(p);
@@ -6781,8 +6953,9 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_rp_read_adv_features *rp;
size_t rp_len;
- int err;
+ int err, i;
bool instance;
+ struct adv_info *adv_instance;
u32 supported_flags;
BT_DBG("%s", hdev->name);
@@ -6795,12 +6968,9 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
rp_len = sizeof(*rp);
- /* Currently only one instance is supported, so just add 1 to the
- * response length.
- */
instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE);
if (instance)
- rp_len++;
+ rp_len += hdev->adv_instance_cnt;
rp = kmalloc(rp_len, GFP_ATOMIC);
if (!rp) {
@@ -6813,14 +6983,18 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
rp->supported_flags = cpu_to_le32(supported_flags);
rp->max_adv_data_len = HCI_MAX_AD_LENGTH;
rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH;
- rp->max_instances = 1;
+ rp->max_instances = HCI_MAX_ADV_INSTANCES;
- /* Currently only one instance is supported, so simply return the
- * current instance number.
- */
if (instance) {
- rp->num_instances = 1;
- rp->instance[0] = 1;
+ i = 0;
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
+ if (i >= hdev->adv_instance_cnt)
+ break;
+
+ rp->instance[i] = adv_instance->instance;
+ i++;
+ }
+ rp->num_instances = hdev->adv_instance_cnt;
} else {
rp->num_instances = 0;
}
@@ -6882,7 +7056,10 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status,
u16 opcode)
{
struct mgmt_pending_cmd *cmd;
+ struct mgmt_cp_add_advertising *cp;
struct mgmt_rp_add_advertising rp;
+ struct adv_info *adv_instance, *n;
+ u8 instance;
BT_DBG("status %d", status);
@@ -6890,16 +7067,32 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status,
cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev);
- if (status) {
+ if (status)
hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
- memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance));
- advertising_removed(cmd ? cmd->sk : NULL, hdev, 1);
+
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
+ if (!adv_instance->pending)
+ continue;
+
+ if (!status) {
+ adv_instance->pending = false;
+ continue;
+ }
+
+ instance = adv_instance->instance;
+
+ if (hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ hci_remove_adv_instance(hdev, instance);
+ advertising_removed(cmd ? cmd->sk : NULL, hdev, instance);
}
if (!cmd)
goto unlock;
- rp.instance = 0x01;
+ cp = cmd->param;
+ rp.instance = cp->instance;
if (status)
mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode,
@@ -6914,15 +7107,28 @@ unlock:
hci_dev_unlock(hdev);
}
-static void adv_timeout_expired(struct work_struct *work)
+void mgmt_adv_timeout_expired(struct hci_dev *hdev)
{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- adv_instance.timeout_exp.work);
+ u8 instance;
+ struct hci_request req;
- hdev->adv_instance.timeout = 0;
+ hdev->adv_instance_timeout = 0;
+
+ instance = get_current_adv_instance(hdev);
+ if (instance == 0x00)
+ return;
hci_dev_lock(hdev);
- clear_adv_instance(hdev);
+ hci_req_init(&req, hdev);
+
+ clear_adv_instance(hdev, &req, instance, false);
+
+ if (list_empty(&hdev->adv_instances))
+ disable_advertising(&req);
+
+ if (!skb_queue_empty(&req.cmd_q))
+ hci_req_run(&req, NULL);
+
hci_dev_unlock(hdev);
}
@@ -6934,7 +7140,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
u32 flags;
u32 supported_flags;
u8 status;
- u16 timeout;
+ u16 timeout, duration;
+ unsigned int prev_instance_cnt = hdev->adv_instance_cnt;
+ u8 schedule_instance = 0;
+ struct adv_info *next_instance;
int err;
struct mgmt_pending_cmd *cmd;
struct hci_request req;
@@ -6948,12 +7157,13 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
flags = __le32_to_cpu(cp->flags);
timeout = __le16_to_cpu(cp->timeout);
+ duration = __le16_to_cpu(cp->duration);
- /* The current implementation only supports adding one instance and only
- * a subset of the specified flags.
+ /* The current implementation only supports a subset of the specified
+ * flags.
*/
supported_flags = get_supported_adv_flags(hdev);
- if (cp->instance != 0x01 || (flags & ~supported_flags))
+ if (flags & ~supported_flags)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6981,38 +7191,51 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- INIT_DELAYED_WORK(&hdev->adv_instance.timeout_exp, adv_timeout_expired);
-
- hdev->adv_instance.flags = flags;
- hdev->adv_instance.adv_data_len = cp->adv_data_len;
- hdev->adv_instance.scan_rsp_len = cp->scan_rsp_len;
-
- if (cp->adv_data_len)
- memcpy(hdev->adv_instance.adv_data, cp->data, cp->adv_data_len);
-
- if (cp->scan_rsp_len)
- memcpy(hdev->adv_instance.scan_rsp_data,
- cp->data + cp->adv_data_len, cp->scan_rsp_len);
-
- if (hdev->adv_instance.timeout)
- cancel_delayed_work(&hdev->adv_instance.timeout_exp);
+ err = hci_add_adv_instance(hdev, cp->instance, flags,
+ cp->adv_data_len, cp->data,
+ cp->scan_rsp_len,
+ cp->data + cp->adv_data_len,
+ timeout, duration);
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_FAILED);
+ goto unlock;
+ }
- hdev->adv_instance.timeout = timeout;
+ /* Only trigger an advertising added event if a new instance was
+ * actually added.
+ */
+ if (hdev->adv_instance_cnt > prev_instance_cnt)
+ advertising_added(sk, hdev, cp->instance);
- if (timeout)
- queue_delayed_work(hdev->workqueue,
- &hdev->adv_instance.timeout_exp,
- msecs_to_jiffies(timeout * 1000));
+ hci_dev_set_flag(hdev, HCI_ADVERTISING_INSTANCE);
- if (!hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING_INSTANCE))
- advertising_added(sk, hdev, 1);
+ if (hdev->cur_adv_instance == cp->instance) {
+ /* If the currently advertised instance is being changed then
+ * cancel the current advertising and schedule the next
+ * instance. If there is only one instance then the overridden
+ * advertising data will be visible right away.
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev, cp->instance);
+ if (next_instance)
+ schedule_instance = next_instance->instance;
+ } else if (!hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other
+ * instance is currently being advertised.
+ */
+ schedule_instance = cp->instance;
+ }
- /* If the HCI_ADVERTISING flag is set or the device isn't powered then
- * we have no HCI communication to make. Simply return.
+ /* If the HCI_ADVERTISING flag is set or the device isn't powered or
+ * there is no instance to be advertised then we have no HCI
+ * communication to make. Simply return.
*/
if (!hdev_is_powered(hdev) ||
- hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
- rp.instance = 0x01;
+ hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !schedule_instance) {
+ rp.instance = cp->instance;
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
goto unlock;
@@ -7030,11 +7253,11 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- update_adv_data(&req);
- update_scan_rsp_data(&req);
- enable_advertising(&req);
+ err = schedule_adv_instance(&req, schedule_instance, true);
+
+ if (!err)
+ err = hci_req_run(&req, add_advertising_complete);
- err = hci_req_run(&req, add_advertising_complete);
if (err < 0)
mgmt_pending_remove(cmd);
@@ -7048,6 +7271,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status,
u16 opcode)
{
struct mgmt_pending_cmd *cmd;
+ struct mgmt_cp_remove_advertising *cp;
struct mgmt_rp_remove_advertising rp;
BT_DBG("status %d", status);
@@ -7062,7 +7286,8 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status,
if (!cmd)
goto unlock;
- rp.instance = 1;
+ cp = cmd->param;
+ rp.instance = cp->instance;
mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS,
&rp, sizeof(rp));
@@ -7077,21 +7302,21 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_cp_remove_advertising *cp = data;
struct mgmt_rp_remove_advertising rp;
- int err;
struct mgmt_pending_cmd *cmd;
struct hci_request req;
+ int err;
BT_DBG("%s", hdev->name);
- /* The current implementation only allows modifying instance no 1. A
- * value of 0 indicates that all instances should be cleared.
- */
- if (cp->instance > 1)
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
- MGMT_STATUS_INVALID_PARAMS);
-
hci_dev_lock(hdev);
+ if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
pending_find(MGMT_OP_SET_LE, hdev)) {
@@ -7106,21 +7331,21 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (hdev->adv_instance.timeout)
- cancel_delayed_work(&hdev->adv_instance.timeout_exp);
-
- memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance));
+ hci_req_init(&req, hdev);
- advertising_removed(sk, hdev, 1);
+ clear_adv_instance(hdev, &req, cp->instance, true);
- hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
+ if (list_empty(&hdev->adv_instances))
+ disable_advertising(&req);
- /* If the HCI_ADVERTISING flag is set or the device isn't powered then
- * we have no HCI communication to make. Simply return.
+ /* If no HCI commands have been collected so far or the HCI_ADVERTISING
+ * flag is set or the device isn't powered then we have no HCI
+ * communication to make. Simply return.
*/
- if (!hdev_is_powered(hdev) ||
+ if (skb_queue_empty(&req.cmd_q) ||
+ !hdev_is_powered(hdev) ||
hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
- rp.instance = 1;
+ rp.instance = cp->instance;
err = mgmt_cmd_complete(sk, hdev->id,
MGMT_OP_REMOVE_ADVERTISING,
MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
@@ -7134,9 +7359,6 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- hci_req_init(&req, hdev);
- disable_advertising(&req);
-
err = hci_req_run(&req, remove_advertising_complete);
if (err < 0)
mgmt_pending_remove(cmd);
@@ -7361,6 +7583,7 @@ static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode)
static int powered_update_hci(struct hci_dev *hdev)
{
struct hci_request req;
+ struct adv_info *adv_instance;
u8 link_sec;
hci_req_init(&req, hdev);
@@ -7400,14 +7623,27 @@ static int powered_update_hci(struct hci_dev *hdev)
* advertising data. This also applies to the case
* where BR/EDR was toggled during the AUTO_OFF phase.
*/
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))) {
update_adv_data(&req);
update_scan_rsp_data(&req);
}
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
+ hdev->cur_adv_instance == 0x00 &&
+ !list_empty(&hdev->adv_instances)) {
+ adv_instance = list_first_entry(&hdev->adv_instances,
+ struct adv_info, list);
+ hdev->cur_adv_instance = adv_instance->instance;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
enable_advertising(&req);
+ else if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
+ hdev->cur_adv_instance)
+ schedule_adv_instance(&req, hdev->cur_adv_instance,
+ true);
restart_le_actions(&req);
}
@@ -7577,7 +7813,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent)
memset(&ev, 0, sizeof(ev));
/* Devices using resolvable or non-resolvable random addresses
- * without providing an indentity resolving key don't require
+ * without providing an identity resolving key don't require
* to store long term keys. Their addresses will change the
* next time around.
*
@@ -7603,32 +7839,23 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent)
if (key->type == SMP_LTK)
ev.key.master = 1;
- memcpy(ev.key.val, key->val, sizeof(key->val));
+ /* Make sure we copy only the significant bytes based on the
+ * encryption key size, and set the rest of the value to zeroes.
+ */
+ memcpy(ev.key.val, key->val, key->enc_size);
+ memset(ev.key.val + key->enc_size, 0,
+ sizeof(ev.key.val) - key->enc_size);
mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL);
}
-void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk)
+void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent)
{
struct mgmt_ev_new_irk ev;
memset(&ev, 0, sizeof(ev));
- /* For identity resolving keys from devices that are already
- * using a public address or static random address, do not
- * ask for storing this key. The identity resolving key really
- * is only mandatory for devices using resovlable random
- * addresses.
- *
- * Storing all identity resolving keys has the downside that
- * they will be also loaded on next boot of they system. More
- * identity resolving keys, means more time during scanning is
- * needed to actually resolve these addresses.
- */
- if (bacmp(&irk->rpa, BDADDR_ANY))
- ev.store_hint = 0x01;
- else
- ev.store_hint = 0x00;
+ ev.store_hint = persistent;
bacpy(&ev.rpa, &irk->rpa);
bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr);
@@ -7646,7 +7873,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk,
memset(&ev, 0, sizeof(ev));
/* Devices using resolvable or non-resolvable random addresses
- * without providing an indentity resolving key don't require
+ * without providing an identity resolving key don't require
* to store signature resolving keys. Their addresses will change
* the next time around.
*
@@ -8387,13 +8614,24 @@ static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
void mgmt_reenable_advertising(struct hci_dev *hdev)
{
struct hci_request req;
+ u8 instance;
if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
return;
+ instance = get_current_adv_instance(hdev);
+
hci_req_init(&req, hdev);
- enable_advertising(&req);
+
+ if (instance) {
+ schedule_adv_instance(&req, instance, true);
+ } else {
+ update_adv_data(&req);
+ update_scan_rsp_data(&req);
+ enable_advertising(&req);
+ }
+
hci_req_run(&req, adv_enable_complete);
}
diff --git a/kernel/net/bluetooth/rfcomm/core.c b/kernel/net/bluetooth/rfcomm/core.c
index 4fea24275..29709fbfd 100644
--- a/kernel/net/bluetooth/rfcomm/core.c
+++ b/kernel/net/bluetooth/rfcomm/core.c
@@ -200,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock)
BT_DBG("");
- err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
+ err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
if (!err) {
struct sock *sk = (*sock)->sk;
sk->sk_data_ready = rfcomm_l2data_ready;
diff --git a/kernel/net/bluetooth/rfcomm/sock.c b/kernel/net/bluetooth/rfcomm/sock.c
index 825e8fb51..7511df723 100644
--- a/kernel/net/bluetooth/rfcomm/sock.c
+++ b/kernel/net/bluetooth/rfcomm/sock.c
@@ -269,12 +269,12 @@ static struct proto rfcomm_proto = {
.obj_size = sizeof(struct rfcomm_pinfo)
};
-static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
{
struct rfcomm_dlc *d;
struct sock *sk;
- sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern);
if (!sk)
return NULL;
@@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
sock->ops = &rfcomm_sock_ops;
- sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+ sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
@@ -334,16 +334,19 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
- struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+ struct sockaddr_rc sa;
struct sock *sk = sock->sk;
- int chan = sa->rc_channel;
- int err = 0;
-
- BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr);
+ int len, err = 0;
if (!addr || addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ memset(&sa, 0, sizeof(sa));
+ len = min_t(unsigned int, sizeof(sa), addr_len);
+ memcpy(&sa, addr, len);
+
+ BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr);
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -358,12 +361,13 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr
write_lock(&rfcomm_sk_list.lock);
- if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) {
+ if (sa.rc_channel &&
+ __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) {
err = -EADDRINUSE;
} else {
/* Save source address */
- bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr);
- rfcomm_pi(sk)->channel = chan;
+ bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr);
+ rfcomm_pi(sk)->channel = sa.rc_channel;
sk->sk_state = BT_BOUND;
}
@@ -969,7 +973,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
goto done;
}
- sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+ sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0);
if (!sk)
goto done;
diff --git a/kernel/net/bluetooth/sco.c b/kernel/net/bluetooth/sco.c
index 4322c833e..f52bcbf2e 100644
--- a/kernel/net/bluetooth/sco.c
+++ b/kernel/net/bluetooth/sco.c
@@ -74,7 +74,7 @@ struct sco_pinfo {
static void sco_sock_timeout(unsigned long arg)
{
- struct sock *sk = (struct sock *) arg;
+ struct sock *sk = (struct sock *)arg;
BT_DBG("sock %p state %d", sk, sk->sk_state);
@@ -154,13 +154,13 @@ static void sco_chan_del(struct sock *sk, int err)
sock_set_flag(sk, SOCK_ZAPPED);
}
-static int sco_conn_del(struct hci_conn *hcon, int err)
+static void sco_conn_del(struct hci_conn *hcon, int err)
{
struct sco_conn *conn = hcon->sco_data;
struct sock *sk;
if (!conn)
- return 0;
+ return;
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
@@ -170,19 +170,21 @@ static int sco_conn_del(struct hci_conn *hcon, int err)
sco_conn_unlock(conn);
if (sk) {
+ sock_hold(sk);
bh_lock_sock(sk);
sco_sock_clear_timer(sk);
sco_chan_del(sk, err);
bh_unlock_sock(sk);
sco_sock_kill(sk);
+ sock_put(sk);
}
hcon->sco_data = NULL;
kfree(conn);
- return 0;
}
-static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent)
+static void __sco_chan_add(struct sco_conn *conn, struct sock *sk,
+ struct sock *parent)
{
BT_DBG("conn %p", conn);
@@ -415,8 +417,10 @@ static void __sco_sock_close(struct sock *sk)
if (sco_pi(sk)->conn->hcon) {
sk->sk_state = BT_DISCONN;
sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
+ sco_conn_lock(sco_pi(sk)->conn);
hci_conn_drop(sco_pi(sk)->conn->hcon);
sco_pi(sk)->conn->hcon = NULL;
+ sco_conn_unlock(sco_pi(sk)->conn);
} else
sco_chan_del(sk, ECONNRESET);
break;
@@ -460,11 +464,12 @@ static struct proto sco_proto = {
.obj_size = sizeof(struct sco_pinfo)
};
-static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
{
struct sock *sk;
- sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto);
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern);
if (!sk)
return NULL;
@@ -501,7 +506,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
sock->ops = &sco_sock_ops;
- sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+ sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
@@ -509,7 +514,8 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
return 0;
}
-static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int sco_sock_bind(struct socket *sock, struct sockaddr *addr,
+ int addr_len)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
@@ -520,6 +526,9 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_le
if (!addr || addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_sco))
+ return -EINVAL;
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -616,7 +625,8 @@ done:
return err;
}
-static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int sco_sock_accept(struct socket *sock, struct socket *newsock,
+ int flags)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *ch;
@@ -670,7 +680,8 @@ done:
return err;
}
-static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+static int sco_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int *len, int peer)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
@@ -780,7 +791,8 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg,
return bt_sock_recvmsg(sock, msg, len, flags);
}
-static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+static int sco_sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
int len, err = 0;
@@ -820,7 +832,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
voice.setting = sco_pi(sk)->setting;
len = min_t(unsigned int, sizeof(voice), optlen);
- if (copy_from_user((char *) &voice, optval, len)) {
+ if (copy_from_user((char *)&voice, optval, len)) {
err = -EFAULT;
break;
}
@@ -844,7 +856,8 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
return err;
}
-static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
+static int sco_sock_getsockopt_old(struct socket *sock, int optname,
+ char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct sco_options opts;
@@ -904,7 +917,8 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user
return err;
}
-static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
int len, err = 0;
@@ -929,7 +943,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
}
if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
- (u32 __user *) optval))
+ (u32 __user *)optval))
err = -EFAULT;
break;
@@ -962,7 +976,9 @@ static int sco_sock_shutdown(struct socket *sock, int how)
if (!sk)
return 0;
+ sock_hold(sk);
lock_sock(sk);
+
if (!sk->sk_shutdown) {
sk->sk_shutdown = SHUTDOWN_MASK;
sco_sock_clear_timer(sk);
@@ -973,7 +989,10 @@ static int sco_sock_shutdown(struct socket *sock, int how)
err = bt_sock_wait_state(sk, BT_CLOSED,
sk->sk_lingertime);
}
+
release_sock(sk);
+ sock_put(sk);
+
return err;
}
@@ -1017,6 +1036,11 @@ static void sco_conn_ready(struct sco_conn *conn)
} else {
sco_conn_lock(conn);
+ if (!conn->hcon) {
+ sco_conn_unlock(conn);
+ return;
+ }
+
parent = sco_get_sock_listen(&conn->hcon->src);
if (!parent) {
sco_conn_unlock(conn);
@@ -1026,7 +1050,7 @@ static void sco_conn_ready(struct sco_conn *conn)
bh_lock_sock(parent);
sk = sco_sock_alloc(sock_net(parent), NULL,
- BTPROTO_SCO, GFP_ATOMIC);
+ BTPROTO_SCO, GFP_ATOMIC, 0);
if (!sk) {
bh_unlock_sock(parent);
sco_conn_unlock(conn);
@@ -1110,7 +1134,7 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
sco_conn_del(hcon, bt_to_errno(reason));
}
-int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
{
struct sco_conn *conn = hcon->sco_data;
@@ -1121,12 +1145,11 @@ int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
if (skb->len) {
sco_recv_frame(conn, skb);
- return 0;
+ return;
}
drop:
kfree_skb(skb);
- return 0;
}
static struct hci_cb sco_cb = {
diff --git a/kernel/net/bluetooth/smp.c b/kernel/net/bluetooth/smp.c
index 7b815bcc8..4b175df35 100644
--- a/kernel/net/bluetooth/smp.c
+++ b/kernel/net/bluetooth/smp.c
@@ -33,6 +33,9 @@
#include "ecc.h"
#include "smp.h"
+#define SMP_DEV(hdev) \
+ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data)
+
/* Low-level debug macros to be used for stuff that we don't want
* accidentially in dmesg, i.e. the values of the various crypto keys
* and the inputs & outputs of crypto functions.
@@ -81,6 +84,9 @@ struct smp_dev {
u8 local_rand[16];
bool debug_key;
+ u8 min_key_size;
+ u8 max_key_size;
+
struct crypto_blkcipher *tfm_aes;
struct crypto_hash *tfm_cmac;
};
@@ -371,6 +377,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
uint8_t tmp[16], data[16];
int err;
+ SMP_DBG("k %16phN r %16phN", k, r);
+
if (!tfm) {
BT_ERR("tfm %p", tfm);
return -EINVAL;
@@ -400,6 +408,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
/* Most significant octet of encryptedData corresponds to data[0] */
swap_buf(data, r, 16);
+ SMP_DBG("r %16phN", r);
+
return err;
}
@@ -410,6 +420,10 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
u8 p1[16], p2[16];
int err;
+ SMP_DBG("k %16phN r %16phN", k, r);
+ SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra);
+ SMP_DBG("preq %7phN pres %7phN", preq, pres);
+
memset(p1, 0, 16);
/* p1 = pres || preq || _rat || _iat */
@@ -418,10 +432,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
memcpy(p1 + 2, preq, 7);
memcpy(p1 + 9, pres, 7);
- /* p2 = padding || ia || ra */
- memcpy(p2, ra, 6);
- memcpy(p2 + 6, ia, 6);
- memset(p2 + 12, 0, 4);
+ SMP_DBG("p1 %16phN", p1);
/* res = r XOR p1 */
u128_xor((u128 *) res, (u128 *) r, (u128 *) p1);
@@ -433,6 +444,13 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
return err;
}
+ /* p2 = padding || ia || ra */
+ memcpy(p2, ra, 6);
+ memcpy(p2 + 6, ia, 6);
+ memset(p2 + 12, 0, 4);
+
+ SMP_DBG("p2 %16phN", p2);
+
/* res = res XOR p2 */
u128_xor((u128 *) res, (u128 *) res, (u128 *) p2);
@@ -477,7 +495,7 @@ static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16],
}
/* The output of the random address function ah is:
- * ah(h, r) = e(k, r') mod 2^24
+ * ah(k, r) = e(k, r') mod 2^24
* The output of the security function e is then truncated to 24 bits
* by taking the least significant 24 bits of the output of e as the
* result of ah.
@@ -696,7 +714,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
if (rsp == NULL) {
req->io_capability = conn->hcon->io_capability;
req->oob_flag = oob_flag;
- req->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ req->max_key_size = SMP_DEV(hdev)->max_key_size;
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -707,7 +725,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
rsp->io_capability = conn->hcon->io_capability;
rsp->oob_flag = oob_flag;
- rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ rsp->max_key_size = SMP_DEV(hdev)->max_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -718,10 +736,11 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
{
struct l2cap_chan *chan = conn->smp;
+ struct hci_dev *hdev = conn->hcon->hdev;
struct smp_chan *smp = chan->data;
- if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) ||
- (max_key_size < SMP_MIN_ENC_KEY_SIZE))
+ if (max_key_size > SMP_DEV(hdev)->max_key_size ||
+ max_key_size < SMP_MIN_ENC_KEY_SIZE)
return SMP_ENC_KEY_SIZE;
smp->enc_key_size = max_key_size;
@@ -792,7 +811,6 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason)
smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason),
&reason);
- clear_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags);
mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE);
if (chan->data)
@@ -985,13 +1003,10 @@ static u8 smp_random(struct smp_chan *smp)
smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk);
- memset(stk + smp->enc_key_size, 0,
- SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size);
-
if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
return SMP_UNSPECIFIED;
- hci_le_start_enc(hcon, ediv, rand, stk);
+ hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size);
hcon->enc_key_size = smp->enc_key_size;
set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags);
} else {
@@ -1004,9 +1019,6 @@ static u8 smp_random(struct smp_chan *smp)
smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk);
- memset(stk + smp->enc_key_size, 0,
- SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size);
-
if (hcon->pending_sec_level == BT_SECURITY_HIGH)
auth = 1;
else
@@ -1033,35 +1045,6 @@ static void smp_notify_keys(struct l2cap_conn *conn)
struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1];
bool persistent;
- if (smp->remote_irk) {
- mgmt_new_irk(hdev, smp->remote_irk);
- /* Now that user space can be considered to know the
- * identity address track the connection based on it
- * from now on (assuming this is an LE link).
- */
- if (hcon->type == LE_LINK) {
- bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
- hcon->dst_type = smp->remote_irk->addr_type;
- queue_work(hdev->workqueue, &conn->id_addr_update_work);
- }
-
- /* When receiving an indentity resolving key for
- * a remote device that does not use a resolvable
- * private address, just remove the key so that
- * it is possible to use the controller white
- * list for scanning.
- *
- * Userspace will have been told to not store
- * this key at this point. So it is safe to
- * just remove it.
- */
- if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) {
- list_del_rcu(&smp->remote_irk->list);
- kfree_rcu(smp->remote_irk, rcu);
- smp->remote_irk = NULL;
- }
- }
-
if (hcon->type == ACL_LINK) {
if (hcon->key_type == HCI_LK_DEBUG_COMBINATION)
persistent = false;
@@ -1069,13 +1052,27 @@ static void smp_notify_keys(struct l2cap_conn *conn)
persistent = !test_bit(HCI_CONN_FLUSH_KEY,
&hcon->flags);
} else {
- /* The LTKs and CSRKs should be persistent only if both sides
- * had the bonding bit set in their authentication requests.
+ /* The LTKs, IRKs and CSRKs should be persistent only if
+ * both sides had the bonding bit set in their
+ * authentication requests.
*/
persistent = !!((req->auth_req & rsp->auth_req) &
SMP_AUTH_BONDING);
}
+ if (smp->remote_irk) {
+ mgmt_new_irk(hdev, smp->remote_irk, persistent);
+
+ /* Now that user space can be considered to know the
+ * identity address track the connection based on it
+ * from now on (assuming this is an LE link).
+ */
+ if (hcon->type == LE_LINK) {
+ bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
+ hcon->dst_type = smp->remote_irk->addr_type;
+ queue_work(hdev->workqueue, &conn->id_addr_update_work);
+ }
+ }
if (smp->csrk) {
smp->csrk->bdaddr_type = hcon->dst_type;
@@ -1144,9 +1141,6 @@ static void sc_add_ltk(struct smp_chan *smp)
else
auth = 0;
- memset(smp->tk + smp->enc_key_size, 0,
- SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size);
-
smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
key_type, auth, smp->tk, smp->enc_key_size,
0, 0);
@@ -1268,7 +1262,14 @@ static void smp_distribute_keys(struct smp_chan *smp)
__le16 ediv;
__le64 rand;
- get_random_bytes(enc.ltk, sizeof(enc.ltk));
+ /* Make sure we generate only the significant amount of
+ * bytes based on the encryption key size, and set the rest
+ * of the value to zeroes.
+ */
+ get_random_bytes(enc.ltk, smp->enc_key_size);
+ memset(enc.ltk + smp->enc_key_size, 0,
+ sizeof(enc.ltk) - smp->enc_key_size);
+
get_random_bytes(&ediv, sizeof(ediv));
get_random_bytes(&rand, sizeof(rand));
@@ -1688,7 +1689,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
- req->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ req->max_key_size = conn->hcon->enc_key_size;
smp->remote_key_dist = remote_dist;
@@ -1697,7 +1698,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
memset(rsp, 0, sizeof(*rsp));
- rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ rsp->max_key_size = conn->hcon->enc_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
@@ -2190,7 +2191,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level)
if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
return true;
- hci_le_start_enc(hcon, key->ediv, key->rand, key->val);
+ hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size);
hcon->enc_key_size = key->enc_size;
/* We never store STKs for master role, so clear this flag */
@@ -2294,12 +2295,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
if (!conn)
return 1;
- chan = conn->smp;
- if (!chan) {
- BT_ERR("SMP security requested but not available");
- return 1;
- }
-
if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED))
return 1;
@@ -2313,6 +2308,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
return 0;
+ chan = conn->smp;
+ if (!chan) {
+ BT_ERR("SMP security requested but not available");
+ return 1;
+ }
+
l2cap_chan_lock(chan);
/* If SMP is already in progress ignore this request */
@@ -2363,6 +2364,32 @@ unlock:
return ret;
}
+void smp_cancel_pairing(struct hci_conn *hcon)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+ struct smp_chan *smp;
+
+ if (!conn)
+ return;
+
+ chan = conn->smp;
+ if (!chan)
+ return;
+
+ l2cap_chan_lock(chan);
+
+ smp = chan->data;
+ if (smp) {
+ if (test_bit(SMP_FLAG_COMPLETE, &smp->flags))
+ smp_failure(conn, 0);
+ else
+ smp_failure(conn, SMP_UNSPECIFIED);
+ }
+
+ l2cap_chan_unlock(chan);
+}
+
static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
{
struct smp_cmd_encrypt_info *rp = (void *) skb->data;
@@ -2742,7 +2769,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
sc_add_ltk(smp);
if (hcon->out) {
- hci_le_start_enc(hcon, 0, 0, smp->tk);
+ hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size);
hcon->enc_key_size = smp->enc_key_size;
}
@@ -2984,8 +3011,13 @@ static void smp_ready_cb(struct l2cap_chan *chan)
BT_DBG("chan %p", chan);
+ /* No need to call l2cap_chan_hold() here since we already own
+ * the reference taken in smp_new_conn_cb(). This is just the
+ * first time that we tie it to a specific pointer. The code in
+ * l2cap_core.c ensures that there's no risk this function wont
+ * get called if smp_new_conn_cb was previously called.
+ */
conn->smp = chan;
- l2cap_chan_hold(chan);
if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
bredr_pairing(chan);
@@ -3124,6 +3156,8 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
smp->tfm_aes = tfm_aes;
smp->tfm_cmac = tfm_cmac;
+ smp->min_key_size = SMP_MIN_ENC_KEY_SIZE;
+ smp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
create_chan:
chan = l2cap_chan_create();
@@ -3246,6 +3280,94 @@ static const struct file_operations force_bredr_smp_fops = {
.llseek = default_llseek,
};
+static ssize_t le_min_key_size_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[4];
+
+ snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+}
+
+static ssize_t le_min_key_size_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf) - 1));
+ u8 key_size;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+
+ sscanf(buf, "%hhu", &key_size);
+
+ if (key_size > SMP_DEV(hdev)->max_key_size ||
+ key_size < SMP_MIN_ENC_KEY_SIZE)
+ return -EINVAL;
+
+ SMP_DEV(hdev)->min_key_size = key_size;
+
+ return count;
+}
+
+static const struct file_operations le_min_key_size_fops = {
+ .open = simple_open,
+ .read = le_min_key_size_read,
+ .write = le_min_key_size_write,
+ .llseek = default_llseek,
+};
+
+static ssize_t le_max_key_size_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[4];
+
+ snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+}
+
+static ssize_t le_max_key_size_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf) - 1));
+ u8 key_size;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+
+ sscanf(buf, "%hhu", &key_size);
+
+ if (key_size > SMP_MAX_ENC_KEY_SIZE ||
+ key_size < SMP_DEV(hdev)->min_key_size)
+ return -EINVAL;
+
+ SMP_DEV(hdev)->max_key_size = key_size;
+
+ return count;
+}
+
+static const struct file_operations le_max_key_size_fops = {
+ .open = simple_open,
+ .read = le_max_key_size_read,
+ .write = le_max_key_size_write,
+ .llseek = default_llseek,
+};
+
int smp_register(struct hci_dev *hdev)
{
struct l2cap_chan *chan;
@@ -3270,6 +3392,11 @@ int smp_register(struct hci_dev *hdev)
hdev->smp_data = chan;
+ debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev,
+ &le_min_key_size_fops);
+ debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev,
+ &le_max_key_size_fops);
+
/* If the controller does not support BR/EDR Secure Connections
* feature, then the BR/EDR SMP channel shall not be present.
*
diff --git a/kernel/net/bluetooth/smp.h b/kernel/net/bluetooth/smp.h
index 6cf872563..ffcc70b6b 100644
--- a/kernel/net/bluetooth/smp.h
+++ b/kernel/net/bluetooth/smp.h
@@ -180,6 +180,7 @@ enum smp_key_pref {
};
/* SMP Commands */
+void smp_cancel_pairing(struct hci_conn *hcon);
bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
enum smp_key_pref key_pref);
int smp_conn_security(struct hci_conn *hcon, __u8 sec_level);
diff --git a/kernel/net/bridge/Makefile b/kernel/net/bridge/Makefile
index fd7ee03c5..a1cda5d47 100644
--- a/kernel/net/bridge/Makefile
+++ b/kernel/net/bridge/Makefile
@@ -12,6 +12,8 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o
+br_netfilter-y := br_netfilter_hooks.o
+br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o
obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
diff --git a/kernel/net/bridge/br.c b/kernel/net/bridge/br.c
index 02c24cf63..3addc05b9 100644
--- a/kernel/net/bridge/br.c
+++ b/kernel/net/bridge/br.c
@@ -121,16 +121,16 @@ static struct notifier_block br_device_notifier = {
.notifier_call = br_device_event
};
-static int br_netdev_switch_event(struct notifier_block *unused,
- unsigned long event, void *ptr)
+/* called with RTNL */
+static int br_switchdev_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
{
- struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr);
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
struct net_bridge_port *p;
struct net_bridge *br;
- struct netdev_switch_notifier_fdb_info *fdb_info;
+ struct switchdev_notifier_fdb_info *fdb_info;
int err = NOTIFY_DONE;
- rtnl_lock();
p = br_port_get_rtnl(dev);
if (!p)
goto out;
@@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused,
br = p->br;
switch (event) {
- case NETDEV_SWITCH_FDB_ADD:
+ case SWITCHDEV_FDB_ADD:
fdb_info = ptr;
err = br_fdb_external_learn_add(br, p, fdb_info->addr,
fdb_info->vid);
if (err)
err = notifier_from_errno(err);
break;
- case NETDEV_SWITCH_FDB_DEL:
+ case SWITCHDEV_FDB_DEL:
fdb_info = ptr;
err = br_fdb_external_learn_del(br, p, fdb_info->addr,
fdb_info->vid);
@@ -155,12 +155,11 @@ static int br_netdev_switch_event(struct notifier_block *unused,
}
out:
- rtnl_unlock();
return err;
}
-static struct notifier_block br_netdev_switch_notifier = {
- .notifier_call = br_netdev_switch_event,
+static struct notifier_block br_switchdev_notifier = {
+ .notifier_call = br_switchdev_event,
};
static void __net_exit br_net_exit(struct net *net)
@@ -214,7 +213,7 @@ static int __init br_init(void)
if (err)
goto err_out3;
- err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
+ err = register_switchdev_notifier(&br_switchdev_notifier);
if (err)
goto err_out4;
@@ -235,7 +234,7 @@ static int __init br_init(void)
return 0;
err_out5:
- unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+ unregister_switchdev_notifier(&br_switchdev_notifier);
err_out4:
unregister_netdevice_notifier(&br_device_notifier);
err_out3:
@@ -253,7 +252,7 @@ static void __exit br_deinit(void)
{
stp_proto_unregister(&br_stp_proto);
br_netlink_fini();
- unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+ unregister_switchdev_notifier(&br_switchdev_notifier);
unregister_netdevice_notifier(&br_device_notifier);
brioctl_set(NULL);
unregister_pernet_subsys(&br_net_ops);
diff --git a/kernel/net/bridge/br_device.c b/kernel/net/bridge/br_device.c
index 4ff77a169..2c8095a5d 100644
--- a/kernel/net/bridge/br_device.c
+++ b/kernel/net/bridge/br_device.c
@@ -28,6 +28,8 @@
const struct nf_br_ops __rcu *nf_br_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_br_ops);
+static struct lock_class_key bridge_netdev_addr_lock_key;
+
/* net device transmit always called with BH disabled */
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -56,7 +58,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
- if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid))
+ if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
if (is_broadcast_ether_addr(dest))
@@ -87,6 +89,11 @@ out:
return NETDEV_TX_OK;
}
+static void br_set_lockdep_class(struct net_device *dev)
+{
+ lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key);
+}
+
static int br_dev_init(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
@@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev)
err = br_vlan_init(br);
if (err)
free_percpu(br->stats);
+ br_set_lockdep_class(dev);
return err;
}
@@ -339,6 +347,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
+ .ndo_features_check = passthru_features_check,
};
static void br_dev_free(struct net_device *dev)
@@ -364,8 +373,7 @@ void br_dev_setup(struct net_device *dev)
dev->destructor = br_dev_free;
dev->ethtool_ops = &br_ethtool_ops;
SET_NETDEV_DEVTYPE(dev, &br_type);
- dev->tx_queue_len = 0;
- dev->priv_flags = IFF_EBRIDGE;
+ dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
@@ -391,7 +399,7 @@ void br_dev_setup(struct net_device *dev)
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
- br->ageing_time = 300 * HZ;
+ br->ageing_time = BR_DEFAULT_AGEING_TIME;
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
diff --git a/kernel/net/bridge/br_fdb.c b/kernel/net/bridge/br_fdb.c
index 659fb9667..a642bb829 100644
--- a/kernel/net/bridge/br_fdb.c
+++ b/kernel/net/bridge/br_fdb.c
@@ -24,6 +24,7 @@
#include <linux/atomic.h>
#include <asm/unaligned.h>
#include <linux/if_vlan.h>
+#include <net/switchdev.h>
#include "br_private.h"
static struct kmem_cache *br_fdb_cache __read_mostly;
@@ -130,11 +131,28 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
}
}
+static void fdb_del_external_learn(struct net_bridge_fdb_entry *f)
+{
+ struct switchdev_obj_port_fdb fdb = {
+ .obj = {
+ .id = SWITCHDEV_OBJ_ID_PORT_FDB,
+ .flags = SWITCHDEV_F_DEFER,
+ },
+ .vid = f->vlan_id,
+ };
+
+ ether_addr_copy(fdb.addr, f->addr.addr);
+ switchdev_port_obj_del(f->dst->dev, &fdb.obj);
+}
+
static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
{
if (f->is_static)
fdb_del_hw_addr(br, f->addr.addr);
+ if (f->added_by_external_learn)
+ fdb_del_external_learn(f);
+
hlist_del_rcu(&f->hlist);
fdb_notify(br, f, RTM_DELNEIGH);
call_rcu(&f->rcu, fdb_rcu_free);
@@ -146,22 +164,27 @@ static void fdb_delete_local(struct net_bridge *br,
struct net_bridge_fdb_entry *f)
{
const unsigned char *addr = f->addr.addr;
- u16 vid = f->vlan_id;
+ struct net_bridge_vlan_group *vg;
+ const struct net_bridge_vlan *v;
struct net_bridge_port *op;
+ u16 vid = f->vlan_id;
/* Maybe another port has same hw addr? */
list_for_each_entry(op, &br->port_list, list) {
+ vg = nbp_vlan_group(op);
if (op != p && ether_addr_equal(op->dev->dev_addr, addr) &&
- (!vid || nbp_vlan_find(op, vid))) {
+ (!vid || br_vlan_find(vg, vid))) {
f->dst = op;
f->added_by_user = 0;
return;
}
}
+ vg = br_vlan_group(br);
+ v = br_vlan_find(vg, vid);
/* Maybe bridge device has same hw addr? */
if (p && ether_addr_equal(br->dev->dev_addr, addr) &&
- (!vid || br_vlan_find(br, vid))) {
+ (!vid || (v && br_vlan_should_use(v)))) {
f->dst = NULL;
f->added_by_user = 0;
return;
@@ -186,14 +209,14 @@ void br_fdb_find_delete_local(struct net_bridge *br,
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
{
+ struct net_bridge_vlan_group *vg;
struct net_bridge *br = p->br;
- struct net_port_vlans *pv = nbp_get_vlan_info(p);
- bool no_vlan = !pv;
+ struct net_bridge_vlan *v;
int i;
- u16 vid;
spin_lock_bh(&br->hash_lock);
+ vg = nbp_vlan_group(p);
/* Search all chains since old address/hash is unknown */
for (i = 0; i < BR_HASH_SIZE; i++) {
struct hlist_node *h;
@@ -209,7 +232,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
* configured, we can safely be done at
* this point.
*/
- if (no_vlan)
+ if (!vg || !vg->num_vlans)
goto insert;
}
}
@@ -219,15 +242,15 @@ insert:
/* insert new address, may fail if invalid address or dup. */
fdb_insert(br, p, newaddr, 0);
- if (no_vlan)
+ if (!vg || !vg->num_vlans)
goto done;
/* Now add entries for every VLAN configured on the port.
* This function runs under RTNL so the bitmap will not change
* from under us.
*/
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
- fdb_insert(br, p, newaddr, vid);
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ fdb_insert(br, p, newaddr, v->vid);
done:
spin_unlock_bh(&br->hash_lock);
@@ -235,9 +258,9 @@ done:
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
{
+ struct net_bridge_vlan_group *vg;
struct net_bridge_fdb_entry *f;
- struct net_port_vlans *pv;
- u16 vid = 0;
+ struct net_bridge_vlan *v;
spin_lock_bh(&br->hash_lock);
@@ -247,20 +270,18 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, 0);
-
+ vg = br_vlan_group(br);
+ if (!vg || !vg->num_vlans)
+ goto out;
/* Now remove and add entries for every VLAN configured on the
* bridge. This function runs under RTNL so the bitmap will not
* change from under us.
*/
- pv = br_get_vlan_info(br);
- if (!pv)
- goto out;
-
- for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) {
- f = __br_fdb_get(br, br->dev->dev_addr, vid);
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
if (f && f->is_local && !f->dst)
fdb_delete_local(br, NULL, f);
- fdb_insert(br, NULL, newaddr, vid);
+ fdb_insert(br, NULL, newaddr, v->vid);
}
out:
spin_unlock_bh(&br->hash_lock);
@@ -282,6 +303,8 @@ void br_fdb_cleanup(unsigned long _data)
unsigned long this_timer;
if (f->is_static)
continue;
+ if (f->added_by_external_learn)
+ continue;
this_timer = f->updated + delay;
if (time_before_eq(this_timer, jiffies))
fdb_delete(br, f);
@@ -313,9 +336,11 @@ void br_fdb_flush(struct net_bridge *br)
/* Flush all entries referring to a specific port.
* if do_all is set also flush static entries
+ * if vid is set delete all entries that match the vlan_id
*/
void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p,
+ u16 vid,
int do_all)
{
int i;
@@ -330,8 +355,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
if (f->dst != p)
continue;
- if (f->is_static && !do_all)
- continue;
+ if (!do_all)
+ if (f->is_static || (vid && f->vlan_id != vid))
+ continue;
if (f->is_local)
fdb_delete_local(br, p, f);
@@ -469,7 +495,9 @@ static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
struct net_bridge_port *source,
const unsigned char *addr,
- __u16 vid)
+ __u16 vid,
+ unsigned char is_local,
+ unsigned char is_static)
{
struct net_bridge_fdb_entry *fdb;
@@ -478,8 +506,8 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
memcpy(fdb->addr.addr, addr, ETH_ALEN);
fdb->dst = source;
fdb->vlan_id = vid;
- fdb->is_local = 0;
- fdb->is_static = 0;
+ fdb->is_local = is_local;
+ fdb->is_static = is_static;
fdb->added_by_user = 0;
fdb->added_by_external_learn = 0;
fdb->updated = fdb->used = jiffies;
@@ -510,11 +538,10 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
fdb_delete(br, fdb);
}
- fdb = fdb_create(head, source, addr, vid);
+ fdb = fdb_create(head, source, addr, vid, 1, 1);
if (!fdb)
return -ENOMEM;
- fdb->is_local = fdb->is_static = 1;
fdb_add_hw_addr(br, addr);
fdb_notify(br, fdb, RTM_NEWNEIGH);
return 0;
@@ -571,7 +598,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
} else {
spin_lock(&br->hash_lock);
if (likely(!fdb_find(head, addr, vid))) {
- fdb = fdb_create(head, source, addr, vid);
+ fdb = fdb_create(head, source, addr, vid, 0, 0);
if (fdb) {
if (unlikely(added_by_user))
fdb->added_by_user = 1;
@@ -585,13 +612,14 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
}
}
-static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb)
+static int fdb_to_nud(const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb)
{
if (fdb->is_local)
return NUD_PERMANENT;
else if (fdb->is_static)
return NUD_NOARP;
- else if (has_expired(fdb->dst->br, fdb))
+ else if (has_expired(br, fdb))
return NUD_STALE;
else
return NUD_REACHABLE;
@@ -617,7 +645,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
ndm->ndm_flags = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0;
ndm->ndm_type = 0;
ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
- ndm->ndm_state = fdb_to_nud(fdb);
+ ndm->ndm_state = fdb_to_nud(br, fdb);
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
goto nla_put_failure;
@@ -736,12 +764,18 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
struct net_bridge_fdb_entry *fdb;
bool modified = false;
+ /* If the port cannot learn allow only local and static entries */
+ if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
+ !(source->state == BR_STATE_LEARNING ||
+ source->state == BR_STATE_FORWARDING))
+ return -EPERM;
+
fdb = fdb_find(head, addr, vid);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
return -ENOENT;
- fdb = fdb_create(head, source, addr, vid);
+ fdb = fdb_create(head, source, addr, vid, 0, 0);
if (!fdb)
return -ENOMEM;
@@ -756,7 +790,7 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
}
}
- if (fdb_to_nud(fdb) != state) {
+ if (fdb_to_nud(br, fdb) != state) {
if (state & NUD_PERMANENT) {
fdb->is_local = 1;
if (!fdb->is_static) {
@@ -816,9 +850,11 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr, u16 vid, u16 nlh_flags)
{
- struct net_bridge_port *p;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan *v;
+ struct net_bridge *br = NULL;
int err = 0;
- struct net_port_vlans *pv;
if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
@@ -830,34 +866,51 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
return -EINVAL;
}
- p = br_port_get_rtnl(dev);
- if (p == NULL) {
- pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
- dev->name);
- return -EINVAL;
+ if (dev->priv_flags & IFF_EBRIDGE) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (!p) {
+ pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
+ dev->name);
+ return -EINVAL;
+ }
+ vg = nbp_vlan_group(p);
}
- pv = nbp_get_vlan_info(p);
if (vid) {
- if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
- pr_info("bridge: RTM_NEWNEIGH with unconfigured "
- "vlan %d on port %s\n", vid, dev->name);
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v)) {
+ pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name);
return -EINVAL;
}
/* VID was specified, so use it. */
- err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
+ if (dev->priv_flags & IFF_EBRIDGE)
+ err = br_fdb_insert(br, NULL, addr, vid);
+ else
+ err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
} else {
- err = __br_fdb_add(ndm, p, addr, nlh_flags, 0);
- if (err || !pv)
+ if (dev->priv_flags & IFF_EBRIDGE)
+ err = br_fdb_insert(br, NULL, addr, 0);
+ else
+ err = __br_fdb_add(ndm, p, addr, nlh_flags, 0);
+ if (err || !vg || !vg->num_vlans)
goto out;
/* We have vlans configured on this port and user didn't
* specify a VLAN. To be nice, add/update entry for every
* vlan on this port.
*/
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
- err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+ if (dev->priv_flags & IFF_EBRIDGE)
+ err = br_fdb_insert(br, NULL, addr, v->vid);
+ else
+ err = __br_fdb_add(ndm, p, addr, nlh_flags,
+ v->vid);
if (err)
goto out;
}
@@ -867,13 +920,41 @@ out:
return err;
}
-static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan)
+static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr,
+ u16 vid)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+ struct net_bridge_fdb_entry *fdb;
+
+ fdb = fdb_find(head, addr, vid);
+ if (!fdb)
+ return -ENOENT;
+
+ fdb_delete(br, fdb);
+ return 0;
+}
+
+static int __br_fdb_delete_by_addr(struct net_bridge *br,
+ const unsigned char *addr, u16 vid)
+{
+ int err;
+
+ spin_lock_bh(&br->hash_lock);
+ err = fdb_delete_by_addr(br, addr, vid);
+ spin_unlock_bh(&br->hash_lock);
+
+ return err;
+}
+
+static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
+ const u8 *addr, u16 vlan)
{
+ struct net_bridge *br = p->br;
struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
struct net_bridge_fdb_entry *fdb;
fdb = fdb_find(head, addr, vlan);
- if (!fdb)
+ if (!fdb || fdb->dst != p)
return -ENOENT;
fdb_delete(br, fdb);
@@ -886,7 +967,7 @@ static int __br_fdb_delete(struct net_bridge_port *p,
int err;
spin_lock_bh(&p->br->hash_lock);
- err = fdb_delete_by_addr(p->br, addr, vid);
+ err = fdb_delete_by_addr_and_port(p, addr, vid);
spin_unlock_bh(&p->br->hash_lock);
return err;
@@ -897,38 +978,53 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr, u16 vid)
{
- struct net_bridge_port *p;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan *v;
+ struct net_bridge *br = NULL;
int err;
- struct net_port_vlans *pv;
- p = br_port_get_rtnl(dev);
- if (p == NULL) {
- pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
- dev->name);
- return -EINVAL;
+ if (dev->priv_flags & IFF_EBRIDGE) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (!p) {
+ pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
+ dev->name);
+ return -EINVAL;
+ }
+ vg = nbp_vlan_group(p);
}
- pv = nbp_get_vlan_info(p);
if (vid) {
- if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
- pr_info("bridge: RTM_DELNEIGH with unconfigured "
- "vlan %d on port %s\n", vid, dev->name);
+ v = br_vlan_find(vg, vid);
+ if (!v) {
+ pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name);
return -EINVAL;
}
- err = __br_fdb_delete(p, addr, vid);
+ if (dev->priv_flags & IFF_EBRIDGE)
+ err = __br_fdb_delete_by_addr(br, addr, vid);
+ else
+ err = __br_fdb_delete(p, addr, vid);
} else {
err = -ENOENT;
- err &= __br_fdb_delete(p, addr, 0);
- if (!pv)
+ if (dev->priv_flags & IFF_EBRIDGE)
+ err = __br_fdb_delete_by_addr(br, addr, 0);
+ else
+ err &= __br_fdb_delete(p, addr, 0);
+
+ if (!vg || !vg->num_vlans)
goto out;
- /* We have vlans configured on this port and user didn't
- * specify a VLAN. To be nice, add/update entry for every
- * vlan on this port.
- */
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
- err &= __br_fdb_delete(p, addr, vid);
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+ if (dev->priv_flags & IFF_EBRIDGE)
+ err = __br_fdb_delete_by_addr(br, addr, v->vid);
+ else
+ err &= __br_fdb_delete(p, addr, v->vid);
}
}
out:
@@ -1004,7 +1100,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
head = &br->hash[br_mac_hash(addr, vid)];
fdb = fdb_find(head, addr, vid);
if (!fdb) {
- fdb = fdb_create(head, p, addr, vid);
+ fdb = fdb_create(head, p, addr, vid, 0, 0);
if (!fdb) {
err = -ENOMEM;
goto err_unlock;
diff --git a/kernel/net/bridge/br_forward.c b/kernel/net/bridge/br_forward.c
index 0ff6e1bbc..fcdb86dd5 100644
--- a/kernel/net/bridge/br_forward.c
+++ b/kernel/net/bridge/br_forward.c
@@ -30,30 +30,47 @@ static int deliver_clone(const struct net_bridge_port *prev,
static inline int should_deliver(const struct net_bridge_port *p,
const struct sk_buff *skb)
{
+ struct net_bridge_vlan_group *vg;
+
+ vg = nbp_vlan_group_rcu(p);
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
- br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) &&
- p->state == BR_STATE_FORWARDING;
+ br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING;
}
-int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb)
+int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- if (!is_skb_forwardable(skb->dev, skb)) {
- kfree_skb(skb);
- } else {
- skb_push(skb, ETH_HLEN);
- br_drop_fake_rtable(skb);
- skb_sender_cpu_clear(skb);
- dev_queue_xmit(skb);
+ if (!is_skb_forwardable(skb->dev, skb))
+ goto drop;
+
+ skb_push(skb, ETH_HLEN);
+ br_drop_fake_rtable(skb);
+ skb_sender_cpu_clear(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (skb->protocol == htons(ETH_P_8021Q) ||
+ skb->protocol == htons(ETH_P_8021AD))) {
+ int depth;
+
+ if (!__vlan_get_protocol(skb, skb->protocol, &depth))
+ goto drop;
+
+ skb_set_network_header(skb, depth);
}
+ dev_queue_xmit(skb);
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
-int br_forward_finish(struct sock *sk, struct sk_buff *skb)
+int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, sk, skb,
- NULL, skb->dev,
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
@@ -61,7 +78,10 @@ EXPORT_SYMBOL_GPL(br_forward_finish);
static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
{
- skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
+ struct net_bridge_vlan_group *vg;
+
+ vg = nbp_vlan_group_rcu(to);
+ skb = br_handle_vlan(to->br, vg, skb);
if (!skb)
return;
@@ -77,13 +97,14 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
return;
}
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb,
- NULL, skb->dev,
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ dev_net(skb->dev), NULL, skb,NULL, skb->dev,
br_forward_finish);
}
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
+ struct net_bridge_vlan_group *vg;
struct net_device *indev;
if (skb_warn_if_lro(skb)) {
@@ -91,7 +112,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
return;
}
- skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
+ vg = nbp_vlan_group_rcu(to);
+ skb = br_handle_vlan(to->br, vg, skb);
if (!skb)
return;
@@ -99,8 +121,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
skb->dev = to->dev;
skb_forward_csum(skb);
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, NULL, skb,
- indev, skb->dev,
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD,
+ dev_net(indev), NULL, skb, indev, skb->dev,
br_forward_finish);
}
@@ -119,7 +141,7 @@ EXPORT_SYMBOL_GPL(br_deliver);
/* called with rcu_read_lock */
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
{
- if (should_deliver(to, skb)) {
+ if (to && should_deliver(to, skb)) {
if (skb0)
deliver_clone(to, skb, __br_forward);
else
diff --git a/kernel/net/bridge/br_if.c b/kernel/net/bridge/br_if.c
index 1849d96b3..ec02f5869 100644
--- a/kernel/net/bridge/br_if.c
+++ b/kernel/net/bridge/br_if.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <net/sock.h>
#include <linux/if_vlan.h>
+#include <net/switchdev.h>
#include "br_private.h"
@@ -249,7 +250,9 @@ static void del_nbp(struct net_bridge_port *p)
list_del_rcu(&p->list);
nbp_vlan_flush(p);
- br_fdb_delete_by_port(br, p, 1);
+ br_fdb_delete_by_port(br, p, 0, 1);
+ switchdev_deferred_process();
+
nbp_update_port_count(br);
netdev_upper_dev_unlink(dev, br->dev);
@@ -278,9 +281,10 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
del_nbp(p);
}
- br_fdb_delete_by_port(br, NULL, 1);
+ br_fdb_delete_by_port(br, NULL, 0, 1);
br_vlan_flush(br);
+ br_multicast_dev_del(br);
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
diff --git a/kernel/net/bridge/br_input.c b/kernel/net/bridge/br_input.c
index f921a5dce..f7fba7410 100644
--- a/kernel/net/bridge/br_input.c
+++ b/kernel/net/bridge/br_input.c
@@ -26,38 +26,44 @@
br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
EXPORT_SYMBOL(br_should_route_hook);
+static int
+br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ return netif_receive_skb(skb);
+}
+
static int br_pass_frame_up(struct sk_buff *skb)
{
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
+ struct net_bridge_vlan_group *vg;
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
- struct net_port_vlans *pv;
u64_stats_update_begin(&brstats->syncp);
brstats->rx_packets++;
brstats->rx_bytes += skb->len;
u64_stats_update_end(&brstats->syncp);
+ vg = br_vlan_group_rcu(br);
/* Bridge is just like any other port. Make sure the
* packet is allowed except in promisc modue when someone
* may be running packet capture.
*/
- pv = br_get_vlan_info(br);
if (!(brdev->flags & IFF_PROMISC) &&
- !br_allowed_egress(br, pv, skb)) {
+ !br_allowed_egress(vg, skb)) {
kfree_skb(skb);
return NET_RX_DROP;
}
indev = skb->dev;
skb->dev = brdev;
- skb = br_handle_vlan(br, pv, skb);
+ skb = br_handle_vlan(br, vg, skb);
if (!skb)
return NET_RX_DROP;
- return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb,
- indev, NULL,
- netif_receive_skb_sk);
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ dev_net(indev), NULL, skb, indev, NULL,
+ br_netif_receive_skb);
}
static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
@@ -120,7 +126,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
}
/* note: already called with rcu_read_lock */
-int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb)
+int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
@@ -134,7 +140,7 @@ int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb)
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
- if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
+ if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
goto out;
/* insert into forwarding database after filtering to avoid spoofing */
@@ -208,7 +214,7 @@ drop:
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
/* note: already called with rcu_read_lock */
-static int br_handle_local_finish(struct sock *sk, struct sk_buff *skb)
+static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
u16 vid = 0;
@@ -278,8 +284,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
}
/* Deliver packet to local host only */
- if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb,
- skb->dev, NULL, br_handle_local_finish)) {
+ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ br_handle_local_finish)) {
return RX_HANDLER_CONSUMED; /* consumed by filter */
} else {
*pskb = skb;
@@ -303,8 +310,8 @@ forward:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
default:
diff --git a/kernel/net/bridge/br_ioctl.c b/kernel/net/bridge/br_ioctl.c
index 8d423bc64..263b4de4d 100644
--- a/kernel/net/bridge/br_ioctl.c
+++ b/kernel/net/bridge/br_ioctl.c
@@ -200,8 +200,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- br->ageing_time = clock_t_to_jiffies(args[1]);
- return 0;
+ return br_set_ageing_time(br, args[1]);
case BRCTL_GET_PORT_INFO:
{
diff --git a/kernel/net/bridge/br_mdb.c b/kernel/net/bridge/br_mdb.c
index d1f910c0d..cd8deea2d 100644
--- a/kernel/net/bridge/br_mdb.c
+++ b/kernel/net/bridge/br_mdb.c
@@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
memset(&e, 0, sizeof(e));
e.ifindex = port->dev->ifindex;
e.state = p->state;
+ e.vid = p->addr.vid;
if (p->addr.proto == htons(ETH_P_IP))
e.addr.u.ip4 = p->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
@@ -230,7 +231,7 @@ errout:
}
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type)
+ struct br_ip *group, int type, u8 state)
{
struct br_mdb_entry entry;
@@ -241,9 +242,78 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
#if IS_ENABLED(CONFIG_IPV6)
entry.addr.u.ip6 = group->u.ip6;
#endif
+ entry.state = state;
+ entry.vid = group->vid;
__br_mdb_notify(dev, &entry, type);
}
+static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ int ifindex, u32 pid,
+ u32 seq, int type, unsigned int flags)
+{
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = dev->ifindex;
+ nest = nla_nest_start(skb, MDBA_ROUTER);
+ if (!nest)
+ goto cancel;
+
+ if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex))
+ goto end;
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+end:
+ nla_nest_end(skb, nest);
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_rtr_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_port_msg))
+ + nla_total_size(sizeof(__u32));
+}
+
+void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+ int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ int ifindex;
+
+ ifindex = port ? port->dev->ifindex : 0;
+ skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
{
if (entry->ifindex == 0)
@@ -263,6 +333,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
return false;
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
return false;
+ if (entry->vid >= VLAN_VID_MASK)
+ return false;
return true;
}
@@ -323,6 +395,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
struct net_bridge_mdb_htable *mdb;
+ unsigned long now = jiffies;
int err;
mdb = mlock_dereference(br->mdb, br);
@@ -347,6 +420,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
if (unlikely(!p))
return -ENOMEM;
rcu_assign_pointer(*pp, p);
+ if (state == MDB_TEMPORARY)
+ mod_timer(&p->timer, now + br->multicast_membership_interval);
return 0;
}
@@ -371,6 +446,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
return -EINVAL;
memset(&ip, 0, sizeof(ip));
+ ip.vid = entry->vid;
ip.proto = entry->addr.proto;
if (ip.proto == htons(ETH_P_IP))
ip.u.ip4 = entry->addr.u.ip4;
@@ -388,8 +464,11 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
+ struct net_bridge_vlan_group *vg;
+ struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
- struct net_device *dev;
+ struct net_bridge_port *p;
+ struct net_bridge_vlan *v;
struct net_bridge *br;
int err;
@@ -399,9 +478,32 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
br = netdev_priv(dev);
- err = __br_mdb_add(net, br, entry);
- if (!err)
- __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ /* If vlan filtering is enabled and VLAN is not specified
+ * install mdb entry on all vlans configured on the port.
+ */
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+
+ vg = nbp_vlan_group(p);
+ if (br_vlan_enabled(br) && vg && entry->vid == 0) {
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ entry->vid = v->vid;
+ err = __br_mdb_add(net, br, entry);
+ if (err)
+ break;
+ __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ }
+ } else {
+ err = __br_mdb_add(net, br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ }
+
return err;
}
@@ -418,20 +520,14 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
return -EINVAL;
memset(&ip, 0, sizeof(ip));
+ ip.vid = entry->vid;
ip.proto = entry->addr.proto;
- if (ip.proto == htons(ETH_P_IP)) {
- if (timer_pending(&br->ip4_other_query.timer))
- return -EBUSY;
-
+ if (ip.proto == htons(ETH_P_IP))
ip.u.ip4 = entry->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
- } else {
- if (timer_pending(&br->ip6_other_query.timer))
- return -EBUSY;
-
+ else
ip.u.ip6 = entry->addr.u.ip6;
#endif
- }
spin_lock_bh(&br->multicast_lock);
mdb = mlock_dereference(br->mdb, br);
@@ -449,6 +545,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (p->port->state == BR_STATE_DISABLED)
goto unlock;
+ entry->state = p->state;
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
@@ -468,8 +565,12 @@ unlock:
static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct net_device *dev;
+ struct net *net = sock_net(skb->sk);
+ struct net_bridge_vlan_group *vg;
+ struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
+ struct net_bridge_port *p;
+ struct net_bridge_vlan *v;
struct net_bridge *br;
int err;
@@ -479,9 +580,31 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
br = netdev_priv(dev);
- err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, entry, RTM_DELMDB);
+ /* If vlan filtering is enabled and VLAN is not specified
+ * delete mdb entry on all vlans configured on the port.
+ */
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+
+ vg = nbp_vlan_group(p);
+ if (br_vlan_enabled(br) && vg && entry->vid == 0) {
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ entry->vid = v->vid;
+ err = __br_mdb_del(br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_DELMDB);
+ }
+ } else {
+ err = __br_mdb_del(br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_DELMDB);
+ }
+
return err;
}
diff --git a/kernel/net/bridge/br_multicast.c b/kernel/net/bridge/br_multicast.c
index 9ba383f5b..03661d974 100644
--- a/kernel/net/bridge/br_multicast.c
+++ b/kernel/net/bridge/br_multicast.c
@@ -37,6 +37,18 @@
static void br_multicast_start_querier(struct net_bridge *br,
struct bridge_mcast_own_query *query);
+static void br_multicast_add_router(struct net_bridge *br,
+ struct net_bridge_port *port);
+static void br_ip4_multicast_leave_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ __be32 group,
+ __u16 vid);
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ const struct in6_addr *group,
+ __u16 vid);
+#endif
unsigned int br_mdb_rehash_seq;
static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
@@ -271,6 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br,
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
+ br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
+ p->state);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
if (!mp->ports && !mp->mglist &&
@@ -692,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br,
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
+ br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY);
found:
mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -752,6 +766,7 @@ static void br_multicast_router_expired(unsigned long data)
goto out;
hlist_del_init_rcu(&port->rlist);
+ br_rtr_notify(br->dev, port, RTM_DELMDB);
out:
spin_unlock(&br->multicast_lock);
@@ -814,8 +829,8 @@ static void __br_multicast_send_query(struct net_bridge *br,
if (port) {
skb->dev = port->dev;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb,
- NULL, skb->dev,
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ dev_net(port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
} else {
br_multicast_select_own_querier(br, ip, skb);
@@ -912,6 +927,15 @@ void br_multicast_add_port(struct net_bridge_port *port)
void br_multicast_del_port(struct net_bridge_port *port)
{
+ struct net_bridge *br = port->br;
+ struct net_bridge_port_group *pg;
+ struct hlist_node *n;
+
+ /* Take care of the remaining groups, only perm ones should be left */
+ spin_lock_bh(&br->multicast_lock);
+ hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
+ br_multicast_del_pg(br, pg);
+ spin_unlock_bh(&br->multicast_lock);
del_timer_sync(&port->multicast_router_timer);
}
@@ -936,6 +960,8 @@ void br_multicast_enable_port(struct net_bridge_port *port)
#if IS_ENABLED(CONFIG_IPV6)
br_multicast_enable(&port->ip6_own_query);
#endif
+ if (port->multicast_router == 2 && hlist_unhashed(&port->rlist))
+ br_multicast_add_router(br, port);
out:
spin_unlock(&br->multicast_lock);
@@ -949,10 +975,13 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_lock(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- br_multicast_del_pg(br, pg);
+ if (pg->state == MDB_TEMPORARY)
+ br_multicast_del_pg(br, pg);
- if (!hlist_unhashed(&port->rlist))
+ if (!hlist_unhashed(&port->rlist)) {
hlist_del_init_rcu(&port->rlist);
+ br_rtr_notify(br->dev, port, RTM_DELMDB);
+ }
del_timer(&port->multicast_router_timer);
del_timer(&port->ip4_own_query.timer);
#if IS_ENABLED(CONFIG_IPV6)
@@ -975,9 +1004,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
int err = 0;
__be32 group;
- if (!pskb_may_pull(skb, sizeof(*ih)))
- return -EINVAL;
-
ih = igmpv3_report_hdr(skb);
num = ntohs(ih->ngrec);
len = skb_transport_offset(skb) + sizeof(*ih);
@@ -1009,9 +1035,15 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
continue;
}
- err = br_ip4_multicast_add_group(br, port, group, vid);
- if (err)
- break;
+ if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
+ type == IGMPV3_MODE_IS_INCLUDE) &&
+ ntohs(grec->grec_nsrcs) == 0) {
+ br_ip4_multicast_leave_group(br, port, group, vid);
+ } else {
+ err = br_ip4_multicast_add_group(br, port, group, vid);
+ if (err)
+ break;
+ }
}
return err;
@@ -1070,10 +1102,17 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
continue;
}
- err = br_ip6_multicast_add_group(br, port, &grec->grec_mca,
- vid);
- if (err)
- break;
+ if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
+ grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
+ ntohs(*nsrcs) == 0) {
+ br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
+ vid);
+ } else {
+ err = br_ip6_multicast_add_group(br, port,
+ &grec->grec_mca, vid);
+ if (!err)
+ break;
+ }
}
return err;
@@ -1180,6 +1219,7 @@ static void br_multicast_add_router(struct net_bridge *br,
hlist_add_behind_rcu(&port->rlist, slot);
else
hlist_add_head_rcu(&port->rlist, &br->router_list);
+ br_rtr_notify(br->dev, port, RTM_NEWMDB);
}
static void br_multicast_mark_router(struct net_bridge *br,
@@ -1247,25 +1287,14 @@ static int br_ip4_multicast_query(struct net_bridge *br,
max_delay = 10 * HZ;
group = 0;
}
- } else {
- if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) {
- err = -EINVAL;
- goto out;
- }
-
+ } else if (skb->len >= sizeof(*ih3)) {
ih3 = igmpv3_query_hdr(skb);
if (ih3->nsrcs)
goto out;
max_delay = ih3->code ?
IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
- }
-
- /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
- * all-systems destination addresses (224.0.0.1) for general queries
- */
- if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) {
- err = -EINVAL;
+ } else {
goto out;
}
@@ -1328,12 +1357,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
(port && port->state == BR_STATE_DISABLED))
goto out;
- /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
- if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
- err = -EINVAL;
- goto out;
- }
-
if (skb->len == sizeof(*mld)) {
if (!pskb_may_pull(skb, sizeof(*mld))) {
err = -EINVAL;
@@ -1357,14 +1380,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
is_general_query = group && ipv6_addr_any(group);
- /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
- * all-nodes destination address (ff02::1) for general queries
- */
- if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) {
- err = -EINVAL;
- goto out;
- }
-
if (is_general_query) {
saddr.proto = htons(ETH_P_IPV6);
saddr.u.ip6 = ip6h->saddr;
@@ -1417,8 +1432,7 @@ br_multicast_leave_group(struct net_bridge *br,
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
- (port && port->state == BR_STATE_DISABLED) ||
- timer_pending(&other_query->timer))
+ (port && port->state == BR_STATE_DISABLED))
goto out;
mdb = mlock_dereference(br->mdb, br);
@@ -1426,6 +1440,32 @@ br_multicast_leave_group(struct net_bridge *br,
if (!mp)
goto out;
+ if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
+ struct net_bridge_port_group __rcu **pp;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->port != port)
+ continue;
+
+ rcu_assign_pointer(*pp, p->next);
+ hlist_del_init(&p->mglist);
+ del_timer(&p->timer);
+ call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ br_mdb_notify(br->dev, port, group, RTM_DELMDB,
+ p->state);
+
+ if (!mp->ports && !mp->mglist &&
+ netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ }
+ goto out;
+ }
+
+ if (timer_pending(&other_query->timer))
+ goto out;
+
if (br->multicast_querier) {
__br_multicast_send_query(br, port, &mp->addr);
@@ -1451,28 +1491,6 @@ br_multicast_leave_group(struct net_bridge *br,
}
}
- if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
- struct net_bridge_port_group __rcu **pp;
-
- for (pp = &mp->ports;
- (p = mlock_dereference(*pp, br)) != NULL;
- pp = &p->next) {
- if (p->port != port)
- continue;
-
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB);
-
- if (!mp->ports && !mp->mglist &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
- }
- goto out;
- }
-
now = jiffies;
time = now + br->multicast_last_member_count *
br->multicast_last_member_interval;
@@ -1556,74 +1574,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
- struct sk_buff *skb2 = skb;
- const struct iphdr *iph;
+ struct sk_buff *skb_trimmed = NULL;
struct igmphdr *ih;
- unsigned int len;
- unsigned int offset;
int err;
- /* We treat OOM as packet loss for now. */
- if (!pskb_may_pull(skb, sizeof(*iph)))
- return -EINVAL;
-
- iph = ip_hdr(skb);
-
- if (iph->ihl < 5 || iph->version != 4)
- return -EINVAL;
-
- if (!pskb_may_pull(skb, ip_hdrlen(skb)))
- return -EINVAL;
-
- iph = ip_hdr(skb);
+ err = ip_mc_check_igmp(skb, &skb_trimmed);
- if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
- return -EINVAL;
-
- if (iph->protocol != IPPROTO_IGMP) {
- if (!ipv4_is_local_multicast(iph->daddr))
+ if (err == -ENOMSG) {
+ if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr))
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
+ } else if (err < 0) {
+ return err;
}
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < ip_hdrlen(skb))
- return -EINVAL;
-
- if (skb->len > len) {
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
- return -ENOMEM;
-
- err = pskb_trim_rcsum(skb2, len);
- if (err)
- goto err_out;
- }
-
- len -= ip_hdrlen(skb2);
- offset = skb_network_offset(skb2) + ip_hdrlen(skb2);
- __skb_pull(skb2, offset);
- skb_reset_transport_header(skb2);
-
- err = -EINVAL;
- if (!pskb_may_pull(skb2, sizeof(*ih)))
- goto out;
-
- switch (skb2->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb2->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb2->csum = 0;
- if (skb_checksum_complete(skb2))
- goto out;
- }
-
- err = 0;
-
BR_INPUT_SKB_CB(skb)->igmp = 1;
- ih = igmp_hdr(skb2);
+ ih = igmp_hdr(skb);
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -1632,21 +1598,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = br_ip4_multicast_add_group(br, port, ih->group, vid);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
- err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
+ err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
break;
case IGMP_HOST_MEMBERSHIP_QUERY:
- err = br_ip4_multicast_query(br, port, skb2, vid);
+ err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
br_ip4_multicast_leave_group(br, port, ih->group, vid);
break;
}
-out:
- __skb_push(skb2, offset);
-err_out:
- if (skb2 != skb)
- kfree_skb(skb2);
+ if (skb_trimmed && skb_trimmed != skb)
+ kfree_skb(skb_trimmed);
+
return err;
}
@@ -1656,138 +1620,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
- struct sk_buff *skb2;
- const struct ipv6hdr *ip6h;
- u8 icmp6_type;
- u8 nexthdr;
- __be16 frag_off;
- unsigned int len;
- int offset;
+ struct sk_buff *skb_trimmed = NULL;
+ struct mld_msg *mld;
int err;
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
- return -EINVAL;
-
- ip6h = ipv6_hdr(skb);
-
- /*
- * We're interested in MLD messages only.
- * - Version is 6
- * - MLD has always Router Alert hop-by-hop option
- * - But we do not support jumbrograms.
- */
- if (ip6h->version != 6)
- return 0;
-
- /* Prevent flooding this packet if there is no listener present */
- if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr))
- BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-
- if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
- ip6h->payload_len == 0)
- return 0;
-
- len = ntohs(ip6h->payload_len) + sizeof(*ip6h);
- if (skb->len < len)
- return -EINVAL;
-
- nexthdr = ip6h->nexthdr;
- offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off);
+ err = ipv6_mc_check_mld(skb, &skb_trimmed);
- if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
+ if (err == -ENOMSG) {
+ if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
-
- /* Okay, we found ICMPv6 header */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
- return -ENOMEM;
-
- err = -EINVAL;
- if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
- goto out;
-
- len -= offset - skb_network_offset(skb2);
-
- __skb_pull(skb2, offset);
- skb_reset_transport_header(skb2);
- skb_postpull_rcsum(skb2, skb_network_header(skb2),
- skb_network_header_len(skb2));
-
- icmp6_type = icmp6_hdr(skb2)->icmp6_type;
-
- switch (icmp6_type) {
- case ICMPV6_MGM_QUERY:
- case ICMPV6_MGM_REPORT:
- case ICMPV6_MGM_REDUCTION:
- case ICMPV6_MLD2_REPORT:
- break;
- default:
- err = 0;
- goto out;
- }
-
- /* Okay, we found MLD message. Check further. */
- if (skb2->len > len) {
- err = pskb_trim_rcsum(skb2, len);
- if (err)
- goto out;
- err = -EINVAL;
- }
-
- ip6h = ipv6_hdr(skb2);
-
- switch (skb2->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len,
- IPPROTO_ICMPV6, skb2->csum))
- break;
- /*FALLTHROUGH*/
- case CHECKSUM_NONE:
- skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
- &ip6h->daddr,
- skb2->len,
- IPPROTO_ICMPV6, 0));
- if (__skb_checksum_complete(skb2))
- goto out;
+ } else if (err < 0) {
+ return err;
}
- err = 0;
-
BR_INPUT_SKB_CB(skb)->igmp = 1;
+ mld = (struct mld_msg *)skb_transport_header(skb);
- switch (icmp6_type) {
+ switch (mld->mld_type) {
case ICMPV6_MGM_REPORT:
- {
- struct mld_msg *mld;
- if (!pskb_may_pull(skb2, sizeof(*mld))) {
- err = -EINVAL;
- goto out;
- }
- mld = (struct mld_msg *)skb_transport_header(skb2);
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
break;
- }
case ICMPV6_MLD2_REPORT:
- err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
+ err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
break;
case ICMPV6_MGM_QUERY:
- err = br_ip6_multicast_query(br, port, skb2, vid);
+ err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
break;
case ICMPV6_MGM_REDUCTION:
- {
- struct mld_msg *mld;
- if (!pskb_may_pull(skb2, sizeof(*mld))) {
- err = -EINVAL;
- goto out;
- }
- mld = (struct mld_msg *)skb_transport_header(skb2);
br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
- }
+ break;
}
-out:
- kfree_skb(skb2);
+ if (skb_trimmed && skb_trimmed != skb)
+ kfree_skb(skb_trimmed);
+
return err;
}
#endif
@@ -1903,12 +1771,6 @@ void br_multicast_open(struct net_bridge *br)
void br_multicast_stop(struct net_bridge *br)
{
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- struct hlist_node *n;
- u32 ver;
- int i;
-
del_timer_sync(&br->multicast_router_timer);
del_timer_sync(&br->ip4_other_query.timer);
del_timer_sync(&br->ip4_own_query.timer);
@@ -1916,6 +1778,15 @@ void br_multicast_stop(struct net_bridge *br)
del_timer_sync(&br->ip6_other_query.timer);
del_timer_sync(&br->ip6_own_query.timer);
#endif
+}
+
+void br_multicast_dev_del(struct net_bridge *br)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct hlist_node *n;
+ u32 ver;
+ int i;
spin_lock_bh(&br->multicast_lock);
mdb = mlock_dereference(br->mdb, br);
@@ -1949,11 +1820,9 @@ out:
int br_multicast_set_router(struct net_bridge *br, unsigned long val)
{
- int err = -ENOENT;
+ int err = -EINVAL;
spin_lock_bh(&br->multicast_lock);
- if (!netif_running(br->dev))
- goto unlock;
switch (val) {
case 0:
@@ -1964,13 +1833,8 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
br->multicast_router = val;
err = 0;
break;
-
- default:
- err = -EINVAL;
- break;
}
-unlock:
spin_unlock_bh(&br->multicast_lock);
return err;
@@ -1979,11 +1843,9 @@ unlock:
int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
{
struct net_bridge *br = p->br;
- int err = -ENOENT;
+ int err = -EINVAL;
spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED)
- goto unlock;
switch (val) {
case 0:
@@ -1992,8 +1854,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
p->multicast_router = val;
err = 0;
- if (val < 2 && !hlist_unhashed(&p->rlist))
+ if (val < 2 && !hlist_unhashed(&p->rlist)) {
hlist_del_init_rcu(&p->rlist);
+ br_rtr_notify(br->dev, p, RTM_DELMDB);
+ }
if (val == 1)
break;
@@ -2005,13 +1869,8 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
br_multicast_add_router(br, p);
break;
-
- default:
- err = -EINVAL;
- break;
}
-unlock:
spin_unlock(&br->multicast_lock);
return err;
@@ -2116,15 +1975,11 @@ unlock:
int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val)
{
- int err = -ENOENT;
+ int err = -EINVAL;
u32 old;
struct net_bridge_mdb_htable *mdb;
spin_lock_bh(&br->multicast_lock);
- if (!netif_running(br->dev))
- goto unlock;
-
- err = -EINVAL;
if (!is_power_of_2(val))
goto unlock;
diff --git a/kernel/net/bridge/br_netfilter.c b/kernel/net/bridge/br_netfilter_hooks.c
index 60ddfbeb4..7ddbe7ec8 100644
--- a/kernel/net/bridge/br_netfilter.c
+++ b/kernel/net/bridge/br_netfilter_hooks.c
@@ -34,6 +34,7 @@
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/addrconf.h>
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
@@ -48,9 +49,9 @@ static struct ctl_table_header *brnf_sysctl_header;
static int brnf_call_iptables __read_mostly = 1;
static int brnf_call_ip6tables __read_mostly = 1;
static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly = 0;
-static int brnf_filter_pppoe_tagged __read_mostly = 0;
-static int brnf_pass_vlan_indev __read_mostly = 0;
+static int brnf_filter_vlan_tagged __read_mostly;
+static int brnf_filter_pppoe_tagged __read_mostly;
+static int brnf_pass_vlan_indev __read_mostly;
#else
#define brnf_call_iptables 1
#define brnf_call_ip6tables 1
@@ -110,27 +111,22 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
/* largest possible L2 header, see br_nf_dev_queue_xmit() */
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
struct brnf_frag_data {
char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
u8 encap_size;
u8 size;
+ u16 vlan_tci;
+ __be16 vlan_proto;
};
static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
-#endif
-static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb)
+static void nf_bridge_info_free(struct sk_buff *skb)
{
- return skb->nf_bridge;
-}
-
-static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
-{
- struct net_bridge_port *port;
-
- port = br_port_get_rcu(dev);
- return port ? &port->br->fake_rtable : NULL;
+ if (skb->nf_bridge) {
+ nf_bridge_put(skb->nf_bridge);
+ skb->nf_bridge = NULL;
+ }
}
static inline struct net_device *bridge_parent(const struct net_device *dev)
@@ -141,15 +137,6 @@ static inline struct net_device *bridge_parent(const struct net_device *dev)
return port ? port->br->dev : NULL;
}
-static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
-{
- skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
- if (likely(skb->nf_bridge))
- atomic_set(&(skb->nf_bridge->use), 1);
-
- return skb->nf_bridge;
-}
-
static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
{
struct nf_bridge_info *nf_bridge = skb->nf_bridge;
@@ -167,7 +154,7 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
return nf_bridge;
}
-static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
+unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
{
switch (skb->protocol) {
case __cpu_to_be16(ETH_P_8021Q):
@@ -179,14 +166,6 @@ static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
}
}
-static inline void nf_bridge_push_encap_header(struct sk_buff *skb)
-{
- unsigned int len = nf_bridge_encap_header_len(skb);
-
- skb_push(skb, len);
- skb->network_header -= len;
-}
-
static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
{
unsigned int len = nf_bridge_encap_header_len(skb);
@@ -208,10 +187,9 @@ static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
* expected format
*/
-static int br_parse_ip_options(struct sk_buff *skb)
+static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
{
const struct iphdr *iph;
- struct net_device *dev = skb->dev;
u32 len;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
@@ -232,13 +210,13 @@ static int br_parse_ip_options(struct sk_buff *skb)
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -251,12 +229,12 @@ static int br_parse_ip_options(struct sk_buff *skb)
return 0;
inhdr_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
drop:
return -1;
}
-static void nf_bridge_update_protocol(struct sk_buff *skb)
+void nf_bridge_update_protocol(struct sk_buff *skb)
{
switch (skb->nf_bridge->orig_proto) {
case BRNF_PROTO_8021Q:
@@ -270,43 +248,12 @@ static void nf_bridge_update_protocol(struct sk_buff *skb)
}
}
-/* PF_BRIDGE/PRE_ROUTING *********************************************/
-/* Undo the changes made for ip6tables PREROUTING and continue the
- * bridge PRE_ROUTING hook. */
-static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
-{
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
- struct rtable *rt;
-
- if (nf_bridge->pkt_otherhost) {
- skb->pkt_type = PACKET_OTHERHOST;
- nf_bridge->pkt_otherhost = false;
- }
- nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-
- rt = bridge_parent_rtable(nf_bridge->physindev);
- if (!rt) {
- kfree_skb(skb);
- return 0;
- }
- skb_dst_set_noref(skb, &rt->dst);
-
- skb->dev = nf_bridge->physindev;
- nf_bridge_update_protocol(skb);
- nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
- skb->dev, NULL,
- br_handle_frame_finish, 1);
-
- return 0;
-}
-
/* Obtain the correct destination MAC address, while preserving the original
* source MAC address. If we already know this address, we just copy it. If we
* don't, we use the neighbour framework to find out. In both cases, we make
* sure that br_handle_frame_finish() is called afterwards.
*/
-static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
+int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct neighbour *neigh;
struct dst_entry *dst;
@@ -323,7 +270,7 @@ static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
if (neigh->hh.hh_len) {
neigh_hh_bridge(&neigh->hh, skb);
skb->dev = nf_bridge->physindev;
- ret = br_handle_frame_finish(sk, skb);
+ ret = br_handle_frame_finish(net, sk, skb);
} else {
/* the neighbour function below overwrites the complete
* MAC header, so we save the Ethernet source address and
@@ -334,7 +281,7 @@ static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
nf_bridge->neigh_header,
ETH_HLEN-ETH_ALEN);
/* tell br_dev_xmit to continue with forwarding */
- nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+ nf_bridge->bridged_dnat = 1;
/* FIXME Need to refragment */
ret = neigh->output(neigh, skb);
}
@@ -346,8 +293,9 @@ free_skb:
return 0;
}
-static bool daddr_was_changed(const struct sk_buff *skb,
- const struct nf_bridge_info *nf_bridge)
+static inline bool
+br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
+ const struct nf_bridge_info *nf_bridge)
{
return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
}
@@ -391,24 +339,22 @@ static bool daddr_was_changed(const struct sk_buff *skb,
* device, we proceed as if ip_route_input() succeeded. If it differs from the
* logical bridge port or if ip_route_output_key() fails we drop the packet.
*/
-static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
+static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct iphdr *iph = ip_hdr(skb);
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct rtable *rt;
int err;
- int frag_max_size;
- frag_max_size = IPCB(skb)->frag_max_size;
- BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
+ nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
if (nf_bridge->pkt_otherhost) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->pkt_otherhost = false;
}
- nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
- if (daddr_was_changed(skb, nf_bridge)) {
+ nf_bridge->in_prerouting = 0;
+ if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -422,7 +368,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
goto free_skb;
- rt = ip_route_output(dev_net(dev), iph->daddr, 0,
+ rt = ip_route_output(net, iph->daddr, 0,
RT_TOS(iph->tos), 0);
if (!IS_ERR(rt)) {
/* - Bridged-and-DNAT'ed traffic doesn't
@@ -444,7 +390,7 @@ bridged_dnat:
nf_bridge_push_encap_header(skb);
NF_HOOK_THRESH(NFPROTO_BRIDGE,
NF_BR_PRE_ROUTING,
- sk, skb, skb->dev, NULL,
+ net, sk, skb, skb->dev, NULL,
br_nf_pre_routing_finish_bridge,
1);
return 0;
@@ -464,7 +410,7 @@ bridged_dnat:
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
skb->dev, NULL,
br_handle_frame_finish, 1);
@@ -486,7 +432,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
}
/* Some common code for IPv4/IPv6 */
-static struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
@@ -495,7 +441,7 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
nf_bridge->pkt_otherhost = true;
}
- nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+ nf_bridge->in_prerouting = 1;
nf_bridge->physindev = skb->dev;
skb->dev = brnf_get_logical_dev(skb, skb->dev);
@@ -509,113 +455,13 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
return skb->dev;
}
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
- unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
- u32 pkt_len;
- const unsigned char *nh = skb_network_header(skb);
- int off = raw - nh;
- int len = (raw[1] + 1) << 3;
-
- if ((raw + len) - skb->data > skb_headlen(skb))
- goto bad;
-
- off += 2;
- len -= 2;
-
- while (len > 0) {
- int optlen = nh[off + 1] + 2;
-
- switch (nh[off]) {
- case IPV6_TLV_PAD1:
- optlen = 1;
- break;
-
- case IPV6_TLV_PADN:
- break;
-
- case IPV6_TLV_JUMBO:
- if (nh[off + 1] != 4 || (off & 3) != 2)
- goto bad;
- pkt_len = ntohl(*(__be32 *) (nh + off + 2));
- if (pkt_len <= IPV6_MAXPLEN ||
- ipv6_hdr(skb)->payload_len)
- goto bad;
- if (pkt_len > skb->len - sizeof(struct ipv6hdr))
- goto bad;
- if (pskb_trim_rcsum(skb,
- pkt_len + sizeof(struct ipv6hdr)))
- goto bad;
- nh = skb_network_header(skb);
- break;
- default:
- if (optlen > len)
- goto bad;
- break;
- }
- off += optlen;
- len -= optlen;
- }
- if (len == 0)
- return 0;
-bad:
- return -1;
-
-}
-
-/* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
-static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- const struct ipv6hdr *hdr;
- u32 pkt_len;
-
- if (skb->len < sizeof(struct ipv6hdr))
- return NF_DROP;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- return NF_DROP;
-
- hdr = ipv6_hdr(skb);
-
- if (hdr->version != 6)
- return NF_DROP;
-
- pkt_len = ntohs(hdr->payload_len);
-
- if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- return NF_DROP;
- if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
- return NF_DROP;
- }
- if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
- return NF_DROP;
-
- nf_bridge_put(skb->nf_bridge);
- if (!nf_bridge_alloc(skb))
- return NF_DROP;
- if (!setup_pre_routing(skb))
- return NF_DROP;
-
- skb->protocol = htons(ETH_P_IPV6);
- NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb,
- skb->dev, NULL,
- br_nf_pre_routing_finish_ipv6);
-
- return NF_STOLEN;
-}
-
/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
* Replicate the checks that IPv4 does on packet reception.
* Set skb->dev to the bridge device (i.e. parent of the
* receiving device) to make netfilter happy, the REDIRECT
* target in particular. Save the original destination IP
* address to be able to detect DNAT afterwards. */
-static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
+static unsigned int br_nf_pre_routing(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -637,7 +483,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
- return br_nf_pre_routing_ipv6(ops, skb, state);
+ return br_nf_pre_routing_ipv6(priv, skb, state);
}
if (!brnf_call_iptables && !br->nf_call_iptables)
@@ -648,7 +494,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
nf_bridge_pull_encap_header_rcsum(skb);
- if (br_parse_ip_options(skb))
+ if (br_validate_ipv4(state->net, skb))
return NF_DROP;
nf_bridge_put(skb->nf_bridge);
@@ -662,7 +508,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
skb->protocol = htons(ETH_P_IP);
- NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
skb->dev, NULL,
br_nf_pre_routing_finish);
@@ -677,7 +523,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
* took place when the packet entered the bridge), but we
* register an IPv4 PRE_ROUTING 'sabotage' hook that will
* prevent this from happening. */
-static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
+static unsigned int br_nf_local_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -686,18 +532,18 @@ static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
}
/* PF_BRIDGE/FORWARD *************************************************/
-static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
+static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct net_device *in;
if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
- int frag_max_size;
- if (skb->protocol == htons(ETH_P_IP)) {
- frag_max_size = IPCB(skb)->frag_max_size;
- BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
- }
+ if (skb->protocol == htons(ETH_P_IP))
+ nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
in = nf_bridge->physindev;
if (nf_bridge->pkt_otherhost) {
@@ -710,7 +556,7 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
}
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb,
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb,
in, skb->dev, br_forward_finish, 1);
return 0;
}
@@ -721,7 +567,7 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
* but we are still able to filter on the 'real' indev/outdev
* because of the physdev module. For ARP, indev and outdev are the
* bridge ports. */
-static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
+static unsigned int br_nf_forward_ip(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -760,12 +606,15 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
}
if (pf == NFPROTO_IPV4) {
- int frag_max = BR_INPUT_SKB_CB(skb)->frag_max_size;
-
- if (br_parse_ip_options(skb))
+ if (br_validate_ipv4(state->net, skb))
return NF_DROP;
+ IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
+ }
- IPCB(skb)->frag_max_size = frag_max;
+ if (pf == NFPROTO_IPV6) {
+ if (br_validate_ipv6(state->net, skb))
+ return NF_DROP;
+ IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
}
nf_bridge->physoutdev = skb->dev;
@@ -774,14 +623,14 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
else
skb->protocol = htons(ETH_P_IPV6);
- NF_HOOK(pf, NF_INET_FORWARD, NULL, skb,
+ NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
brnf_get_logical_dev(skb, state->in),
parent, br_nf_forward_finish);
return NF_STOLEN;
}
-static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
+static unsigned int br_nf_forward_arp(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -809,14 +658,13 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
return NF_ACCEPT;
}
*d = state->in;
- NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb,
+ NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb,
state->in, state->out, br_nf_forward_finish);
return NF_STOLEN;
}
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
-static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
+static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct brnf_frag_data *data;
int err;
@@ -829,59 +677,117 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
return 0;
}
+ if (data->vlan_tci) {
+ skb->vlan_tci = data->vlan_tci;
+ skb->vlan_proto = data->vlan_proto;
+ }
+
skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
__skb_push(skb, data->encap_size);
- return br_dev_queue_push_xmit(sk, skb);
+ nf_bridge_info_free(skb);
+ return br_dev_queue_push_xmit(net, sk, skb);
}
-static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
+static int
+br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
- int ret;
- int frag_max_size;
- unsigned int mtu_reserved;
+ unsigned int mtu = ip_skb_dst_mtu(skb);
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(net, sk, skb, output);
+}
+
+static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
+{
+ if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
+ return PPPOE_SES_HLEN;
+ return 0;
+}
- if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP))
- return br_dev_queue_push_xmit(sk, skb);
+static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge;
+ unsigned int mtu_reserved;
mtu_reserved = nf_bridge_mtu_reduction(skb);
+
+ if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) {
+ nf_bridge_info_free(skb);
+ return br_dev_queue_push_xmit(net, sk, skb);
+ }
+
+ nf_bridge = nf_bridge_info_get(skb);
+
/* This is wrong! We should preserve the original fragment
* boundaries by preserving frag_list rather than refragmenting.
*/
- if (skb->len + mtu_reserved > skb->dev->mtu) {
+ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) &&
+ skb->protocol == htons(ETH_P_IP)) {
struct brnf_frag_data *data;
- frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
- if (br_parse_ip_options(skb))
- /* Drop invalid packet */
- return NF_DROP;
- IPCB(skb)->frag_max_size = frag_max_size;
+ if (br_validate_ipv4(net, skb))
+ goto drop;
+
+ IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
nf_bridge_update_protocol(skb);
data = this_cpu_ptr(&brnf_frag_data_storage);
+
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
- ret = ip_fragment(sk, skb, br_nf_push_frag_xmit);
- } else {
- ret = br_dev_queue_push_xmit(sk, skb);
+ return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
}
+ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6)) {
+ const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+ struct brnf_frag_data *data;
- return ret;
-}
-#else
-static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
-{
- return br_dev_queue_push_xmit(sk, skb);
+ if (br_validate_ipv6(net, skb))
+ goto drop;
+
+ IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+
+ nf_bridge_update_protocol(skb);
+
+ data = this_cpu_ptr(&brnf_frag_data_storage);
+ data->encap_size = nf_bridge_encap_header_len(skb);
+ data->size = ETH_HLEN + data->encap_size;
+
+ skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
+ data->size);
+
+ if (v6ops)
+ return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
+
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ nf_bridge_info_free(skb);
+ return br_dev_queue_push_xmit(net, sk, skb);
+ drop:
+ kfree_skb(skb);
+ return 0;
}
-#endif
/* PF_BRIDGE/POST_ROUTING ********************************************/
-static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+static unsigned int br_nf_post_routing(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -920,7 +826,7 @@ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
else
skb->protocol = htons(ETH_P_IPV6);
- NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb,
+ NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb,
NULL, realoutdev,
br_nf_dev_queue_xmit);
@@ -930,14 +836,12 @@ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
/* IP/SABOTAGE *****************************************************/
/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
* for the second time. */
-static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
+static unsigned int ip_sabotage_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge &&
- !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+ if (skb->nf_bridge && !skb->nf_bridge->in_prerouting)
return NF_STOP;
- }
return NF_ACCEPT;
}
@@ -956,7 +860,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
skb_pull(skb, ETH_HLEN);
- nf_bridge->mask &= ~BRNF_BRIDGED_DNAT;
+ nf_bridge->bridged_dnat = 0;
BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
@@ -964,12 +868,14 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
nf_bridge->neigh_header,
ETH_HLEN - ETH_ALEN);
skb->dev = nf_bridge->physindev;
- br_handle_frame_finish(NULL, skb);
+
+ nf_bridge->physoutdev = NULL;
+ br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
}
static int br_nf_dev_xmit(struct sk_buff *skb)
{
- if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+ if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) {
br_nf_pre_routing_finish_bridge_slow(skb);
return 1;
}
@@ -990,49 +896,42 @@ EXPORT_SYMBOL_GPL(br_netfilter_enable);
static struct nf_hook_ops br_nf_ops[] __read_mostly = {
{
.hook = br_nf_pre_routing,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF,
},
{
.hook = br_nf_local_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_BRNF,
},
{
.hook = br_nf_forward_ip,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF - 1,
},
{
.hook = br_nf_forward_arp,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF,
},
{
.hook = br_nf_post_routing,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_LAST,
},
{
.hook = ip_sabotage_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_FIRST,
},
{
.hook = ip_sabotage_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_FIRST,
diff --git a/kernel/net/bridge/br_netfilter_ipv6.c b/kernel/net/bridge/br_netfilter_ipv6.c
new file mode 100644
index 000000000..d61f56efc
--- /dev/null
+++ b/kernel/net/bridge/br_netfilter_ipv6.c
@@ -0,0 +1,244 @@
+/*
+ * Handle firewalling
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/route.h>
+#include <net/netfilter/br_netfilter.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
+ * anyway
+ */
+static int br_nf_check_hbh_len(struct sk_buff *skb)
+{
+ unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+ u32 pkt_len;
+ const unsigned char *nh = skb_network_header(skb);
+ int off = raw - nh;
+ int len = (raw[1] + 1) << 3;
+
+ if ((raw + len) - skb->data > skb_headlen(skb))
+ goto bad;
+
+ off += 2;
+ len -= 2;
+
+ while (len > 0) {
+ int optlen = nh[off + 1] + 2;
+
+ switch (nh[off]) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+
+ case IPV6_TLV_PADN:
+ break;
+
+ case IPV6_TLV_JUMBO:
+ if (nh[off + 1] != 4 || (off & 3) != 2)
+ goto bad;
+ pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+ if (pkt_len <= IPV6_MAXPLEN ||
+ ipv6_hdr(skb)->payload_len)
+ goto bad;
+ if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+ goto bad;
+ if (pskb_trim_rcsum(skb,
+ pkt_len + sizeof(struct ipv6hdr)))
+ goto bad;
+ nh = skb_network_header(skb);
+ break;
+ default:
+ if (optlen > len)
+ goto bad;
+ break;
+ }
+ off += optlen;
+ len -= optlen;
+ }
+ if (len == 0)
+ return 0;
+bad:
+ return -1;
+}
+
+int br_validate_ipv6(struct net *net, struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+ u32 pkt_len;
+ u8 ip6h_len = sizeof(struct ipv6hdr);
+
+ if (!pskb_may_pull(skb, ip6h_len))
+ goto inhdr_error;
+
+ if (skb->len < ip6h_len)
+ goto drop;
+
+ hdr = ipv6_hdr(skb);
+
+ if (hdr->version != 6)
+ goto inhdr_error;
+
+ pkt_len = ntohs(hdr->payload_len);
+
+ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+ if (pkt_len + ip6h_len > skb->len) {
+ IP6_INC_STATS_BH(net, idev,
+ IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ }
+ if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+ IP6_INC_STATS_BH(net, idev,
+ IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ }
+ if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
+ goto drop;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ /* No IP options in IPv6 header; however it should be
+ * checked if some next headers need special treatment
+ */
+ return 0;
+
+inhdr_error:
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+ return -1;
+}
+
+static inline bool
+br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb,
+ const struct nf_bridge_info *nf_bridge)
+{
+ return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr,
+ sizeof(ipv6_hdr(skb)->daddr)) != 0;
+}
+
+/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables
+ * PREROUTING and continue the bridge PRE_ROUTING hook. See comment
+ * for br_nf_pre_routing_finish(), same logic is used here but
+ * equivalent IPv6 function ip6_route_input() called indirectly.
+ */
+static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct rtable *rt;
+ struct net_device *dev = skb->dev;
+ const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+
+ nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge->in_prerouting = 0;
+ if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) {
+ skb_dst_drop(skb);
+ v6ops->route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (skb_dst(skb)->dev == dev) {
+ skb->dev = nf_bridge->physindev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev, NULL,
+ br_nf_pre_routing_finish_bridge,
+ 1);
+ return 0;
+ }
+ ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
+ skb->pkt_type = PACKET_HOST;
+ } else {
+ rt = bridge_parent_rtable(nf_bridge->physindev);
+ if (!rt) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb_dst_set_noref(skb, &rt->dst);
+ }
+
+ skb->dev = nf_bridge->physindev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
+ skb->dev, NULL,
+ br_handle_frame_finish, 1);
+
+ return 0;
+}
+
+/* Replicate the checks that IPv6 does on packet reception and pass the packet
+ * to ip6tables.
+ */
+unsigned int br_nf_pre_routing_ipv6(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge;
+
+ if (br_validate_ipv6(state->net, skb))
+ return NF_DROP;
+
+ nf_bridge_put(skb->nf_bridge);
+ if (!nf_bridge_alloc(skb))
+ return NF_DROP;
+ if (!setup_pre_routing(skb))
+ return NF_DROP;
+
+ nf_bridge = nf_bridge_info_get(skb);
+ nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
+ skb->dev, NULL,
+ br_nf_pre_routing_finish_ipv6);
+
+ return NF_STOLEN;
+}
diff --git a/kernel/net/bridge/br_netlink.c b/kernel/net/bridge/br_netlink.c
index a7559ef31..40197ff89 100644
--- a/kernel/net/bridge/br_netlink.c
+++ b/kernel/net/bridge/br_netlink.c
@@ -16,42 +16,40 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
-#include <net/switchdev.h>
#include <uapi/linux/if_bridge.h>
#include "br_private.h"
#include "br_private_stp.h"
-static int br_get_num_vlan_infos(const struct net_port_vlans *pv,
- u32 filter_mask)
+static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
+ u32 filter_mask)
{
- u16 vid_range_start = 0, vid_range_end = 0;
- u16 vid_range_flags = 0;
- u16 pvid, vid, flags;
+ struct net_bridge_vlan *v;
+ u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0;
+ u16 flags, pvid;
int num_vlans = 0;
- if (filter_mask & RTEXT_FILTER_BRVLAN)
- return pv->num_vlans;
-
if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
return 0;
- /* Count number of vlan info's
- */
- pvid = br_get_pvid(pv);
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ pvid = br_get_pvid(vg);
+ /* Count number of vlan infos */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
flags = 0;
- if (vid == pvid)
+ /* only a context, bridge vlan not activated */
+ if (!br_vlan_should_use(v))
+ continue;
+ if (v->vid == pvid)
flags |= BRIDGE_VLAN_INFO_PVID;
- if (test_bit(vid, pv->untagged_bitmap))
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
flags |= BRIDGE_VLAN_INFO_UNTAGGED;
if (vid_range_start == 0) {
goto initvars;
- } else if ((vid - vid_range_end) == 1 &&
+ } else if ((v->vid - vid_range_end) == 1 &&
flags == vid_range_flags) {
- vid_range_end = vid;
+ vid_range_end = v->vid;
continue;
} else {
if ((vid_range_end - vid_range_start) > 0)
@@ -60,8 +58,8 @@ static int br_get_num_vlan_infos(const struct net_port_vlans *pv,
num_vlans += 1;
}
initvars:
- vid_range_start = vid;
- vid_range_end = vid;
+ vid_range_start = v->vid;
+ vid_range_end = v->vid;
vid_range_flags = flags;
}
@@ -75,28 +73,43 @@ initvars:
return num_vlans;
}
+static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg,
+ u32 filter_mask)
+{
+ int num_vlans;
+
+ if (!vg)
+ return 0;
+
+ if (filter_mask & RTEXT_FILTER_BRVLAN)
+ return vg->num_vlans;
+
+ rcu_read_lock();
+ num_vlans = __get_num_vlan_infos(vg, filter_mask);
+ rcu_read_unlock();
+
+ return num_vlans;
+}
+
static size_t br_get_link_af_size_filtered(const struct net_device *dev,
u32 filter_mask)
{
- struct net_port_vlans *pv;
+ struct net_bridge_vlan_group *vg = NULL;
+ struct net_bridge_port *p;
+ struct net_bridge *br;
int num_vlan_infos;
rcu_read_lock();
- if (br_port_exists(dev))
- pv = nbp_get_vlan_info(br_port_get_rcu(dev));
- else if (dev->priv_flags & IFF_EBRIDGE)
- pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev));
- else
- pv = NULL;
- if (pv)
- num_vlan_infos = br_get_num_vlan_infos(pv, filter_mask);
- else
- num_vlan_infos = 0;
+ if (br_port_exists(dev)) {
+ p = br_port_get_rcu(dev);
+ vg = nbp_vlan_group_rcu(p);
+ } else if (dev->priv_flags & IFF_EBRIDGE) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group_rcu(br);
+ }
+ num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
rcu_read_unlock();
- if (!num_vlan_infos)
- return 0;
-
/* Each VLAN is returned in bridge_vlan_info along with flags */
return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
}
@@ -114,6 +127,20 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
+ nla_total_size(1) /* IFLA_BRPORT_PROXYARP */
+ nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
+ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */
+ + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */
+ + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */
+ + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */
+#endif
+ 0;
}
@@ -135,6 +162,7 @@ static int br_port_fill_attrs(struct sk_buff *skb,
const struct net_bridge_port *p)
{
u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
+ u64 timerval;
if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) ||
@@ -147,9 +175,36 @@ static int br_port_fill_attrs(struct sk_buff *skb,
nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) ||
nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
- !!(p->flags & BR_PROXYARP_WIFI)))
+ !!(p->flags & BR_PROXYARP_WIFI)) ||
+ nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id),
+ &p->designated_root) ||
+ nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id),
+ &p->designated_bridge) ||
+ nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) ||
+ nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) ||
+ nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) ||
+ nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
+ nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
+ p->topology_change_ack) ||
+ nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending))
+ return -EMSGSIZE;
+
+ timerval = br_timer_value(&p->message_age_timer);
+ if (nla_put_u64(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval))
+ return -EMSGSIZE;
+ timerval = br_timer_value(&p->forward_delay_timer);
+ if (nla_put_u64(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval))
+ return -EMSGSIZE;
+ timerval = br_timer_value(&p->hold_timer);
+ if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval))
return -EMSGSIZE;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER,
+ p->multicast_router))
+ return -EMSGSIZE;
+#endif
+
return 0;
}
@@ -166,8 +221,6 @@ static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start,
sizeof(vinfo), &vinfo))
goto nla_put_failure;
- vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
-
vinfo.vid = vid_end;
vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END;
if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
@@ -188,31 +241,33 @@ nla_put_failure:
}
static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb,
- const struct net_port_vlans *pv)
+ struct net_bridge_vlan_group *vg)
{
- u16 vid_range_start = 0, vid_range_end = 0;
- u16 vid_range_flags = 0;
- u16 pvid, vid, flags;
+ struct net_bridge_vlan *v;
+ u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0;
+ u16 flags, pvid;
int err = 0;
/* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan
* and mark vlan info with begin and end flags
* if vlaninfo represents a range
*/
- pvid = br_get_pvid(pv);
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ pvid = br_get_pvid(vg);
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
flags = 0;
- if (vid == pvid)
+ if (!br_vlan_should_use(v))
+ continue;
+ if (v->vid == pvid)
flags |= BRIDGE_VLAN_INFO_PVID;
- if (test_bit(vid, pv->untagged_bitmap))
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
flags |= BRIDGE_VLAN_INFO_UNTAGGED;
if (vid_range_start == 0) {
goto initvars;
- } else if ((vid - vid_range_end) == 1 &&
+ } else if ((v->vid - vid_range_end) == 1 &&
flags == vid_range_flags) {
- vid_range_end = vid;
+ vid_range_end = v->vid;
continue;
} else {
err = br_fill_ifvlaninfo_range(skb, vid_range_start,
@@ -223,8 +278,8 @@ static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb,
}
initvars:
- vid_range_start = vid;
- vid_range_end = vid;
+ vid_range_start = v->vid;
+ vid_range_end = v->vid;
vid_range_flags = flags;
}
@@ -241,19 +296,23 @@ initvars:
}
static int br_fill_ifvlaninfo(struct sk_buff *skb,
- const struct net_port_vlans *pv)
+ struct net_bridge_vlan_group *vg)
{
struct bridge_vlan_info vinfo;
- u16 pvid, vid;
+ struct net_bridge_vlan *v;
+ u16 pvid;
+
+ pvid = br_get_pvid(vg);
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
- pvid = br_get_pvid(pv);
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
- vinfo.vid = vid;
+ vinfo.vid = v->vid;
vinfo.flags = 0;
- if (vid == pvid)
+ if (v->vid == pvid)
vinfo.flags |= BRIDGE_VLAN_INFO_PVID;
- if (test_bit(vid, pv->untagged_bitmap))
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
@@ -272,11 +331,11 @@ nla_put_failure:
* Contains port and master info as well as carrier and bridge state.
*/
static int br_fill_ifinfo(struct sk_buff *skb,
- const struct net_bridge_port *port,
+ struct net_bridge_port *port,
u32 pid, u32 seq, int event, unsigned int flags,
u32 filter_mask, const struct net_device *dev)
{
- const struct net_bridge *br;
+ struct net_bridge *br;
struct ifinfomsg *hdr;
struct nlmsghdr *nlh;
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
@@ -323,26 +382,31 @@ static int br_fill_ifinfo(struct sk_buff *skb,
/* Check if the VID information is requested */
if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
- const struct net_port_vlans *pv;
+ struct net_bridge_vlan_group *vg;
struct nlattr *af;
int err;
+ /* RCU needed because of the VLAN locking rules (rcu || rtnl) */
+ rcu_read_lock();
if (port)
- pv = nbp_get_vlan_info(port);
+ vg = nbp_vlan_group_rcu(port);
else
- pv = br_get_vlan_info(br);
+ vg = br_vlan_group_rcu(br);
- if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID))
+ if (!vg || !vg->num_vlans) {
+ rcu_read_unlock();
goto done;
-
+ }
af = nla_nest_start(skb, IFLA_AF_SPEC);
- if (!af)
+ if (!af) {
+ rcu_read_unlock();
goto nla_put_failure;
-
+ }
if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
- err = br_fill_ifvlaninfo_compressed(skb, pv);
+ err = br_fill_ifvlaninfo_compressed(skb, vg);
else
- err = br_fill_ifvlaninfo(skb, pv);
+ err = br_fill_ifvlaninfo(skb, vg);
+ rcu_read_unlock();
if (err)
goto nla_put_failure;
nla_nest_end(skb, af);
@@ -416,14 +480,14 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
switch (cmd) {
case RTM_SETLINK:
if (p) {
+ /* if the MASTER flag is set this will act on the global
+ * per-VLAN entry as well
+ */
err = nbp_vlan_add(p, vinfo->vid, vinfo->flags);
if (err)
break;
-
- if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER)
- err = br_vlan_add(p->br, vinfo->vid,
- vinfo->flags);
} else {
+ vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY;
err = br_vlan_add(br, vinfo->vid, vinfo->flags);
}
break;
@@ -459,10 +523,15 @@ static int br_afspec(struct net_bridge *br,
if (nla_len(attr) != sizeof(struct bridge_vlan_info))
return -EINVAL;
vinfo = nla_data(attr);
+ if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
+ return -EINVAL;
if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
if (vinfo_start)
return -EINVAL;
vinfo_start = vinfo;
+ /* don't allow range of pvids */
+ if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID)
+ return -EINVAL;
continue;
}
@@ -508,6 +577,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
[IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 },
[IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
};
/* Change the state of the port and notify spanning tree */
@@ -579,6 +649,18 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
return err;
}
+ if (tb[IFLA_BRPORT_FLUSH])
+ br_fdb_delete_by_port(p->br, p, 0, 0);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) {
+ u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]);
+
+ err = br_multicast_set_port_router(p, mcast_router);
+ if (err)
+ return err;
+ }
+#endif
br_port_flags_change(p, old_flags ^ p->flags);
return 0;
}
@@ -590,7 +672,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
struct nlattr *afspec;
struct net_bridge_port *p;
struct nlattr *tb[IFLA_BRPORT_MAX + 1];
- int err = 0, ret_offload = 0;
+ int err = 0;
protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO);
afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
@@ -632,16 +714,6 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
afspec, RTM_SETLINK);
}
- if (p && !(flags & BRIDGE_FLAGS_SELF)) {
- /* set bridge attributes in hardware if supported
- */
- ret_offload = netdev_switch_port_bridge_setlink(dev, nlh,
- flags);
- if (ret_offload && ret_offload != -EOPNOTSUPP)
- br_warn(p->br, "error setting attrs on port %u(%s)\n",
- (unsigned int)p->port_no, p->dev->name);
- }
-
if (err == 0)
br_ifinfo_notify(RTM_NEWLINK, p);
out:
@@ -653,7 +725,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
{
struct nlattr *afspec;
struct net_bridge_port *p;
- int err = 0, ret_offload = 0;
+ int err = 0;
afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (!afspec)
@@ -672,16 +744,6 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
*/
br_ifinfo_notify(RTM_NEWLINK, p);
- if (p && !(flags & BRIDGE_FLAGS_SELF)) {
- /* del bridge attributes in hardware
- */
- ret_offload = netdev_switch_port_bridge_dellink(dev, nlh,
- flags);
- if (ret_offload && ret_offload != -EOPNOTSUPP)
- br_warn(p->br, "error deleting attrs on port %u (%s)\n",
- (unsigned int)p->port_no, p->dev->name);
- }
-
return err;
}
static int br_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -693,6 +755,21 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
return -EADDRNOTAVAIL;
}
+ if (!data)
+ return 0;
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (data[IFLA_BR_VLAN_PROTOCOL]) {
+ switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ }
+#endif
+
return 0;
}
@@ -748,6 +825,29 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_AGEING_TIME] = { .type = NLA_U32 },
[IFLA_BR_STP_STATE] = { .type = NLA_U32 },
[IFLA_BR_PRIORITY] = { .type = NLA_U16 },
+ [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 },
+ [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 },
+ [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 },
+ [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 },
+ [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -778,9 +878,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
}
if (data[IFLA_BR_AGEING_TIME]) {
- u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]);
-
- br->ageing_time = clock_t_to_jiffies(ageing_time);
+ err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME]));
+ if (err)
+ return err;
}
if (data[IFLA_BR_STP_STATE]) {
@@ -795,6 +895,176 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
br_stp_set_bridge_priority(br, priority);
}
+ if (data[IFLA_BR_VLAN_FILTERING]) {
+ u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]);
+
+ err = __br_vlan_filter_toggle(br, vlan_filter);
+ if (err)
+ return err;
+ }
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (data[IFLA_BR_VLAN_PROTOCOL]) {
+ __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]);
+
+ err = __br_vlan_set_proto(br, vlan_proto);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
+ __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
+
+ err = __br_vlan_set_default_pvid(br, defpvid);
+ if (err)
+ return err;
+ }
+#endif
+
+ if (data[IFLA_BR_GROUP_FWD_MASK]) {
+ u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]);
+
+ if (fwd_mask & BR_GROUPFWD_RESTRICTED)
+ return -EINVAL;
+ br->group_fwd_mask = fwd_mask;
+ }
+
+ if (data[IFLA_BR_GROUP_ADDR]) {
+ u8 new_addr[ETH_ALEN];
+
+ if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN)
+ return -EINVAL;
+ memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN);
+ if (!is_link_local_ether_addr(new_addr))
+ return -EINVAL;
+ if (new_addr[5] == 1 || /* 802.3x Pause address */
+ new_addr[5] == 2 || /* 802.3ad Slow protocols */
+ new_addr[5] == 3) /* 802.1X PAE address */
+ return -EINVAL;
+ spin_lock_bh(&br->lock);
+ memcpy(br->group_addr, new_addr, sizeof(br->group_addr));
+ spin_unlock_bh(&br->lock);
+ br->group_addr_set = true;
+ br_recalculate_fwd_mask(br);
+ }
+
+ if (data[IFLA_BR_FDB_FLUSH])
+ br_fdb_flush(br);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (data[IFLA_BR_MCAST_ROUTER]) {
+ u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]);
+
+ err = br_multicast_set_router(br, multicast_router);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_SNOOPING]) {
+ u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]);
+
+ err = br_multicast_toggle(br, mcast_snooping);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) {
+ u8 val;
+
+ val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]);
+ br->multicast_query_use_ifaddr = !!val;
+ }
+
+ if (data[IFLA_BR_MCAST_QUERIER]) {
+ u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]);
+
+ err = br_multicast_set_querier(br, mcast_querier);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) {
+ u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]);
+
+ br->hash_elasticity = val;
+ }
+
+ if (data[IFLA_BR_MCAST_HASH_MAX]) {
+ u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
+
+ err = br_multicast_set_hash_max(br, hash_max);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) {
+ u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]);
+
+ br->multicast_last_member_count = val;
+ }
+
+ if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) {
+ u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]);
+
+ br->multicast_startup_query_count = val;
+ }
+
+ if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]);
+
+ br->multicast_last_member_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]);
+
+ br->multicast_membership_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERIER_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]);
+
+ br->multicast_querier_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERY_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]);
+
+ br->multicast_query_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]);
+
+ br->multicast_query_response_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]);
+
+ br->multicast_startup_query_interval = clock_t_to_jiffies(val);
+ }
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (data[IFLA_BR_NF_CALL_IPTABLES]) {
+ u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]);
+
+ br->nf_call_iptables = val ? true : false;
+ }
+
+ if (data[IFLA_BR_NF_CALL_IP6TABLES]) {
+ u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]);
+
+ br->nf_call_ip6tables = val ? true : false;
+ }
+
+ if (data[IFLA_BR_NF_CALL_ARPTABLES]) {
+ u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]);
+
+ br->nf_call_arptables = val ? true : false;
+ }
+#endif
+
return 0;
}
@@ -806,6 +1076,44 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */
nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */
nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */
+#endif
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */
+ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */
+ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
+ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */
+#endif
0;
}
@@ -818,46 +1126,105 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
u32 ageing_time = jiffies_to_clock_t(br->ageing_time);
u32 stp_enabled = br->stp_enabled;
u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
+ u8 vlan_enabled = br_vlan_enabled(br);
+ u64 clockval;
+
+ clockval = br_timer_value(&br->hello_timer);
+ if (nla_put_u64(skb, IFLA_BR_HELLO_TIMER, clockval))
+ return -EMSGSIZE;
+ clockval = br_timer_value(&br->tcn_timer);
+ if (nla_put_u64(skb, IFLA_BR_TCN_TIMER, clockval))
+ return -EMSGSIZE;
+ clockval = br_timer_value(&br->topology_change_timer);
+ if (nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval))
+ return -EMSGSIZE;
+ clockval = br_timer_value(&br->gc_timer);
+ if (nla_put_u64(skb, IFLA_BR_GC_TIMER, clockval))
+ return -EMSGSIZE;
if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) ||
nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) ||
- nla_put_u16(skb, IFLA_BR_PRIORITY, priority))
+ nla_put_u16(skb, IFLA_BR_PRIORITY, priority) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) ||
+ nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) ||
+ nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id),
+ &br->bridge_id) ||
+ nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id),
+ &br->designated_root) ||
+ nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) ||
+ nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) ||
+ nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) ||
+ nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED,
+ br->topology_change_detected) ||
+ nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr))
return -EMSGSIZE;
- return 0;
-}
-
-static size_t br_get_link_af_size(const struct net_device *dev)
-{
- struct net_port_vlans *pv;
-
- if (br_port_exists(dev))
- pv = nbp_get_vlan_info(br_port_get_rtnl(dev));
- else if (dev->priv_flags & IFF_EBRIDGE)
- pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev));
- else
- return 0;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
+ nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid))
+ return -EMSGSIZE;
+#endif
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
+ br->multicast_query_use_ifaddr) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
+ br->hash_elasticity) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
+ br->multicast_last_member_count) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
+ br->multicast_startup_query_count))
+ return -EMSGSIZE;
- if (!pv)
- return 0;
+ clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
+ if (nla_put_u64(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_membership_interval);
+ if (nla_put_u64(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_querier_interval);
+ if (nla_put_u64(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_query_interval);
+ if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_query_response_interval);
+ if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_startup_query_interval);
+ if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval))
+ return -EMSGSIZE;
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES,
+ br->nf_call_iptables ? 1 : 0) ||
+ nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES,
+ br->nf_call_ip6tables ? 1 : 0) ||
+ nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES,
+ br->nf_call_arptables ? 1 : 0))
+ return -EMSGSIZE;
+#endif
- /* Each VLAN is returned in bridge_vlan_info along with flags */
- return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info));
+ return 0;
}
+
static struct rtnl_af_ops br_af_ops __read_mostly = {
.family = AF_BRIDGE,
- .get_link_af_size = br_get_link_af_size,
+ .get_link_af_size = br_get_link_af_size_filtered,
};
struct rtnl_link_ops br_link_ops __read_mostly = {
.kind = "bridge",
.priv_size = sizeof(struct net_bridge),
.setup = br_dev_setup,
- .maxtype = IFLA_BRPORT_MAX,
+ .maxtype = IFLA_BR_MAX,
.policy = br_policy,
.validate = br_validate,
.newlink = br_dev_newlink,
diff --git a/kernel/net/bridge/br_private.h b/kernel/net/bridge/br_private.h
index 3362c2940..216018c76 100644
--- a/kernel/net/bridge/br_private.h
+++ b/kernel/net/bridge/br_private.h
@@ -18,7 +18,9 @@
#include <linux/netpoll.h>
#include <linux/u64_stats_sync.h>
#include <net/route.h>
+#include <net/ip6_fib.h>
#include <linux/if_vlan.h>
+#include <linux/rhashtable.h>
#define BR_HASH_BITS 8
#define BR_HASH_SIZE (1 << BR_HASH_BITS)
@@ -27,14 +29,13 @@
#define BR_PORT_BITS 10
#define BR_MAX_PORTS (1<<BR_PORT_BITS)
-#define BR_VLAN_BITMAP_LEN BITS_TO_LONGS(VLAN_N_VID)
#define BR_VERSION "2.3"
/* Control of forwarding link local multicast */
#define BR_GROUPFWD_DEFAULT 0
-/* Don't allow forwarding control protocols like STP and LLDP */
-#define BR_GROUPFWD_RESTRICTED 0x4007u
+/* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */
+#define BR_GROUPFWD_RESTRICTED 0x0007u
/* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
#define BR_GROUPFWD_8021AD 0xB801u
@@ -76,17 +77,61 @@ struct bridge_mcast_querier {
};
#endif
-struct net_port_vlans {
- u16 port_idx;
- u16 pvid;
+/**
+ * struct net_bridge_vlan - per-vlan entry
+ *
+ * @vnode: rhashtable member
+ * @vid: VLAN id
+ * @flags: bridge vlan flags
+ * @br: if MASTER flag set, this points to a bridge struct
+ * @port: if MASTER flag unset, this points to a port struct
+ * @refcnt: if MASTER flag set, this is bumped for each port referencing it
+ * @brvlan: if MASTER flag unset, this points to the global per-VLAN context
+ * for this VLAN entry
+ * @vlist: sorted list of VLAN entries
+ * @rcu: used for entry destruction
+ *
+ * This structure is shared between the global per-VLAN entries contained in
+ * the bridge rhashtable and the local per-port per-VLAN entries contained in
+ * the port's rhashtable. The union entries should be interpreted depending on
+ * the entry flags that are set.
+ */
+struct net_bridge_vlan {
+ struct rhash_head vnode;
+ u16 vid;
+ u16 flags;
+ union {
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ };
union {
- struct net_bridge_port *port;
- struct net_bridge *br;
- } parent;
+ atomic_t refcnt;
+ struct net_bridge_vlan *brvlan;
+ };
+ struct list_head vlist;
+
struct rcu_head rcu;
- unsigned long vlan_bitmap[BR_VLAN_BITMAP_LEN];
- unsigned long untagged_bitmap[BR_VLAN_BITMAP_LEN];
+};
+
+/**
+ * struct net_bridge_vlan_group
+ *
+ * @vlan_hash: VLAN entry rhashtable
+ * @vlan_list: sorted VLAN entry list
+ * @num_vlans: number of total VLAN entries
+ * @pvid: PVID VLAN id
+ *
+ * IMPORTANT: Be careful when checking if there're VLAN entries using list
+ * primitives because the bridge can have entries in its list which
+ * are just for global context but not for filtering, i.e. they have
+ * the master flag set but not the brentry flag. If you have to check
+ * if there're "real" entries in the bridge please test @num_vlans
+ */
+struct net_bridge_vlan_group {
+ struct rhashtable vlan_hash;
+ struct list_head vlan_list;
u16 num_vlans;
+ u16 pvid;
};
struct net_bridge_fdb_entry
@@ -94,15 +139,15 @@ struct net_bridge_fdb_entry
struct hlist_node hlist;
struct net_bridge_port *dst;
- struct rcu_head rcu;
unsigned long updated;
unsigned long used;
mac_addr addr;
+ __u16 vlan_id;
unsigned char is_local:1,
is_static:1,
added_by_user:1,
added_by_external_learn:1;
- __u16 vlan_id;
+ struct rcu_head rcu;
};
struct net_bridge_port_group {
@@ -184,7 +229,7 @@ struct net_bridge_port
struct netpoll *np;
#endif
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- struct net_port_vlans __rcu *vlan_info;
+ struct net_bridge_vlan_group __rcu *vlgrp;
#endif
};
@@ -214,7 +259,10 @@ struct net_bridge
spinlock_t hash_lock;
struct hlist_head hash[BR_HASH_SIZE];
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct rtable fake_rtable;
+ union {
+ struct rtable fake_rtable;
+ struct rt6_info fake_rt6_info;
+ };
bool nf_call_iptables;
bool nf_call_ip6tables;
bool nf_call_arptables;
@@ -289,10 +337,10 @@ struct net_bridge
struct kobject *ifobj;
u32 auto_cnt;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ struct net_bridge_vlan_group __rcu *vlgrp;
u8 vlan_enabled;
__be16 vlan_proto;
u16 default_pvid;
- struct net_port_vlans __rcu *vlan_info;
#endif
};
@@ -304,7 +352,6 @@ struct br_input_skb_cb {
int mrouters_only;
#endif
- u16 frag_max_size;
bool proxyarp_replied;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
@@ -341,6 +388,31 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
return !memcmp(&br->bridge_id, &br->designated_root, 8);
}
+/* check if a VLAN entry is global */
+static inline bool br_vlan_is_master(const struct net_bridge_vlan *v)
+{
+ return v->flags & BRIDGE_VLAN_INFO_MASTER;
+}
+
+/* check if a VLAN entry is used by the bridge */
+static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v)
+{
+ return v->flags & BRIDGE_VLAN_INFO_BRENTRY;
+}
+
+/* check if we should use the vlan entry, returns false if it's only context */
+static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
+{
+ if (br_vlan_is_master(v)) {
+ if (br_vlan_is_brentry(v))
+ return true;
+ else
+ return false;
+ }
+
+ return true;
+}
+
/* br_device.c */
void br_dev_setup(struct net_device *dev);
void br_dev_delete(struct net_device *dev, struct list_head *list);
@@ -384,7 +456,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
void br_fdb_cleanup(unsigned long arg);
void br_fdb_delete_by_port(struct net_bridge *br,
- const struct net_bridge_port *p, int do_all);
+ const struct net_bridge_port *p, u16 vid, int do_all);
struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
const unsigned char *addr, __u16 vid);
int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
@@ -410,10 +482,10 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
/* br_forward.c */
void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb);
-int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb);
+int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, struct sk_buff *skb0);
-int br_forward_finish(struct sock *sk, struct sk_buff *skb);
+int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast);
void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
struct sk_buff *skb2, bool unicast);
@@ -431,7 +503,7 @@ void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
void br_manage_promisc(struct net_bridge *br);
/* br_input.c */
-int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb);
+int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
rx_handler_result_t br_handle_frame(struct sk_buff **pskb);
static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
@@ -463,6 +535,7 @@ void br_multicast_disable_port(struct net_bridge_port *port);
void br_multicast_init(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
+void br_multicast_dev_del(struct net_bridge *br);
void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb);
void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
@@ -485,7 +558,9 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
void br_mdb_init(void);
void br_mdb_uninit(void);
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type);
+ struct br_ip *group, int type, u8 state);
+void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+ int type);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -562,6 +637,10 @@ static inline void br_multicast_stop(struct net_bridge *br)
{
}
+static inline void br_multicast_dev_del(struct net_bridge *br)
+{
+}
+
static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb)
{
@@ -591,39 +670,55 @@ static inline void br_mdb_uninit(void)
/* br_vlan.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
- struct sk_buff *skb, u16 *vid);
-bool br_allowed_egress(struct net_bridge *br, const struct net_port_vlans *v,
+bool br_allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg, struct sk_buff *skb,
+ u16 *vid);
+bool br_allowed_egress(struct net_bridge_vlan_group *vg,
const struct sk_buff *skb);
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
struct sk_buff *br_handle_vlan(struct net_bridge *br,
- const struct net_port_vlans *v,
+ struct net_bridge_vlan_group *vg,
struct sk_buff *skb);
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
int br_vlan_delete(struct net_bridge *br, u16 vid);
void br_vlan_flush(struct net_bridge *br);
-bool br_vlan_find(struct net_bridge *br, u16 vid);
+struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid);
void br_recalculate_fwd_mask(struct net_bridge *br);
+int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags);
int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
void nbp_vlan_flush(struct net_bridge_port *port);
-bool nbp_vlan_find(struct net_bridge_port *port, u16 vid);
int nbp_vlan_init(struct net_bridge_port *port);
+int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
-static inline struct net_port_vlans *br_get_vlan_info(
- const struct net_bridge *br)
+static inline struct net_bridge_vlan_group *br_vlan_group(
+ const struct net_bridge *br)
{
- return rcu_dereference_rtnl(br->vlan_info);
+ return rtnl_dereference(br->vlgrp);
}
-static inline struct net_port_vlans *nbp_get_vlan_info(
- const struct net_bridge_port *p)
+static inline struct net_bridge_vlan_group *nbp_vlan_group(
+ const struct net_bridge_port *p)
{
- return rcu_dereference_rtnl(p->vlan_info);
+ return rtnl_dereference(p->vlgrp);
+}
+
+static inline struct net_bridge_vlan_group *br_vlan_group_rcu(
+ const struct net_bridge *br)
+{
+ return rcu_dereference(br->vlgrp);
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
+ const struct net_bridge_port *p)
+{
+ return rcu_dereference(p->vlgrp);
}
/* Since bridge now depends on 8021Q module, but the time bridge sees the
@@ -633,9 +728,9 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
{
int err = 0;
- if (skb_vlan_tag_present(skb))
+ if (skb_vlan_tag_present(skb)) {
*vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
- else {
+ } else {
*vid = 0;
err = -EINVAL;
}
@@ -643,13 +738,13 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
return err;
}
-static inline u16 br_get_pvid(const struct net_port_vlans *v)
+static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
{
- if (!v)
+ if (!vg)
return 0;
smp_rmb();
- return v->pvid;
+ return vg->pvid;
}
static inline int br_vlan_enabled(struct net_bridge *br)
@@ -657,16 +752,15 @@ static inline int br_vlan_enabled(struct net_bridge *br)
return br->vlan_enabled;
}
#else
-static inline bool br_allowed_ingress(struct net_bridge *br,
- struct net_port_vlans *v,
+static inline bool br_allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
struct sk_buff *skb,
u16 *vid)
{
return true;
}
-static inline bool br_allowed_egress(struct net_bridge *br,
- const struct net_port_vlans *v,
+static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg,
const struct sk_buff *skb)
{
return true;
@@ -679,7 +773,7 @@ static inline bool br_should_learn(struct net_bridge_port *p,
}
static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
- const struct net_port_vlans *v,
+ struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
return skb;
@@ -699,11 +793,6 @@ static inline void br_vlan_flush(struct net_bridge *br)
{
}
-static inline bool br_vlan_find(struct net_bridge *br, u16 vid)
-{
- return false;
-}
-
static inline void br_recalculate_fwd_mask(struct net_bridge *br)
{
}
@@ -727,40 +816,68 @@ static inline void nbp_vlan_flush(struct net_bridge_port *port)
{
}
-static inline struct net_port_vlans *br_get_vlan_info(
- const struct net_bridge *br)
+static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg,
+ u16 vid)
{
return NULL;
}
-static inline struct net_port_vlans *nbp_get_vlan_info(
- const struct net_bridge_port *p)
+
+static inline int nbp_vlan_init(struct net_bridge_port *port)
{
- return NULL;
+ return 0;
}
-static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid)
+static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag)
{
- return false;
+ return 0;
}
-static inline int nbp_vlan_init(struct net_bridge_port *port)
+static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
{
return 0;
}
-static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag)
+static inline int br_vlan_enabled(struct net_bridge *br)
{
return 0;
}
-static inline u16 br_get_pvid(const struct net_port_vlans *v)
+
+static inline int __br_vlan_filter_toggle(struct net_bridge *br,
+ unsigned long val)
{
- return 0;
+ return -EOPNOTSUPP;
}
-static inline int br_vlan_enabled(struct net_bridge *br)
+static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p,
+ u32 filter_mask)
{
return 0;
}
+
+static inline struct net_bridge_vlan_group *br_vlan_group(
+ const struct net_bridge *br)
+{
+ return NULL;
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group(
+ const struct net_bridge_port *p)
+{
+ return NULL;
+}
+
+static inline struct net_bridge_vlan_group *br_vlan_group_rcu(
+ const struct net_bridge *br)
+{
+ return NULL;
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
+ const struct net_bridge_port *p)
+{
+ return NULL;
+}
+
#endif
struct nf_br_ops {
@@ -790,6 +907,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
int br_set_forward_delay(struct net_bridge *br, unsigned long x);
int br_set_hello_time(struct net_bridge *br, unsigned long x);
int br_set_max_age(struct net_bridge *br, unsigned long x);
+int br_set_ageing_time(struct net_bridge *br, u32 ageing_time);
/* br_stp_if.c */
diff --git a/kernel/net/bridge/br_stp.c b/kernel/net/bridge/br_stp.c
index fb3ebe615..5f3f64553 100644
--- a/kernel/net/bridge/br_stp.c
+++ b/kernel/net/bridge/br_stp.c
@@ -39,10 +39,15 @@ void br_log_state(const struct net_bridge_port *p)
void br_set_state(struct net_bridge_port *p, unsigned int state)
{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.stp_state = state,
+ };
int err;
p->state = state;
- err = netdev_switch_port_stp_update(p->dev, state);
+ err = switchdev_port_attr_set(p->dev, &attr);
if (err && err != -EOPNOTSUPP)
br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
(unsigned int) p->port_no, p->dev->name);
@@ -205,8 +210,9 @@ void br_transmit_config(struct net_bridge_port *p)
br_send_config_bpdu(p, &bpdu);
p->topology_change_ack = 0;
p->config_pending = 0;
- mod_timer(&p->hold_timer,
- round_jiffies(jiffies + BR_HOLD_TIME));
+ if (p->br->stp_enabled == BR_KERNEL_STP)
+ mod_timer(&p->hold_timer,
+ round_jiffies(jiffies + BR_HOLD_TIME));
}
}
@@ -424,7 +430,6 @@ static void br_make_forwarding(struct net_bridge_port *p)
else
br_set_state(p, BR_STATE_LEARNING);
- br_multicast_enable_port(p);
br_log_state(p);
br_ifinfo_notify(RTM_NEWLINK, p);
@@ -458,6 +463,12 @@ void br_port_state_selection(struct net_bridge *br)
}
}
+ if (p->state != BR_STATE_BLOCKING)
+ br_multicast_enable_port(p);
+ /* Multicast is not disabled for the port when it goes in
+ * blocking state because the timers will expire and stop by
+ * themselves without sending more queries.
+ */
if (p->state == BR_STATE_FORWARDING)
++liveports;
}
@@ -556,6 +567,29 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
}
+int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.ageing_time = ageing_time,
+ };
+ unsigned long t = clock_t_to_jiffies(ageing_time);
+ int err;
+
+ if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME)
+ return -ERANGE;
+
+ err = switchdev_port_attr_set(br->dev, &attr);
+ if (err)
+ return err;
+
+ br->ageing_time = t;
+ mod_timer(&br->gc_timer, jiffies);
+
+ return 0;
+}
+
void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
{
br->bridge_forward_delay = t;
diff --git a/kernel/net/bridge/br_stp_bpdu.c b/kernel/net/bridge/br_stp_bpdu.c
index 534fc4cd2..5881fbc11 100644
--- a/kernel/net/bridge/br_stp_bpdu.c
+++ b/kernel/net/bridge/br_stp_bpdu.c
@@ -30,6 +30,12 @@
#define LLC_RESERVE sizeof(struct llc_pdu_un)
+static int br_send_bpdu_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+
static void br_send_bpdu(struct net_bridge_port *p,
const unsigned char *data, int length)
{
@@ -54,9 +60,9 @@ static void br_send_bpdu(struct net_bridge_port *p,
skb_reset_mac_header(skb);
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb,
- NULL, skb->dev,
- dev_queue_xmit_sk);
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ dev_net(p->dev), NULL, skb, NULL, skb->dev,
+ br_send_bpdu_finish);
}
static inline void br_set_ticks(unsigned char *dest, int j)
diff --git a/kernel/net/bridge/br_stp_if.c b/kernel/net/bridge/br_stp_if.c
index 7832d07f4..8a7ada8bb 100644
--- a/kernel/net/bridge/br_stp_if.c
+++ b/kernel/net/bridge/br_stp_if.c
@@ -15,6 +15,7 @@
#include <linux/kmod.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
+#include <net/switchdev.h>
#include "br_private.h"
#include "br_private_stp.h"
@@ -35,11 +36,22 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
/* called under bridge lock */
void br_init_port(struct net_bridge_port *p)
{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
+ .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
+ };
+ int err;
+
p->port_id = br_make_port_id(p->priority, p->port_no);
br_become_designated_port(p);
br_set_state(p, BR_STATE_BLOCKING);
p->topology_change_ack = 0;
p->config_pending = 0;
+
+ err = switchdev_port_attr_set(p->dev, &attr);
+ if (err && err != -EOPNOTSUPP)
+ netdev_err(p->dev, "failed to set HW ageing time\n");
}
/* called under bridge lock */
@@ -48,7 +60,8 @@ void br_stp_enable_bridge(struct net_bridge *br)
struct net_bridge_port *p;
spin_lock_bh(&br->lock);
- mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ if (br->stp_enabled == BR_KERNEL_STP)
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
mod_timer(&br->gc_timer, jiffies + HZ/10);
br_config_bpdu_generation(br);
@@ -111,7 +124,7 @@ void br_stp_disable_port(struct net_bridge_port *p)
del_timer(&p->forward_delay_timer);
del_timer(&p->hold_timer);
- br_fdb_delete_by_port(br, p, 0);
+ br_fdb_delete_by_port(br, p, 0, 0);
br_multicast_disable_port(p);
br_configuration_update(br);
@@ -127,8 +140,12 @@ static void br_stp_start(struct net_bridge *br)
int r;
char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
char *envp[] = { NULL };
+ struct net_bridge_port *p;
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ if (net_eq(dev_net(br->dev), &init_net))
+ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ else
+ r = -ENOENT;
spin_lock_bh(&br->lock);
@@ -140,6 +157,10 @@ static void br_stp_start(struct net_bridge *br)
if (r == 0) {
br->stp_enabled = BR_USER_STP;
br_debug(br, "userspace STP started\n");
+ /* Stop hello and hold timers */
+ del_timer(&br->hello_timer);
+ list_for_each_entry(p, &br->port_list, list)
+ del_timer(&p->hold_timer);
} else {
br->stp_enabled = BR_KERNEL_STP;
br_debug(br, "using kernel STP\n");
@@ -156,12 +177,17 @@ static void br_stp_stop(struct net_bridge *br)
int r;
char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
char *envp[] = { NULL };
+ struct net_bridge_port *p;
if (br->stp_enabled == BR_USER_STP) {
r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
br_info(br, "userspace STP stopped, return code %d\n", r);
/* To start timers on any ports left in blocking */
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ list_for_each_entry(p, &br->port_list, list)
+ mod_timer(&p->hold_timer,
+ round_jiffies(jiffies + BR_HOLD_TIME));
spin_lock_bh(&br->lock);
br_port_state_selection(br);
spin_unlock_bh(&br->lock);
diff --git a/kernel/net/bridge/br_stp_timer.c b/kernel/net/bridge/br_stp_timer.c
index 7caf7fae2..5f0f5af0e 100644
--- a/kernel/net/bridge/br_stp_timer.c
+++ b/kernel/net/bridge/br_stp_timer.c
@@ -40,7 +40,9 @@ static void br_hello_timer_expired(unsigned long arg)
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
- mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time));
+ if (br->stp_enabled != BR_USER_STP)
+ mod_timer(&br->hello_timer,
+ round_jiffies(jiffies + br->hello_time));
}
spin_unlock(&br->lock);
}
diff --git a/kernel/net/bridge/br_sysfs_br.c b/kernel/net/bridge/br_sysfs_br.c
index 4c97fc50f..8365bd53c 100644
--- a/kernel/net/bridge/br_sysfs_br.c
+++ b/kernel/net/bridge/br_sysfs_br.c
@@ -102,8 +102,15 @@ static ssize_t ageing_time_show(struct device *d,
static int set_ageing_time(struct net_bridge *br, unsigned long val)
{
- br->ageing_time = clock_t_to_jiffies(val);
- return 0;
+ int ret;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ ret = br_set_ageing_time(br, val);
+ rtnl_unlock();
+
+ return ret;
}
static ssize_t ageing_time_store(struct device *d,
diff --git a/kernel/net/bridge/br_sysfs_if.c b/kernel/net/bridge/br_sysfs_if.c
index 4905845a9..efe415ad8 100644
--- a/kernel/net/bridge/br_sysfs_if.c
+++ b/kernel/net/bridge/br_sysfs_if.c
@@ -160,7 +160,7 @@ static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
static int store_flush(struct net_bridge_port *p, unsigned long v)
{
- br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry
+ br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry
return 0;
}
static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
diff --git a/kernel/net/bridge/br_vlan.c b/kernel/net/bridge/br_vlan.c
index 13013fe8d..1394da636 100644
--- a/kernel/net/bridge/br_vlan.c
+++ b/kernel/net/bridge/br_vlan.c
@@ -2,59 +2,209 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <net/switchdev.h>
#include "br_private.h"
-static void __vlan_add_pvid(struct net_port_vlans *v, u16 vid)
+static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
{
- if (v->pvid == vid)
+ const struct net_bridge_vlan *vle = ptr;
+ u16 vid = *(u16 *)arg->key;
+
+ return vle->vid != vid;
+}
+
+static const struct rhashtable_params br_vlan_rht_params = {
+ .head_offset = offsetof(struct net_bridge_vlan, vnode),
+ .key_offset = offsetof(struct net_bridge_vlan, vid),
+ .key_len = sizeof(u16),
+ .nelem_hint = 3,
+ .locks_mul = 1,
+ .max_size = VLAN_N_VID,
+ .obj_cmpfn = br_vlan_cmp,
+ .automatic_shrinking = true,
+};
+
+static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
+{
+ return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
+}
+
+static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+{
+ if (vg->pvid == vid)
return;
smp_wmb();
- v->pvid = vid;
+ vg->pvid = vid;
}
-static void __vlan_delete_pvid(struct net_port_vlans *v, u16 vid)
+static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
{
- if (v->pvid != vid)
+ if (vg->pvid != vid)
return;
smp_wmb();
- v->pvid = 0;
+ vg->pvid = 0;
}
-static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags)
+static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
{
+ struct net_bridge_vlan_group *vg;
+
+ if (br_vlan_is_master(v))
+ vg = br_vlan_group(v->br);
+ else
+ vg = nbp_vlan_group(v->port);
+
if (flags & BRIDGE_VLAN_INFO_PVID)
- __vlan_add_pvid(v, vid);
+ __vlan_add_pvid(vg, v->vid);
else
- __vlan_delete_pvid(v, vid);
+ __vlan_delete_pvid(vg, v->vid);
if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
- set_bit(vid, v->untagged_bitmap);
+ v->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
else
- clear_bit(vid, v->untagged_bitmap);
+ v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED;
}
-static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
+static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
+ u16 vid, u16 flags)
{
- struct net_bridge_port *p = NULL;
- struct net_bridge *br;
- struct net_device *dev;
+ struct switchdev_obj_port_vlan v = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = flags,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
+ int err;
+
+ /* Try switchdev op first. In case it is not supported, fallback to
+ * 8021q add.
+ */
+ err = switchdev_port_obj_add(dev, &v.obj);
+ if (err == -EOPNOTSUPP)
+ return vlan_vid_add(dev, br->vlan_proto, vid);
+ return err;
+}
+
+static void __vlan_add_list(struct net_bridge_vlan *v)
+{
+ struct net_bridge_vlan_group *vg;
+ struct list_head *headp, *hpos;
+ struct net_bridge_vlan *vent;
+
+ if (br_vlan_is_master(v))
+ vg = br_vlan_group(v->br);
+ else
+ vg = nbp_vlan_group(v->port);
+
+ headp = &vg->vlan_list;
+ list_for_each_prev(hpos, headp) {
+ vent = list_entry(hpos, struct net_bridge_vlan, vlist);
+ if (v->vid < vent->vid)
+ continue;
+ else
+ break;
+ }
+ list_add_rcu(&v->vlist, hpos);
+}
+
+static void __vlan_del_list(struct net_bridge_vlan *v)
+{
+ list_del_rcu(&v->vlist);
+}
+
+static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
+ u16 vid)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
int err;
- if (test_bit(vid, v->vlan_bitmap)) {
- __vlan_add_flags(v, vid, flags);
+ /* Try switchdev op first. In case it is not supported, fallback to
+ * 8021q del.
+ */
+ err = switchdev_port_obj_del(dev, &v.obj);
+ if (err == -EOPNOTSUPP) {
+ vlan_vid_del(dev, br->vlan_proto, vid);
return 0;
}
+ return err;
+}
+
+/* Returns a master vlan, if it didn't exist it gets created. In all cases a
+ * a reference is taken to the master vlan before returning.
+ */
+static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *masterv;
+
+ vg = br_vlan_group(br);
+ masterv = br_vlan_find(vg, vid);
+ if (!masterv) {
+ /* missing global ctx, create it now */
+ if (br_vlan_add(br, vid, 0))
+ return NULL;
+ masterv = br_vlan_find(vg, vid);
+ if (WARN_ON(!masterv))
+ return NULL;
+ }
+ atomic_inc(&masterv->refcnt);
+
+ return masterv;
+}
+
+static void br_vlan_put_master(struct net_bridge_vlan *masterv)
+{
+ struct net_bridge_vlan_group *vg;
+
+ if (!br_vlan_is_master(masterv))
+ return;
+
+ vg = br_vlan_group(masterv->br);
+ if (atomic_dec_and_test(&masterv->refcnt)) {
+ rhashtable_remove_fast(&vg->vlan_hash,
+ &masterv->vnode, br_vlan_rht_params);
+ __vlan_del_list(masterv);
+ kfree_rcu(masterv, rcu);
+ }
+}
- if (v->port_idx) {
- p = v->parent.port;
+/* This is the shared VLAN add function which works for both ports and bridge
+ * devices. There are four possible calls to this function in terms of the
+ * vlan entry type:
+ * 1. vlan is being added on a port (no master flags, global entry exists)
+ * 2. vlan is being added on a bridge (both master and brentry flags)
+ * 3. vlan is being added on a port, but a global entry didn't exist which
+ * is being created right now (master flag set, brentry flag unset), the
+ * global entry is used for global per-vlan features, but not for filtering
+ * 4. same as 3 but with both master and brentry flags set so the entry
+ * will be used for filtering in both the port and the bridge
+ */
+static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
+{
+ struct net_bridge_vlan *masterv = NULL;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan_group *vg;
+ struct net_device *dev;
+ struct net_bridge *br;
+ int err;
+
+ if (br_vlan_is_master(v)) {
+ br = v->br;
+ dev = br->dev;
+ vg = br_vlan_group(br);
+ } else {
+ p = v->port;
br = p->br;
dev = p->dev;
- } else {
- br = v->parent.br;
- dev = br->dev;
+ vg = nbp_vlan_group(p);
}
if (p) {
@@ -62,83 +212,140 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
* This ensures tagged traffic enters the bridge when
* promiscuous mode is disabled by br_manage_promisc().
*/
- err = vlan_vid_add(dev, br->vlan_proto, vid);
+ err = __vlan_vid_add(dev, br, v->vid, flags);
if (err)
- return err;
+ goto out;
+
+ /* need to work on the master vlan too */
+ if (flags & BRIDGE_VLAN_INFO_MASTER) {
+ err = br_vlan_add(br, v->vid, flags |
+ BRIDGE_VLAN_INFO_BRENTRY);
+ if (err)
+ goto out_filt;
+ }
+
+ masterv = br_vlan_get_master(br, v->vid);
+ if (!masterv)
+ goto out_filt;
+ v->brvlan = masterv;
}
- err = br_fdb_insert(br, p, dev->dev_addr, vid);
- if (err) {
- br_err(br, "failed insert local address into bridge "
- "forwarding table\n");
- goto out_filt;
+ /* Add the dev mac and count the vlan only if it's usable */
+ if (br_vlan_should_use(v)) {
+ err = br_fdb_insert(br, p, dev->dev_addr, v->vid);
+ if (err) {
+ br_err(br, "failed insert local address into bridge forwarding table\n");
+ goto out_filt;
+ }
+ vg->num_vlans++;
}
- set_bit(vid, v->vlan_bitmap);
- v->num_vlans++;
- __vlan_add_flags(v, vid, flags);
+ err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
+ br_vlan_rht_params);
+ if (err)
+ goto out_fdb_insert;
- return 0;
+ __vlan_add_list(v);
+ __vlan_add_flags(v, flags);
+out:
+ return err;
+
+out_fdb_insert:
+ if (br_vlan_should_use(v)) {
+ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
+ vg->num_vlans--;
+ }
out_filt:
- if (p)
- vlan_vid_del(dev, br->vlan_proto, vid);
- return err;
+ if (p) {
+ __vlan_vid_del(dev, br, v->vid);
+ if (masterv) {
+ br_vlan_put_master(masterv);
+ v->brvlan = NULL;
+ }
+ }
+
+ goto out;
}
-static int __vlan_del(struct net_port_vlans *v, u16 vid)
+static int __vlan_del(struct net_bridge_vlan *v)
{
- if (!test_bit(vid, v->vlan_bitmap))
- return -EINVAL;
+ struct net_bridge_vlan *masterv = v;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ int err = 0;
+
+ if (br_vlan_is_master(v)) {
+ vg = br_vlan_group(v->br);
+ } else {
+ p = v->port;
+ vg = nbp_vlan_group(v->port);
+ masterv = v->brvlan;
+ }
- __vlan_delete_pvid(v, vid);
- clear_bit(vid, v->untagged_bitmap);
+ __vlan_delete_pvid(vg, v->vid);
+ if (p) {
+ err = __vlan_vid_del(p->dev, p->br, v->vid);
+ if (err)
+ goto out;
+ }
- if (v->port_idx) {
- struct net_bridge_port *p = v->parent.port;
- vlan_vid_del(p->dev, p->br->vlan_proto, vid);
+ if (br_vlan_should_use(v)) {
+ v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY;
+ vg->num_vlans--;
}
- clear_bit(vid, v->vlan_bitmap);
- v->num_vlans--;
- if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) {
- if (v->port_idx)
- RCU_INIT_POINTER(v->parent.port->vlan_info, NULL);
- else
- RCU_INIT_POINTER(v->parent.br->vlan_info, NULL);
+ if (masterv != v) {
+ rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
+ br_vlan_rht_params);
+ __vlan_del_list(v);
kfree_rcu(v, rcu);
}
- return 0;
+
+ br_vlan_put_master(masterv);
+out:
+ return err;
}
-static void __vlan_flush(struct net_port_vlans *v)
+static void __vlan_group_free(struct net_bridge_vlan_group *vg)
{
- smp_wmb();
- v->pvid = 0;
- bitmap_zero(v->vlan_bitmap, VLAN_N_VID);
- if (v->port_idx)
- RCU_INIT_POINTER(v->parent.port->vlan_info, NULL);
- else
- RCU_INIT_POINTER(v->parent.br->vlan_info, NULL);
- kfree_rcu(v, rcu);
+ WARN_ON(!list_empty(&vg->vlan_list));
+ rhashtable_destroy(&vg->vlan_hash);
+ kfree(vg);
+}
+
+static void __vlan_flush(struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *vlan, *tmp;
+
+ __vlan_delete_pvid(vg, vg->pvid);
+ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
+ __vlan_del(vlan);
}
struct sk_buff *br_handle_vlan(struct net_bridge *br,
- const struct net_port_vlans *pv,
+ struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
+ struct net_bridge_vlan *v;
u16 vid;
/* If this packet was not filtered at input, let it pass */
if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
goto out;
- /* Vlan filter table must be configured at this point. The
+ /* At this point, we know that the frame was filtered and contains
+ * a valid vlan id. If the vlan id has untagged flag set,
+ * send untagged; otherwise, send tagged.
+ */
+ br_vlan_get_tag(skb, &vid);
+ v = br_vlan_find(vg, vid);
+ /* Vlan entry must be configured at this point. The
* only exception is the bridge is set in promisc mode and the
* packet is destined for the bridge device. In this case
* pass the packet as is.
*/
- if (!pv) {
+ if (!v || !br_vlan_should_use(v)) {
if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) {
goto out;
} else {
@@ -146,13 +353,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
return NULL;
}
}
-
- /* At this point, we know that the frame was filtered and contains
- * a valid vlan id. If the vlan id is set in the untagged bitmap,
- * send untagged; otherwise, send tagged.
- */
- br_vlan_get_tag(skb, &vid);
- if (test_bit(vid, pv->untagged_bitmap))
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
skb->vlan_tci = 0;
out:
@@ -160,29 +361,13 @@ out:
}
/* Called under RCU */
-bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
- struct sk_buff *skb, u16 *vid)
+static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto,
+ struct sk_buff *skb, u16 *vid)
{
+ const struct net_bridge_vlan *v;
bool tagged;
- __be16 proto;
-
- /* If VLAN filtering is disabled on the bridge, all packets are
- * permitted.
- */
- if (!br->vlan_enabled) {
- BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
- return true;
- }
-
- /* If there are no vlan in the permitted list, all packets are
- * rejected.
- */
- if (!v)
- goto drop;
BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
- proto = br->vlan_proto;
-
/* If vlan tx offload is disabled on bridge device and frame was
* sent from vlan device on the bridge device, it does not have
* HW accelerated vlan tag.
@@ -217,7 +402,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
}
if (!*vid) {
- u16 pvid = br_get_pvid(v);
+ u16 pvid = br_get_pvid(vg);
/* Frame had a tag with VID 0 or did not have a tag.
* See if pvid is set on this port. That tells us which
@@ -245,29 +430,43 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
}
/* Frame had a valid vlan tag. See if vlan is allowed */
- if (test_bit(*vid, v->vlan_bitmap))
+ v = br_vlan_find(vg, *vid);
+ if (v && br_vlan_should_use(v))
return true;
drop:
kfree_skb(skb);
return false;
}
+bool br_allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg, struct sk_buff *skb,
+ u16 *vid)
+{
+ /* If VLAN filtering is disabled on the bridge, all packets are
+ * permitted.
+ */
+ if (!br->vlan_enabled) {
+ BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
+ return true;
+ }
+
+ return __allowed_ingress(vg, br->vlan_proto, skb, vid);
+}
+
/* Called under RCU. */
-bool br_allowed_egress(struct net_bridge *br,
- const struct net_port_vlans *v,
+bool br_allowed_egress(struct net_bridge_vlan_group *vg,
const struct sk_buff *skb)
{
+ const struct net_bridge_vlan *v;
u16 vid;
/* If this packet was not filtered at input, let it pass */
if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
return true;
- if (!v)
- return false;
-
br_vlan_get_tag(skb, &vid);
- if (test_bit(vid, v->vlan_bitmap))
+ v = br_vlan_find(vg, vid);
+ if (v && br_vlan_should_use(v))
return true;
return false;
@@ -276,29 +475,29 @@ bool br_allowed_egress(struct net_bridge *br,
/* Called under RCU */
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
{
+ struct net_bridge_vlan_group *vg;
struct net_bridge *br = p->br;
- struct net_port_vlans *v;
/* If filtering was disabled at input, let it pass. */
if (!br->vlan_enabled)
return true;
- v = rcu_dereference(p->vlan_info);
- if (!v)
+ vg = nbp_vlan_group_rcu(p);
+ if (!vg || !vg->num_vlans)
return false;
if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto)
*vid = 0;
if (!*vid) {
- *vid = br_get_pvid(v);
+ *vid = br_get_pvid(vg);
if (!*vid)
return false;
return true;
}
- if (test_bit(*vid, v->vlan_bitmap))
+ if (br_vlan_find(vg, *vid))
return true;
return false;
@@ -309,31 +508,49 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
*/
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags)
{
- struct net_port_vlans *pv = NULL;
- int err;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+ int ret;
ASSERT_RTNL();
- pv = rtnl_dereference(br->vlan_info);
- if (pv)
- return __vlan_add(pv, vid, flags);
+ vg = br_vlan_group(br);
+ vlan = br_vlan_find(vg, vid);
+ if (vlan) {
+ if (!br_vlan_is_brentry(vlan)) {
+ /* Trying to change flags of non-existent bridge vlan */
+ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
+ return -EINVAL;
+ /* It was only kept for port vlans, now make it real */
+ ret = br_fdb_insert(br, NULL, br->dev->dev_addr,
+ vlan->vid);
+ if (ret) {
+ br_err(br, "failed insert local address into bridge forwarding table\n");
+ return ret;
+ }
+ atomic_inc(&vlan->refcnt);
+ vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
+ vg->num_vlans++;
+ }
+ __vlan_add_flags(vlan, flags);
+ return 0;
+ }
- /* Create port vlan infomration
- */
- pv = kzalloc(sizeof(*pv), GFP_KERNEL);
- if (!pv)
+ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
+ if (!vlan)
return -ENOMEM;
- pv->parent.br = br;
- err = __vlan_add(pv, vid, flags);
- if (err)
- goto out;
-
- rcu_assign_pointer(br->vlan_info, pv);
- return 0;
-out:
- kfree(pv);
- return err;
+ vlan->vid = vid;
+ vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER;
+ vlan->flags &= ~BRIDGE_VLAN_INFO_PVID;
+ vlan->br = br;
+ if (flags & BRIDGE_VLAN_INFO_BRENTRY)
+ atomic_set(&vlan->refcnt, 1);
+ ret = __vlan_add(vlan, flags);
+ if (ret)
+ kfree(vlan);
+
+ return ret;
}
/* Must be protected by RTNL.
@@ -341,49 +558,41 @@ out:
*/
int br_vlan_delete(struct net_bridge *br, u16 vid)
{
- struct net_port_vlans *pv;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
ASSERT_RTNL();
- pv = rtnl_dereference(br->vlan_info);
- if (!pv)
- return -EINVAL;
+ vg = br_vlan_group(br);
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_is_brentry(v))
+ return -ENOENT;
br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
+ br_fdb_delete_by_port(br, NULL, vid, 0);
- __vlan_del(pv, vid);
- return 0;
+ return __vlan_del(v);
}
void br_vlan_flush(struct net_bridge *br)
{
- struct net_port_vlans *pv;
+ struct net_bridge_vlan_group *vg;
ASSERT_RTNL();
- pv = rtnl_dereference(br->vlan_info);
- if (!pv)
- return;
- __vlan_flush(pv);
+ vg = br_vlan_group(br);
+ __vlan_flush(vg);
+ RCU_INIT_POINTER(br->vlgrp, NULL);
+ synchronize_rcu();
+ __vlan_group_free(vg);
}
-bool br_vlan_find(struct net_bridge *br, u16 vid)
+struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
{
- struct net_port_vlans *pv;
- bool found = false;
-
- rcu_read_lock();
- pv = rcu_dereference(br->vlan_info);
+ if (!vg)
+ return NULL;
- if (!pv)
- goto out;
-
- if (test_bit(vid, pv->vlan_bitmap))
- found = true;
-
-out:
- rcu_read_unlock();
- return found;
+ return br_vlan_lookup(&vg->vlan_hash, vid);
}
/* Must be protected by RTNL. */
@@ -413,50 +622,46 @@ void br_recalculate_fwd_mask(struct net_bridge *br)
~(1u << br->group_addr[5]);
}
-int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
{
- if (!rtnl_trylock())
- return restart_syscall();
-
if (br->vlan_enabled == val)
- goto unlock;
+ return 0;
br->vlan_enabled = val;
br_manage_promisc(br);
recalculate_group_addr(br);
br_recalculate_fwd_mask(br);
-unlock:
+ return 0;
+}
+
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+{
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ __br_vlan_filter_toggle(br, val);
rtnl_unlock();
+
return 0;
}
-int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
{
int err = 0;
struct net_bridge_port *p;
- struct net_port_vlans *pv;
- __be16 proto, oldproto;
- u16 vid, errvid;
+ struct net_bridge_vlan *vlan;
+ struct net_bridge_vlan_group *vg;
+ __be16 oldproto;
- if (val != ETH_P_8021Q && val != ETH_P_8021AD)
- return -EPROTONOSUPPORT;
-
- if (!rtnl_trylock())
- return restart_syscall();
-
- proto = htons(val);
if (br->vlan_proto == proto)
- goto unlock;
+ return 0;
/* Add VLANs for the new proto to the device filter. */
list_for_each_entry(p, &br->port_list, list) {
- pv = rtnl_dereference(p->vlan_info);
- if (!pv)
- continue;
-
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
- err = vlan_vid_add(p->dev, proto, vid);
+ vg = nbp_vlan_group(p);
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ err = vlan_vid_add(p->dev, proto, vlan->vid);
if (err)
goto err_filt;
}
@@ -470,38 +675,55 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
/* Delete VLANs for the old proto from the device filter. */
list_for_each_entry(p, &br->port_list, list) {
- pv = rtnl_dereference(p->vlan_info);
- if (!pv)
- continue;
-
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
- vlan_vid_del(p->dev, oldproto, vid);
+ vg = nbp_vlan_group(p);
+ list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ vlan_vid_del(p->dev, oldproto, vlan->vid);
}
-unlock:
- rtnl_unlock();
- return err;
+ return 0;
err_filt:
- errvid = vid;
- for_each_set_bit(vid, pv->vlan_bitmap, errvid)
- vlan_vid_del(p->dev, proto, vid);
+ list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist)
+ vlan_vid_del(p->dev, proto, vlan->vid);
list_for_each_entry_continue_reverse(p, &br->port_list, list) {
- pv = rtnl_dereference(p->vlan_info);
- if (!pv)
- continue;
-
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
- vlan_vid_del(p->dev, proto, vid);
+ vg = nbp_vlan_group(p);
+ list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ vlan_vid_del(p->dev, proto, vlan->vid);
}
- goto unlock;
+ return err;
+}
+
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
+{
+ int err;
+
+ if (val != ETH_P_8021Q && val != ETH_P_8021AD)
+ return -EPROTONOSUPPORT;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ err = __br_vlan_set_proto(br, htons(val));
+ rtnl_unlock();
+
+ return err;
}
-static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid)
+static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
{
- return pv && vid == pv->pvid && test_bit(vid, pv->untagged_bitmap);
+ struct net_bridge_vlan *v;
+
+ if (vid != vg->pvid)
+ return false;
+
+ v = br_vlan_lookup(&vg->vlan_hash, vid);
+ if (v && br_vlan_should_use(v) &&
+ (v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return true;
+
+ return false;
}
static void br_vlan_disable_default_pvid(struct net_bridge *br)
@@ -512,24 +734,31 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br)
/* Disable default_pvid on all ports where it is still
* configured.
*/
- if (vlan_default_pvid(br_get_vlan_info(br), pvid))
+ if (vlan_default_pvid(br_vlan_group(br), pvid))
br_vlan_delete(br, pvid);
list_for_each_entry(p, &br->port_list, list) {
- if (vlan_default_pvid(nbp_get_vlan_info(p), pvid))
+ if (vlan_default_pvid(nbp_vlan_group(p), pvid))
nbp_vlan_delete(p, pvid);
}
br->default_pvid = 0;
}
-static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
{
+ const struct net_bridge_vlan *pvent;
+ struct net_bridge_vlan_group *vg;
struct net_bridge_port *p;
u16 old_pvid;
int err = 0;
unsigned long *changed;
+ if (!pvid) {
+ br_vlan_disable_default_pvid(br);
+ return 0;
+ }
+
changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
GFP_KERNEL);
if (!changed)
@@ -540,11 +769,14 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
/* Update default_pvid config only if we do not conflict with
* user configuration.
*/
- if ((!old_pvid || vlan_default_pvid(br_get_vlan_info(br), old_pvid)) &&
- !br_vlan_find(br, pvid)) {
+ vg = br_vlan_group(br);
+ pvent = br_vlan_find(vg, pvid);
+ if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) &&
+ (!pvent || !br_vlan_should_use(pvent))) {
err = br_vlan_add(br, pvid,
BRIDGE_VLAN_INFO_PVID |
- BRIDGE_VLAN_INFO_UNTAGGED);
+ BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY);
if (err)
goto out;
br_vlan_delete(br, old_pvid);
@@ -555,9 +787,10 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
/* Update default_pvid config only if we do not conflict with
* user configuration.
*/
+ vg = nbp_vlan_group(p);
if ((old_pvid &&
- !vlan_default_pvid(nbp_get_vlan_info(p), old_pvid)) ||
- nbp_vlan_find(p, pvid))
+ !vlan_default_pvid(vg, old_pvid)) ||
+ br_vlan_find(vg, pvid))
continue;
err = nbp_vlan_add(p, pvid,
@@ -591,7 +824,8 @@ err_port:
if (old_pvid)
br_vlan_add(br, old_pvid,
BRIDGE_VLAN_INFO_PVID |
- BRIDGE_VLAN_INFO_UNTAGGED);
+ BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY);
br_vlan_delete(br, pvid);
}
goto out;
@@ -617,12 +851,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
err = -EPERM;
goto unlock;
}
-
- if (!pvid)
- br_vlan_disable_default_pvid(br);
- else
- err = __br_vlan_set_default_pvid(br, pvid);
-
+ err = __br_vlan_set_default_pvid(br, pvid);
unlock:
rtnl_unlock();
return err;
@@ -630,10 +859,68 @@ unlock:
int br_vlan_init(struct net_bridge *br)
{
+ struct net_bridge_vlan_group *vg;
+ int ret = -ENOMEM;
+
+ vg = kzalloc(sizeof(*vg), GFP_KERNEL);
+ if (!vg)
+ goto out;
+ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
+ if (ret)
+ goto err_rhtbl;
+ INIT_LIST_HEAD(&vg->vlan_list);
br->vlan_proto = htons(ETH_P_8021Q);
br->default_pvid = 1;
- return br_vlan_add(br, 1,
- BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED);
+ rcu_assign_pointer(br->vlgrp, vg);
+ ret = br_vlan_add(br, 1,
+ BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY);
+ if (ret)
+ goto err_vlan_add;
+
+out:
+ return ret;
+
+err_vlan_add:
+ rhashtable_destroy(&vg->vlan_hash);
+err_rhtbl:
+ kfree(vg);
+
+ goto out;
+}
+
+int nbp_vlan_init(struct net_bridge_port *p)
+{
+ struct net_bridge_vlan_group *vg;
+ int ret = -ENOMEM;
+
+ vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL);
+ if (!vg)
+ goto out;
+
+ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
+ if (ret)
+ goto err_rhtbl;
+ INIT_LIST_HEAD(&vg->vlan_list);
+ rcu_assign_pointer(p->vlgrp, vg);
+ if (p->br->default_pvid) {
+ ret = nbp_vlan_add(p, p->br->default_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED);
+ if (ret)
+ goto err_vlan_add;
+ }
+out:
+ return ret;
+
+err_vlan_add:
+ RCU_INIT_POINTER(p->vlgrp, NULL);
+ synchronize_rcu();
+ rhashtable_destroy(&vg->vlan_hash);
+err_rhtbl:
+ kfree(vg);
+
+ goto out;
}
/* Must be protected by RTNL.
@@ -641,35 +928,28 @@ int br_vlan_init(struct net_bridge *br)
*/
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
{
- struct net_port_vlans *pv = NULL;
- int err;
+ struct net_bridge_vlan *vlan;
+ int ret;
ASSERT_RTNL();
- pv = rtnl_dereference(port->vlan_info);
- if (pv)
- return __vlan_add(pv, vid, flags);
-
- /* Create port vlan infomration
- */
- pv = kzalloc(sizeof(*pv), GFP_KERNEL);
- if (!pv) {
- err = -ENOMEM;
- goto clean_up;
+ vlan = br_vlan_find(nbp_vlan_group(port), vid);
+ if (vlan) {
+ __vlan_add_flags(vlan, flags);
+ return 0;
}
- pv->port_idx = port->port_no;
- pv->parent.port = port;
- err = __vlan_add(pv, vid, flags);
- if (err)
- goto clean_up;
+ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
+ if (!vlan)
+ return -ENOMEM;
- rcu_assign_pointer(port->vlan_info, pv);
- return 0;
+ vlan->vid = vid;
+ vlan->port = port;
+ ret = __vlan_add(vlan, flags);
+ if (ret)
+ kfree(vlan);
-clean_up:
- kfree(pv);
- return err;
+ return ret;
}
/* Must be protected by RTNL.
@@ -677,60 +957,28 @@ clean_up:
*/
int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
{
- struct net_port_vlans *pv;
+ struct net_bridge_vlan *v;
ASSERT_RTNL();
- pv = rtnl_dereference(port->vlan_info);
- if (!pv)
- return -EINVAL;
-
+ v = br_vlan_find(nbp_vlan_group(port), vid);
+ if (!v)
+ return -ENOENT;
br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid);
+ br_fdb_delete_by_port(port->br, port, vid, 0);
- return __vlan_del(pv, vid);
+ return __vlan_del(v);
}
void nbp_vlan_flush(struct net_bridge_port *port)
{
- struct net_port_vlans *pv;
- u16 vid;
+ struct net_bridge_vlan_group *vg;
ASSERT_RTNL();
- pv = rtnl_dereference(port->vlan_info);
- if (!pv)
- return;
-
- for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
- vlan_vid_del(port->dev, port->br->vlan_proto, vid);
-
- __vlan_flush(pv);
-}
-
-bool nbp_vlan_find(struct net_bridge_port *port, u16 vid)
-{
- struct net_port_vlans *pv;
- bool found = false;
-
- rcu_read_lock();
- pv = rcu_dereference(port->vlan_info);
-
- if (!pv)
- goto out;
-
- if (test_bit(vid, pv->vlan_bitmap))
- found = true;
-
-out:
- rcu_read_unlock();
- return found;
-}
-
-int nbp_vlan_init(struct net_bridge_port *p)
-{
- return p->br->default_pvid ?
- nbp_vlan_add(p, p->br->default_pvid,
- BRIDGE_VLAN_INFO_PVID |
- BRIDGE_VLAN_INFO_UNTAGGED) :
- 0;
+ vg = nbp_vlan_group(port);
+ __vlan_flush(vg);
+ RCU_INIT_POINTER(port->vlgrp, NULL);
+ synchronize_rcu();
+ __vlan_group_free(vg);
}
diff --git a/kernel/net/bridge/netfilter/ebt_log.c b/kernel/net/bridge/netfilter/ebt_log.c
index 17f2e4bc2..0ad639a96 100644
--- a/kernel/net/bridge/netfilter/ebt_log.c
+++ b/kernel/net/bridge/netfilter/ebt_log.c
@@ -180,7 +180,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_log_info *info = par->targinfo;
struct nf_loginfo li;
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = info->loglevel;
diff --git a/kernel/net/bridge/netfilter/ebt_nflog.c b/kernel/net/bridge/netfilter/ebt_nflog.c
index 59ac79520..548161506 100644
--- a/kernel/net/bridge/netfilter/ebt_nflog.c
+++ b/kernel/net/bridge/netfilter/ebt_nflog.c
@@ -24,7 +24,7 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nflog_info *info = par->targinfo;
struct nf_loginfo li;
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
diff --git a/kernel/net/bridge/netfilter/ebt_stp.c b/kernel/net/bridge/netfilter/ebt_stp.c
index 071d87214..0c4057006 100644
--- a/kernel/net/bridge/netfilter/ebt_stp.c
+++ b/kernel/net/bridge/netfilter/ebt_stp.c
@@ -164,8 +164,10 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
!(info->bitmask & EBT_STP_MASK))
return -EINVAL;
/* Make sure the match only receives stp frames */
- if (!ether_addr_equal(e->destmac, bridge_ula) ||
- !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
+ if (!par->nft_compat &&
+ (!ether_addr_equal(e->destmac, bridge_ula) ||
+ !ether_addr_equal(e->destmsk, msk) ||
+ !(e->bitmask & EBT_DESTMAC)))
return -EINVAL;
return 0;
diff --git a/kernel/net/bridge/netfilter/ebtable_broute.c b/kernel/net/bridge/netfilter/ebtable_broute.c
index d2cdf5d6e..ec94c6f1a 100644
--- a/kernel/net/bridge/netfilter/ebtable_broute.c
+++ b/kernel/net/bridge/netfilter/ebtable_broute.c
@@ -50,10 +50,14 @@ static const struct ebt_table broute_table = {
static int ebt_broute(struct sk_buff *skb)
{
+ struct nf_hook_state state;
int ret;
- ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL,
- dev_net(skb->dev)->xt.broute_table);
+ nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN,
+ NFPROTO_BRIDGE, skb->dev, NULL, NULL,
+ dev_net(skb->dev), NULL);
+
+ ret = ebt_do_table(skb, &state, state.net->xt.broute_table);
if (ret == NF_DROP)
return 1; /* route it */
return 0; /* bridge it */
diff --git a/kernel/net/bridge/netfilter/ebtable_filter.c b/kernel/net/bridge/netfilter/ebtable_filter.c
index 8a3f63b2e..32eccd101 100644
--- a/kernel/net/bridge/netfilter/ebtable_filter.c
+++ b/kernel/net/bridge/netfilter/ebtable_filter.c
@@ -57,39 +57,34 @@ static const struct ebt_table frame_filter = {
};
static unsigned int
-ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ebt_in_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ebt_do_table(ops->hooknum, skb, state->in, state->out,
- dev_net(state->in)->xt.frame_filter);
+ return ebt_do_table(skb, state, state->net->xt.frame_filter);
}
static unsigned int
-ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ebt_out_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ebt_do_table(ops->hooknum, skb, state->in, state->out,
- dev_net(state->out)->xt.frame_filter);
+ return ebt_do_table(skb, state, state->net->xt.frame_filter);
}
static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
{
.hook = ebt_in_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_FILTER_BRIDGED,
},
{
.hook = ebt_in_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_FILTER_BRIDGED,
},
{
.hook = ebt_out_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_FILTER_OTHER,
diff --git a/kernel/net/bridge/netfilter/ebtable_nat.c b/kernel/net/bridge/netfilter/ebtable_nat.c
index c5ef5b1ab..ec55358f0 100644
--- a/kernel/net/bridge/netfilter/ebtable_nat.c
+++ b/kernel/net/bridge/netfilter/ebtable_nat.c
@@ -57,39 +57,34 @@ static struct ebt_table frame_nat = {
};
static unsigned int
-ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ebt_nat_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ebt_do_table(ops->hooknum, skb, state->in, state->out,
- dev_net(state->in)->xt.frame_nat);
+ return ebt_do_table(skb, state, state->net->xt.frame_nat);
}
static unsigned int
-ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ebt_nat_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ebt_do_table(ops->hooknum, skb, state->in, state->out,
- dev_net(state->out)->xt.frame_nat);
+ return ebt_do_table(skb, state, state->net->xt.frame_nat);
}
static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
{
.hook = ebt_nat_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_NAT_DST_OTHER,
},
{
.hook = ebt_nat_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_NAT_SRC,
},
{
.hook = ebt_nat_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED,
diff --git a/kernel/net/bridge/netfilter/ebtables.c b/kernel/net/bridge/netfilter/ebtables.c
index 91180a7fc..f46ca417b 100644
--- a/kernel/net/bridge/netfilter/ebtables.c
+++ b/kernel/net/bridge/netfilter/ebtables.c
@@ -6,7 +6,7 @@
*
* ebtables.c,v 2.0, July, 2002
*
- * This code is stongly inspired on the iptables code which is
+ * This code is strongly inspired by the iptables code which is
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
*
* This program is free software; you can redistribute it and/or
@@ -139,7 +139,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
ethproto = h->h_proto;
if (e->bitmask & EBT_802_3) {
- if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO))
+ if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO))
return 1;
} else if (!(e->bitmask & EBT_NOPROTO) &&
FWINV2(e->ethproto != ethproto, EBT_IPROTO))
@@ -176,17 +176,18 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
return 0;
}
-static inline __pure
+static inline
struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
{
return (void *)entry + entry->next_offset;
}
/* Do some firewalling */
-unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- struct ebt_table *table)
+unsigned int ebt_do_table(struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct ebt_table *table)
{
+ unsigned int hook = state->hook;
int i, nentries;
struct ebt_entry *point;
struct ebt_counter *counter_base, *cb_base;
@@ -199,8 +200,9 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
struct xt_action_param acpar;
acpar.family = NFPROTO_BRIDGE;
- acpar.in = in;
- acpar.out = out;
+ acpar.net = state->net;
+ acpar.in = state->in;
+ acpar.out = state->out;
acpar.hotdrop = false;
acpar.hooknum = hook;
@@ -220,7 +222,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
base = private->entries;
i = 0;
while (i < nentries) {
- if (ebt_basic_match(point, skb, in, out))
+ if (ebt_basic_match(point, skb, state->in, state->out))
goto letscontinue;
if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
diff --git a/kernel/net/bridge/netfilter/nf_tables_bridge.c b/kernel/net/bridge/netfilter/nf_tables_bridge.c
index a343e6244..62f6b1b19 100644
--- a/kernel/net/bridge/netfilter/nf_tables_bridge.c
+++ b/kernel/net/bridge/netfilter/nf_tables_bridge.c
@@ -65,31 +65,29 @@ int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate);
static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
- const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
if (nft_bridge_iphdr_validate(skb))
- nft_set_pktinfo_ipv4(pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(pkt, skb, state);
else
- nft_set_pktinfo(pkt, ops, skb, state);
+ nft_set_pktinfo(pkt, skb, state);
}
static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
#if IS_ENABLED(CONFIG_IPV6)
if (nft_bridge_ip6hdr_validate(skb) &&
- nft_set_pktinfo_ipv6(pkt, ops, skb, state) == 0)
+ nft_set_pktinfo_ipv6(pkt, skb, state) == 0)
return;
#endif
- nft_set_pktinfo(pkt, ops, skb, state);
+ nft_set_pktinfo(pkt, skb, state);
}
static unsigned int
-nft_do_chain_bridge(const struct nf_hook_ops *ops,
+nft_do_chain_bridge(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -97,17 +95,17 @@ nft_do_chain_bridge(const struct nf_hook_ops *ops,
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
- nft_bridge_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_bridge_set_pktinfo_ipv4(&pkt, skb, state);
break;
case htons(ETH_P_IPV6):
- nft_bridge_set_pktinfo_ipv6(&pkt, ops, skb, state);
+ nft_bridge_set_pktinfo_ipv6(&pkt, skb, state);
break;
default:
- nft_set_pktinfo(&pkt, ops, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
break;
}
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
static struct nft_af_info nft_af_bridge __read_mostly = {
diff --git a/kernel/net/bridge/netfilter/nft_reject_bridge.c b/kernel/net/bridge/netfilter/nft_reject_bridge.c
index 858d84856..fdba3d9fb 100644
--- a/kernel/net/bridge/netfilter/nft_reject_bridge.c
+++ b/kernel/net/bridge/netfilter/nft_reject_bridge.c
@@ -261,7 +261,6 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_reject *priv = nft_expr_priv(expr);
- struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
const unsigned char *dest = eth_hdr(pkt->skb)->h_dest;
if (is_broadcast_ether_addr(dest) ||
@@ -273,16 +272,16 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
nft_reject_br_send_v4_unreach(pkt->skb, pkt->in,
- pkt->ops->hooknum,
+ pkt->hook,
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in,
- pkt->ops->hooknum);
+ pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
nft_reject_br_send_v4_unreach(pkt->skb, pkt->in,
- pkt->ops->hooknum,
+ pkt->hook,
nft_reject_icmp_code(priv->icmp_code));
break;
}
@@ -290,17 +289,17 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
case htons(ETH_P_IPV6):
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in,
- pkt->ops->hooknum,
+ nft_reject_br_send_v6_unreach(pkt->net, pkt->skb,
+ pkt->in, pkt->hook,
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
- nft_reject_br_send_v6_tcp_reset(net, pkt->skb, pkt->in,
- pkt->ops->hooknum);
+ nft_reject_br_send_v6_tcp_reset(pkt->net, pkt->skb,
+ pkt->in, pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
- nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in,
- pkt->ops->hooknum,
+ nft_reject_br_send_v6_unreach(pkt->net, pkt->skb,
+ pkt->in, pkt->hook,
nft_reject_icmpv6_code(priv->icmp_code));
break;
}
diff --git a/kernel/net/caif/caif_dev.c b/kernel/net/caif/caif_dev.c
index edbca468f..d730a0f68 100644
--- a/kernel/net/caif/caif_dev.c
+++ b/kernel/net/caif/caif_dev.c
@@ -177,7 +177,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt)
skb->protocol = htons(ETH_P_CAIF);
/* Check if we need to handle xoff */
- if (likely(caifd->netdev->tx_queue_len == 0))
+ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE))
goto noxoff;
if (unlikely(caifd->xoff))
diff --git a/kernel/net/caif/caif_socket.c b/kernel/net/caif/caif_socket.c
index 112ad7848..aa209b106 100644
--- a/kernel/net/caif/caif_socket.c
+++ b/kernel/net/caif/caif_socket.c
@@ -121,12 +121,13 @@ static void caif_flow_ctrl(struct sock *sk, int mode)
* Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are
* not dropped, but CAIF is sending flow off instead.
*/
-static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ bool queued = false;
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
@@ -139,7 +140,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
err = sk_filter(sk, skb);
if (err)
- return err;
+ goto out;
+
if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
set_rx_flow_off(cf_sk);
net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
@@ -147,21 +149,16 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
skb->dev = NULL;
skb_set_owner_r(skb, sk);
- /* Cache the SKB length before we tack it onto the receive
- * queue. Once it is added it no longer belongs to us and
- * may be freed by other threads of control pulling packets
- * from the queue.
- */
spin_lock_irqsave(&list->lock, flags);
- if (!sock_flag(sk, SOCK_DEAD))
+ queued = !sock_flag(sk, SOCK_DEAD);
+ if (queued)
__skb_queue_tail(list, skb);
spin_unlock_irqrestore(&list->lock, flags);
-
- if (!sock_flag(sk, SOCK_DEAD))
+out:
+ if (queued)
sk->sk_data_ready(sk);
else
kfree_skb(skb);
- return 0;
}
/* Packet Receive Callback function called from CAIF Stack */
@@ -326,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
!timeo)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
release_sock(sk);
timeo = schedule_timeout(timeo);
lock_sock(sk);
@@ -334,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
if (sock_flag(sk, SOCK_DEAD))
break;
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
finish_wait(sk_sleep(sk), &wait);
@@ -1055,7 +1052,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
* is really not used at all in the net/core or socket.c but the
* initialization makes sure that sock->state is not uninitialized.
*/
- sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot);
+ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/can/af_can.c b/kernel/net/can/af_can.c
index 62c635f2b..166d43619 100644
--- a/kernel/net/can/af_can.c
+++ b/kernel/net/can/af_can.c
@@ -181,7 +181,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
sock->ops = cp->ops;
- sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
+ sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern);
if (!sk) {
err = -ENOMEM;
goto errout;
diff --git a/kernel/net/can/bcm.c b/kernel/net/can/bcm.c
index a1ba6875c..6863310d6 100644
--- a/kernel/net/can/bcm.c
+++ b/kernel/net/can/bcm.c
@@ -96,7 +96,7 @@ struct bcm_op {
canid_t can_id;
u32 flags;
unsigned long frames_abs, frames_filtered;
- struct timeval ival1, ival2;
+ struct bcm_timeval ival1, ival2;
struct hrtimer timer, thrtimer;
struct tasklet_struct tsklet, thrtsklet;
ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
@@ -131,6 +131,11 @@ static inline struct bcm_sock *bcm_sk(const struct sock *sk)
return (struct bcm_sock *)sk;
}
+static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
+{
+ return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
+}
+
#define CFSIZ sizeof(struct can_frame)
#define OPSIZ sizeof(struct bcm_op)
#define MHSIZ sizeof(struct bcm_msg_head)
@@ -953,8 +958,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->count = msg_head->count;
op->ival1 = msg_head->ival1;
op->ival2 = msg_head->ival2;
- op->kt_ival1 = timeval_to_ktime(msg_head->ival1);
- op->kt_ival2 = timeval_to_ktime(msg_head->ival2);
+ op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
+ op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
/* disable an active timer due to zero values? */
if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64)
@@ -1134,8 +1139,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
/* set timer value */
op->ival1 = msg_head->ival1;
op->ival2 = msg_head->ival2;
- op->kt_ival1 = timeval_to_ktime(msg_head->ival1);
- op->kt_ival2 = timeval_to_ktime(msg_head->ival2);
+ op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
+ op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
/* disable an active timer due to zero value? */
if (!op->kt_ival1.tv64)
diff --git a/kernel/net/can/gw.c b/kernel/net/can/gw.c
index a6f448e18..455168718 100644
--- a/kernel/net/can/gw.c
+++ b/kernel/net/can/gw.c
@@ -110,6 +110,7 @@ struct cf_mod {
void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor);
void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8);
} csumfunc;
+ u32 uid;
};
@@ -548,6 +549,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
goto cancel;
}
+ if (gwj->mod.uid) {
+ if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0)
+ goto cancel;
+ }
+
if (gwj->mod.csumfunc.crc8) {
if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN,
&gwj->mod.csum.crc8) < 0)
@@ -619,6 +625,7 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = {
[CGW_DST_IF] = { .type = NLA_U32 },
[CGW_FILTER] = { .len = sizeof(struct can_filter) },
[CGW_LIM_HOPS] = { .type = NLA_U8 },
+ [CGW_MOD_UID] = { .type = NLA_U32 },
};
/* check for common and gwtype specific attributes */
@@ -761,6 +768,10 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
else
mod->csumfunc.xor = cgw_csum_xor_neg;
}
+
+ if (tb[CGW_MOD_UID]) {
+ nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32));
+ }
}
if (gwtype == CGW_TYPE_CAN_CAN) {
@@ -802,6 +813,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct rtcanmsg *r;
struct cgw_job *gwj;
+ struct cf_mod mod;
+ struct can_can_gw ccgw;
u8 limhops = 0;
int err = 0;
@@ -819,6 +832,36 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh)
if (r->gwtype != CGW_TYPE_CAN_CAN)
return -EINVAL;
+ err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
+ if (err < 0)
+ return err;
+
+ if (mod.uid) {
+
+ ASSERT_RTNL();
+
+ /* check for updating an existing job with identical uid */
+ hlist_for_each_entry(gwj, &cgw_list, list) {
+
+ if (gwj->mod.uid != mod.uid)
+ continue;
+
+ /* interfaces & filters must be identical */
+ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
+ return -EINVAL;
+
+ /* update modifications with disabled softirq & quit */
+ local_bh_disable();
+ memcpy(&gwj->mod, &mod, sizeof(mod));
+ local_bh_enable();
+ return 0;
+ }
+ }
+
+ /* ifindex == 0 is not allowed for job creation */
+ if (!ccgw.src_idx || !ccgw.dst_idx)
+ return -ENODEV;
+
gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL);
if (!gwj)
return -ENOMEM;
@@ -828,18 +871,14 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh)
gwj->deleted_frames = 0;
gwj->flags = r->flags;
gwj->gwtype = r->gwtype;
+ gwj->limit_hops = limhops;
- err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw,
- &limhops);
- if (err < 0)
- goto out;
+ /* insert already parsed information */
+ memcpy(&gwj->mod, &mod, sizeof(mod));
+ memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw));
err = -ENODEV;
- /* ifindex == 0 is not allowed for job creation */
- if (!gwj->ccgw.src_idx || !gwj->ccgw.dst_idx)
- goto out;
-
gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx);
if (!gwj->src.dev)
@@ -856,8 +895,6 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh)
if (gwj->dst.dev->type != ARPHRD_CAN)
goto out;
- gwj->limit_hops = limhops;
-
ASSERT_RTNL();
err = cgw_register_filter(gwj);
@@ -931,8 +968,15 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh)
if (gwj->limit_hops != limhops)
continue;
- if (memcmp(&gwj->mod, &mod, sizeof(mod)))
- continue;
+ /* we have a match when uid is enabled and identical */
+ if (gwj->mod.uid || mod.uid) {
+ if (gwj->mod.uid != mod.uid)
+ continue;
+ } else {
+ /* no uid => check for identical modifications */
+ if (memcmp(&gwj->mod, &mod, sizeof(mod)))
+ continue;
+ }
/* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */
if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
diff --git a/kernel/net/ceph/auth_x.c b/kernel/net/ceph/auth_x.c
index ba6eb1722..10d87753e 100644
--- a/kernel/net/ceph/auth_x.c
+++ b/kernel/net/ceph/auth_x.c
@@ -8,6 +8,7 @@
#include <linux/ceph/decode.h>
#include <linux/ceph/auth.h>
+#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
#include "crypto.h"
@@ -279,6 +280,15 @@ bad:
return -EINVAL;
}
+static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au)
+{
+ ceph_crypto_key_destroy(&au->session_key);
+ if (au->buf) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+}
+
static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th,
struct ceph_x_authorizer *au)
@@ -297,7 +307,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
ceph_crypto_key_destroy(&au->session_key);
ret = ceph_crypto_key_clone(&au->session_key, &th->session_key);
if (ret)
- return ret;
+ goto out_au;
maxlen = sizeof(*msg_a) + sizeof(msg_b) +
ceph_x_encrypt_buflen(ticket_blob_len);
@@ -309,8 +319,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
if (!au->buf) {
au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
if (!au->buf) {
- ceph_crypto_key_destroy(&au->session_key);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_au;
}
}
au->service = th->service;
@@ -340,7 +350,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
p, end - p);
if (ret < 0)
- goto out_buf;
+ goto out_au;
p += ret;
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
@@ -348,9 +358,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
BUG_ON(au->buf->vec.iov_len > maxlen);
return 0;
-out_buf:
- ceph_buffer_put(au->buf);
- au->buf = NULL;
+out_au:
+ ceph_x_authorizer_cleanup(au);
return ret;
}
@@ -624,8 +633,7 @@ static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
{
struct ceph_x_authorizer *au = (void *)a;
- ceph_crypto_key_destroy(&au->session_key);
- ceph_buffer_put(au->buf);
+ ceph_x_authorizer_cleanup(au);
kfree(au);
}
@@ -653,8 +661,7 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
remove_ticket_handler(ac, th);
}
- if (xi->auth_authorizer.buf)
- ceph_buffer_put(xi->auth_authorizer.buf);
+ ceph_x_authorizer_cleanup(&xi->auth_authorizer);
kfree(ac->private);
ac->private = NULL;
@@ -691,8 +698,10 @@ static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
struct ceph_msg *msg)
{
int ret;
- if (!auth->authorizer)
+
+ if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
+
ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
msg, &msg->footer.sig);
if (ret < 0)
@@ -707,8 +716,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
__le64 sig_check;
int ret;
- if (!auth->authorizer)
+ if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
+
ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
msg, &sig_check);
if (ret < 0)
diff --git a/kernel/net/ceph/ceph_common.c b/kernel/net/ceph/ceph_common.c
index 3f76eb84b..bcbec33c6 100644
--- a/kernel/net/ceph/ceph_common.c
+++ b/kernel/net/ceph/ceph_common.c
@@ -9,6 +9,7 @@
#include <keys/ceph-type.h>
#include <linux/module.h>
#include <linux/mount.h>
+#include <linux/nsproxy.h>
#include <linux/parser.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
@@ -16,8 +17,6 @@
#include <linux/statfs.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
-#include <linux/nsproxy.h>
-#include <net/net_namespace.h>
#include <linux/ceph/ceph_features.h>
@@ -131,6 +130,13 @@ int ceph_compare_options(struct ceph_options *new_opt,
int i;
int ret;
+ /*
+ * Don't bother comparing options if network namespaces don't
+ * match.
+ */
+ if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net)))
+ return -1;
+
ret = memcmp(opt1, opt2, ofs);
if (ret)
return ret;
@@ -239,6 +245,8 @@ enum {
Opt_nocrc,
Opt_cephx_require_signatures,
Opt_nocephx_require_signatures,
+ Opt_cephx_sign_messages,
+ Opt_nocephx_sign_messages,
Opt_tcp_nodelay,
Opt_notcp_nodelay,
};
@@ -261,6 +269,8 @@ static match_table_t opt_tokens = {
{Opt_nocrc, "nocrc"},
{Opt_cephx_require_signatures, "cephx_require_signatures"},
{Opt_nocephx_require_signatures, "nocephx_require_signatures"},
+ {Opt_cephx_sign_messages, "cephx_sign_messages"},
+ {Opt_nocephx_sign_messages, "nocephx_sign_messages"},
{Opt_tcp_nodelay, "tcp_nodelay"},
{Opt_notcp_nodelay, "notcp_nodelay"},
{-1, NULL}
@@ -312,7 +322,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
goto out;
}
- ckey = ukey->payload.data;
+ ckey = ukey->payload.data[0];
err = ceph_crypto_key_clone(dst, ckey);
if (err)
goto out_key;
@@ -335,9 +345,6 @@ ceph_parse_options(char *options, const char *dev_name,
int err = -ENOMEM;
substring_t argstr[MAX_OPT_ARGS];
- if (current->nsproxy->net_ns != &init_net)
- return ERR_PTR(-EINVAL);
-
opt = kzalloc(sizeof(*opt), GFP_KERNEL);
if (!opt)
return ERR_PTR(-ENOMEM);
@@ -352,8 +359,9 @@ ceph_parse_options(char *options, const char *dev_name,
/* start with defaults */
opt->flags = CEPH_OPT_DEFAULT;
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
- opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
- opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
@@ -439,13 +447,32 @@ ceph_parse_options(char *options, const char *dev_name,
pr_warn("ignoring deprecated osdtimeout option\n");
break;
case Opt_osdkeepalivetimeout:
- opt->osd_keepalive_timeout = intval;
+ /* 0 isn't well defined right now, reject it */
+ if (intval < 1 || intval > INT_MAX / 1000) {
+ pr_err("osdkeepalive out of range\n");
+ err = -EINVAL;
+ goto out;
+ }
+ opt->osd_keepalive_timeout =
+ msecs_to_jiffies(intval * 1000);
break;
case Opt_osd_idle_ttl:
- opt->osd_idle_ttl = intval;
+ /* 0 isn't well defined right now, reject it */
+ if (intval < 1 || intval > INT_MAX / 1000) {
+ pr_err("osd_idle_ttl out of range\n");
+ err = -EINVAL;
+ goto out;
+ }
+ opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
break;
case Opt_mount_timeout:
- opt->mount_timeout = intval;
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (intval < 0 || intval > INT_MAX / 1000) {
+ pr_err("mount_timeout out of range\n");
+ err = -EINVAL;
+ goto out;
+ }
+ opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break;
case Opt_share:
@@ -468,6 +495,12 @@ ceph_parse_options(char *options, const char *dev_name,
case Opt_nocephx_require_signatures:
opt->flags |= CEPH_OPT_NOMSGAUTH;
break;
+ case Opt_cephx_sign_messages:
+ opt->flags &= ~CEPH_OPT_NOMSGSIGN;
+ break;
+ case Opt_nocephx_sign_messages:
+ opt->flags |= CEPH_OPT_NOMSGSIGN;
+ break;
case Opt_tcp_nodelay:
opt->flags |= CEPH_OPT_TCP_NODELAY;
@@ -511,16 +544,20 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
seq_puts(m, "nocrc,");
if (opt->flags & CEPH_OPT_NOMSGAUTH)
seq_puts(m, "nocephx_require_signatures,");
+ if (opt->flags & CEPH_OPT_NOMSGSIGN)
+ seq_puts(m, "nocephx_sign_messages,");
if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
seq_puts(m, "notcp_nodelay,");
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
- seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+ seq_printf(m, "mount_timeout=%d,",
+ jiffies_to_msecs(opt->mount_timeout) / 1000);
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
- seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+ seq_printf(m, "osd_idle_ttl=%d,",
+ jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,",
- opt->osd_keepalive_timeout);
+ jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
/* drop redundant comma */
if (m->count != pos)
@@ -571,11 +608,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
if (ceph_test_opt(client, MYIP))
myaddr = &client->options->my_addr;
- ceph_messenger_init(&client->msgr, myaddr,
- client->supported_features,
- client->required_features,
- ceph_test_opt(client, NOCRC),
- ceph_test_opt(client, TCP_NODELAY));
+ ceph_messenger_init(&client->msgr, myaddr);
/* subsystems */
err = ceph_monc_init(&client->monc, client);
@@ -590,6 +623,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
fail_monc:
ceph_monc_stop(&client->monc);
fail:
+ ceph_messenger_fini(&client->msgr);
kfree(client);
return ERR_PTR(err);
}
@@ -603,8 +637,8 @@ void ceph_destroy_client(struct ceph_client *client)
/* unmount */
ceph_osdc_stop(&client->osdc);
-
ceph_monc_stop(&client->monc);
+ ceph_messenger_fini(&client->msgr);
ceph_debugfs_client_cleanup(client);
@@ -629,8 +663,8 @@ static int have_mon_and_osd_map(struct ceph_client *client)
*/
int __ceph_open_session(struct ceph_client *client, unsigned long started)
{
- int err;
- unsigned long timeout = client->options->mount_timeout * HZ;
+ unsigned long timeout = client->options->mount_timeout;
+ long err;
/* open session, and wait for mon and osd maps */
err = ceph_monc_open_session(&client->monc);
@@ -638,16 +672,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return err;
while (!have_mon_and_osd_map(client)) {
- err = -EIO;
if (timeout && time_after_eq(jiffies, started + timeout))
- return err;
+ return -ETIMEDOUT;
/* wait */
dout("mount waiting for mon_map\n");
err = wait_event_interruptible_timeout(client->auth_wq,
have_mon_and_osd_map(client) || (client->auth_err < 0),
- timeout);
- if (err == -EINTR || err == -ERESTARTSYS)
+ ceph_timeout_jiffies(timeout));
+ if (err < 0)
return err;
if (client->auth_err < 0)
return client->auth_err;
@@ -724,5 +757,5 @@ module_exit(exit_ceph_lib);
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
-MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_DESCRIPTION("Ceph core library");
MODULE_LICENSE("GPL");
diff --git a/kernel/net/ceph/crush/crush.c b/kernel/net/ceph/crush/crush.c
index 9d84ce4ea..80d7c3a97 100644
--- a/kernel/net/ceph/crush/crush.c
+++ b/kernel/net/ceph/crush/crush.c
@@ -1,15 +1,11 @@
-
#ifdef __KERNEL__
# include <linux/slab.h>
+# include <linux/crush/crush.h>
#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
+# include "crush_compat.h"
+# include "crush.h"
#endif
-#include <linux/crush/crush.h>
-
const char *crush_bucket_alg_name(int alg)
{
switch (alg) {
@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
kfree(map->rules);
}
+#ifndef __KERNEL__
+ kfree(map->choose_tries);
+#endif
kfree(map);
}
diff --git a/kernel/net/ceph/crush/crush_ln_table.h b/kernel/net/ceph/crush/crush_ln_table.h
index 6192c7fc9..aae534c90 100644
--- a/kernel/net/ceph/crush/crush_ln_table.h
+++ b/kernel/net/ceph/crush/crush_ln_table.h
@@ -10,20 +10,20 @@
*
*/
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
-#endif
-
#ifndef CEPH_CRUSH_LN_H
#define CEPH_CRUSH_LN_H
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
-// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
-// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
-
-static int64_t __RH_LH_tbl[128*2+2] = {
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
@@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
0x0000800000000000ll, 0x0000ffff00000000ll,
- };
-
+};
- // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
-static int64_t __LL_tbl[256] = {
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
@@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = {
0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
};
-
-
-
#endif
diff --git a/kernel/net/ceph/crush/hash.c b/kernel/net/ceph/crush/hash.c
index 5bb63e37a..ed123af49 100644
--- a/kernel/net/ceph/crush/hash.c
+++ b/kernel/net/ceph/crush/hash.c
@@ -1,6 +1,8 @@
-
-#include <linux/types.h>
-#include <linux/crush/hash.h>
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
/*
* Robert Jenkins' function for mixing 32-bit values
diff --git a/kernel/net/ceph/crush/mapper.c b/kernel/net/ceph/crush/mapper.c
index 5b47736d2..393bfb22d 100644
--- a/kernel/net/ceph/crush/mapper.c
+++ b/kernel/net/ceph/crush/mapper.c
@@ -1,27 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
#ifdef __KERNEL__
# include <linux/string.h>
# include <linux/slab.h>
# include <linux/bug.h>
# include <linux/kernel.h>
-# ifndef dprintk
-# define dprintk(args...)
-# endif
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
#endif
-
-#include <linux/crush/crush.h>
-#include <linux/crush/hash.h>
#include "crush_ln_table.h"
+#define dprintk(args...) /* printf(args) */
+
/*
* Implement the core CRUSH mapping algorithm.
*/
@@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
int i;
for (i = bucket->h.size-1; i >= 0; i--) {
- __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+ __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
r, bucket->h.id);
w &= 0xffff;
dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
@@ -238,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
return bucket->h.items[high];
}
-// compute 2^44*log2(input+1)
-uint64_t crush_ln(unsigned xin)
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
{
- unsigned x=xin, x1;
- int iexpon, index1, index2;
- uint64_t RH, LH, LL, xl64, result;
+ unsigned int x = xin, x1;
+ int iexpon, index1, index2;
+ __u64 RH, LH, LL, xl64, result;
- x++;
+ x++;
- // normalize input
- iexpon = 15;
- while(!(x&0x18000)) { x<<=1; iexpon--; }
+ /* normalize input */
+ iexpon = 15;
+ while (!(x & 0x18000)) {
+ x <<= 1;
+ iexpon--;
+ }
- index1 = (x>>8)<<1;
- // RH ~ 2^56/index1
- RH = __RH_LH_tbl[index1 - 256];
- // LH ~ 2^48 * log2(index1/256)
- LH = __RH_LH_tbl[index1 + 1 - 256];
+ index1 = (x >> 8) << 1;
+ /* RH ~ 2^56/index1 */
+ RH = __RH_LH_tbl[index1 - 256];
+ /* LH ~ 2^48 * log2(index1/256) */
+ LH = __RH_LH_tbl[index1 + 1 - 256];
- // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
- xl64 = (int64_t)x * RH;
- xl64 >>= 48;
- x1 = xl64;
+ /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+ xl64 = (__s64)x * RH;
+ xl64 >>= 48;
+ x1 = xl64;
- result = iexpon;
- result <<= (12 + 32);
+ result = iexpon;
+ result <<= (12 + 32);
- index2 = x1 & 0xff;
- // LL ~ 2^48*log2(1.0+index2/2^15)
- LL = __LL_tbl[index2];
+ index2 = x1 & 0xff;
+ /* LL ~ 2^48*log2(1.0+index2/2^15) */
+ LL = __LL_tbl[index2];
- LH = LH + LL;
+ LH = LH + LL;
- LH >>= (48-12 - 32);
- result += LH;
+ LH >>= (48 - 12 - 32);
+ result += LH;
- return result;
+ return result;
}
@@ -290,9 +297,9 @@ uint64_t crush_ln(unsigned xin)
static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
int x, int r)
{
- unsigned i, high = 0;
- unsigned u;
- unsigned w;
+ unsigned int i, high = 0;
+ unsigned int u;
+ unsigned int w;
__s64 ln, draw, high_draw = 0;
for (i = 0; i < bucket->h.size; i++) {
@@ -567,6 +574,10 @@ reject:
out[outpos] = item;
outpos++;
count--;
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
}
dprintk("CHOOSE returns %d\n", outpos);
@@ -610,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map,
}
for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+ if (out2 && ftotal) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
for (rep = outpos; rep < endpos; rep++) {
if (out[rep] != CRUSH_ITEM_UNDEF)
continue;
@@ -726,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map,
out2[rep] = CRUSH_ITEM_NONE;
}
}
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+ if (out2) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
}
/**
@@ -790,8 +833,15 @@ int crush_do_rule(const struct crush_map *map,
switch (curstep->op) {
case CRUSH_RULE_TAKE:
- w[0] = curstep->arg1;
- wsize = 1;
+ if ((curstep->arg1 >= 0 &&
+ curstep->arg1 < map->max_devices) ||
+ (-1-curstep->arg1 < map->max_buckets &&
+ map->buckets[-1-curstep->arg1])) {
+ w[0] = curstep->arg1;
+ wsize = 1;
+ } else {
+ dprintk(" bad take value %d\n", curstep->arg1);
+ }
break;
case CRUSH_RULE_SET_CHOOSE_TRIES:
@@ -877,7 +927,7 @@ int crush_do_rule(const struct crush_map *map,
0);
} else {
out_size = ((numrep < (result_max-osize)) ?
- numrep : (result_max-osize));
+ numrep : (result_max-osize));
crush_choose_indep(
map,
map->buckets[-1-w[i]],
@@ -923,5 +973,3 @@ int crush_do_rule(const struct crush_map *map,
}
return result_len;
}
-
-
diff --git a/kernel/net/ceph/crypto.c b/kernel/net/ceph/crypto.c
index 790fe89d9..42e8649c6 100644
--- a/kernel/net/ceph/crypto.c
+++ b/kernel/net/ceph/crypto.c
@@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
return 0;
}
-
-
-#define AES_KEY_SIZE 16
-
static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
{
return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
@@ -541,7 +537,7 @@ static int ceph_key_preparse(struct key_preparsed_payload *prep)
if (ret < 0)
goto err_ckey;
- prep->payload[0] = ckey;
+ prep->payload.data[0] = ckey;
prep->quotalen = datalen;
return 0;
@@ -553,14 +549,14 @@ err:
static void ceph_key_free_preparse(struct key_preparsed_payload *prep)
{
- struct ceph_crypto_key *ckey = prep->payload[0];
+ struct ceph_crypto_key *ckey = prep->payload.data[0];
ceph_crypto_key_destroy(ckey);
kfree(ckey);
}
static void ceph_key_destroy(struct key *key)
{
- struct ceph_crypto_key *ckey = key->payload.data;
+ struct ceph_crypto_key *ckey = key->payload.data[0];
ceph_crypto_key_destroy(ckey);
kfree(ckey);
diff --git a/kernel/net/ceph/crypto.h b/kernel/net/ceph/crypto.h
index d1498224c..2e9cab09f 100644
--- a/kernel/net/ceph/crypto.h
+++ b/kernel/net/ceph/crypto.h
@@ -16,8 +16,10 @@ struct ceph_crypto_key {
static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
- if (key)
+ if (key) {
kfree(key->key);
+ key->key = NULL;
+ }
}
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
diff --git a/kernel/net/ceph/messenger.c b/kernel/net/ceph/messenger.c
index 967080a9f..63ae5dd24 100644
--- a/kernel/net/ceph/messenger.c
+++ b/kernel/net/ceph/messenger.c
@@ -6,6 +6,7 @@
#include <linux/inet.h>
#include <linux/kthread.h>
#include <linux/net.h>
+#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/string.h>
@@ -162,6 +163,7 @@ static struct kmem_cache *ceph_msg_data_cache;
static char tag_msg = CEPH_MSGR_TAG_MSG;
static char tag_ack = CEPH_MSGR_TAG_ACK;
static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
#ifdef CONFIG_LOCKDEP
static struct lock_class_key socket_class;
@@ -175,7 +177,7 @@ static struct lock_class_key socket_class;
static void queue_con(struct ceph_connection *con);
static void cancel_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
+static void ceph_con_workfn(struct work_struct *);
static void con_fault(struct ceph_connection *con);
/*
@@ -275,23 +277,22 @@ static void _ceph_msgr_exit(void)
ceph_msgr_wq = NULL;
}
- ceph_msgr_slab_exit();
-
BUG_ON(zero_page == NULL);
- kunmap(zero_page);
page_cache_release(zero_page);
zero_page = NULL;
+
+ ceph_msgr_slab_exit();
}
int ceph_msgr_init(void)
{
+ if (ceph_msgr_slab_init())
+ return -ENOMEM;
+
BUG_ON(zero_page != NULL);
zero_page = ZERO_PAGE(0);
page_cache_get(zero_page);
- if (ceph_msgr_slab_init())
- return -ENOMEM;
-
/*
* The number of active work items is limited by the number of
* connections, so leave @max_active at default.
@@ -480,8 +481,8 @@ static int ceph_tcp_connect(struct ceph_connection *con)
int ret;
BUG_ON(con->sock);
- ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
- IPPROTO_TCP, &sock);
+ ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret)
return ret;
sock->sk->sk_allocation = GFP_NOFS;
@@ -508,7 +509,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret;
}
- if (con->msgr->tcp_nodelay) {
+ if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) {
int optval = 1;
ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
@@ -636,9 +637,6 @@ static int con_close_socket(struct ceph_connection *con)
static void ceph_msg_remove(struct ceph_msg *msg)
{
list_del_init(&msg->list_head);
- BUG_ON(msg->con == NULL);
- msg->con->ops->put(msg->con);
- msg->con = NULL;
ceph_msg_put(msg);
}
@@ -661,20 +659,21 @@ static void reset_connection(struct ceph_connection *con)
if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
- con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->ops->put(con);
}
con->connect_seq = 0;
con->out_seq = 0;
if (con->out_msg) {
+ BUG_ON(con->out_msg->con != con);
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
}
con->in_seq = 0;
con->in_seq_acked = 0;
+
+ con->out_skip = 0;
}
/*
@@ -749,7 +748,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
mutex_init(&con->mutex);
INIT_LIST_HEAD(&con->out_queue);
INIT_LIST_HEAD(&con->out_sent);
- INIT_DELAYED_WORK(&con->work, con_work);
+ INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
con->state = CON_STATE_CLOSED;
}
@@ -774,6 +773,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
static void con_out_kvec_reset(struct ceph_connection *con)
{
+ BUG_ON(con->out_skip);
+
con->out_kvec_left = 0;
con->out_kvec_bytes = 0;
con->out_kvec_cur = &con->out_kvec[0];
@@ -782,9 +783,9 @@ static void con_out_kvec_reset(struct ceph_connection *con)
static void con_out_kvec_add(struct ceph_connection *con,
size_t size, void *data)
{
- int index;
+ int index = con->out_kvec_left;
- index = con->out_kvec_left;
+ BUG_ON(con->out_skip);
BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
con->out_kvec[index].iov_len = size;
@@ -793,6 +794,27 @@ static void con_out_kvec_add(struct ceph_connection *con,
con->out_kvec_bytes += size;
}
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int off = con->out_kvec_cur - con->out_kvec;
+ int skip = 0;
+
+ if (con->out_kvec_bytes > 0) {
+ skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+ BUG_ON(con->out_kvec_bytes < skip);
+ BUG_ON(!con->out_kvec_left);
+ con->out_kvec_bytes -= skip;
+ con->out_kvec_left--;
+ }
+
+ return skip;
+}
+
#ifdef CONFIG_BLOCK
/*
@@ -1178,6 +1200,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
return new_piece;
}
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
BUG_ON(!msg);
@@ -1200,11 +1229,10 @@ static void prepare_write_message_footer(struct ceph_connection *con)
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con);
- con->out_kvec_is_msg = true;
con->out_kvec[v].iov_base = &m->footer;
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message)
- con->ops->sign_message(con, m);
+ con->ops->sign_message(m);
else
m->footer.sig = 0;
con->out_kvec[v].iov_len = sizeof(m->footer);
@@ -1228,7 +1256,6 @@ static void prepare_write_message(struct ceph_connection *con)
u32 crc;
con_out_kvec_reset(con);
- con->out_kvec_is_msg = true;
con->out_msg_done = false;
/* Sneak an ack in there first? If we can get it into the same
@@ -1268,18 +1295,19 @@ static void prepare_write_message(struct ceph_connection *con)
/* tag + hdr + front + middle */
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+ con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
if (m->middle)
con_out_kvec_add(con, m->middle->vec.iov_len,
m->middle->vec.iov_base);
- /* fill in crc (except data pages), footer */
+ /* fill in hdr crc and finalize hdr */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
con->out_msg->hdr.crc = cpu_to_le32(crc);
- con->out_msg->footer.flags = 0;
+ memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+ /* fill in front and middle crc, footer */
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
con->out_msg->footer.front_crc = cpu_to_le32(crc);
if (m->middle) {
@@ -1291,6 +1319,7 @@ static void prepare_write_message(struct ceph_connection *con)
dout("%s front_crc %u middle_crc %u\n", __func__,
le32_to_cpu(con->out_msg->footer.front_crc),
le32_to_cpu(con->out_msg->footer.middle_crc));
+ con->out_msg->footer.flags = 0;
/* is there a data payload? */
con->out_msg->footer.data_crc = 0;
@@ -1351,7 +1380,16 @@ static void prepare_write_keepalive(struct ceph_connection *con)
{
dout("prepare_write_keepalive %p\n", con);
con_out_kvec_reset(con);
- con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+ if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+ struct timespec now = CURRENT_TIME;
+
+ con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+ ceph_encode_timespec(&con->out_temp_keepalive2, &now);
+ con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
+ &con->out_temp_keepalive2);
+ } else {
+ con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+ }
con_flag_set(con, CON_FLAG_WRITE_PENDING);
}
@@ -1422,7 +1460,8 @@ static int prepare_write_connect(struct ceph_connection *con)
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
con->connect_seq, global_seq, proto);
- con->out_connect.features = cpu_to_le64(con->msgr->supported_features);
+ con->out_connect.features =
+ cpu_to_le64(from_msgr(con->msgr)->supported_features);
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1485,7 +1524,6 @@ static int write_partial_kvec(struct ceph_connection *con)
}
}
con->out_kvec_left = 0;
- con->out_kvec_is_msg = false;
ret = 1;
out:
dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
@@ -1517,7 +1555,7 @@ static int write_partial_message_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !con->msgr->nocrc;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
u32 crc;
dout("%s %p msg %p\n", __func__, con, msg);
@@ -1542,10 +1580,10 @@ static int write_partial_message_data(struct ceph_connection *con)
bool need_crc;
int ret;
- page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
- &last_piece);
+ page = ceph_msg_data_next(cursor, &page_offset, &length,
+ &last_piece);
ret = ceph_tcp_sendpage(con->sock, page, page_offset,
- length, last_piece);
+ length, !last_piece);
if (ret <= 0) {
if (do_datacrc)
msg->footer.data_crc = cpu_to_le32(crc);
@@ -1554,7 +1592,7 @@ static int write_partial_message_data(struct ceph_connection *con)
}
if (do_datacrc && cursor->need_crc)
crc = ceph_crc32c_page(crc, page, page_offset, length);
- need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
+ need_crc = ceph_msg_data_advance(cursor, (size_t)ret);
}
dout("%s %p msg %p done\n", __func__, con, msg);
@@ -1577,6 +1615,7 @@ static int write_partial_skip(struct ceph_connection *con)
{
int ret;
+ dout("%s %p %d left\n", __func__, con, con->out_skip);
while (con->out_skip > 0) {
size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
@@ -1625,6 +1664,12 @@ static void prepare_read_tag(struct ceph_connection *con)
con->in_tag = CEPH_MSGR_TAG_READY;
}
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_keepalive_ack %p\n", con);
+ con->in_base_pos = 0;
+}
+
/*
* Prepare to read a message.
*/
@@ -1732,17 +1777,17 @@ static int verify_hello(struct ceph_connection *con)
static bool addr_is_blank(struct sockaddr_storage *ss)
{
+ struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr;
+ struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr;
+
switch (ss->ss_family) {
case AF_INET:
- return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+ return addr->s_addr == htonl(INADDR_ANY);
case AF_INET6:
- return
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+ return ipv6_addr_any(addr6);
+ default:
+ return true;
}
- return false;
}
static int addr_port(struct sockaddr_storage *ss)
@@ -1989,8 +2034,8 @@ static int process_banner(struct ceph_connection *con)
static int process_connect(struct ceph_connection *con)
{
- u64 sup_feat = con->msgr->supported_features;
- u64 req_feat = con->msgr->required_features;
+ u64 sup_feat = from_msgr(con->msgr)->supported_features;
+ u64 req_feat = from_msgr(con->msgr)->required_features;
u64 server_feat = ceph_sanitize_features(
le64_to_cpu(con->in_reply.features));
int ret;
@@ -2216,7 +2261,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
- const bool do_datacrc = !con->msgr->nocrc;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
struct page *page;
size_t page_offset;
size_t length;
@@ -2230,8 +2275,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = con->in_data_crc;
while (cursor->resid) {
- page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
- NULL);
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
if (ret <= 0) {
if (do_datacrc)
@@ -2242,7 +2286,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = ceph_crc32c_page(crc, page, page_offset, ret);
- (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
+ (void) ceph_msg_data_advance(cursor, (size_t)ret);
}
if (do_datacrc)
con->in_data_crc = crc;
@@ -2262,7 +2306,7 @@ static int read_partial_message(struct ceph_connection *con)
int end;
int ret;
unsigned int front_len, middle_len, data_len;
- bool do_datacrc = !con->msgr->nocrc;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
u64 seq;
u32 crc;
@@ -2301,9 +2345,9 @@ static int read_partial_message(struct ceph_connection *con)
ceph_pr_addr(&con->peer_addr.in_addr),
seq, con->in_seq + 1);
con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
+ sizeof_footer(con);
con->in_tag = CEPH_MSGR_TAG_READY;
- return 0;
+ return 1;
} else if ((s64)seq - (s64)con->in_seq > 1) {
pr_err("read_partial_message bad seq %lld expected %lld\n",
seq, con->in_seq + 1);
@@ -2322,21 +2366,14 @@ static int read_partial_message(struct ceph_connection *con)
return ret;
BUG_ON(!con->in_msg ^ skip);
- if (con->in_msg && data_len > con->in_msg->data_length) {
- pr_warn("%s skipping long message (%u > %zd)\n",
- __func__, data_len, con->in_msg->data_length);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- skip = 1;
- }
if (skip) {
/* skip this message */
dout("alloc_msg said skip message\n");
con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
+ sizeof_footer(con);
con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++;
- return 0;
+ return 1;
}
BUG_ON(!con->in_msg);
@@ -2414,7 +2451,7 @@ static int read_partial_message(struct ceph_connection *con)
}
if (need_sign && con->ops->check_message_signature &&
- con->ops->check_message_signature(con, m)) {
+ con->ops->check_message_signature(m)) {
pr_err("read_partial_message %p signature check failed\n", m);
return -EBADMSG;
}
@@ -2429,13 +2466,10 @@ static int read_partial_message(struct ceph_connection *con)
*/
static void process_message(struct ceph_connection *con)
{
- struct ceph_msg *msg;
+ struct ceph_msg *msg = con->in_msg;
BUG_ON(con->in_msg->con != con);
- con->in_msg->con = NULL;
- msg = con->in_msg;
con->in_msg = NULL;
- con->ops->put(con);
/* if first message, set peer_name */
if (con->peer_name.type == 0)
@@ -2457,6 +2491,17 @@ static void process_message(struct ceph_connection *con)
mutex_lock(&con->mutex);
}
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+ struct ceph_timespec ceph_ts;
+ size_t size = sizeof(ceph_ts);
+ int ret = read_partial(con, size, size, &ceph_ts);
+ if (ret <= 0)
+ return ret;
+ ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+ prepare_read_tag(con);
+ return 1;
+}
/*
* Write something to the socket. Called in a worker thread when the
@@ -2493,13 +2538,13 @@ more:
more_kvec:
/* kvec data queued? */
- if (con->out_skip) {
- ret = write_partial_skip(con);
+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
if (ret <= 0)
goto out;
}
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
if (ret <= 0)
goto out;
}
@@ -2526,6 +2571,10 @@ more_kvec:
do_next:
if (con->state == CON_STATE_OPEN) {
+ if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
/* is anything else pending? */
if (!list_empty(&con->out_queue)) {
prepare_write_message(con);
@@ -2535,10 +2584,6 @@ do_next:
prepare_write_ack(con);
goto more;
}
- if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
- prepare_write_keepalive(con);
- goto more;
- }
}
/* Nothing to do! */
@@ -2641,6 +2686,9 @@ more:
case CEPH_MSGR_TAG_ACK:
prepare_read_ack(con);
break;
+ case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+ prepare_read_keepalive_ack(con);
+ break;
case CEPH_MSGR_TAG_CLOSE:
con_close_socket(con);
con->state = CON_STATE_CLOSED;
@@ -2654,7 +2702,7 @@ more:
if (ret <= 0) {
switch (ret) {
case -EBADMSG:
- con->error_msg = "bad crc";
+ con->error_msg = "bad crc/signature";
/* fall through */
case -EBADE:
ret = -EIO;
@@ -2684,6 +2732,12 @@ more:
process_ack(con);
goto more;
}
+ if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+ ret = read_keepalive_ack(con);
+ if (ret <= 0)
+ goto out;
+ goto more;
+ }
out:
dout("try_read done on %p ret %d\n", con, ret);
@@ -2799,7 +2853,7 @@ static void con_fault_finish(struct ceph_connection *con)
/*
* Do some work on a connection. Drop a connection ref when we're done.
*/
-static void con_work(struct work_struct *work)
+static void ceph_con_workfn(struct work_struct *work)
{
struct ceph_connection *con = container_of(work, struct ceph_connection,
work.work);
@@ -2889,10 +2943,8 @@ static void con_fault(struct ceph_connection *con)
if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
- con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->ops->put(con);
}
/* Requeue anything that hasn't been acked */
@@ -2923,15 +2975,8 @@ static void con_fault(struct ceph_connection *con)
* initialize a new messenger instance
*/
void ceph_messenger_init(struct ceph_messenger *msgr,
- struct ceph_entity_addr *myaddr,
- u64 supported_features,
- u64 required_features,
- bool nocrc,
- bool tcp_nodelay)
+ struct ceph_entity_addr *myaddr)
{
- msgr->supported_features = supported_features;
- msgr->required_features = required_features;
-
spin_lock_init(&msgr->global_seq_lock);
if (myaddr)
@@ -2941,15 +2986,29 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
msgr->inst.addr.type = 0;
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
encode_my_addr(msgr);
- msgr->nocrc = nocrc;
- msgr->tcp_nodelay = tcp_nodelay;
atomic_set(&msgr->stopping, 0);
+ write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
dout("%s %p\n", __func__, msgr);
}
EXPORT_SYMBOL(ceph_messenger_init);
+void ceph_messenger_fini(struct ceph_messenger *msgr)
+{
+ put_net(read_pnet(&msgr->net));
+}
+EXPORT_SYMBOL(ceph_messenger_fini);
+
+static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
+{
+ if (msg->con)
+ msg->con->ops->put(msg->con);
+
+ msg->con = con ? con->ops->get(con) : NULL;
+ BUG_ON(msg->con != con);
+}
+
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
@@ -2981,9 +3040,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
return;
}
- BUG_ON(msg->con != NULL);
- msg->con = con->ops->get(con);
- BUG_ON(msg->con == NULL);
+ msg_con_set(msg, con);
BUG_ON(!list_empty(&msg->list_head));
list_add_tail(&msg->list_head, &con->out_queue);
@@ -3011,31 +3068,45 @@ void ceph_msg_revoke(struct ceph_msg *msg)
{
struct ceph_connection *con = msg->con;
- if (!con)
+ if (!con) {
+ dout("%s msg %p null con\n", __func__, msg);
return; /* Message not in our possession */
+ }
mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) {
dout("%s %p msg %p - was on queue\n", __func__, con, msg);
list_del_init(&msg->list_head);
- BUG_ON(msg->con == NULL);
- msg->con->ops->put(msg->con);
- msg->con = NULL;
msg->hdr.seq = 0;
ceph_msg_put(msg);
}
if (con->out_msg == msg) {
- dout("%s %p msg %p - was sending\n", __func__, con, msg);
- con->out_msg = NULL;
- if (con->out_kvec_is_msg) {
- con->out_skip = con->out_kvec_bytes;
- con->out_kvec_is_msg = false;
+ BUG_ON(con->out_skip);
+ /* footer */
+ if (con->out_msg_done) {
+ con->out_skip += con_out_kvec_skip(con);
+ } else {
+ BUG_ON(!msg->data_length);
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
+ con->out_skip += sizeof(msg->footer);
+ else
+ con->out_skip += sizeof(msg->old_footer);
}
+ /* data, middle, front */
+ if (msg->data_length)
+ con->out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->out_skip += con_out_kvec_skip(con);
+ con->out_skip += con_out_kvec_skip(con);
+
+ dout("%s %p msg %p - was sending, will write %d skip %d\n",
+ __func__, con, msg, con->out_kvec_bytes, con->out_skip);
msg->hdr.seq = 0;
-
+ con->out_msg = NULL;
ceph_msg_put(msg);
}
+
mutex_unlock(&con->mutex);
}
@@ -3044,16 +3115,13 @@ void ceph_msg_revoke(struct ceph_msg *msg)
*/
void ceph_msg_revoke_incoming(struct ceph_msg *msg)
{
- struct ceph_connection *con;
+ struct ceph_connection *con = msg->con;
- BUG_ON(msg == NULL);
- if (!msg->con) {
+ if (!con) {
dout("%s msg %p null con\n", __func__, msg);
-
return; /* Message not in our possession */
}
- con = msg->con;
mutex_lock(&con->mutex);
if (con->in_msg == msg) {
unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
@@ -3094,6 +3162,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
}
EXPORT_SYMBOL(ceph_con_keepalive);
+bool ceph_con_keepalive_expired(struct ceph_connection *con,
+ unsigned long interval)
+{
+ if (interval > 0 &&
+ (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+ struct timespec now = CURRENT_TIME;
+ struct timespec ts;
+ jiffies_to_timespec(interval, &ts);
+ ts = timespec_add(con->last_keepalive_ack, ts);
+ return timespec_compare(&now, &ts) >= 0;
+ }
+ return false;
+}
+
static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
{
struct ceph_msg_data *data;
@@ -3285,9 +3367,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
}
if (msg) {
BUG_ON(*skip);
+ msg_con_set(msg, con);
con->in_msg = msg;
- con->in_msg->con = con->ops->get(con);
- BUG_ON(con->in_msg->con == NULL);
} else {
/*
* Null message pointer means either we should skip
@@ -3334,6 +3415,8 @@ static void ceph_msg_release(struct kref *kref)
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
+ msg_con_set(m, NULL);
+
/* drop middle, data, if any */
if (m->middle) {
ceph_buffer_put(m->middle);
diff --git a/kernel/net/ceph/mon_client.c b/kernel/net/ceph/mon_client.c
index 2b3cf05e8..edda01626 100644
--- a/kernel/net/ceph/mon_client.c
+++ b/kernel/net/ceph/mon_client.c
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
CEPH_ENTITY_TYPE_MON, monc->cur_mon,
&monc->monmap->mon_inst[monc->cur_mon].addr);
+ /* send an initial keepalive to ensure our timestamp is
+ * valid by the time we are in an OPENED state */
+ ceph_con_keepalive(&monc->con);
+
/* initiatiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
*/
static void __schedule_delayed(struct ceph_mon_client *monc)
{
- unsigned int delay;
+ struct ceph_options *opt = monc->client->options;
+ unsigned long delay;
- if (monc->cur_mon < 0 || __sub_expired(monc))
+ if (monc->cur_mon < 0 || __sub_expired(monc)) {
delay = 10 * HZ;
- else
+ } else {
delay = 20 * HZ;
- dout("__schedule_delayed after %u\n", delay);
- schedule_delayed_work(&monc->delayed_work, delay);
+ if (opt->monc_ping_timeout > 0)
+ delay = min(delay, opt->monc_ping_timeout / 3);
+ }
+ dout("__schedule_delayed after %lu\n", delay);
+ schedule_delayed_work(&monc->delayed_work,
+ round_jiffies_relative(delay));
}
/*
@@ -298,21 +307,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+/*
+ * Wait for an osdmap with a given epoch.
+ *
+ * @epoch: epoch to wait for
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
unsigned long timeout)
{
unsigned long started = jiffies;
- int ret;
+ long ret;
mutex_lock(&monc->mutex);
while (monc->have_osdmap < epoch) {
mutex_unlock(&monc->mutex);
- if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+ if (timeout && time_after_eq(jiffies, started + timeout))
return -ETIMEDOUT;
ret = wait_event_interruptible_timeout(monc->client->auth_wq,
- monc->have_osdmap >= epoch, timeout);
+ monc->have_osdmap >= epoch,
+ ceph_timeout_jiffies(timeout));
if (ret < 0)
return ret;
@@ -736,11 +752,23 @@ static void delayed_work(struct work_struct *work)
__close_session(monc);
__open_session(monc); /* continue hunting */
} else {
- ceph_con_keepalive(&monc->con);
+ struct ceph_options *opt = monc->client->options;
+ int is_auth = ceph_auth_is_authenticated(monc->auth);
+ if (ceph_con_keepalive_expired(&monc->con,
+ opt->monc_ping_timeout)) {
+ dout("monc keepalive timeout\n");
+ is_auth = 0;
+ __close_session(monc);
+ monc->hunting = true;
+ __open_session(monc);
+ }
- __validate_auth(monc);
+ if (!monc->hunting) {
+ ceph_con_keepalive(&monc->con);
+ __validate_auth(monc);
+ }
- if (ceph_auth_is_authenticated(monc->auth))
+ if (is_auth)
__send_subscribe(monc);
}
__schedule_delayed(monc);
diff --git a/kernel/net/ceph/osd_client.c b/kernel/net/ceph/osd_client.c
index c4ec92392..a28e47ff1 100644
--- a/kernel/net/ceph/osd_client.c
+++ b/kernel/net/ceph/osd_client.c
@@ -120,11 +120,13 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
}
#endif /* CONFIG_BLOCK */
-#define osd_req_op_data(oreq, whch, typ, fld) \
- ({ \
- BUG_ON(whch >= (oreq)->r_num_ops); \
- &(oreq)->r_ops[whch].typ.fld; \
- })
+#define osd_req_op_data(oreq, whch, typ, fld) \
+({ \
+ struct ceph_osd_request *__oreq = (oreq); \
+ unsigned int __whch = (whch); \
+ BUG_ON(__whch >= __oreq->r_num_ops); \
+ &__oreq->r_ops[__whch].typ.fld; \
+})
static struct ceph_osd_data *
osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
@@ -285,6 +287,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
switch (op->op) {
case CEPH_OSD_OP_READ:
case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
ceph_osd_data_release(&op->extent.osd_data);
break;
case CEPH_OSD_OP_CALL:
@@ -296,6 +299,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_CMPXATTR:
ceph_osd_data_release(&op->xattr.osd_data);
break;
+ case CEPH_OSD_OP_STAT:
+ ceph_osd_data_release(&op->raw_data_in);
+ break;
default:
break;
}
@@ -450,7 +456,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
*/
static struct ceph_osd_req_op *
_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
- u16 opcode)
+ u16 opcode, u32 flags)
{
struct ceph_osd_req_op *op;
@@ -460,14 +466,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
op = &osd_req->r_ops[which];
memset(op, 0, sizeof (*op));
op->op = opcode;
+ op->flags = flags;
return op;
}
void osd_req_op_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode)
+ unsigned int which, u16 opcode, u32 flags)
{
- (void)_osd_req_op_init(osd_req, which, opcode);
+ (void)_osd_req_op_init(osd_req, which, opcode, flags);
}
EXPORT_SYMBOL(osd_req_op_init);
@@ -476,17 +483,19 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
u64 offset, u64 length,
u64 truncate_size, u32 truncate_seq)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ opcode, 0);
size_t payload_len = 0;
BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
- opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE);
+ opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
+ opcode != CEPH_OSD_OP_TRUNCATE);
op->extent.offset = offset;
op->extent.length = length;
op->extent.truncate_size = truncate_size;
op->extent.truncate_seq = truncate_seq;
- if (opcode == CEPH_OSD_OP_WRITE)
+ if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
payload_len += length;
op->payload_len = payload_len;
@@ -515,7 +524,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update);
void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *class, const char *method)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ opcode, 0);
struct ceph_pagelist *pagelist;
size_t payload_len = 0;
size_t size;
@@ -552,7 +562,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value,
size_t size, u8 cmp_op, u8 cmp_mode)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ opcode, 0);
struct ceph_pagelist *pagelist;
size_t payload_len;
@@ -585,7 +596,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode,
u64 cookie, u64 version, int flag)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ opcode, 0);
BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
@@ -602,7 +614,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
u64 expected_write_size)
{
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- CEPH_OSD_OP_SETALLOCHINT);
+ CEPH_OSD_OP_SETALLOCHINT,
+ 0);
op->alloc_hint.expected_object_size = expected_object_size;
op->alloc_hint.expected_write_size = expected_write_size;
@@ -661,9 +674,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
break;
case CEPH_OSD_OP_READ:
case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
case CEPH_OSD_OP_ZERO:
case CEPH_OSD_OP_TRUNCATE:
- if (src->op == CEPH_OSD_OP_WRITE)
+ if (src->op == CEPH_OSD_OP_WRITE ||
+ src->op == CEPH_OSD_OP_WRITEFULL)
request_data_len = src->extent.length;
dst->extent.offset = cpu_to_le64(src->extent.offset);
dst->extent.length = cpu_to_le64(src->extent.length);
@@ -672,7 +687,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->extent.truncate_seq =
cpu_to_le32(src->extent.truncate_seq);
osd_data = &src->extent.osd_data;
- if (src->op == CEPH_OSD_OP_WRITE)
+ if (src->op == CEPH_OSD_OP_WRITE ||
+ src->op == CEPH_OSD_OP_WRITEFULL)
ceph_osdc_msg_data_add(req->r_request, osd_data);
else
ceph_osdc_msg_data_add(req->r_reply, osd_data);
@@ -786,7 +802,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
}
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
- osd_req_op_init(req, which, opcode);
+ osd_req_op_init(req, which, opcode, 0);
} else {
u32 object_size = le32_to_cpu(layout->fl_object_size);
u32 object_base = off - objoff;
@@ -1088,7 +1104,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
BUG_ON(!list_empty(&osd->o_osd_lru));
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
- osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+ osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
}
static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
@@ -1199,7 +1215,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
{
schedule_delayed_work(&osdc->timeout_work,
- osdc->client->options->osd_keepalive_timeout * HZ);
+ osdc->client->options->osd_keepalive_timeout);
}
static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -1567,10 +1583,9 @@ static void handle_timeout(struct work_struct *work)
{
struct ceph_osd_client *osdc =
container_of(work, struct ceph_osd_client, timeout_work.work);
+ struct ceph_options *opts = osdc->client->options;
struct ceph_osd_request *req;
struct ceph_osd *osd;
- unsigned long keepalive =
- osdc->client->options->osd_keepalive_timeout * HZ;
struct list_head slow_osds;
dout("timeout\n");
down_read(&osdc->map_sem);
@@ -1586,7 +1601,8 @@ static void handle_timeout(struct work_struct *work)
*/
INIT_LIST_HEAD(&slow_osds);
list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
- if (time_before(jiffies, req->r_stamp + keepalive))
+ if (time_before(jiffies,
+ req->r_stamp + opts->osd_keepalive_timeout))
break;
osd = req->r_osd;
@@ -1613,8 +1629,7 @@ static void handle_osds_timeout(struct work_struct *work)
struct ceph_osd_client *osdc =
container_of(work, struct ceph_osd_client,
osds_timeout_work.work);
- unsigned long delay =
- osdc->client->options->osd_idle_ttl * HZ >> 2;
+ unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
dout("osds timeout\n");
down_read(&osdc->map_sem);
@@ -1737,8 +1752,7 @@ static void complete_request(struct ceph_osd_request *req)
* handle osd op reply. either call the callback if it is specified,
* or do the completion to wake up the waiting thread.
*/
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
- struct ceph_connection *con)
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
{
void *p, *end;
struct ceph_osd_request *req;
@@ -2619,7 +2633,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
osdc->event_count = 0;
schedule_delayed_work(&osdc->osds_timeout_work,
- round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM;
osdc->req_mempool = mempool_create_kmalloc_pool(10,
@@ -2794,7 +2808,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
ceph_osdc_handle_map(osdc, msg);
break;
case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg, con);
+ handle_reply(osdc, msg);
break;
case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg);
@@ -2809,8 +2823,9 @@ out:
}
/*
- * lookup and return message for incoming reply. set up reply message
- * pages.
+ * Lookup and return message for incoming reply. Don't try to do
+ * anything about a larger than preallocated data portion of the
+ * message at the moment - for now, just skip the message.
*/
static struct ceph_msg *get_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
@@ -2828,23 +2843,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
mutex_lock(&osdc->request_mutex);
req = __lookup_request(osdc, tid);
if (!req) {
- *skip = 1;
+ dout("%s osd%d tid %llu unknown, skipping\n", __func__,
+ osd->o_osd, tid);
m = NULL;
- dout("get_reply unknown tid %llu from osd%d\n", tid,
- osd->o_osd);
+ *skip = 1;
goto out;
}
- if (req->r_reply->con)
- dout("%s revoking msg %p from old con %p\n", __func__,
- req->r_reply, req->r_reply->con);
ceph_msg_revoke_incoming(req->r_reply);
if (front_len > req->r_reply->front_alloc_len) {
- pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n",
- front_len, req->r_reply->front_alloc_len,
- (unsigned int)con->peer_name.type,
- le64_to_cpu(con->peer_name.num));
+ pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
+ __func__, osd->o_osd, req->r_tid, front_len,
+ req->r_reply->front_alloc_len);
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
false);
if (!m)
@@ -2852,37 +2863,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
ceph_msg_put(req->r_reply);
req->r_reply = m;
}
- m = ceph_msg_get(req->r_reply);
-
- if (data_len > 0) {
- struct ceph_osd_data *osd_data;
- /*
- * XXX This is assuming there is only one op containing
- * XXX page data. Probably OK for reads, but this
- * XXX ought to be done more generally.
- */
- osd_data = osd_req_op_extent_osd_data(req, 0);
- if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
- if (osd_data->pages &&
- unlikely(osd_data->length < data_len)) {
-
- pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
- tid, data_len, osd_data->length);
- *skip = 1;
- ceph_msg_put(m);
- m = NULL;
- goto out;
- }
- }
+ if (data_len > req->r_reply->data_length) {
+ pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
+ __func__, osd->o_osd, req->r_tid, data_len,
+ req->r_reply->data_length);
+ m = NULL;
+ *skip = 1;
+ goto out;
}
- *skip = 0;
+
+ m = ceph_msg_get(req->r_reply);
dout("get_reply tid %lld %p\n", tid, m);
out:
mutex_unlock(&osdc->request_mutex);
return m;
-
}
static struct ceph_msg *alloc_msg(struct ceph_connection *con,
@@ -2980,17 +2976,19 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int osd_sign_message(struct ceph_msg *msg)
{
- struct ceph_osd *o = con->private;
+ struct ceph_osd *o = msg->con->private;
struct ceph_auth_handshake *auth = &o->o_auth;
+
return ceph_auth_sign_message(auth, msg);
}
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int osd_check_message_signature(struct ceph_msg *msg)
{
- struct ceph_osd *o = con->private;
+ struct ceph_osd *o = msg->con->private;
struct ceph_auth_handshake *auth = &o->o_auth;
+
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3002,7 +3000,7 @@ static const struct ceph_connection_operations osd_con_ops = {
.verify_authorizer_reply = verify_authorizer_reply,
.invalidate_authorizer = invalidate_authorizer,
.alloc_msg = alloc_msg,
- .sign_message = sign_message,
- .check_message_signature = check_message_signature,
+ .sign_message = osd_sign_message,
+ .check_message_signature = osd_check_message_signature,
.fault = osd_reset,
};
diff --git a/kernel/net/ceph/osdmap.c b/kernel/net/ceph/osdmap.c
index 4a3125836..7d8f581d9 100644
--- a/kernel/net/ceph/osdmap.c
+++ b/kernel/net/ceph/osdmap.c
@@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
ceph_decode_addr(&addr);
pr_info("osd%d up\n", osd);
BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP;
+ map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
map->osd_addr[osd] = addr;
}
diff --git a/kernel/net/ceph/pagevec.c b/kernel/net/ceph/pagevec.c
index 096d91447..d4f5f220a 100644
--- a/kernel/net/ceph/pagevec.c
+++ b/kernel/net/ceph/pagevec.c
@@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
set_page_dirty_lock(pages[i]);
put_page(pages[i]);
}
- if (is_vmalloc_addr(pages))
- vfree(pages);
- else
- kfree(pages);
+ kvfree(pages);
}
EXPORT_SYMBOL(ceph_put_page_vector);
diff --git a/kernel/net/core/Makefile b/kernel/net/core/Makefile
index fec0856dd..086b01fbe 100644
--- a/kernel/net/core/Makefile
+++ b/kernel/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/kernel/net/core/datagram.c b/kernel/net/core/datagram.c
index 617088aee..d62af69ad 100644
--- a/kernel/net/core/datagram.c
+++ b/kernel/net/core/datagram.c
@@ -785,7 +785,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
if (sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
diff --git a/kernel/net/core/dev.c b/kernel/net/core/dev.c
index f8c23dee5..0e17592ad 100644
--- a/kernel/net/core/dev.c
+++ b/kernel/net/core/dev.c
@@ -99,6 +99,7 @@
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -135,6 +136,7 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
#include "net-sysfs.h"
@@ -469,10 +471,14 @@ EXPORT_SYMBOL(dev_remove_pack);
*/
void dev_add_offload(struct packet_offload *po)
{
- struct list_head *head = &offload_base;
+ struct packet_offload *elem;
spin_lock(&offload_lock);
- list_add_rcu(&po->list, head);
+ list_for_each_entry(elem, &offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
@@ -678,6 +684,32 @@ int dev_get_iflink(const struct net_device *dev)
EXPORT_SYMBOL(dev_get_iflink);
/**
+ * dev_fill_metadata_dst - Retrieve tunnel egress information.
+ * @dev: targeted interface
+ * @skb: The packet.
+ *
+ * For better visibility of tunnel traffic OVS needs to retrieve
+ * egress tunnel information for a packet. Following API allows
+ * user to get this info.
+ */
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info;
+
+ if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
+ return -EINVAL;
+
+ info = skb_tunnel_info_unclone(skb);
+ if (!info)
+ return -ENOMEM;
+ if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
+
+ return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+
+/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
@@ -1632,7 +1664,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
}
EXPORT_SYMBOL(call_netdevice_notifiers);
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
static struct static_key ingress_needed __read_mostly;
void net_inc_ingress_queue(void)
@@ -2347,21 +2379,52 @@ void netif_device_attach(struct net_device *dev)
}
EXPORT_SYMBOL(netif_device_attach);
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+ unsigned int num_tx_queues)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
+ return hash;
+ }
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features = 0;
struct net_device *dev = skb->dev;
- const char *driver = "";
+ const char *name = "";
if (!net_ratelimit())
return;
- if (dev && dev->dev.parent)
- driver = dev_driver_string(dev->dev.parent);
-
+ if (dev) {
+ if (dev->dev.parent)
+ name = dev_driver_string(dev->dev.parent);
+ else
+ name = netdev_name(dev);
+ }
WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
"gso_type=%d ip_summed=%d\n",
- driver, dev ? &dev->features : &null_features,
+ name, dev ? &dev->features : &null_features,
skb->sk ? &skb->sk->sk_route_caps : &null_features,
skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
skb_shinfo(skb)->gso_type, skb->ip_summed);
@@ -2487,6 +2550,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
*
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
+ *
+ * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -2501,6 +2566,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
return ERR_PTR(err);
}
+ BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
SKB_GSO_CB(skb)->encap_level = 0;
@@ -2823,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* This permits __QDISC___STATE_RUNNING owner to get the lock more
* often and dequeue packets faster.
*/
+#ifdef CONFIG_PREEMPT_RT_FULL
+ contended = true;
+#else
contended = qdisc_is_running(q);
+#endif
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -2883,16 +2955,53 @@ static void skb_update_prio(struct sk_buff *skb)
#define skb_update_prio(skb)
#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+static inline int xmit_rec_read(void)
+{
+ return current->xmit_recursion;
+}
+
+static inline void xmit_rec_inc(void)
+{
+ current->xmit_recursion++;
+}
+
+static inline void xmit_rec_dec(void)
+{
+ current->xmit_recursion--;
+}
+
+#else
+
DEFINE_PER_CPU(int, xmit_recursion);
EXPORT_SYMBOL(xmit_recursion);
+static inline int xmit_rec_read(void)
+{
+ return __this_cpu_read(xmit_recursion);
+}
+
+static inline void xmit_rec_inc(void)
+{
+ __this_cpu_inc(xmit_recursion);
+}
+
+static inline void xmit_rec_dec(void)
+{
+ __this_cpu_dec(xmit_recursion);
+}
+#endif
+
#define RECURSION_LIMIT 10
/**
* dev_loopback_xmit - loop back @skb
+ * @net: network namespace this loopback is happening in
+ * @sk: sk needed to be a netfilter okfn
* @skb: buffer to transmit
*/
-int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
+int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
@@ -2905,6 +3014,85 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_loopback_xmit);
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[skb->sender_cpu - 1]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+ map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, skb);
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, skb);
+
+ if (queue_index != new_index && sk &&
+ sk_fullsock(sk) &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv)
+{
+ int queue_index = 0;
+
+#ifdef CONFIG_XPS
+ if (skb->sender_cpu == 0)
+ skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb);
+
+ if (!accel_priv)
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
+
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
@@ -2958,6 +3146,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
+#ifdef CONFIG_NET_SWITCHDEV
+ /* Don't forward if offload device already forwarded */
+ if (skb->offload_fwd_mark &&
+ skb->offload_fwd_mark == dev->offload_fwd_mark) {
+ consume_skb(skb);
+ rc = NET_XMIT_SUCCESS;
+ goto out;
+ }
+#endif
+
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -2987,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
if (txq->xmit_lock_owner != cpu) {
- if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+ if (xmit_rec_read() > RECURSION_LIMIT)
goto recursion_alert;
skb = validate_xmit_skb(skb, dev);
@@ -2997,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
- __this_cpu_inc(xmit_recursion);
+ xmit_rec_inc();
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
- __this_cpu_dec(xmit_recursion);
+ xmit_rec_dec();
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
@@ -3030,11 +3228,11 @@ out:
return rc;
}
-int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
+int dev_queue_xmit(struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
-EXPORT_SYMBOL(dev_queue_xmit_sk);
+EXPORT_SYMBOL(dev_queue_xmit);
int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
{
@@ -3549,66 +3747,55 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-#ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesn't stop any functionality; if you dont have
- * the ingress scheduler, you just can't add policies on ingress.
- *
- */
-static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
-{
- struct net_device *dev = skb->dev;
- u32 ttl = G_TC_RTTL(skb->tc_verd);
- int result = TC_ACT_OK;
- struct Qdisc *q;
-
- if (unlikely(MAX_RED_LOOP < ttl++)) {
- net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
- skb->skb_iif, dev->ifindex);
- return TC_ACT_SHOT;
- }
-
- skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
- q = rcu_dereference(rxq->qdisc);
- if (q != &noop_qdisc) {
- spin_lock(qdisc_lock(q));
- if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
- result = qdisc_enqueue_root(skb, q);
- spin_unlock(qdisc_lock(q));
- }
-
- return result;
-}
-
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
- struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+#ifdef CONFIG_NET_CLS_ACT
+ struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+ struct tcf_result cl_res;
- if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
+ /* If there's at least one ingress present somewhere (so
+ * we get here via enabled static key), remaining devices
+ * that are not configured with an ingress qdisc will bail
+ * out here.
+ */
+ if (!cl)
return skb;
-
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- switch (ing_filter(skb, rxq)) {
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+ qdisc_bstats_cpu_update(cl->q, skb);
+
+ switch (tc_classify(skb, cl, &cl_res, false)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
case TC_ACT_SHOT:
+ qdisc_qstats_cpu_drop(cl->q);
case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
kfree_skb(skb);
return NULL;
+ case TC_ACT_REDIRECT:
+ /* skb_mac_header check was done by cls/act_bpf, so
+ * we can safely push the L2 header back before
+ * redirecting to another netdev
+ */
+ __skb_push(skb, skb->mac_len);
+ skb_do_redirect(skb);
+ return NULL;
+ default:
+ break;
}
-
+#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
-#endif
/**
* netdev_rx_handler_register - register receive handler
@@ -3681,6 +3868,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
}
}
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (nf_hook_ingress_active(skb)) {
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ return nf_hook_ingress(skb);
+ }
+#endif /* CONFIG_NETFILTER_INGRESS */
+ return 0;
+}
+
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
@@ -3738,13 +3941,17 @@ another_round:
}
skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
- }
+ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ goto out;
+ }
+#endif
+#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
ncls:
#endif
@@ -3897,13 +4104,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
-int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
+int netif_receive_skb(struct sk_buff *skb)
{
trace_netif_receive_skb_entry(skb);
return netif_receive_skb_internal(skb);
}
-EXPORT_SYMBOL(netif_receive_skb_sk);
+EXPORT_SYMBOL(netif_receive_skb);
/* Network device is going away, flush any packets still pending
* Called with irqs disabled.
@@ -4017,6 +4224,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_metadata_dst_cmp(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
skb_mac_header(skb));
@@ -4214,10 +4422,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
break;
case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
+ skb_dst_drop(skb);
kmem_cache_free(skbuff_head_cache, skb);
- else
+ } else {
__kfree_skb(skb);
+ }
break;
case GRO_HELD:
@@ -4634,6 +4844,8 @@ void napi_disable(struct napi_struct *n)
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
msleep(1);
+ while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
+ msleep(1);
hrtimer_cancel(&n->timer);
@@ -4755,7 +4967,7 @@ static void net_rx_action(struct softirq_action *h)
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
}
@@ -4776,8 +4988,7 @@ struct netdev_adjacent {
struct rcu_head rcu;
};
-static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
- struct net_device *adj_dev,
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
struct list_head *adj_list)
{
struct netdev_adjacent *adj;
@@ -4803,7 +5014,7 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
+ return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
@@ -4916,7 +5127,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
* Gets the next netdev_adjacent->private from the dev's lower neighbour
* list, starting from iter position. The caller must hold either hold the
* RTNL lock or its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
*/
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter)
@@ -4971,7 +5182,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
* Gets the next netdev_adjacent from the dev's lower neighbour
* list, starting from iter position. The caller must hold RTNL lock or
* its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
*/
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
@@ -5065,7 +5276,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct netdev_adjacent *adj;
int ret;
- adj = __netdev_find_adj(dev, adj_dev, dev_list);
+ adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
adj->ref_nr++;
@@ -5121,7 +5332,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
{
struct netdev_adjacent *adj;
- adj = __netdev_find_adj(dev, adj_dev, dev_list);
+ adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
pr_err("tried to remove device %s from %s\n",
@@ -5232,6 +5443,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *private)
{
+ struct netdev_notifier_changeupper_info changeupper_info;
struct netdev_adjacent *i, *j, *to_i, *to_j;
int ret = 0;
@@ -5241,15 +5453,25 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
+ if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
return -EBUSY;
- if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
+ if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
return -EBUSY;
+ changeupper_info.upper_dev = upper_dev;
+ changeupper_info.master = master;
+ changeupper_info.linking = true;
+
+ ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
master);
if (ret)
@@ -5288,7 +5510,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
goto rollback_lower_mesh;
}
- call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+ call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+ &changeupper_info.info);
return 0;
rollback_lower_mesh:
@@ -5383,9 +5606,17 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
+ struct netdev_notifier_changeupper_info changeupper_info;
struct netdev_adjacent *i, *j;
ASSERT_RTNL();
+ changeupper_info.upper_dev = upper_dev;
+ changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+ changeupper_info.linking = false;
+
+ call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+ &changeupper_info.info);
+
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
/* Here is the tricky part. We must remove all dev's lower
@@ -5405,7 +5636,8 @@ void netdev_upper_dev_unlink(struct net_device *dev,
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
__netdev_adjacent_dev_unlink(dev, i->dev);
- call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+ call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+ &changeupper_info.info);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -5511,7 +5743,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
if (!lower_dev)
return NULL;
- lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+ lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
if (!lower)
return NULL;
@@ -6006,6 +6238,26 @@ int dev_get_phys_port_name(struct net_device *dev,
EXPORT_SYMBOL(dev_get_phys_port_name);
/**
+ * dev_change_proto_down - update protocol port state information
+ * @dev: device
+ * @proto_down: new value
+ *
+ * This info can be used by switch drivers to set the phys state of the
+ * port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_proto_down)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
@@ -6129,6 +6381,48 @@ static void rollback_registered(struct net_device *dev)
list_del(&single);
}
+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+ struct net_device *upper, netdev_features_t features)
+{
+ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+ netdev_features_t feature;
+ int feature_bit;
+
+ for_each_netdev_feature(&upper_disables, feature_bit) {
+ feature = __NETIF_F_BIT(feature_bit);
+ if (!(upper->wanted_features & feature)
+ && (features & feature)) {
+ netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
+ &feature, upper->name);
+ features &= ~feature;
+ }
+ }
+
+ return features;
+}
+
+static void netdev_sync_lower_features(struct net_device *upper,
+ struct net_device *lower, netdev_features_t features)
+{
+ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+ netdev_features_t feature;
+ int feature_bit;
+
+ for_each_netdev_feature(&upper_disables, feature_bit) {
+ feature = __NETIF_F_BIT(feature_bit);
+ if (!(features & feature) && (lower->features & feature)) {
+ netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
+ &feature, lower->name);
+ lower->wanted_features &= ~feature;
+ netdev_update_features(lower);
+
+ if (unlikely(lower->features & feature))
+ netdev_WARN(upper, "failed to disable %pNF on %s!\n",
+ &feature, lower->name);
+ }
+ }
+}
+
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -6198,8 +6492,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
int __netdev_update_features(struct net_device *dev)
{
+ struct net_device *upper, *lower;
netdev_features_t features;
- int err = 0;
+ struct list_head *iter;
+ int err = -1;
ASSERT_RTNL();
@@ -6211,26 +6507,42 @@ int __netdev_update_features(struct net_device *dev)
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
+ /* some features can't be enabled if they're off an an upper device */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter)
+ features = netdev_sync_upper_features(dev, upper, features);
+
if (dev->features == features)
- return 0;
+ goto sync_lower;
netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
&dev->features, &features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features);
+ else
+ err = 0;
if (unlikely(err < 0)) {
netdev_err(dev,
"set_features() failed (%d); wanted %pNF, left %pNF\n",
err, &features, &dev->features);
+ /* return non-0 since some features might have changed and
+ * it's better to fire a spurious notification than miss it
+ */
return -1;
}
+sync_lower:
+ /* some features must be disabled on lower devices when disabled
+ * on an upper device (think: bonding master or bridge)
+ */
+ netdev_for_each_lower_dev(dev, lower, iter)
+ netdev_sync_lower_features(dev, lower, features);
+
if (!err)
dev->features = features;
- return 1;
+ return err < 0 ? 0 : 1;
}
/**
@@ -6357,6 +6669,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
return 0;
}
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+ netif_tx_stop_queue(txq);
+ }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -6887,6 +7210,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
+ if (!dev->tx_queue_len) {
+ dev->priv_flags |= IFF_NO_QUEUE;
+ dev->tx_queue_len = 1;
+ }
+
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
@@ -6904,6 +7232,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
+
+ nf_hook_ingress_init(dev);
+
return dev;
free_all:
@@ -6969,7 +7300,7 @@ EXPORT_SYMBOL(free_netdev);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
synchronize_rcu_expedited();
else
synchronize_rcu();
@@ -7561,7 +7892,7 @@ static int __init net_dev_init(void)
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
- dst_init();
+ dst_subsys_init();
rc = 0;
out:
return rc;
diff --git a/kernel/net/core/dst.c b/kernel/net/core/dst.c
index e956ce6d1..a1656e3b8 100644
--- a/kernel/net/core/dst.c
+++ b/kernel/net/core/dst.c
@@ -20,8 +20,10 @@
#include <net/net_namespace.h>
#include <linux/sched.h>
#include <linux/prefetch.h>
+#include <net/lwtunnel.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
/*
* Theory of operations:
@@ -142,12 +144,12 @@ loop:
mutex_unlock(&dst_gc_mutex);
}
-int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
+int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
-EXPORT_SYMBOL(dst_discard_sk);
+EXPORT_SYMBOL(dst_discard_out);
const u32 dst_default_metrics[RTAX_MAX + 1] = {
/* This initializer is needed to force linker to place this variable
@@ -158,19 +160,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = {
[RTAX_MAX] = 0xdeadbeef,
};
-
-void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+ struct net_device *dev, int initial_ref, int initial_obsolete,
+ unsigned short flags)
{
- struct dst_entry *dst;
-
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
- if (ops->gc(ops))
- return NULL;
- }
- dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
- if (!dst)
- return NULL;
dst->child = NULL;
dst->dev = dev;
if (dev)
@@ -184,7 +177,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst->xfrm = NULL;
#endif
dst->input = dst_discard;
- dst->output = dst_discard_sk;
+ dst->output = dst_discard_out;
dst->error = 0;
dst->obsolete = initial_obsolete;
dst->header_len = 0;
@@ -192,6 +185,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
#ifdef CONFIG_IP_ROUTE_CLASSID
dst->tclassid = 0;
#endif
+ dst->lwtstate = NULL;
atomic_set(&dst->__refcnt, initial_ref);
dst->__use = 0;
dst->lastuse = jiffies;
@@ -200,6 +194,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
+}
+EXPORT_SYMBOL(dst_init);
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
+{
+ struct dst_entry *dst;
+
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops))
+ return NULL;
+ }
+
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+
+ dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+
return dst;
}
EXPORT_SYMBOL(dst_alloc);
@@ -211,7 +224,7 @@ static void ___dst_free(struct dst_entry *dst)
*/
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
dst->input = dst_discard;
- dst->output = dst_discard_sk;
+ dst->output = dst_discard_out;
}
dst->obsolete = DST_OBSOLETE_DEAD;
}
@@ -248,7 +261,13 @@ again:
dst->ops->destroy(dst);
if (dst->dev)
dev_put(dst->dev);
- kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ lwtstate_put(dst->lwtstate);
+
+ if (dst->flags & DST_METADATA)
+ kfree(dst);
+ else
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst) {
@@ -282,10 +301,13 @@ void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
+ unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
- WARN_ON(newrefcnt < 0);
- if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt && unlikely(nocache))
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
@@ -327,6 +349,69 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+static struct dst_ops md_dst_ops = {
+ .family = AF_UNSPEC,
+};
+
+static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call output on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dst_md_discard(struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call input on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
+{
+ struct dst_entry *dst;
+
+ dst = &md_dst->dst;
+ dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+ DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+
+ dst->input = dst_md_discard;
+ dst->output = dst_md_discard_out;
+
+ memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+ struct metadata_dst *md_dst;
+
+ md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ if (!md_dst)
+ return NULL;
+
+ __metadata_dst_init(md_dst, optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+
+struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+{
+ int cpu;
+ struct metadata_dst __percpu *md_dst;
+
+ md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+ __alignof__(struct metadata_dst), flags);
+ if (!md_dst)
+ return NULL;
+
+ for_each_possible_cpu(cpu)
+ __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
@@ -346,7 +431,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
if (!unregister) {
dst->input = dst_discard;
- dst->output = dst_discard_sk;
+ dst->output = dst_discard_out;
} else {
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
@@ -391,7 +476,7 @@ static struct notifier_block dst_dev_notifier = {
.priority = -10, /* must be called after other network notifiers */
};
-void __init dst_init(void)
+void __init dst_subsys_init(void)
{
register_netdevice_notifier(&dst_dev_notifier);
}
diff --git a/kernel/net/core/ethtool.c b/kernel/net/core/ethtool.c
index 1d00b8922..29edf7484 100644
--- a/kernel/net/core/ethtool.c
+++ b/kernel/net/core/ethtool.c
@@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
- [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload",
};
static const char
@@ -107,6 +106,13 @@ rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
[ETH_RSS_HASH_XOR_BIT] = "xor",
};
+static const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
+ [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -195,6 +201,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_RSS_HASH_FUNCS)
return ARRAY_SIZE(rss_hash_func_strings);
+ if (sset == ETH_SS_TUNABLES)
+ return ARRAY_SIZE(tunable_strings);
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -212,6 +221,8 @@ static void __ethtool_get_strings(struct net_device *dev,
else if (stringset == ETH_SS_RSS_HASH_FUNCS)
memcpy(data, rss_hash_func_strings,
sizeof(rss_hash_func_strings));
+ else if (stringset == ETH_SS_TUNABLES)
+ memcpy(data, tunable_strings, sizeof(tunable_strings));
else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -1273,7 +1284,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
gstrings.len = ret;
- data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+ data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
if (!data)
return -ENOMEM;
diff --git a/kernel/net/core/fib_rules.c b/kernel/net/core/fib_rules.c
index 0ad144fb0..365de6643 100644
--- a/kernel/net/core/fib_rules.c
+++ b/kernel/net/core/fib_rules.c
@@ -16,6 +16,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
@@ -43,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
}
EXPORT_SYMBOL(fib_default_rule_add);
-u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
{
struct list_head *pos;
struct fib_rule *rule;
@@ -59,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops)
return 0;
}
-EXPORT_SYMBOL(fib_default_rule_pref);
static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
@@ -186,6 +186,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
+ if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -295,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
}
rule->fr_net = net;
- if (tb[FRA_PRIORITY])
- rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
+ : fib_default_rule_pref(ops);
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
@@ -330,6 +333,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (tb[FRA_FWMASK])
rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ if (tb[FRA_TUN_ID])
+ rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
rule->action = frh->action;
rule->flags = frh->flags;
rule->table = frh_get_table(frh, tb);
@@ -343,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
else
rule->suppress_ifgroup = -1;
- if (!tb[FRA_PRIORITY] && ops->default_pref)
- rule->pref = ops->default_pref(ops);
-
err = -EINVAL;
if (tb[FRA_GOTO]) {
if (rule->action != FR_ACT_GOTO)
@@ -407,6 +410,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (unresolved)
ops->unresolved_rules++;
+ if (rule->tun_id)
+ ip_tunnel_need_metadata();
+
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
@@ -473,6 +479,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
(rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
continue;
+ if (tb[FRA_TUN_ID] &&
+ (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -487,6 +497,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
goto errout;
}
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
+
list_del_rcu(&rule->list);
if (rule->action == FR_ACT_GOTO) {
@@ -535,7 +548,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
- + nla_total_size(4); /* FRA_FWMASK */
+ + nla_total_size(4) /* FRA_FWMASK */
+ + nla_total_size(8); /* FRA_TUN_ID */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -591,7 +605,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
((rule->mark_mask || rule->mark) &&
nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
(rule->target &&
- nla_put_u32(skb, FRA_GOTO, rule->target)))
+ nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+ (rule->tun_id &&
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/kernel/net/core/filter.c b/kernel/net/core/filter.c
index bf831a85c..37157c4c1 100644
--- a/kernel/net/core/filter.c
+++ b/kernel/net/core/filter.c
@@ -36,6 +36,7 @@
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <net/sock.h>
+#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -45,16 +46,20 @@
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
+#include <net/sch_generic.h>
+#include <net/cls_cgroup.h>
+#include <net/dst_metadata.h>
+#include <net/dst.h>
/**
* sk_filter - run a packet through a socket filter
* @sk: sock associated with &sk_buff
* @skb: buffer to filter
*
- * Run the filter code and then cut skb->data to correct size returned by
- * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * Run the eBPF program and then cut skb->data to correct size returned by
+ * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
* than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to SK_RUN_FILTER. It returns 0 if the packet should
+ * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
* be accepted or -EPERM if the packet should be tossed.
*
*/
@@ -78,7 +83,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
- unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
+ unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
@@ -144,12 +149,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return raw_smp_processor_id();
}
-/* note that this only generates 32-bit random numbers */
-static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
-{
- return prandom_u32();
-}
-
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -308,7 +307,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
*insn = BPF_EMIT_CALL(__get_raw_cpu_id);
break;
case SKF_AD_OFF + SKF_AD_RANDOM:
- *insn = BPF_EMIT_CALL(__get_random_u32);
+ *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
+ bpf_user_rnd_init_once();
break;
}
break;
@@ -355,8 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* for socket filters: ctx == 'struct sk_buff *', for seccomp:
* ctx == 'struct seccomp_data *'.
*/
-int bpf_convert_filter(struct sock_filter *prog, int len,
- struct bpf_insn *new_prog, int *new_len)
+static int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_insn *new_prog, int *new_len)
{
int new_flen = 0, pass = 0, target, i;
struct bpf_insn *new_insn;
@@ -371,7 +371,8 @@ int bpf_convert_filter(struct sock_filter *prog, int len,
return -EINVAL;
if (new_prog) {
- addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ addrs = kcalloc(len, sizeof(*addrs),
+ GFP_KERNEL | __GFP_NOWARN);
if (!addrs)
return -ENOMEM;
}
@@ -473,9 +474,9 @@ do_pass:
bpf_src = BPF_X;
} else {
insn->dst_reg = BPF_REG_A;
- insn->src_reg = BPF_REG_X;
insn->imm = fp->k;
bpf_src = BPF_SRC(fp->code);
+ insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
}
/* Common case where 'jump_false' is next insn. */
@@ -751,7 +752,8 @@ static bool chk_code_allowed(u16 code_to_probe)
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
+static int bpf_check_classic(const struct sock_filter *filter,
+ unsigned int flen)
{
bool anc_found;
int pc;
@@ -775,6 +777,11 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
if (ftest->k == 0)
return -EINVAL;
break;
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ if (ftest->k >= 32)
+ return -EINVAL;
+ break;
case BPF_LD | BPF_MEM:
case BPF_LDX | BPF_MEM:
case BPF_ST:
@@ -825,7 +832,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
return -EINVAL;
}
-EXPORT_SYMBOL(bpf_check_classic);
static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
const struct sock_fprog *fprog)
@@ -839,7 +845,9 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
fkprog = fp->orig_prog;
fkprog->len = fprog->len;
- fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+
+ fkprog->filter = kmemdup(fp->insns, fsize,
+ GFP_KERNEL | __GFP_NOWARN);
if (!fkprog->filter) {
kfree(fp->orig_prog);
return -ENOMEM;
@@ -941,7 +949,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
* pass. At this time, the user BPF is stored in fp->insns.
*/
old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
- GFP_KERNEL);
+ GFP_KERNEL | __GFP_NOWARN);
if (!old_prog) {
err = -ENOMEM;
goto out_err;
@@ -988,12 +996,13 @@ out_err:
return ERR_PTR(err);
}
-static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+ bpf_aux_classic_check_t trans)
{
int err;
fp->bpf_func = NULL;
- fp->jited = false;
+ fp->jited = 0;
err = bpf_check_classic(fp->insns, fp->len);
if (err) {
@@ -1001,6 +1010,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
return ERR_PTR(err);
}
+ /* There might be additional checks and transformations
+ * needed on classic filters, f.e. in case of seccomp.
+ */
+ if (trans) {
+ err = trans(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+ }
+
/* Probe if we can JIT compile the filter and if so, do
* the compilation of the filter.
*/
@@ -1050,7 +1070,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- fp = bpf_prepare_filter(fp);
+ fp = bpf_prepare_filter(fp, NULL);
if (IS_ERR(fp))
return PTR_ERR(fp);
@@ -1059,6 +1079,60 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
}
EXPORT_SYMBOL_GPL(bpf_prog_create);
+/**
+ * bpf_prog_create_from_user - create an unattached filter from user buffer
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ * @trans: post-classic verifier transformation handler
+ * @save_orig: save classic BPF program
+ *
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
+ */
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+ bpf_aux_classic_check_t trans, bool save_orig)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+ int err;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(fp);
+ return -EFAULT;
+ }
+
+ fp->len = fprog->len;
+ fp->orig_prog = NULL;
+
+ if (save_orig) {
+ err = bpf_prog_store_orig_filter(fp, fprog);
+ if (err) {
+ __bpf_prog_free(fp);
+ return -ENOMEM;
+ }
+ }
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp, trans);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
+
void bpf_prog_destroy(struct bpf_prog *fp)
{
__bpf_prog_release(fp);
@@ -1135,7 +1209,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- prog = bpf_prepare_filter(prog);
+ prog = bpf_prepare_filter(prog, NULL);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -1175,21 +1249,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
-/**
- * bpf_skb_clone_not_writable - is the header of a clone not writable
- * @skb: buffer to check
- * @len: length up to which to write, can be negative
- *
- * Returns true if modifying the header part of the cloned buffer
- * does require the data to be copied. I.e. this version works with
- * negative lengths needed for eBPF case!
- */
-static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
-{
- return skb_header_cloned(skb) ||
- (int) skb_headroom(skb) + len > skb->hdr_len;
-}
-
#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
@@ -1212,9 +1271,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
return -EFAULT;
- offset -= skb->data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb) &&
- bpf_skb_clone_unwritable(skb, offset + len)))
+ !skb_clone_writable(skb, offset + len)))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, len, buf);
@@ -1258,9 +1316,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
- offset -= skb->data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb) &&
- bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
+ !skb_clone_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1299,16 +1356,15 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = {
static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags);
+ bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags);
int offset = (int) r2;
__sum16 sum, *ptr;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
- offset -= skb->data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb) &&
- bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
+ !skb_clone_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1344,6 +1400,233 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
+#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1)
+
+static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
+ if (unlikely(!dev))
+ return -EINVAL;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!skb2))
+ return -ENOMEM;
+
+ if (BPF_IS_REDIRECT_INGRESS(flags))
+ return dev_forward_skb(dev, skb2);
+
+ skb2->dev = dev;
+ skb_sender_cpu_clear(skb2);
+ return dev_queue_xmit(skb2);
+}
+
+const struct bpf_func_proto bpf_clone_redirect_proto = {
+ .func = bpf_clone_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+struct redirect_info {
+ u32 ifindex;
+ u32 flags;
+};
+
+static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+ return TC_ACT_REDIRECT;
+}
+
+int skb_do_redirect(struct sk_buff *skb)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
+ ri->ifindex = 0;
+ if (unlikely(!dev)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (BPF_IS_REDIRECT_INGRESS(ri->flags))
+ return dev_forward_skb(dev, skb);
+
+ skb->dev = dev;
+ skb_sender_cpu_clear(skb);
+ return dev_queue_xmit(skb);
+}
+
+const struct bpf_func_proto bpf_redirect_proto = {
+ .func = bpf_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return task_get_classid((struct sk_buff *) (unsigned long) r1);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
+ .func = bpf_get_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ const struct dst_entry *dst;
+
+ dst = skb_dst((struct sk_buff *) (unsigned long) r1);
+ if (dst)
+ return dst->tclassid;
+#endif
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_route_realm_proto = {
+ .func = bpf_get_route_realm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ __be16 vlan_proto = (__force __be16) r2;
+
+ if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+ vlan_proto != htons(ETH_P_8021AD)))
+ vlan_proto = htons(ETH_P_8021Q);
+
+ return skb_vlan_push(skb, vlan_proto, vlan_tci);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+ .func = bpf_skb_vlan_push,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
+
+static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+
+ return skb_vlan_pop(skb);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+ .func = bpf_skb_vlan_pop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
+
+bool bpf_helper_changes_skb_data(void *func)
+{
+ if (func == bpf_skb_vlan_push)
+ return true;
+ if (func == bpf_skb_vlan_pop)
+ return true;
+ return false;
+}
+
+static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
+ return -EINVAL;
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+
+ to->tunnel_id = be64_to_cpu(info->key.tun_id);
+ to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+ .func = bpf_skb_get_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static struct metadata_dst __percpu *md_dst;
+
+static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
+ struct metadata_dst *md = this_cpu_ptr(md_dst);
+ struct ip_tunnel_info *info;
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags))
+ return -EINVAL;
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *) md);
+ skb_dst_set(skb, (struct dst_entry *) md);
+
+ info = &md->u.tun_info;
+ info->mode = IP_TUNNEL_INFO_TX;
+ info->key.tun_flags = TUNNEL_KEY;
+ info->key.tun_id = cpu_to_be64(from->tunnel_id);
+ info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+ .func = bpf_skb_set_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+{
+ if (!md_dst) {
+ /* race is not possible, since it's called from
+ * verifier that is holding verifier mutex
+ */
+ md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+ if (!md_dst)
+ return NULL;
+ }
+ return &bpf_skb_set_tunnel_key_proto;
+}
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -1358,6 +1641,13 @@ sk_filter_func_proto(enum bpf_func_id func_id)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_trace_printk:
+ if (capable(CAP_SYS_ADMIN))
+ return bpf_get_trace_printk_proto();
default:
return NULL;
}
@@ -1373,18 +1663,29 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_skb_vlan_push:
+ return &bpf_skb_vlan_push_proto;
+ case BPF_FUNC_skb_vlan_pop:
+ return &bpf_skb_vlan_pop_proto;
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_key_proto();
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
default:
return sk_filter_func_proto(func_id);
}
}
-static bool sk_filter_is_valid_access(int off, int size,
- enum bpf_access_type type)
+static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
- /* only read is allowed */
- if (type != BPF_READ)
- return false;
-
/* check bounds */
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
@@ -1400,8 +1701,50 @@ static bool sk_filter_is_valid_access(int off, int size,
return true;
}
-static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
- struct bpf_insn *insn_buf)
+static bool sk_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type)
+{
+ if (off == offsetof(struct __sk_buff, tc_classid))
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return __is_valid_access(off, size, type);
+}
+
+static bool tc_cls_act_is_valid_access(int off, int size,
+ enum bpf_access_type type)
+{
+ if (off == offsetof(struct __sk_buff, tc_classid))
+ return type == BPF_WRITE ? true : false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, tc_index):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+ return __is_valid_access(off, size, type);
+}
+
+static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
@@ -1430,12 +1773,49 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
case offsetof(struct __sk_buff, priority):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, priority));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, priority));
+ break;
+
+ case offsetof(struct __sk_buff, ingress_ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
+
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
- offsetof(struct sk_buff, priority));
+ offsetof(struct sk_buff, skb_iif));
+ break;
+
+ case offsetof(struct __sk_buff, ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+
+ case offsetof(struct __sk_buff, hash):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, hash));
break;
case offsetof(struct __sk_buff, mark):
- return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
case offsetof(struct __sk_buff, pkt_type):
return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
@@ -1450,6 +1830,47 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
case offsetof(struct __sk_buff, vlan_tci):
return convert_skb_access(SKF_AD_VLAN_TAG,
dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+
+ prog->cb_access = 1;
+ ctx_off -= offsetof(struct __sk_buff, cb[0]);
+ ctx_off += offsetof(struct sk_buff, cb);
+ ctx_off += offsetof(struct qdisc_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_classid):
+ ctx_off -= offsetof(struct __sk_buff, tc_classid);
+ ctx_off += offsetof(struct sk_buff, cb);
+ ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+ WARN_ON(type != BPF_WRITE);
+ *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_index):
+#ifdef CONFIG_NET_SCHED
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, tc_index));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, tc_index));
+ break;
+#else
+ if (type == BPF_WRITE)
+ *insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+ else
+ *insn++ = BPF_MOV64_IMM(dst_reg, 0);
+ break;
+#endif
}
return insn - insn_buf;
@@ -1458,13 +1879,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = sk_filter_convert_ctx_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
.get_func_proto = tc_cls_act_func_proto,
- .is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = sk_filter_convert_ctx_access,
+ .is_valid_access = tc_cls_act_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
@@ -1526,9 +1947,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
goto out;
/* We're copying the filter that has been originally attached,
- * so no conversion/decode needed anymore.
+ * so no conversion/decode needed anymore. eBPF programs that
+ * have no original program cannot be dumped through this.
*/
+ ret = -EACCES;
fprog = filter->prog->orig_prog;
+ if (!fprog)
+ goto out;
ret = fprog->len;
if (!len)
diff --git a/kernel/net/core/flow_dissector.c b/kernel/net/core/flow_dissector.c
index 2c35c02a9..12e700332 100644
--- a/kernel/net/core/flow_dissector.c
+++ b/kernel/net/core/flow_dissector.c
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
@@ -12,19 +13,60 @@
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
-#include <net/flow_keys.h>
+#include <linux/stddef.h>
+#include <linux/if_ether.h>
+#include <linux/mpls.h>
+#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
-/* copy saddr & daddr, possibly using 64bit load/store
- * Equivalent to : flow->src = iph->saddr;
- * flow->dst = iph->daddr;
- */
-static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id)
+{
+ return flow_dissector->used_keys & (1 << key_id);
+}
+
+static void dissector_set_key(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id)
+{
+ flow_dissector->used_keys |= (1 << key_id);
+}
+
+static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id,
+ void *target_container)
+{
+ return ((char *) target_container) + flow_dissector->offset[key_id];
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+ const struct flow_dissector_key *key,
+ unsigned int key_count)
{
- BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
- offsetof(typeof(*flow), src) + sizeof(flow->src));
- memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+ unsigned int i;
+
+ memset(flow_dissector, 0, sizeof(*flow_dissector));
+
+ for (i = 0; i < key_count; i++, key++) {
+ /* User should make sure that every key target offset is withing
+ * boundaries of unsigned short.
+ */
+ BUG_ON(key->offset > USHRT_MAX);
+ BUG_ON(dissector_uses_key(flow_dissector,
+ key->key_id));
+
+ dissector_set_key(flow_dissector, key->key_id);
+ flow_dissector->offset[key->key_id] = key->offset;
+ }
+
+ /* Ensure that the dissector always includes control and basic key.
+ * That way we are able to avoid handling lack of these in fast path.
+ */
+ BUG_ON(!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL));
+ BUG_ON(!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC));
}
+EXPORT_SYMBOL(skb_flow_dissector_init);
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
@@ -63,18 +105,33 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
* @data: raw buffer pointer to the packet, if NULL use skb->data
* @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
*
- * The function will try to retrieve the struct flow_keys from either the skbuff
- * or a raw buffer specified by the rest parameters
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
*/
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
- void *data, __be16 proto, int nhoff, int hlen)
+bool __skb_flow_dissect(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container,
+ void *data, __be16 proto, int nhoff, int hlen,
+ unsigned int flags)
{
- u8 ip_proto;
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_tags *key_tags;
+ struct flow_dissector_key_keyid *key_keyid;
+ u8 ip_proto = 0;
+ bool ret = false;
if (!data) {
data = skb->data;
@@ -83,7 +140,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
hlen = skb_headlen(skb);
}
- memset(flow, 0, sizeof(*flow));
+ /* It is ensured by skb_flow_dissector_init() that control key will
+ * be always present.
+ */
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+
+ /* It is ensured by skb_flow_dissector_init() that basic key will
+ * be always present.
+ */
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ struct flow_dissector_key_eth_addrs *key_eth_addrs;
+
+ key_eth_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS,
+ target_container);
+ memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+ }
again:
switch (proto) {
@@ -93,57 +173,82 @@ again:
ip:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph || iph->ihl < 5)
- return false;
+ goto out_bad;
nhoff += iph->ihl * 4;
ip_proto = iph->protocol;
- if (ip_is_fragment(iph))
- ip_proto = 0;
- /* skip the address processing if skb is NULL. The assumption
- * here is that if there is no skb we are not looking for flow
- * info but lengths and protocols.
- */
- if (!skb)
+ if (!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS))
break;
- iph_to_flow_copy_addrs(flow, iph);
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
+ memcpy(&key_addrs->v4addrs, &iph->saddr,
+ sizeof(key_addrs->v4addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+
+ if (ip_is_fragment(iph)) {
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+
+ if (iph->frag_off & htons(IP_OFFSET)) {
+ goto out_good;
+ } else {
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+ goto out_good;
+ }
+ }
+
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
+ goto out_good;
+
break;
}
case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
- __be32 flow_label;
ipv6:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph)
- return false;
+ goto out_bad;
ip_proto = iph->nexthdr;
nhoff += sizeof(struct ipv6hdr);
- /* see comment above in IPv4 section */
- if (!skb)
- break;
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
- flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
- flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+ key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
- flow_label = ip6_flowlabel(iph);
- if (flow_label) {
- /* Awesome, IPv6 packet has a flow label so we can
- * use that to represent the ports without any
- * further dissection.
- */
- flow->n_proto = proto;
- flow->ip_proto = ip_proto;
- flow->ports = flow_label;
- flow->thoff = (u16)nhoff;
+ memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
- return true;
+ if ((dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+ (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+ ip6_flowlabel(iph)) {
+ __be32 flow_label = ip6_flowlabel(iph);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_label);
+ }
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
+ goto out_good;
}
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
+ goto out_good;
+
break;
}
case htons(ETH_P_8021AD):
@@ -153,7 +258,16 @@ ipv6:
vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
if (!vlan)
- return false;
+ goto out_bad;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_VLANID)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_VLANID,
+ target_container);
+
+ key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+ }
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
@@ -166,7 +280,7 @@ ipv6:
} *hdr, _hdr;
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
- return false;
+ goto out_bad;
proto = hdr->proto;
nhoff += PPPOE_SES_HLEN;
switch (proto) {
@@ -175,7 +289,7 @@ ipv6:
case htons(PPP_IPV6):
goto ipv6;
default:
- return false;
+ goto out_bad;
}
}
case htons(ETH_P_TIPC): {
@@ -185,20 +299,53 @@ ipv6:
} *hdr, _hdr;
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
- return false;
- flow->src = hdr->srcnode;
- flow->dst = 0;
- flow->n_proto = proto;
- flow->thoff = (u16)nhoff;
- return true;
+ goto out_bad;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+ target_container);
+ key_addrs->tipcaddrs.srcnode = hdr->srcnode;
+ key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
+ }
+ goto out_good;
}
+
+ case htons(ETH_P_MPLS_UC):
+ case htons(ETH_P_MPLS_MC): {
+ struct mpls_label *hdr, _hdr[2];
+mpls:
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr)
+ goto out_bad;
+
+ if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
+ MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+ target_container);
+ key_keyid->keyid = hdr[1].entry &
+ htonl(MPLS_LS_LABEL_MASK);
+ }
+
+ goto out_good;
+ }
+
+ goto out_good;
+ }
+
case htons(ETH_P_FCOE):
- flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+ key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
/* fall through */
default:
- return false;
+ goto out_bad;
}
+ip_proto_again:
switch (ip_proto) {
case IPPROTO_GRE: {
struct gre_hdr {
@@ -208,56 +355,149 @@ ipv6:
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
- return false;
+ goto out_bad;
/*
* Only look inside GRE if version zero and no
* routing
*/
- if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
- proto = hdr->proto;
+ if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
+ break;
+
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
nhoff += 4;
- if (hdr->flags & GRE_CSUM)
- nhoff += 4;
- if (hdr->flags & GRE_KEY)
- nhoff += 4;
- if (hdr->flags & GRE_SEQ)
- nhoff += 4;
- if (proto == htons(ETH_P_TEB)) {
- const struct ethhdr *eth;
- struct ethhdr _eth;
-
- eth = __skb_header_pointer(skb, nhoff,
- sizeof(_eth),
- data, hlen, &_eth);
- if (!eth)
- return false;
- proto = eth->h_proto;
- nhoff += sizeof(*eth);
+ if (hdr->flags & GRE_KEY) {
+ const __be32 *keyid;
+ __be32 _keyid;
+
+ keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+ data, hlen, &_keyid);
+
+ if (!keyid)
+ goto out_bad;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID,
+ target_container);
+ key_keyid->keyid = *keyid;
}
- goto again;
+ nhoff += 4;
}
- break;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, nhoff,
+ sizeof(_eth),
+ data, hlen, &_eth);
+ if (!eth)
+ goto out_bad;
+ proto = eth->h_proto;
+ nhoff += sizeof(*eth);
+
+ /* Cap headers that we access via pointers at the
+ * end of the Ethernet header as our maximum alignment
+ * at that point is only 2 bytes.
+ */
+ if (NET_IP_ALIGN)
+ hlen = nhoff;
+ }
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ goto out_good;
+
+ goto again;
+ }
+ case NEXTHDR_HOP:
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_DEST: {
+ u8 _opthdr[2], *opthdr;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
+ data, hlen, &_opthdr);
+ if (!opthdr)
+ goto out_bad;
+
+ ip_proto = opthdr[0];
+ nhoff += (opthdr[1] + 1) << 3;
+
+ goto ip_proto_again;
+ }
+ case NEXTHDR_FRAGMENT: {
+ struct frag_hdr _fh, *fh;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
+ data, hlen, &_fh);
+
+ if (!fh)
+ goto out_bad;
+
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+
+ nhoff += sizeof(_fh);
+
+ if (!(fh->frag_off & htons(IP6_OFFSET))) {
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
+ ip_proto = fh->nexthdr;
+ goto ip_proto_again;
+ }
+ }
+ goto out_good;
}
case IPPROTO_IPIP:
proto = htons(ETH_P_IP);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ goto out_good;
+
goto ip;
case IPPROTO_IPV6:
proto = htons(ETH_P_IPV6);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ goto out_good;
+
goto ipv6;
+ case IPPROTO_MPLS:
+ proto = htons(ETH_P_MPLS_UC);
+ goto mpls;
default:
break;
}
- flow->n_proto = proto;
- flow->ip_proto = ip_proto;
- flow->thoff = (u16) nhoff;
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+ data, hlen);
+ }
+
+out_good:
+ ret = true;
- /* unless skb is set we don't need to record port info */
- if (skb)
- flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
+out_bad:
+ key_basic->n_proto = proto;
+ key_basic->ip_proto = ip_proto;
+ key_control->thoff = (u16)nhoff;
- return true;
+ return ret;
}
EXPORT_SYMBOL(__skb_flow_dissect);
@@ -267,27 +507,112 @@ static __always_inline void __flow_hash_secret_init(void)
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
+ u32 keyval)
{
- __flow_hash_secret_init();
- return jhash_3words(a, b, c, hashrnd);
+ return jhash2(words, length, keyval);
}
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
+static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
{
- u32 hash;
+ const void *p = flow;
- /* get a consistent hash (same value on both flow directions) */
- if (((__force u32)keys->dst < (__force u32)keys->src) ||
- (((__force u32)keys->dst == (__force u32)keys->src) &&
- ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
- swap(keys->dst, keys->src);
- swap(keys->port16[0], keys->port16[1]);
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
+ return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
+}
+
+static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
+{
+ size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
+ BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
+ sizeof(*flow) - sizeof(flow->addrs));
+
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ diff -= sizeof(flow->addrs.v4addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ diff -= sizeof(flow->addrs.v6addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+ diff -= sizeof(flow->addrs.tipcaddrs);
+ break;
+ }
+ return (sizeof(*flow) - diff) / sizeof(u32);
+}
+
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.src;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.src);
+ case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+ return flow->addrs.tipcaddrs.srcnode;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(flow_get_u32_src);
+
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.dst;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.dst);
+ default:
+ return 0;
}
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
- hash = __flow_hash_3words((__force u32)keys->dst,
- (__force u32)keys->src,
- (__force u32)keys->ports);
+static inline void __flow_hash_consistentify(struct flow_keys *keys)
+{
+ int addr_diff, i;
+
+ switch (keys->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ addr_diff = (__force u32)keys->addrs.v4addrs.dst -
+ (__force u32)keys->addrs.v4addrs.src;
+ if ((addr_diff < 0) ||
+ (addr_diff == 0 &&
+ ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src))) {
+ swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ addr_diff = memcmp(&keys->addrs.v6addrs.dst,
+ &keys->addrs.v6addrs.src,
+ sizeof(keys->addrs.v6addrs.dst));
+ if ((addr_diff < 0) ||
+ (addr_diff == 0 &&
+ ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src))) {
+ for (i = 0; i < 4; i++)
+ swap(keys->addrs.v6addrs.src.s6_addr32[i],
+ keys->addrs.v6addrs.dst.s6_addr32[i]);
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ }
+}
+
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
+{
+ u32 hash;
+
+ __flow_hash_consistentify(keys);
+
+ hash = __flow_hash_words(flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
if (!hash)
hash = 1;
@@ -296,12 +621,52 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
u32 flow_hash_from_keys(struct flow_keys *keys)
{
- return __flow_hash_from_keys(keys);
+ __flow_hash_secret_init();
+ return __flow_hash_from_keys(keys, hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);
-/*
- * __skb_get_hash: calculate a flow hash based on src/dst addresses
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+ struct flow_keys *keys, u32 keyval)
+{
+ skb_flow_dissect_flow_keys(skb, keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ return __flow_hash_from_keys(keys, keyval);
+}
+
+struct _flow_keys_digest_data {
+ __be16 n_proto;
+ u8 ip_proto;
+ u8 padding;
+ __be32 ports;
+ __be32 src;
+ __be32 dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+ const struct flow_keys *flow)
+{
+ struct _flow_keys_digest_data *data =
+ (struct _flow_keys_digest_data *)digest;
+
+ BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+ memset(digest, 0, sizeof(*digest));
+
+ data->n_proto = flow->basic.n_proto;
+ data->ip_proto = flow->basic.ip_proto;
+ data->ports = flow->ports.ports;
+ data->src = flow->addrs.v4addrs.src;
+ data->dst = flow->addrs.v4addrs.dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
+/**
+ * __skb_get_hash: calculate a flow hash
+ * @skb: sk_buff to calculate flow hash from
+ *
+ * This function calculates a flow hash based on src/dst addresses
* and src/dst port numbers. Sets hash in skb to non-zero hash value
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
@@ -310,52 +675,72 @@ void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
- if (!skb_flow_dissect(skb, &keys))
- return;
+ __flow_hash_secret_init();
- if (keys.ports)
- skb->l4_hash = 1;
+ __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd),
+ flow_keys_have_l4(&keys));
+}
+EXPORT_SYMBOL(__skb_get_hash);
- skb->sw_hash = 1;
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+{
+ struct flow_keys keys;
- skb->hash = __flow_hash_from_keys(&keys);
+ return ___skb_get_hash(skb, &keys, perturb);
}
-EXPORT_SYMBOL(__skb_get_hash);
+EXPORT_SYMBOL(skb_get_hash_perturb);
-/*
- * Returns a Tx hash based on the given packet descriptor a Tx queues' number
- * to be used as a distribution range.
- */
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
- unsigned int num_tx_queues)
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
- u32 hash;
- u16 qoffset = 0;
- u16 qcount = num_tx_queues;
-
- if (skb_rx_queue_recorded(skb)) {
- hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
- return hash;
- }
+ struct flow_keys keys;
- if (dev->num_tc) {
- u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
- qoffset = dev->tc_to_txq[tc].offset;
- qcount = dev->tc_to_txq[tc].count;
- }
+ memset(&keys, 0, sizeof(keys));
+
+ memcpy(&keys.addrs.v6addrs.src, &fl6->saddr,
+ sizeof(keys.addrs.v6addrs.src));
+ memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr,
+ sizeof(keys.addrs.v6addrs.dst));
+ keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ keys.ports.src = fl6->fl6_sport;
+ keys.ports.dst = fl6->fl6_dport;
+ keys.keyid.keyid = fl6->fl6_gre_key;
+ keys.tags.flow_label = (__force u32)fl6->flowlabel;
+ keys.basic.ip_proto = fl6->flowi6_proto;
+
+ __skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+ flow_keys_have_l4(&keys));
- return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+ return skb->hash;
}
-EXPORT_SYMBOL(__skb_tx_hash);
+EXPORT_SYMBOL(__skb_get_hash_flowi6);
+
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
+{
+ struct flow_keys keys;
+
+ memset(&keys, 0, sizeof(keys));
+
+ keys.addrs.v4addrs.src = fl4->saddr;
+ keys.addrs.v4addrs.dst = fl4->daddr;
+ keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ keys.ports.src = fl4->fl4_sport;
+ keys.ports.dst = fl4->fl4_dport;
+ keys.keyid.keyid = fl4->fl4_gre_key;
+ keys.basic.ip_proto = fl4->flowi4_proto;
+
+ __skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+ flow_keys_have_l4(&keys));
+
+ return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi4);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
const struct flow_keys *keys, int hlen)
{
- u32 poff = keys->thoff;
+ u32 poff = keys->control.thoff;
- switch (keys->ip_proto) {
+ switch (keys->basic.ip_proto) {
case IPPROTO_TCP: {
/* access doff as u8 to avoid unaligned access */
const u8 *doff;
@@ -396,8 +781,12 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
return poff;
}
-/* skb_get_poff() returns the offset to the payload as far as it could
- * be dissected. The main user is currently BPF, so that we can dynamically
+/**
+ * skb_get_poff - get the offset to the payload
+ * @skb: sk_buff to get the payload offset from
+ *
+ * The function will get the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
* truncate packets without needing to push actual payload to the user
* space and can analyze headers only, instead.
*/
@@ -405,86 +794,111 @@ u32 skb_get_poff(const struct sk_buff *skb)
{
struct flow_keys keys;
- if (!skb_flow_dissect(skb, &keys))
+ if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
{
-#ifdef CONFIG_XPS
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- int queue_index = -1;
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
- if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[skb->sender_cpu - 1]);
- if (map) {
- if (map->len == 1)
- queue_index = map->queues[0];
- else
- queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
- map->len)];
- if (unlikely(queue_index >= dev->real_num_tx_queues))
- queue_index = -1;
- }
- }
- rcu_read_unlock();
-
- return queue_index;
-#else
- return -1;
-#endif
+ memset(keys, 0, sizeof(*keys));
+
+ memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
+ sizeof(keys->addrs.v6addrs.src));
+ memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
+ sizeof(keys->addrs.v6addrs.dst));
+ keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ keys->ports.src = fl6->fl6_sport;
+ keys->ports.dst = fl6->fl6_dport;
+ keys->keyid.keyid = fl6->fl6_gre_key;
+ keys->tags.flow_label = (__force u32)fl6->flowlabel;
+ keys->basic.ip_proto = fl6->flowi6_proto;
+
+ return flow_hash_from_keys(keys);
}
+EXPORT_SYMBOL(__get_hash_from_flowi6);
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
{
- struct sock *sk = skb->sk;
- int queue_index = sk_tx_queue_get(sk);
-
- if (queue_index < 0 || skb->ooo_okay ||
- queue_index >= dev->real_num_tx_queues) {
- int new_index = get_xps_queue(dev, skb);
- if (new_index < 0)
- new_index = skb_tx_hash(dev, skb);
+ memset(keys, 0, sizeof(*keys));
- if (queue_index != new_index && sk &&
- rcu_access_pointer(sk->sk_dst_cache))
- sk_tx_queue_set(sk, new_index);
+ keys->addrs.v4addrs.src = fl4->saddr;
+ keys->addrs.v4addrs.dst = fl4->daddr;
+ keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ keys->ports.src = fl4->fl4_sport;
+ keys->ports.dst = fl4->fl4_dport;
+ keys->keyid.keyid = fl4->fl4_gre_key;
+ keys->basic.ip_proto = fl4->flowi4_proto;
- queue_index = new_index;
- }
-
- return queue_index;
+ return flow_hash_from_keys(keys);
}
-
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
- struct sk_buff *skb,
- void *accel_priv)
+EXPORT_SYMBOL(__get_hash_from_flowi4);
+
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v4addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v6addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.tipcaddrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_PORTS,
+ .offset = offsetof(struct flow_keys, ports),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_VLANID,
+ .offset = offsetof(struct flow_keys, tags),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ .offset = offsetof(struct flow_keys, tags),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+ .offset = offsetof(struct flow_keys, keyid),
+ },
+};
+
+static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_buf_dissector __read_mostly;
+
+static int __init init_default_flow_dissectors(void)
{
- int queue_index = 0;
-
-#ifdef CONFIG_XPS
- if (skb->sender_cpu == 0)
- skb->sender_cpu = raw_smp_processor_id() + 1;
-#endif
-
- if (dev->real_num_tx_queues != 1) {
- const struct net_device_ops *ops = dev->netdev_ops;
- if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
- __netdev_pick_tx);
- else
- queue_index = __netdev_pick_tx(dev, skb);
-
- if (!accel_priv)
- queue_index = netdev_cap_txqueue(dev, queue_index);
- }
-
- skb_set_queue_mapping(skb, queue_index);
- return netdev_get_tx_queue(dev, queue_index);
+ skb_flow_dissector_init(&flow_keys_dissector,
+ flow_keys_dissector_keys,
+ ARRAY_SIZE(flow_keys_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_buf_dissector,
+ flow_keys_buf_dissector_keys,
+ ARRAY_SIZE(flow_keys_buf_dissector_keys));
+ return 0;
}
+
+late_initcall_sync(init_default_flow_dissectors);
diff --git a/kernel/net/core/gen_estimator.c b/kernel/net/core/gen_estimator.c
index 9dfb88a93..92d886f4a 100644
--- a/kernel/net/core/gen_estimator.c
+++ b/kernel/net/core/gen_estimator.c
@@ -66,7 +66,7 @@
NOTES.
- * avbps is scaled by 2^5, avpps is scaled by 2^10.
+ * avbps and avpps are scaled by 2^5.
* both values are reported as 32 bit unsigned values. bps can
overflow for fast links : max speed being 34360Mbit/sec
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
@@ -85,10 +85,10 @@ struct gen_estimator
struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
int ewma_log;
+ u32 last_packets;
+ unsigned long avpps;
u64 last_bytes;
u64 avbps;
- u32 last_packets;
- u32 avpps;
struct rcu_head e_rcu;
struct rb_node node;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
@@ -118,8 +118,8 @@ static void est_timer(unsigned long arg)
rcu_read_lock();
list_for_each_entry_rcu(e, &elist[idx].list, list) {
struct gnet_stats_basic_packed b = {0};
+ unsigned long rate;
u64 brate;
- u32 rate;
spin_lock(e->stats_lock);
read_lock(&est_lock);
@@ -133,10 +133,11 @@ static void est_timer(unsigned long arg)
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
- rate = (b.packets - e->last_packets)<<(12 - idx);
+ rate = b.packets - e->last_packets;
+ rate <<= (7 - idx);
e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- e->rate_est->pps = (e->avpps+0x1FF)>>10;
+ e->rate_est->pps = (e->avpps + 0xF) >> 5;
skip:
read_unlock(&est_lock);
spin_unlock(e->stats_lock);
diff --git a/kernel/net/core/lwtunnel.c b/kernel/net/core/lwtunnel.c
new file mode 100644
index 000000000..299cfc24d
--- /dev/null
+++ b/kernel/net/core/lwtunnel.c
@@ -0,0 +1,249 @@
+/*
+ * lwtunnel Infrastructure for light weight tunnels like mpls
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+#include <net/ip6_fib.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+ struct lwtunnel_state *lws;
+
+ lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+ return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+static const struct lwtunnel_encap_ops __rcu *
+ lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ return !cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int encap_type)
+{
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[encap_type],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+ struct nlattr *encap, unsigned int family,
+ const void *cfg, struct lwtunnel_state **lws)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return ret;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state))
+ ret = ops->build_state(dev, encap, family, cfg, lws);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct nlattr *nest;
+ int ret = -EINVAL;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ nest = nla_nest_start(skb, RTA_ENCAP);
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->fill_encap))
+ ret = ops->fill_encap(skb, lwtstate);
+ rcu_read_unlock();
+
+ if (ret)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+ if (ret)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+
+ return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->get_encap_size))
+ ret = nla_total_size(ops->get_encap_size(lwtstate));
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!a && !b)
+ return 0;
+
+ if (!a || !b)
+ return 1;
+
+ if (a->type != b->type)
+ return 1;
+
+ if (a->type == LWTUNNEL_ENCAP_NONE ||
+ a->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[a->type]);
+ if (likely(ops && ops->cmp_encap))
+ ret = ops->cmp_encap(a, b);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
+
+int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->output))
+ ret = ops->output(net, sk, skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_output);
+
+int lwtunnel_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->input))
+ ret = ops->input(skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_input);
diff --git a/kernel/net/core/neighbour.c b/kernel/net/core/neighbour.c
index 2237c1b3c..f18ae91b6 100644
--- a/kernel/net/core/neighbour.c
+++ b/kernel/net/core/neighbour.c
@@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
- entries >= tbl->gc_thresh3)
+ entries >= tbl->gc_thresh3) {
+ net_info_ratelimited("%s: neighbor table overflow!\n",
+ tbl->id);
+ NEIGH_CACHE_STAT_INC(tbl, table_fulls);
goto out_entries;
+ }
}
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
@@ -853,7 +857,7 @@ static void neigh_probe(struct neighbour *neigh)
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
- skb = skb_copy(skb, GFP_ATOMIC);
+ skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
@@ -913,6 +917,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_PROBE;
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
+ notify = 1;
next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
} else {
@@ -1155,6 +1160,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (new != old) {
neigh_del_timer(neigh);
+ if (new & NUD_PROBE)
+ atomic_set(&neigh->probes, 0);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
@@ -1846,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ ndst.ndts_table_fulls += st->table_fulls;
}
if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
@@ -2207,7 +2215,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
ndm->ndm_pad2 = 0;
ndm->ndm_flags = pn->flags | NTF_PROXY;
ndm->ndm_type = RTN_UNICAST;
- ndm->ndm_ifindex = pn->dev->ifindex;
+ ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
ndm->ndm_state = NUD_NONE;
if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
@@ -2227,14 +2235,53 @@ static void neigh_update_notify(struct neighbour *neigh)
__neigh_notify(neigh, RTM_NEWNEIGH, 0);
}
+static bool neigh_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
+{
+ if (filter_idx && dev->ifindex != filter_idx)
+ return true;
+
+ return false;
+}
+
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct nlattr *tb[NDA_MAX + 1];
struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
+ int filter_master_idx = 0, filter_idx = 0;
+ unsigned int flags = NLM_F_MULTI;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL);
+ if (!err) {
+ if (tb[NDA_IFINDEX])
+ filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
+
+ if (tb[NDA_MASTER])
+ filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
+
+ if (filter_idx || filter_master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
+ }
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
@@ -2247,12 +2294,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
n = rcu_dereference_bh(n->next)) {
if (!net_eq(dev_net(n->dev), net))
continue;
+ if (neigh_ifindex_filtered(n->dev, filter_idx))
+ continue;
+ if (neigh_master_filtered(n->dev, filter_master_idx))
+ continue;
if (idx < s_idx)
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
- NLM_F_MULTI) < 0) {
+ flags) < 0) {
rc = -1;
goto out;
}
@@ -2282,7 +2333,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (dev_net(n->dev) != net)
+ if (pneigh_net(n) != net)
continue;
if (idx < s_idx)
goto next;
@@ -2714,12 +2765,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n");
+ seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx %08lx\n",
+ "%08lx %08lx %08lx %08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
@@ -2736,7 +2787,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
st->periodic_gc_runs,
st->forced_gc_runs,
- st->unres_discards
+ st->unres_discards,
+ st->table_fulls
);
return 0;
diff --git a/kernel/net/core/net-sysfs.c b/kernel/net/core/net-sysfs.c
index 4238d6da5..f88a62ab0 100644
--- a/kernel/net/core/net-sysfs.c
+++ b/kernel/net/core/net-sysfs.c
@@ -31,7 +31,6 @@
static const char fmt_hex[] = "%#x\n";
static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
-static const char fmt_udec[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
@@ -202,7 +201,7 @@ static ssize_t speed_show(struct device *dev,
if (netif_running(netdev)) {
struct ethtool_cmd cmd;
if (!__ethtool_get_settings(netdev, &cmd))
- ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
+ ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
}
rtnl_unlock();
return ret;
@@ -404,6 +403,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
NETDEVICE_SHOW(group, fmt_dec);
static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+ return dev_change_proto_down(dev, (bool) proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -458,11 +470,15 @@ static ssize_t phys_switch_id_show(struct device *dev,
return restart_syscall();
if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
- ret = netdev_switch_parent_id_get(netdev, &ppid);
+ ret = switchdev_port_attr_get(netdev, &attr);
if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len,
+ attr.u.ppid.id);
}
rtnl_unlock();
@@ -497,6 +513,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
+ &dev_attr_proto_down.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
@@ -671,7 +688,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
struct rps_map *old_map, *map;
cpumask_var_t mask;
int err, cpu, i;
- static DEFINE_SPINLOCK(rps_map_lock);
+ static DEFINE_MUTEX(rps_map_mutex);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -704,18 +721,21 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
map = NULL;
}
- spin_lock(&rps_map_lock);
+ mutex_lock(&rps_map_mutex);
old_map = rcu_dereference_protected(queue->rps_map,
- lockdep_is_held(&rps_map_lock));
+ mutex_is_locked(&rps_map_mutex));
rcu_assign_pointer(queue->rps_map, map);
- spin_unlock(&rps_map_lock);
if (map)
static_key_slow_inc(&rps_needed);
- if (old_map) {
- kfree_rcu(old_map, rcu);
+ if (old_map)
static_key_slow_dec(&rps_needed);
- }
+
+ mutex_unlock(&rps_map_mutex);
+
+ if (old_map)
+ kfree_rcu(old_map, rcu);
+
free_cpumask_var(mask);
return len;
}
@@ -983,15 +1003,12 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
}
#ifdef CONFIG_XPS
-static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
- int i;
-
- for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
- break;
+ unsigned int i;
+ i = queue - dev->_tx;
BUG_ON(i >= dev->num_tx_queues);
return i;
@@ -1460,6 +1477,15 @@ static int of_dev_node_match(struct device *dev, const void *data)
return ret == 0 ? dev->of_node == data : ret;
}
+/*
+ * of_find_net_device_by_node - lookup the net device for the device node
+ * @np: OF device node
+ *
+ * Looks up the net_device structure corresponding with the device node.
+ * If successful, returns a pointer to the net_device with the embedded
+ * struct device refcount incremented by one, or NULL on failure. The
+ * refcount must be dropped when done with the net_device.
+ */
struct net_device *of_find_net_device_by_node(struct device_node *np)
{
struct device *dev;
diff --git a/kernel/net/core/net-traces.c b/kernel/net/core/net-traces.c
index ba3c01207..adef015b2 100644
--- a/kernel/net/core/net-traces.c
+++ b/kernel/net/core/net-traces.c
@@ -31,6 +31,7 @@
#include <trace/events/napi.h>
#include <trace/events/sock.h>
#include <trace/events/udp.h>
+#include <trace/events/fib.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/kernel/net/core/net_namespace.c b/kernel/net/core/net_namespace.c
index 572af0011..2c2eb1b62 100644
--- a/kernel/net/core/net_namespace.c
+++ b/kernel/net/core/net_namespace.c
@@ -147,24 +147,17 @@ static void ops_free_list(const struct pernet_operations *ops,
}
}
-static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
- int id);
+/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
- int min = 0, max = 0, id;
-
- ASSERT_RTNL();
+ int min = 0, max = 0;
if (reqid >= 0) {
min = reqid;
max = reqid + 1;
}
- id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
- if (id >= 0)
- rtnl_net_notifyid(net, peer, RTM_NEWNSID, id);
-
- return id;
+ return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}
/* This function is used by idr_for_each(). If net is equal to peer, the
@@ -180,11 +173,16 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}
-static int __peernet2id(struct net *net, struct net *peer, bool alloc)
+/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
+ * is set to true, thus the caller knows that the new id must be notified via
+ * rtnl.
+ */
+static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+ bool alloc_it = *alloc;
- ASSERT_RTNL();
+ *alloc = false;
/* Magic value for id 0. */
if (id == NET_ID_ZERO)
@@ -192,36 +190,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
if (id > 0)
return id;
- if (alloc)
- return alloc_netid(net, peer, -1);
+ if (alloc_it) {
+ id = alloc_netid(net, peer, -1);
+ *alloc = true;
+ return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+ }
+
+ return NETNSA_NSID_NOT_ASSIGNED;
+}
+
+/* should be called with nsid_lock held */
+static int __peernet2id(struct net *net, struct net *peer)
+{
+ bool no = false;
- return -ENOENT;
+ return __peernet2id_alloc(net, peer, &no);
}
+static void rtnl_net_notifyid(struct net *net, int cmd, int id);
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
+int peernet2id_alloc(struct net *net, struct net *peer)
+{
+ unsigned long flags;
+ bool alloc;
+ int id;
+
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ alloc = atomic_read(&peer->count) == 0 ? false : true;
+ id = __peernet2id_alloc(net, peer, &alloc);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+ if (alloc && id >= 0)
+ rtnl_net_notifyid(net, RTM_NEWNSID, id);
+ return id;
+}
+EXPORT_SYMBOL(peernet2id_alloc);
+
+/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
{
- bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+ unsigned long flags;
int id;
- id = __peernet2id(net, peer, alloc);
- return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ id = __peernet2id(net, peer);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+ return id;
+}
+
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(struct net *net, struct net *peer)
+{
+ return peernet2id(net, peer) >= 0;
}
-EXPORT_SYMBOL(peernet2id);
struct net *get_net_ns_by_id(struct net *net, int id)
{
+ unsigned long flags;
struct net *peer;
if (id < 0)
return NULL;
rcu_read_lock();
+ spin_lock_irqsave(&net->nsid_lock, flags);
peer = idr_find(&net->netns_ids, id);
if (peer)
get_net(peer);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
rcu_read_unlock();
return peer;
@@ -242,6 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
+ spin_lock_init(&net->nsid_lock);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -362,14 +402,19 @@ static void cleanup_net(struct work_struct *work)
list_del_rcu(&net->list);
list_add_tail(&net->exit_list, &net_exit_list);
for_each_net(tmp) {
- int id = __peernet2id(tmp, net, false);
+ int id;
- if (id >= 0) {
- rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
+ spin_lock_irq(&tmp->nsid_lock);
+ id = __peernet2id(tmp, net);
+ if (id >= 0)
idr_remove(&tmp->netns_ids, id);
- }
+ spin_unlock_irq(&tmp->nsid_lock);
+ if (id >= 0)
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id);
}
+ spin_lock_irq(&net->nsid_lock);
idr_destroy(&net->netns_ids);
+ spin_unlock_irq(&net->nsid_lock);
}
rtnl_unlock();
@@ -497,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
+ unsigned long flags;
struct net *peer;
int nsid, err;
@@ -517,14 +563,19 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(peer))
return PTR_ERR(peer);
- if (__peernet2id(net, peer, false) >= 0) {
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ if (__peernet2id(net, peer) >= 0) {
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
err = -EEXIST;
goto out;
}
err = alloc_netid(net, peer, nsid);
- if (err > 0)
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+ if (err >= 0) {
+ rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0;
+ }
out:
put_net(peer);
return err;
@@ -538,14 +589,10 @@ static int rtnl_net_get_size(void)
}
static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
- int cmd, struct net *net, struct net *peer,
- int nsid)
+ int cmd, struct net *net, int nsid)
{
struct nlmsghdr *nlh;
struct rtgenmsg *rth;
- int id;
-
- ASSERT_RTNL();
nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
if (!nlh)
@@ -554,14 +601,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC;
- if (nsid >= 0) {
- id = nsid;
- } else {
- id = __peernet2id(net, peer, false);
- if (id < 0)
- id = NETNSA_NSID_NOT_ASSIGNED;
- }
- if (nla_put_s32(skb, NETNSA_NSID, id))
+ if (nla_put_s32(skb, NETNSA_NSID, nsid))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -578,7 +618,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[NETNSA_MAX + 1];
struct sk_buff *msg;
struct net *peer;
- int err;
+ int err, id;
err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy);
@@ -600,8 +640,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
goto out;
}
+ id = peernet2id(net, peer);
err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- RTM_NEWNSID, net, peer, -1);
+ RTM_NEWNSID, net, id);
if (err < 0)
goto err_out;
@@ -633,7 +674,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)
ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWNSID, net_cb->net, peer, id);
+ RTM_NEWNSID, net_cb->net, id);
if (ret < 0)
return ret;
@@ -652,17 +693,17 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
.idx = 0,
.s_idx = cb->args[0],
};
+ unsigned long flags;
- ASSERT_RTNL();
-
+ spin_lock_irqsave(&net->nsid_lock, flags);
idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
cb->args[0] = net_cb.idx;
return skb->len;
}
-static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
- int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id)
{
struct sk_buff *msg;
int err = -ENOMEM;
@@ -671,7 +712,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
if (!msg)
goto out;
- err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id);
+ err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
if (err < 0)
goto err_out;
diff --git a/kernel/net/core/netclassid_cgroup.c b/kernel/net/core/netclassid_cgroup.c
index 1f2a126f4..d9ee8d08a 100644
--- a/kernel/net/core/netclassid_cgroup.c
+++ b/kernel/net/core/netclassid_cgroup.c
@@ -23,7 +23,8 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return css_cls_state(task_css(p, net_cls_cgrp_id));
+ return css_cls_state(task_css_check(p, net_cls_cgrp_id,
+ rcu_read_lock_bh_held()));
}
EXPORT_SYMBOL_GPL(task_cls_state);
@@ -55,7 +56,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
kfree(css_cls_state(css));
}
-static int update_classid(const void *v, struct file *file, unsigned n)
+static int update_classid_sock(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
@@ -66,18 +67,27 @@ static int update_classid(const void *v, struct file *file, unsigned n)
return 0;
}
-static void cgrp_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void update_classid(struct cgroup_subsys_state *css, void *v)
{
- struct cgroup_cls_state *cs = css_cls_state(css);
- void *v = (void *)(unsigned long)cs->classid;
+ struct css_task_iter it;
struct task_struct *p;
- cgroup_taskset_for_each(p, tset) {
+ css_task_iter_start(css, &it);
+ while ((p = css_task_iter_next(&it))) {
task_lock(p);
- iterate_fd(p->files, 0, update_classid, v);
+ iterate_fd(p->files, 0, update_classid_sock, v);
task_unlock(p);
}
+ css_task_iter_end(&it);
+}
+
+static void cgrp_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_first(tset, &css);
+ update_classid(css,
+ (void *)(unsigned long)css_cls_state(css)->classid);
}
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -88,8 +98,11 @@ static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
u64 value)
{
- css_cls_state(css)->classid = (u32) value;
+ struct cgroup_cls_state *cs = css_cls_state(css);
+
+ cs->classid = (u32)value;
+ update_classid(css, (void *)(unsigned long)cs->classid);
return 0;
}
diff --git a/kernel/net/core/netevent.c b/kernel/net/core/netevent.c
index f17ccd291..8b3bc4fac 100644
--- a/kernel/net/core/netevent.c
+++ b/kernel/net/core/netevent.c
@@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
*/
int register_netevent_notifier(struct notifier_block *nb)
{
- int err;
-
- err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
- return err;
+ return atomic_notifier_chain_register(&netevent_notif_chain, nb);
}
EXPORT_SYMBOL_GPL(register_netevent_notifier);
diff --git a/kernel/net/core/netpoll.c b/kernel/net/core/netpoll.c
index c126a878c..94acfc89a 100644
--- a/kernel/net/core/netpoll.c
+++ b/kernel/net/core/netpoll.c
@@ -140,36 +140,42 @@ static void queue_process(struct work_struct *work)
* case. Further, we test the poll_owner to avoid recursion on UP
* systems where the lock doesn't exist.
*/
-static int poll_one_napi(struct napi_struct *napi, int budget)
+static void poll_one_napi(struct napi_struct *napi)
{
- int work;
+ int work = 0;
/* net_rx_action's ->poll() invocations and our's are
* synchronized by this test which is only made while
* holding the napi->poll_lock.
*/
if (!test_bit(NAPI_STATE_SCHED, &napi->state))
- return budget;
+ return;
- set_bit(NAPI_STATE_NPSVC, &napi->state);
+ /* If we set this bit but see that it has already been set,
+ * that indicates that napi has been disabled and we need
+ * to abort this operation
+ */
+ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
+ return;
- work = napi->poll(napi, budget);
- WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
+ /* We explicilty pass the polling call a budget of 0 to
+ * indicate that we are clearing the Tx path only.
+ */
+ work = napi->poll(napi, 0);
+ WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll);
trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
-
- return budget - work;
}
-static void poll_napi(struct net_device *dev, int budget)
+static void poll_napi(struct net_device *dev)
{
struct napi_struct *napi;
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner != smp_processor_id() &&
spin_trylock(&napi->poll_lock)) {
- budget = poll_one_napi(napi, budget);
+ poll_one_napi(napi);
spin_unlock(&napi->poll_lock);
}
}
@@ -179,7 +185,6 @@ static void netpoll_poll_dev(struct net_device *dev)
{
const struct net_device_ops *ops;
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
- int budget = 0;
/* Don't do any rx activity if the dev_lock mutex is held
* the dev_open/close paths use this to block netpoll activity
@@ -202,7 +207,7 @@ static void netpoll_poll_dev(struct net_device *dev)
/* Process pending work on NIC */
ops->ndo_poll_controller(dev);
- poll_napi(dev, budget);
+ poll_napi(dev);
up(&ni->dev_lock);
@@ -380,6 +385,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
static atomic_t ip_ident;
struct ipv6hdr *ip6h;
+ WARN_ON_ONCE(!irqs_disabled());
+
udp_len = len + sizeof(*udph);
if (np->ipv6)
ip_len = udp_len + sizeof(*ip6h);
diff --git a/kernel/net/core/netprio_cgroup.c b/kernel/net/core/netprio_cgroup.c
index cbd0a199b..40fd09fe0 100644
--- a/kernel/net/core/netprio_cgroup.c
+++ b/kernel/net/core/netprio_cgroup.c
@@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
return 0;
}
-static void net_prio_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void net_prio_attach(struct cgroup_taskset *tset)
{
struct task_struct *p;
- void *v = (void *)(unsigned long)css->cgroup->id;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ void *v = (void *)(unsigned long)css->cgroup->id;
- cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
diff --git a/kernel/net/core/pktgen.c b/kernel/net/core/pktgen.c
index 043ea1867..4da4d51a2 100644
--- a/kernel/net/core/pktgen.c
+++ b/kernel/net/core/pktgen.c
@@ -177,7 +177,7 @@
#include <asm/dma.h>
#include <asm/div64.h> /* do_div */
-#define VERSION "2.74"
+#define VERSION "2.75"
#define IP_NAME_SZ 32
#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
#define MPLS_STACK_BOTTOM htonl(0x00000100)
@@ -210,6 +210,10 @@
#define T_REMDEVALL (1<<2) /* Remove all devs */
#define T_REMDEV (1<<3) /* Remove one dev */
+/* Xmit modes */
+#define M_START_XMIT 0 /* Default normal TX */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+
/* If lock -- protects updating of if_list */
#define if_lock(t) spin_lock(&(t->if_lock));
#define if_unlock(t) spin_unlock(&(t->if_lock));
@@ -251,13 +255,14 @@ struct pktgen_dev {
* we will do a random selection from within the range.
*/
__u32 flags;
- int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
-
+ int xmit_mode;
int min_pkt_size;
int max_pkt_size;
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
+ int removal_mark; /* non-zero => the device is marked for
+ * removal by worker thread */
+
struct page *page;
u64 delay; /* nano-seconds */
@@ -268,7 +273,6 @@ struct pktgen_dev {
/* runtime counters relating to clone_skb */
- __u64 allocated_skbs;
__u32 clone_count;
int last_ok; /* Was last skb sent?
* Or a failed transmit of some sort?
@@ -507,7 +511,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
pktgen_reset_all_threads(pn);
else
- pr_warn("Unknown command: %s\n", data);
+ return -EINVAL;
return count;
}
@@ -567,7 +571,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
" dst_min: %s dst_max: %s\n",
pkt_dev->dst_min, pkt_dev->dst_max);
seq_printf(seq,
- " src_min: %s src_max: %s\n",
+ " src_min: %s src_max: %s\n",
pkt_dev->src_min, pkt_dev->src_max);
}
@@ -620,6 +624,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
+ seq_puts(seq, " xmit_mode: netif_receive\n");
+
seq_puts(seq, " Flags: ");
if (pkt_dev->flags & F_IPV6)
@@ -1081,7 +1088,8 @@ static ssize_t pktgen_if_write(struct file *file,
if (len < 0)
return len;
if ((value > 0) &&
- (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
+ !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
i += len;
pkt_dev->clone_skb = value;
@@ -1134,7 +1142,7 @@ static ssize_t pktgen_if_write(struct file *file,
return len;
i += len;
- if ((value > 1) &&
+ if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
pkt_dev->burst = value < 1 ? 1 : value;
@@ -1160,6 +1168,45 @@ static ssize_t pktgen_if_write(struct file *file,
sprintf(pg_result, "ERROR: node not possible");
return count;
}
+ if (!strcmp(name, "xmit_mode")) {
+ char f[32];
+
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ if (strcmp(f, "start_xmit") == 0) {
+ pkt_dev->xmit_mode = M_START_XMIT;
+ } else if (strcmp(f, "netif_receive") == 0) {
+ /* clone_skb set earlier, not supported in this mode */
+ if (pkt_dev->clone_skb > 0)
+ return -ENOTSUPP;
+
+ pkt_dev->xmit_mode = M_NETIF_RECEIVE;
+
+ /* make sure new packet is allocated every time
+ * pktgen_xmit() is called
+ */
+ pkt_dev->last_ok = 1;
+
+ /* override clone_skb if user passed default value
+ * at module loading time
+ */
+ pkt_dev->clone_skb = 0;
+ } else {
+ sprintf(pg_result,
+ "xmit_mode -:%s:- unknown\nAvailable modes: %s",
+ f, "start_xmit, netif_receive\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: xmit_mode=%s", f);
+ return count;
+ }
if (!strcmp(name, "flag")) {
char f[32];
memset(f, 0, 32);
@@ -1267,6 +1314,9 @@ static ssize_t pktgen_if_write(struct file *file,
else if (strcmp(f, "NO_TIMESTAMP") == 0)
pkt_dev->flags |= F_NO_TIMESTAMP;
+ else if (strcmp(f, "!NO_TIMESTAMP") == 0)
+ pkt_dev->flags &= ~F_NO_TIMESTAMP;
+
else {
sprintf(pg_result,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
@@ -2212,8 +2262,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
if (likely(t.task))
schedule();
@@ -2230,7 +2278,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
{
- pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev);
pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
@@ -2594,9 +2642,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int nhead = 0;
if (x) {
- int ret;
- __u8 *eth;
+ struct ethhdr *eth;
struct iphdr *iph;
+ int ret;
nhead = x->props.header_len - skb_headroom(skb);
if (nhead > 0) {
@@ -2616,9 +2664,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
goto err;
}
/* restore ll */
- eth = (__u8 *) skb_push(skb, ETH_HLEN);
- memcpy(eth, pkt_dev->hh, 12);
- *(u16 *) &eth[12] = protocol;
+ eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);
+ eth->h_proto = protocol;
/* Update IPv4 header len as well as checksum value */
iph = ip_hdr(skb);
@@ -2740,6 +2788,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
+ if (likely(skb))
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
return skb;
}
@@ -3317,6 +3368,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ struct sk_buff *skb;
int ret;
/* If device is offline, then don't send */
@@ -3347,13 +3399,43 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
return;
}
pkt_dev->last_pkt_size = pkt_dev->skb->len;
- pkt_dev->allocated_skbs++;
pkt_dev->clone_count = 0; /* reset counter */
}
if (pkt_dev->delay && pkt_dev->last_ok)
spin(pkt_dev, pkt_dev->next_tx);
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
+ skb = pkt_dev->skb;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ atomic_add(burst, &skb->users);
+ local_bh_disable();
+ do {
+ ret = netif_receive_skb(skb);
+ if (ret == NET_RX_DROP)
+ pkt_dev->errors++;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ if (atomic_read(&skb->users) != burst) {
+ /* skb was queued by rps/rfs or taps,
+ * so cannot reuse this skb
+ */
+ atomic_sub(burst - 1, &skb->users);
+ /* get out of the loop and wait
+ * until skb is consumed
+ */
+ break;
+ }
+ /* skb was 'freed' by stack, so clean few
+ * bits and reuse it
+ */
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = 0; /* reset reclass/redir ttl */
+#endif
+ } while (--burst > 0);
+ goto out; /* Skips xmit_mode M_START_XMIT */
+ }
+
txq = skb_get_tx_queue(odev, pkt_dev->skb);
local_bh_disable();
@@ -3401,6 +3483,7 @@ xmit_more:
unlock:
HARD_TX_UNLOCK(odev, txq);
+out:
local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
@@ -3432,8 +3515,6 @@ static int pktgen_thread_worker(void *arg)
set_freezable();
- __set_current_state(TASK_RUNNING);
-
while (!kthread_should_stop()) {
pkt_dev = next_to_run(t);
@@ -3478,7 +3559,6 @@ static int pktgen_thread_worker(void *arg)
try_to_freeze();
}
- set_current_state(TASK_INTERRUPTIBLE);
pr_debug("%s stopping all device\n", t->tsk->comm);
pktgen_stop(t);
@@ -3489,15 +3569,6 @@ static int pktgen_thread_worker(void *arg)
pr_debug("%s removing thread\n", t->tsk->comm);
pktgen_rem_thread(t);
- /* Wait for kthread_stop */
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
- schedule();
- }
- __set_current_state(TASK_RUNNING);
-
return 0;
}
@@ -3689,6 +3760,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
}
t->net = pn;
+ get_task_struct(p);
wake_up_process(p);
wait_for_completion(&t->start_done);
@@ -3811,6 +3883,7 @@ static void __net_exit pg_net_exit(struct net *net)
t = list_entry(q, struct pktgen_thread, th_list);
list_del(&t->th_list);
kthread_stop(t->tsk);
+ put_task_struct(t->tsk);
kfree(t);
}
diff --git a/kernel/net/core/ptp_classifier.c b/kernel/net/core/ptp_classifier.c
index 4eab4a94a..703cf76aa 100644
--- a/kernel/net/core/ptp_classifier.c
+++ b/kernel/net/core/ptp_classifier.c
@@ -58,7 +58,7 @@
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
* ldh [18] ; reload payload
* and #0xf ; mask PTP_CLASS_VMASK
- * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2
* ret a ; return PTP class
*
* ; PTP over UDP over IPv4 over 802.1Q over Ethernet
@@ -73,7 +73,7 @@
* jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
* ldh [x + 26] ; load payload
* and #0xf ; mask PTP_CLASS_VMASK
- * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
* ret a ; return PTP class
* drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
*
@@ -86,7 +86,7 @@
* jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
* ldh [66] ; load payload
* and #0xf ; mask PTP_CLASS_VMASK
- * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
+ * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
* ret a ; return PTP class
* drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
*
@@ -98,7 +98,7 @@
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
* ldh [14] ; reload payload
* and #0xf ; mask PTP_CLASS_VMASK
- * or #0x30 ; PTP_CLASS_L2
+ * or #0x40 ; PTP_CLASS_L2
* ret a ; return PTP class
* drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
*/
@@ -150,7 +150,7 @@ void __init ptp_classifier_init(void)
{ 0x15, 0, 35, 0x00000000 },
{ 0x28, 0, 0, 0x00000012 },
{ 0x54, 0, 0, 0x0000000f },
- { 0x44, 0, 0, 0x00000070 },
+ { 0x44, 0, 0, 0x000000c0 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x15, 0, 12, 0x00000800 },
{ 0x30, 0, 0, 0x0000001b },
@@ -162,7 +162,7 @@ void __init ptp_classifier_init(void)
{ 0x15, 0, 4, 0x0000013f },
{ 0x48, 0, 0, 0x0000001a },
{ 0x54, 0, 0, 0x0000000f },
- { 0x44, 0, 0, 0x00000050 },
+ { 0x44, 0, 0, 0x00000090 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 8, 0x000086dd },
@@ -172,7 +172,7 @@ void __init ptp_classifier_init(void)
{ 0x15, 0, 4, 0x0000013f },
{ 0x28, 0, 0, 0x00000042 },
{ 0x54, 0, 0, 0x0000000f },
- { 0x44, 0, 0, 0x00000060 },
+ { 0x44, 0, 0, 0x000000a0 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 7, 0x000088f7 },
@@ -181,7 +181,7 @@ void __init ptp_classifier_init(void)
{ 0x15, 0, 4, 0x00000000 },
{ 0x28, 0, 0, 0x0000000e },
{ 0x54, 0, 0, 0x0000000f },
- { 0x44, 0, 0, 0x00000030 },
+ { 0x44, 0, 0, 0x00000040 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
};
diff --git a/kernel/net/core/request_sock.c b/kernel/net/core/request_sock.c
index b42f0e26f..5d26056b6 100644
--- a/kernel/net/core/request_sock.c
+++ b/kernel/net/core/request_sock.c
@@ -37,90 +37,16 @@
int sysctl_max_syn_backlog = 256;
EXPORT_SYMBOL(sysctl_max_syn_backlog);
-int reqsk_queue_alloc(struct request_sock_queue *queue,
- unsigned int nr_table_entries)
+void reqsk_queue_alloc(struct request_sock_queue *queue)
{
- size_t lopt_size = sizeof(struct listen_sock);
- struct listen_sock *lopt = NULL;
+ spin_lock_init(&queue->rskq_lock);
- nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
- nr_table_entries = max_t(u32, nr_table_entries, 8);
- nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
- lopt_size += nr_table_entries * sizeof(struct request_sock *);
+ spin_lock_init(&queue->fastopenq.lock);
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
- if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
- lopt = kzalloc(lopt_size, GFP_KERNEL |
- __GFP_NOWARN |
- __GFP_NORETRY);
- if (!lopt)
- lopt = vzalloc(lopt_size);
- if (!lopt)
- return -ENOMEM;
-
- get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
- spin_lock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
- lopt->nr_table_entries = nr_table_entries;
- lopt->max_qlen_log = ilog2(nr_table_entries);
-
- spin_lock_bh(&queue->syn_wait_lock);
- queue->listen_opt = lopt;
- spin_unlock_bh(&queue->syn_wait_lock);
-
- return 0;
-}
-
-void __reqsk_queue_destroy(struct request_sock_queue *queue)
-{
- /* This is an error recovery path only, no locking needed */
- kvfree(queue->listen_opt);
-}
-
-static inline struct listen_sock *reqsk_queue_yank_listen_sk(
- struct request_sock_queue *queue)
-{
- struct listen_sock *lopt;
-
- spin_lock_bh(&queue->syn_wait_lock);
- lopt = queue->listen_opt;
- queue->listen_opt = NULL;
- spin_unlock_bh(&queue->syn_wait_lock);
-
- return lopt;
-}
-
-void reqsk_queue_destroy(struct request_sock_queue *queue)
-{
- /* make all the listen_opt local to us */
- struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
-
- if (listen_sock_qlen(lopt) != 0) {
- unsigned int i;
-
- for (i = 0; i < lopt->nr_table_entries; i++) {
- struct request_sock *req;
-
- spin_lock_bh(&queue->syn_wait_lock);
- while ((req = lopt->syn_table[i]) != NULL) {
- lopt->syn_table[i] = req->dl_next;
- /* Because of following del_timer_sync(),
- * we must release the spinlock here
- * or risk a dead lock.
- */
- spin_unlock_bh(&queue->syn_wait_lock);
- atomic_inc(&lopt->qlen_dec);
- if (del_timer_sync(&req->rsk_timer))
- reqsk_put(req);
- reqsk_put(req);
- spin_lock_bh(&queue->syn_wait_lock);
- }
- spin_unlock_bh(&queue->syn_wait_lock);
- }
- }
-
- if (WARN_ON(listen_sock_qlen(lopt) != 0))
- pr_err("qlen %u\n", listen_sock_qlen(lopt));
- kvfree(lopt);
}
/*
@@ -174,7 +100,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
struct sock *lsk = req->rsk_listener;
struct fastopen_queue *fastopenq;
- fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq;
+ fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
tcp_sk(sk)->fastopen_rsk = NULL;
spin_lock_bh(&fastopenq->lock);
diff --git a/kernel/net/core/rtnetlink.c b/kernel/net/core/rtnetlink.c
index fe95cb704..34ba7a088 100644
--- a/kernel/net/core/rtnetlink.c
+++ b/kernel/net/core/rtnetlink.c
@@ -96,7 +96,7 @@ int rtnl_is_locked(void)
EXPORT_SYMBOL(rtnl_is_locked);
#ifdef CONFIG_PROVE_LOCKING
-int lockdep_rtnl_is_held(void)
+bool lockdep_rtnl_is_held(void)
{
return lockdep_is_held(&rtnl_mutex);
}
@@ -497,7 +497,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops)
}
EXPORT_SYMBOL_GPL(rtnl_af_unregister);
-static size_t rtnl_link_get_af_size(const struct net_device *dev)
+static size_t rtnl_link_get_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct rtnl_af_ops *af_ops;
size_t size;
@@ -509,7 +510,7 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev)
if (af_ops->get_link_af_size) {
/* AF_* + nested data */
size += nla_total_size(sizeof(struct nlattr)) +
- af_ops->get_link_af_size(dev);
+ af_ops->get_link_af_size(dev, ext_filter_mask);
}
}
@@ -678,6 +679,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
+ } else if (i == RTAX_FEATURES - 1) {
+ u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+
+ BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
+ if (nla_put_u32(skb, i + 1, user_features))
+ goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
@@ -819,7 +826,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
- nla_total_size(sizeof(struct ifla_vf_rss_query_en)));
+ nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ /* IFLA_VF_STATS_RX_PACKETS */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_PACKETS */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_BYTES */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_BYTES */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_BROADCAST */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_MULTICAST */
+ nla_total_size(sizeof(__u64)) +
+ nla_total_size(sizeof(struct ifla_vf_trust)));
return size;
} else
return 0;
@@ -882,9 +902,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
- + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
- + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(1); /* IFLA_PROTO_DOWN */
+
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1004,16 +1026,163 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
int err;
- struct netdev_phys_item_id psid;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
- err = netdev_switch_parent_id_get(dev, &psid);
+ err = switchdev_port_attr_get(dev, &attr);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
- if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len,
+ attr.u.ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ const struct rtnl_link_stats64 *stats;
+ struct rtnl_link_stats64 temp;
+ struct nlattr *attr;
+
+ stats = dev_get_stats(dev, &temp);
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (!attr)
+ return -EMSGSIZE;
+
+ copy_rtnl_link_stats(nla_data(attr), stats);
+
+ attr = nla_reserve(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64));
+ if (!attr)
+ return -EMSGSIZE;
+
+ copy_rtnl_link_stats64(nla_data(attr), stats);
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
+ struct net_device *dev,
+ int vfs_num,
+ struct nlattr *vfinfo)
+{
+ struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_stats vf_stats;
+ struct ifla_vf_trust vf_trust;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
+ struct nlattr *vf, *vfstats;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_info ivi;
+
+ /* Not all SR-IOV capable drivers support the
+ * spoofcheck and "RSS query enable" query. Preset to
+ * -1 so the user space tool can detect that the driver
+ * didn't report anything.
+ */
+ ivi.spoofchk = -1;
+ ivi.rss_query_en = -1;
+ ivi.trusted = -1;
+ memset(ivi.mac, 0, sizeof(ivi.mac));
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
+ if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
+ return 0;
+
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf =
+ vf_rss_query_en.vf =
+ vf_trust.vf = ivi.vf;
+
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
+ vf_rss_query_en.setting = ivi.rss_query_en;
+ vf_trust.setting = ivi.trusted;
+ vf = nla_nest_start(skb, IFLA_VF_INFO);
+ if (!vf) {
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
+ }
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate) ||
+ nla_put(skb, IFLA_VF_RSS_QUERY_EN,
+ sizeof(vf_rss_query_en),
+ &vf_rss_query_en) ||
+ nla_put(skb, IFLA_VF_TRUST,
+ sizeof(vf_trust), &vf_trust))
+ return -EMSGSIZE;
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
+ &vf_stats);
+ vfstats = nla_nest_start(skb, IFLA_VF_STATS);
+ if (!vfstats) {
+ nla_nest_cancel(skb, vf);
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
+ }
+ if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets) ||
+ nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets) ||
+ nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes) ||
+ nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes) ||
+ nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast) ||
+ nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast))
+ return -EMSGSIZE;
+ nla_nest_end(skb, vfstats);
+ nla_nest_end(skb, vf);
+ return 0;
+}
+
+static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
+{
+ struct rtnl_link_ifmap map = {
+ .mem_start = dev->mem_start,
+ .mem_end = dev->mem_end,
+ .base_addr = dev->base_addr,
+ .irq = dev->irq,
+ .dma = dev->dma,
+ .port = dev->if_port,
+ };
+ if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
return -EMSGSIZE;
return 0;
@@ -1025,9 +1194,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
- struct rtnl_link_stats64 temp;
- const struct rtnl_link_stats64 *stats;
- struct nlattr *attr, *af_spec;
+ struct nlattr *af_spec;
struct rtnl_af_ops *af_ops;
struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
@@ -1066,21 +1233,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
(dev->ifalias &&
nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
- atomic_read(&dev->carrier_changes)))
+ atomic_read(&dev->carrier_changes)) ||
+ nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
goto nla_put_failure;
- if (1) {
- struct rtnl_link_ifmap map = {
- .mem_start = dev->mem_start,
- .mem_end = dev->mem_end,
- .base_addr = dev->base_addr,
- .irq = dev->irq,
- .dma = dev->dma,
- .port = dev->if_port,
- };
- if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
- goto nla_put_failure;
- }
+ if (rtnl_fill_link_ifmap(skb, dev))
+ goto nla_put_failure;
if (dev->addr_len) {
if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
@@ -1097,97 +1255,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_phys_switch_id_fill(skb, dev))
goto nla_put_failure;
- attr = nla_reserve(skb, IFLA_STATS,
- sizeof(struct rtnl_link_stats));
- if (attr == NULL)
+ if (rtnl_fill_stats(skb, dev))
goto nla_put_failure;
- stats = dev_get_stats(dev, &temp);
- copy_rtnl_link_stats(nla_data(attr), stats);
-
- attr = nla_reserve(skb, IFLA_STATS64,
- sizeof(struct rtnl_link_stats64));
- if (attr == NULL)
- goto nla_put_failure;
- copy_rtnl_link_stats64(nla_data(attr), stats);
-
if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
goto nla_put_failure;
- if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
- && (ext_filter_mask & RTEXT_FILTER_VF)) {
+ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent &&
+ ext_filter_mask & RTEXT_FILTER_VF) {
int i;
-
- struct nlattr *vfinfo, *vf;
+ struct nlattr *vfinfo;
int num_vfs = dev_num_vf(dev->dev.parent);
vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
if (!vfinfo)
goto nla_put_failure;
for (i = 0; i < num_vfs; i++) {
- struct ifla_vf_info ivi;
- struct ifla_vf_mac vf_mac;
- struct ifla_vf_vlan vf_vlan;
- struct ifla_vf_rate vf_rate;
- struct ifla_vf_tx_rate vf_tx_rate;
- struct ifla_vf_spoofchk vf_spoofchk;
- struct ifla_vf_link_state vf_linkstate;
- struct ifla_vf_rss_query_en vf_rss_query_en;
-
- /*
- * Not all SR-IOV capable drivers support the
- * spoofcheck and "RSS query enable" query. Preset to
- * -1 so the user space tool can detect that the driver
- * didn't report anything.
- */
- ivi.spoofchk = -1;
- ivi.rss_query_en = -1;
- memset(ivi.mac, 0, sizeof(ivi.mac));
- /* The default value for VF link state is "auto"
- * IFLA_VF_LINK_STATE_AUTO which equals zero
- */
- ivi.linkstate = 0;
- if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
- break;
- vf_mac.vf =
- vf_vlan.vf =
- vf_rate.vf =
- vf_tx_rate.vf =
- vf_spoofchk.vf =
- vf_linkstate.vf =
- vf_rss_query_en.vf = ivi.vf;
-
- memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
- vf_vlan.vlan = ivi.vlan;
- vf_vlan.qos = ivi.qos;
- vf_tx_rate.rate = ivi.max_tx_rate;
- vf_rate.min_tx_rate = ivi.min_tx_rate;
- vf_rate.max_tx_rate = ivi.max_tx_rate;
- vf_spoofchk.setting = ivi.spoofchk;
- vf_linkstate.link_state = ivi.linkstate;
- vf_rss_query_en.setting = ivi.rss_query_en;
- vf = nla_nest_start(skb, IFLA_VF_INFO);
- if (!vf) {
- nla_nest_cancel(skb, vfinfo);
- goto nla_put_failure;
- }
- if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
- nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
- nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
- &vf_rate) ||
- nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
- &vf_tx_rate) ||
- nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
- &vf_spoofchk) ||
- nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
- &vf_linkstate) ||
- nla_put(skb, IFLA_VF_RSS_QUERY_EN,
- sizeof(vf_rss_query_en),
- &vf_rss_query_en))
+ if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
goto nla_put_failure;
- nla_nest_end(skb, vf);
}
+
nla_nest_end(skb, vfinfo);
}
@@ -1204,7 +1292,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) {
- int id = peernet2id(dev_net(dev), link_net);
+ int id = peernet2id_alloc(dev_net(dev), link_net);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
goto nla_put_failure;
@@ -1222,7 +1310,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (!(af = nla_nest_start(skb, af_ops->family)))
goto nla_put_failure;
- err = af_ops->fill_link_af(skb, dev);
+ err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
/*
* Caller may return ENODATA to indicate that there
@@ -1278,6 +1366,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
+ [IFLA_PROTO_DOWN] = { .type = NLA_U8 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1295,6 +1384,17 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
[IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) },
[IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
+ [IFLA_VF_STATS] = { .type = NLA_NESTED },
+ [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) },
+};
+
+static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
+ [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1525,6 +1625,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
return err;
}
+ if (tb[IFLA_VF_TRUST]) {
+ struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_trust)
+ err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
+ if (err < 0)
+ return err;
+ }
+
return err;
}
@@ -1753,10 +1863,13 @@ static int do_setlink(const struct sk_buff *skb,
goto errout;
nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
- if (nla_type(attr) != IFLA_VF_PORT)
- continue;
- err = nla_parse_nested(port, IFLA_PORT_MAX,
- attr, ifla_port_policy);
+ if (nla_type(attr) != IFLA_VF_PORT ||
+ nla_len(attr) < NLA_HDRLEN) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = nla_parse_nested(port, IFLA_PORT_MAX, attr,
+ ifla_port_policy);
if (err < 0)
goto errout;
if (!port[IFLA_PORT_VF]) {
@@ -1807,6 +1920,14 @@ static int do_setlink(const struct sk_buff *skb,
}
err = 0;
+ if (tb[IFLA_PROTO_DOWN]) {
+ err = dev_change_proto_down(dev,
+ nla_get_u8(tb[IFLA_PROTO_DOWN]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
errout:
if (status & DO_SETLINK_MODIFIED) {
if (status & DO_SETLINK_NOTIFY)
@@ -1897,16 +2018,30 @@ static int rtnl_group_dellink(const struct net *net, int group)
return 0;
}
+int rtnl_delete_link(struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops;
+ LIST_HEAD(list_kill);
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_delete_link);
+
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
- const struct rtnl_link_ops *ops;
struct net_device *dev;
struct ifinfomsg *ifm;
char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
int err;
- LIST_HEAD(list_kill);
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err < 0)
@@ -1928,13 +2063,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!dev)
return -ENODEV;
- ops = dev->rtnl_link_ops;
- if (!ops || !ops->dellink)
- return -EOPNOTSUPP;
-
- ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
- return 0;
+ return rtnl_delete_link(dev);
}
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
@@ -2862,7 +2991,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct net_device *dev, u16 mode,
- u32 flags, u32 mask, int nlflags)
+ u32 flags, u32 mask, int nlflags,
+ u32 filter_mask,
+ int (*vlan_fill)(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 filter_mask))
{
struct nlmsghdr *nlh;
struct ifinfomsg *ifm;
@@ -2870,6 +3003,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct nlattr *protinfo;
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ int err = 0;
nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
if (nlh == NULL)
@@ -2910,6 +3044,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
goto nla_put_failure;
}
}
+ if (vlan_fill) {
+ err = vlan_fill(skb, dev, filter_mask);
+ if (err) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
nla_nest_end(skb, br_afspec);
protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
@@ -2943,9 +3084,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
return 0;
nla_put_failure:
nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
+ return err ? err : -EMSGSIZE;
}
-EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -2955,6 +3096,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
u32 portid = NETLINK_CB(cb->skb).portid;
u32 seq = cb->nlh->nlmsg_seq;
u32 filter_mask = 0;
+ int err;
if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
struct nlattr *extfilt;
@@ -2975,20 +3117,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
- if (idx >= cb->args[0] &&
- br_dev->netdev_ops->ndo_bridge_getlink(
- skb, portid, seq, dev, filter_mask,
- NLM_F_MULTI) < 0)
- break;
+ if (idx >= cb->args[0]) {
+ err = br_dev->netdev_ops->ndo_bridge_getlink(
+ skb, portid, seq, dev,
+ filter_mask, NLM_F_MULTI);
+ if (err < 0 && err != -EOPNOTSUPP)
+ break;
+ }
idx++;
}
if (ops->ndo_bridge_getlink) {
- if (idx >= cb->args[0] &&
- ops->ndo_bridge_getlink(skb, portid, seq, dev,
- filter_mask,
- NLM_F_MULTI) < 0)
- break;
+ if (idx >= cb->args[0]) {
+ err = ops->ndo_bridge_getlink(skb, portid,
+ seq, dev,
+ filter_mask,
+ NLM_F_MULTI);
+ if (err < 0 && err != -EOPNOTSUPP)
+ break;
+ }
idx++;
}
}
@@ -3345,4 +3492,3 @@ void __init rtnetlink_init(void)
rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
}
-
diff --git a/kernel/net/core/scm.c b/kernel/net/core/scm.c
index 3b6899b7d..dce0acb92 100644
--- a/kernel/net/core/scm.c
+++ b/kernel/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
}
fpp = &fpl->fp[fpl->count];
@@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fpp++ = file;
fpl->count++;
}
+
+ if (!fpl->user)
+ fpl->user = get_uid(current_user());
+
return num;
}
@@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm)
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
+ free_uid(fpl->user);
kfree(fpl);
}
}
@@ -305,6 +311,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_SPACE(i*sizeof(int));
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
}
@@ -334,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
}
return new_fpl;
}
diff --git a/kernel/net/core/secure_seq.c b/kernel/net/core/secure_seq.c
index 51dd3193a..fd3ce461f 100644
--- a/kernel/net/core/secure_seq.c
+++ b/kernel/net/core/secure_seq.c
@@ -154,7 +154,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + daddr[i];
+ secret[i] = net_secret[i] + (__force u32)daddr[i];
secret[4] = net_secret[4] +
(((__force u16)sport << 16) + (__force u16)dport);
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
diff --git a/kernel/net/core/skbuff.c b/kernel/net/core/skbuff.c
index fc09e8f3d..12780dc62 100644
--- a/kernel/net/core/skbuff.c
+++ b/kernel/net/core/skbuff.c
@@ -80,6 +80,8 @@
struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
/**
* skb_panic - private function for out-of-line support
@@ -348,95 +350,20 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(build_skb);
-struct netdev_alloc_cache {
- struct page_frag frag;
- /* we maintain a pagecount bias, so that we dont dirty cache line
- * containing page->_count every time we allocate a fragment.
- */
- unsigned int pagecnt_bias;
-};
-static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
-
-static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
- gfp_t gfp_mask)
-{
- const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
- struct page *page = NULL;
- gfp_t gfp = gfp_mask;
-
- if (order) {
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
- __GFP_NOMEMALLOC;
- page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
- nc->frag.size = PAGE_SIZE << (page ? order : 0);
- }
-
- if (unlikely(!page))
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
- nc->frag.page = page;
-
- return page;
-}
-
-static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
- unsigned int fragsz, gfp_t gfp_mask)
-{
- struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
- struct page *page = nc->frag.page;
- unsigned int size;
- int offset;
-
- if (unlikely(!page)) {
-refill:
- page = __page_frag_refill(nc, gfp_mask);
- if (!page)
- return NULL;
-
- /* if size can vary use frag.size else just use PAGE_SIZE */
- size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
- /* Even if we own the page, we do not use atomic_set().
- * This would break get_page_unless_zero() users.
- */
- atomic_add(size - 1, &page->_count);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
- nc->frag.offset = size;
- }
-
- offset = nc->frag.offset - fragsz;
- if (unlikely(offset < 0)) {
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
- goto refill;
-
- /* if size can vary use frag.size else just use PAGE_SIZE */
- size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
- /* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
- offset = size - fragsz;
- }
-
- nc->pagecnt_bias--;
- nc->frag.offset = offset;
-
- return page_address(page) + offset;
-}
+static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
+ struct page_frag_cache *nc;
unsigned long flags;
void *data;
local_lock_irqsave(netdev_alloc_lock, flags);
- data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = __alloc_page_frag(nc, fragsz, gfp_mask);
local_unlock_irqrestore(netdev_alloc_lock, flags);
return data;
}
@@ -456,7 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+ struct page_frag_cache *nc;
+ void *data;
+
+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+ data = __alloc_page_frag(nc, fragsz, gfp_mask);
+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+ return data;
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -466,76 +399,70 @@ void *napi_alloc_frag(unsigned int fragsz)
EXPORT_SYMBOL(napi_alloc_frag);
/**
- * __alloc_rx_skb - allocate an skbuff for rx
- * @length: length to allocate
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
- * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- * allocations in case we have to fallback to __alloc_skb()
- * If SKB_ALLOC_NAPI is set, page fragment will be allocated
- * from napi_cache instead of netdev_cache.
*
* Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has unspecified headroom built in. Users should allocate
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
* the headroom they think they need without accounting for the
* built in space. The built in space is used for optimisations.
*
* %NULL is returned if there is no free memory.
*/
-static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
- int flags)
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+ gfp_t gfp_mask)
{
- struct sk_buff *skb = NULL;
- unsigned int fragsz = SKB_DATA_ALIGN(length) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct page_frag_cache *nc;
+ unsigned long flags;
+ struct sk_buff *skb;
+ bool pfmemalloc;
+ void *data;
- if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
- void *data;
+ len += NET_SKB_PAD;
- if (sk_memalloc_socks())
- gfp_mask |= __GFP_MEMALLOC;
+ if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
+ }
- data = (flags & SKB_ALLOC_NAPI) ?
- __napi_alloc_frag(fragsz, gfp_mask) :
- __netdev_alloc_frag(fragsz, gfp_mask);
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
- if (likely(data)) {
- skb = build_skb(data, fragsz);
- if (unlikely(!skb))
- put_page(virt_to_head_page(data));
- }
- } else {
- skb = __alloc_skb(length, gfp_mask,
- SKB_ALLOC_RX, NUMA_NO_NODE);
- }
- return skb;
-}
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
-/**
- * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
- * @dev: network device to receive on
- * @length: length to allocate
- * @gfp_mask: get_free_pages mask, passed to alloc_skb
- *
- * Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has NET_SKB_PAD headroom built in. Users should allocate
- * the headroom they think they need without accounting for the
- * built in space. The built in space is used for optimisations.
- *
- * %NULL is returned if there is no free memory.
- */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
-{
- struct sk_buff *skb;
+ local_lock_irqsave(netdev_alloc_lock, flags);
+
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = __alloc_page_frag(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
- length += NET_SKB_PAD;
- skb = __alloc_rx_skb(length, gfp_mask, 0);
+ local_unlock_irqrestore(netdev_alloc_lock, flags);
- if (likely(skb)) {
- skb_reserve(skb, NET_SKB_PAD);
- skb->dev = dev;
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
}
+ /* use OR instead of assignment to avoid clearing of bits in mask */
+ if (pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+
+skb_fail:
return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);
@@ -543,7 +470,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
/**
* __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
- * @length: length to allocate
+ * @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
@@ -553,19 +480,54 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
- unsigned int length, gfp_t gfp_mask)
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
+ gfp_t gfp_mask)
{
+ struct page_frag_cache *nc;
struct sk_buff *skb;
+ void *data;
+ bool pfmemalloc;
- length += NET_SKB_PAD + NET_IP_ALIGN;
- skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+ len += NET_SKB_PAD + NET_IP_ALIGN;
- if (likely(skb)) {
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
- skb->dev = napi->dev;
+ if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
}
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+ data = __alloc_page_frag(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
+ }
+
+ /* use OR instead of assignment to avoid clearing of bits in mask */
+ if (pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+
+skb_fail:
return skb;
}
EXPORT_SYMBOL(__napi_alloc_skb);
@@ -613,10 +575,12 @@ static void skb_clone_fraglist(struct sk_buff *skb)
static void skb_free_head(struct sk_buff *skb)
{
+ unsigned char *head = skb->head;
+
if (skb->head_frag)
- put_page(virt_to_head_page(skb->head));
+ skb_free_frag(head);
else
- kfree(skb->head);
+ kfree(head);
}
static void skb_release_data(struct sk_buff *skb)
@@ -1920,15 +1884,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return false;
}
+ssize_t skb_socket_splice(struct sock *sk,
+ struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ int ret;
+
+ /* Drop the socket lock, otherwise we have reverse
+ * locking dependencies between sk_lock and i_mutex
+ * here as compared to sendfile(). We enter here
+ * with the socket lock held, and splice_to_pipe() will
+ * grab the pipe inode lock. For sendfile() emulation,
+ * we call into ->sendpage() with the i_mutex lock held
+ * and networking will grab the socket lock.
+ */
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, spd);
+ lock_sock(sk);
+
+ return ret;
+}
+
/*
* Map data from the skb to a pipe. Should handle both the linear part,
* the fragments, and the frag list. It does NOT handle frag lists within
* the frag list, if such a thing exists. We'd probably need to recurse to
* handle that cleanly.
*/
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
- unsigned int flags)
+ unsigned int flags,
+ ssize_t (*splice_cb)(struct sock *,
+ struct pipe_inode_info *,
+ struct splice_pipe_desc *))
{
struct partial_page partial[MAX_SKB_FRAGS];
struct page *pages[MAX_SKB_FRAGS];
@@ -1941,7 +1929,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
.spd_release = sock_spd_release,
};
struct sk_buff *frag_iter;
- struct sock *sk = skb->sk;
int ret = 0;
/*
@@ -1964,23 +1951,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
}
done:
- if (spd.nr_pages) {
- /*
- * Drop the socket lock, otherwise we have reverse
- * locking dependencies between sk_lock and i_mutex
- * here as compared to sendfile(). We enter here
- * with the socket lock held, and splice_to_pipe() will
- * grab the pipe inode lock. For sendfile() emulation,
- * we call into ->sendpage() with the i_mutex lock held
- * and networking will grab the socket lock.
- */
- release_sock(sk);
- ret = splice_to_pipe(pipe, &spd);
- lock_sock(sk);
- }
+ if (spd.nr_pages)
+ ret = splice_cb(sk, pipe, &spd);
return ret;
}
+EXPORT_SYMBOL_GPL(skb_splice_bits);
/**
* skb_store_bits - store bits from kernel buffer to skb
@@ -2965,6 +2941,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_append_datato_frags);
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+ int offset, size_t size)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, size);
+ } else {
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
+
/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
@@ -2978,11 +2972,12 @@ EXPORT_SYMBOL(skb_append_datato_frags);
*/
unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
+ unsigned char *data = skb->data;
+
BUG_ON(len > skb->len);
- skb->len -= len;
- BUG_ON(skb->len < skb->data_len);
- skb_postpull_rcsum(skb, skb->data, len);
- return skb->data += len;
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, data, len);
+ return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
@@ -3662,7 +3657,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->ee.ee_info = tstype;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP)
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
serr->ee.ee_data -= sk->sk_tskey;
}
@@ -4032,6 +4028,92 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
}
EXPORT_SYMBOL(skb_checksum_setup);
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+ unsigned int transport_len)
+{
+ struct sk_buff *skb_chk;
+ unsigned int len = skb_transport_offset(skb) + transport_len;
+ int ret;
+
+ if (skb->len < len)
+ return NULL;
+ else if (skb->len == len)
+ return skb;
+
+ skb_chk = skb_clone(skb, GFP_ATOMIC);
+ if (!skb_chk)
+ return NULL;
+
+ ret = pskb_trim_rcsum(skb_chk, len);
+ if (ret) {
+ kfree_skb(skb_chk);
+ return NULL;
+ }
+
+ return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+ unsigned int transport_len,
+ __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+ struct sk_buff *skb_chk;
+ unsigned int offset = skb_transport_offset(skb);
+ __sum16 ret;
+
+ skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+ if (!skb_chk)
+ goto err;
+
+ if (!pskb_may_pull(skb_chk, offset))
+ goto err;
+
+ __skb_pull(skb_chk, offset);
+ ret = skb_chkf(skb_chk);
+ __skb_push(skb_chk, offset);
+
+ if (ret)
+ goto err;
+
+ return skb_chk;
+
+err:
+ if (skb_chk && skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return NULL;
+
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
@@ -4201,7 +4283,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
return NULL;
}
- memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+ memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
+ 2 * ETH_ALEN);
skb->mac_header += VLAN_HLEN;
return skb;
}
@@ -4385,7 +4468,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
return NULL;
gfp_head = gfp_mask;
- if (gfp_head & __GFP_WAIT)
+ if (gfp_head & __GFP_DIRECT_RECLAIM)
gfp_head |= __GFP_REPEAT;
*errcode = -ENOBUFS;
@@ -4400,7 +4483,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
while (order) {
if (npages >= 1 << order) {
- page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP |
__GFP_NOWARN |
__GFP_NORETRY,
diff --git a/kernel/net/core/sock.c b/kernel/net/core/sock.c
index 6317c7149..9c3234299 100644
--- a/kernel/net/core/sock.c
+++ b/kernel/net/core/sock.c
@@ -131,6 +131,7 @@
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
#include <linux/filter.h>
@@ -421,13 +422,23 @@ static void sock_warn_obsolete_bsdism(const char *name)
}
}
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+static bool sock_needs_netstamp(const struct sock *sk)
+{
+ switch (sk->sk_family) {
+ case AF_UNSPEC:
+ case AF_UNIX:
+ return false;
+ default:
+ return true;
+ }
+}
static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
if (sk->sk_flags & flags) {
sk->sk_flags &= ~flags;
- if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ if (sock_needs_netstamp(sk) &&
+ !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
net_disable_timestamp();
}
}
@@ -861,7 +872,8 @@ set_rcvbuf:
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
if (sk->sk_state != TCP_ESTABLISHED) {
ret = -EINVAL;
break;
@@ -987,6 +999,10 @@ set_rcvbuf:
sk->sk_max_pacing_rate);
break;
+ case SO_INCOMING_CPU:
+ sk->sk_incoming_cpu = val;
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1393,9 +1409,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
* @family: protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
* @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
*/
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
- struct proto *prot)
+ struct proto *prot, int kern)
{
struct sock *sk;
@@ -1408,7 +1425,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
- sock_net_set(sk, get_net(net));
+ sk->sk_net_refcnt = kern ? 0 : 1;
+ if (likely(sk->sk_net_refcnt))
+ get_net(net);
+ sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
@@ -1419,7 +1439,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
}
EXPORT_SYMBOL(sk_alloc);
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
{
struct sk_filter *filter;
@@ -1442,10 +1462,19 @@ static void __sk_free(struct sock *sk)
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- put_net(sock_net(sk));
+ if (likely(sk->sk_net_refcnt))
+ put_net(sock_net(sk));
sk_prot_free(sk->sk_prot_creator, sk);
}
+static void __sk_free(struct sock *sk)
+{
+ if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+ sock_diag_broadcast_destroy(sk);
+ else
+ sk_destruct(sk);
+}
+
void sk_free(struct sock *sk)
{
/*
@@ -1458,25 +1487,6 @@ void sk_free(struct sock *sk)
}
EXPORT_SYMBOL(sk_free);
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
- if (sk == NULL || sk->sk_socket == NULL)
- return;
-
- sock_hold(sk);
- sock_release(sk->sk_socket);
- sock_net_set(sk, get_net(&init_net));
- sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
static void sk_update_clone(const struct sock *sk, struct sock *newsk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
@@ -1502,7 +1512,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sock_copy(newsk, sk);
/* SANITY */
- get_net(sock_net(newsk));
+ if (likely(newsk->sk_net_refcnt))
+ get_net(sock_net(newsk));
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
@@ -1518,7 +1529,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
- spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
lockdep_set_class_and_name(&newsk->sk_callback_lock,
af_callback_keys + newsk->sk_family,
@@ -1541,7 +1551,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
*/
is_charged = sk_filter_charge(newsk, filter);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
@@ -1582,7 +1592,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
- if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ if (sock_needs_netstamp(sk) &&
+ newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
}
out:
@@ -1592,7 +1603,9 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
- __sk_dst_set(sk, dst);
+ u32 max_segs = 1;
+
+ sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
@@ -1603,9 +1616,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
- sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
}
}
+ sk->sk_gso_max_segs = max_segs;
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -1640,6 +1654,28 @@ void sock_wfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_wfree);
+void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+#ifdef CONFIG_INET
+ if (unlikely(!sk_fullsock(sk))) {
+ skb->destructor = sock_edemux;
+ sock_hold(sk);
+ return;
+ }
+#endif
+ skb->destructor = sock_wfree;
+ skb_set_hash_from_sk(skb, sk);
+ /*
+ * We used to take a refcount on sk, but following operation
+ * is enough to guarantee sk_free() wont free this sock until
+ * all in-flight packets are completed
+ */
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(skb_set_owner_w);
+
void skb_orphan_partial(struct sk_buff *skb)
{
/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
@@ -1777,7 +1813,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
for (;;) {
if (!timeo)
break;
@@ -1823,7 +1859,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
break;
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
if (!timeo)
@@ -1853,6 +1889,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+ struct sockcm_cookie *sockc)
+{
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(sock_cmsg_send);
+
/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
@@ -1880,8 +1942,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
pfrag->offset = 0;
if (SKB_FRAG_PAGE_ORDER) {
- pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
- __GFP_NOWARN | __GFP_NORETRY,
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
@@ -1969,21 +2033,22 @@ static void __release_sock(struct sock *sk)
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
* @timeo: for how long
+ * @skb: last skb seen on sk_receive_queue
*
* Now socket state including sk->sk_err is changed only under lock,
* hence we may omit checks after joining wait queue.
* We check receive queue before schedule() only as optimization;
* it is very likely that release_sock() added new data.
*/
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
int rc;
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
return rc;
}
@@ -2078,14 +2143,15 @@ suppress_allocation:
EXPORT_SYMBOL(__sk_mem_schedule);
/**
- * __sk_reclaim - reclaim memory_allocated
+ * __sk_mem_reclaim - reclaim memory_allocated
* @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
*/
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk,
- sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
- sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk_memory_allocated_sub(sk, amount);
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -2270,7 +2336,6 @@ static void sock_def_write_space(struct sock *sk)
static void sock_def_destruct(struct sock *sk)
{
- kfree(sk->sk_protinfo);
}
void sk_send_sigurg(struct sock *sk)
@@ -2321,7 +2386,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
} else
sk->sk_wq = NULL;
- spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
@@ -2353,6 +2417,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_max_pacing_rate = ~0U;
sk->sk_pacing_rate = ~0U;
+ sk->sk_incoming_cpu = -1;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -2478,7 +2543,8 @@ void sock_enable_timestamp(struct sock *sk, int flag)
* time stamping, but time stamping might have been on
* already because of the other one
*/
- if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+ if (sock_needs_netstamp(sk) &&
+ !(previous_flags & SK_FLAGS_TIMESTAMP))
net_enable_timestamp();
}
}
@@ -2739,10 +2805,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
return;
kfree(rsk_prot->slab_name);
rsk_prot->slab_name = NULL;
- if (rsk_prot->slab) {
- kmem_cache_destroy(rsk_prot->slab);
- rsk_prot->slab = NULL;
- }
+ kmem_cache_destroy(rsk_prot->slab);
+ rsk_prot->slab = NULL;
}
static int req_prot_init(const struct proto *prot)
@@ -2759,7 +2823,7 @@ static int req_prot_init(const struct proto *prot)
rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
rsk_prot->obj_size, 0,
- 0, NULL);
+ prot->slab_flags, NULL);
if (!rsk_prot->slab) {
pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -2827,10 +2891,8 @@ void proto_unregister(struct proto *prot)
list_del(&prot->node);
mutex_unlock(&proto_list_mutex);
- if (prot->slab != NULL) {
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
- }
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
req_prot_cleanup(prot->rsk_prot);
diff --git a/kernel/net/core/sock_diag.c b/kernel/net/core/sock_diag.c
index 556ecf96a..0c1d58d43 100644
--- a/kernel/net/core/sock_diag.c
+++ b/kernel/net/core/sock_diag.c
@@ -1,3 +1,5 @@
+/* License: GPL */
+
#include <linux/mutex.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
@@ -5,6 +7,9 @@
#include <net/net_namespace.h>
#include <linux/module.h>
#include <net/sock.h>
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
@@ -12,6 +17,7 @@
static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
+static struct workqueue_struct *broadcast_wq;
static u64 sock_gen_cookie(struct sock *sk)
{
@@ -104,6 +110,62 @@ out:
}
EXPORT_SYMBOL(sock_diag_put_filterinfo);
+struct broadcast_sk {
+ struct sock *sk;
+ struct work_struct work;
+};
+
+static size_t sock_diag_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+ + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
+ + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+}
+
+static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+{
+ struct broadcast_sk *bsk =
+ container_of(work, struct broadcast_sk, work);
+ struct sock *sk = bsk->sk;
+ const struct sock_diag_handler *hndl;
+ struct sk_buff *skb;
+ const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+ int err = -1;
+
+ WARN_ON(group == SKNLGRP_NONE);
+
+ skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[sk->sk_family];
+ if (hndl && hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ mutex_unlock(&sock_diag_table_mutex);
+
+ if (!err)
+ nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
+ GFP_KERNEL);
+ else
+ kfree_skb(skb);
+out:
+ sk_destruct(sk);
+ kfree(bsk);
+}
+
+void sock_diag_broadcast_destroy(struct sock *sk)
+{
+ /* Note, this function is often called from an interrupt context. */
+ struct broadcast_sk *bsk =
+ kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC);
+ if (!bsk)
+ return sk_destruct(sk);
+ bsk->sk = sk;
+ INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+ queue_work(broadcast_wq, &bsk->work);
+}
+
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
{
mutex_lock(&sock_diag_table_mutex);
@@ -214,10 +276,32 @@ static void sock_diag_rcv(struct sk_buff *skb)
mutex_unlock(&sock_diag_mutex);
}
+static int sock_diag_bind(struct net *net, int group)
+{
+ switch (group) {
+ case SKNLGRP_INET_TCP_DESTROY:
+ case SKNLGRP_INET_UDP_DESTROY:
+ if (!sock_diag_handlers[AF_INET])
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+ break;
+ case SKNLGRP_INET6_TCP_DESTROY:
+ case SKNLGRP_INET6_UDP_DESTROY:
+ if (!sock_diag_handlers[AF_INET6])
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+ break;
+ }
+ return 0;
+}
+
static int __net_init diag_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
+ .groups = SKNLGRP_MAX,
.input = sock_diag_rcv,
+ .bind = sock_diag_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
};
net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
@@ -237,15 +321,8 @@ static struct pernet_operations diag_net_ops = {
static int __init sock_diag_init(void)
{
+ broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+ BUG_ON(!broadcast_wq);
return register_pernet_subsys(&diag_net_ops);
}
-
-static void __exit sock_diag_exit(void)
-{
- unregister_pernet_subsys(&diag_net_ops);
-}
-
-module_init(sock_diag_init);
-module_exit(sock_diag_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
+device_initcall(sock_diag_init);
diff --git a/kernel/net/core/stream.c b/kernel/net/core/stream.c
index 301c05f26..b96f7a79e 100644
--- a/kernel/net/core/stream.c
+++ b/kernel/net/core/stream.c
@@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk)
wake_up_interruptible_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
- sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
}
@@ -119,23 +119,27 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
int err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
+ bool noblock = (*timeo_p ? false : true);
DEFINE_WAIT(wait);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
while (1) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
- if (!*timeo_p)
+ if (!*timeo_p) {
+ if (noblock)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
goto do_nonblock;
+ }
if (signal_pending(current))
goto do_interrupted;
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk_stream_memory_free(sk) && !vm_wait)
break;
diff --git a/kernel/net/core/sysctl_net_core.c b/kernel/net/core/sysctl_net_core.c
index 95b6139d7..a6beb7b6a 100644
--- a/kernel/net/core/sysctl_net_core.c
+++ b/kernel/net/core/sysctl_net_core.c
@@ -26,6 +26,7 @@ static int zero = 0;
static int one = 1;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "max_skb_frags",
+ .data = &sysctl_max_skb_frags,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &max_skb_frags,
+ },
{ }
};
diff --git a/kernel/net/core/timestamping.c b/kernel/net/core/timestamping.c
index 43d3dd62f..42689d5c4 100644
--- a/kernel/net/core/timestamping.c
+++ b/kernel/net/core/timestamping.c
@@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
struct phy_device *phydev;
unsigned int type;
+ if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
+ return false;
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
+
__skb_push(skb, ETH_HLEN);
- type = classify(skb);
+ type = ptp_classify_raw(skb);
__skb_pull(skb, ETH_HLEN);
diff --git a/kernel/net/core/tso.c b/kernel/net/core/tso.c
index 630b30b4f..5dca7ce8e 100644
--- a/kernel/net/core/tso.c
+++ b/kernel/net/core/tso.c
@@ -1,4 +1,5 @@
#include <linux/export.h>
+#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/tso.h>
#include <asm/unaligned.h>
@@ -14,18 +15,24 @@ EXPORT_SYMBOL(tso_count_descs);
void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
{
- struct iphdr *iph;
struct tcphdr *tcph;
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
int mac_hdr_len = skb_network_offset(skb);
memcpy(hdr, skb->data, hdr_len);
- iph = (struct iphdr *)(hdr + mac_hdr_len);
- iph->id = htons(tso->ip_id);
- iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ if (!tso->ipv6) {
+ struct iphdr *iph = (void *)(hdr + mac_hdr_len);
+
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tso->ip_id++;
+ } else {
+ struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len);
+
+ iph->payload_len = htons(size + tcp_hdrlen(skb));
+ }
tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
put_unaligned_be32(tso->tcp_seq, &tcph->seq);
- tso->ip_id++;
if (!is_last) {
/* Clear all special flags for not last packet */
@@ -61,6 +68,7 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso)
tso->ip_id = ntohs(ip_hdr(skb)->id);
tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
tso->next_frag_idx = 0;
+ tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6);
/* Build first data */
tso->size = skb_headlen(skb) - hdr_len;
diff --git a/kernel/net/core/utils.c b/kernel/net/core/utils.c
index 7b803884c..3d17ca8b4 100644
--- a/kernel/net/core/utils.c
+++ b/kernel/net/core/utils.c
@@ -301,22 +301,24 @@ out:
EXPORT_SYMBOL(in6_pton);
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
- __be32 from, __be32 to, int pseudohdr)
+ __be32 from, __be32 to, bool pseudohdr)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
- to));
+ csum_replace4(sum, from, to);
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
+ skb->csum = ~csum_add(csum_sub(~(skb->csum),
+ (__force __wsum)from),
+ (__force __wsum)to);
} else if (pseudohdr)
- *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
- to));
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum),
+ (__force __wsum)from),
+ (__force __wsum)to));
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
const __be32 *from, const __be32 *to,
- int pseudohdr)
+ bool pseudohdr)
{
__be32 diff[] = {
~from[0], ~from[1], ~from[2], ~from[3],
@@ -334,51 +336,15 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_proto_csum_replace16);
-struct __net_random_once_work {
- struct work_struct work;
- struct static_key *key;
-};
-
-static void __net_random_once_deferred(struct work_struct *w)
-{
- struct __net_random_once_work *work =
- container_of(w, struct __net_random_once_work, work);
- BUG_ON(!static_key_enabled(work->key));
- static_key_slow_dec(work->key);
- kfree(work);
-}
-
-static void __net_random_once_disable_jump(struct static_key *key)
+void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
+ __wsum diff, bool pseudohdr)
{
- struct __net_random_once_work *w;
-
- w = kmalloc(sizeof(*w), GFP_ATOMIC);
- if (!w)
- return;
-
- INIT_WORK(&w->work, __net_random_once_deferred);
- w->key = key;
- schedule_work(&w->work);
-}
-
-bool __net_get_random_once(void *buf, int nbytes, bool *done,
- struct static_key *once_key)
-{
- static DEFINE_SPINLOCK(lock);
- unsigned long flags;
-
- spin_lock_irqsave(&lock, flags);
- if (*done) {
- spin_unlock_irqrestore(&lock, flags);
- return false;
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_add(diff, ~skb->csum);
+ } else if (pseudohdr) {
+ *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
}
-
- get_random_bytes(buf, nbytes);
- *done = true;
- spin_unlock_irqrestore(&lock, flags);
-
- __net_random_once_disable_jump(once_key);
-
- return true;
}
-EXPORT_SYMBOL(__net_get_random_once);
+EXPORT_SYMBOL(inet_proto_csum_replace_by_diff);
diff --git a/kernel/net/dcb/dcbnl.c b/kernel/net/dcb/dcbnl.c
index 5b21f6f88..4f6c1862d 100644
--- a/kernel/net/dcb/dcbnl.c
+++ b/kernel/net/dcb/dcbnl.c
@@ -13,6 +13,7 @@
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses/>.
*
+ * Description: Data Center Bridging netlink interface
* Author: Lucy Liu <lucy.liu@intel.com>
*/
@@ -24,7 +25,7 @@
#include <linux/dcbnl.h>
#include <net/dcbevent.h>
#include <linux/rtnetlink.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <net/sock.h>
/* Data Center Bridging (DCB) is a collection of Ethernet enhancements
@@ -48,10 +49,6 @@
* features for capable devices.
*/
-MODULE_AUTHOR("Lucy Liu, <lucy.liu@intel.com>");
-MODULE_DESCRIPTION("Data Center Bridging netlink interface");
-MODULE_LICENSE("GPL");
-
/**************** DCB attribute policies *************************************/
/* DCB netlink attributes policy */
@@ -1935,19 +1932,6 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
}
EXPORT_SYMBOL(dcb_ieee_delapp);
-static void dcb_flushapp(void)
-{
- struct dcb_app_type *app;
- struct dcb_app_type *tmp;
-
- spin_lock_bh(&dcb_lock);
- list_for_each_entry_safe(app, tmp, &dcb_app_list, list) {
- list_del(&app->list);
- kfree(app);
- }
- spin_unlock_bh(&dcb_lock);
-}
-
static int __init dcbnl_init(void)
{
INIT_LIST_HEAD(&dcb_app_list);
@@ -1957,12 +1941,4 @@ static int __init dcbnl_init(void)
return 0;
}
-module_init(dcbnl_init);
-
-static void __exit dcbnl_exit(void)
-{
- rtnl_unregister(PF_UNSPEC, RTM_GETDCB);
- rtnl_unregister(PF_UNSPEC, RTM_SETDCB);
- dcb_flushapp();
-}
-module_exit(dcbnl_exit);
+device_initcall(dcbnl_init);
diff --git a/kernel/net/dccp/ackvec.c b/kernel/net/dccp/ackvec.c
index bd9e718c2..3de0d0362 100644
--- a/kernel/net/dccp/ackvec.c
+++ b/kernel/net/dccp/ackvec.c
@@ -398,12 +398,8 @@ out_err:
void dccp_ackvec_exit(void)
{
- if (dccp_ackvec_slab != NULL) {
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
- }
- if (dccp_ackvec_record_slab != NULL) {
- kmem_cache_destroy(dccp_ackvec_record_slab);
- dccp_ackvec_record_slab = NULL;
- }
+ kmem_cache_destroy(dccp_ackvec_slab);
+ dccp_ackvec_slab = NULL;
+ kmem_cache_destroy(dccp_ackvec_record_slab);
+ dccp_ackvec_record_slab = NULL;
}
diff --git a/kernel/net/dccp/ccid.c b/kernel/net/dccp/ccid.c
index 834989751..90f77d08c 100644
--- a/kernel/net/dccp/ccid.c
+++ b/kernel/net/dccp/ccid.c
@@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f
static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
{
- if (slab != NULL)
- kmem_cache_destroy(slab);
+ kmem_cache_destroy(slab);
}
static int __init ccid_activate(struct ccid_operations *ccid_ops)
diff --git a/kernel/net/dccp/dccp.h b/kernel/net/dccp/dccp.h
index bebc735f5..b0e28d24e 100644
--- a/kernel/net/dccp/dccp.h
+++ b/kernel/net/dccp/dccp.h
@@ -229,7 +229,7 @@ void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
int dccp_retransmit_skb(struct sock *sk);
void dccp_send_ack(struct sock *sk);
-void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *rsk);
void dccp_send_sync(struct sock *sk, const u64 seq,
@@ -270,15 +270,17 @@ int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-struct sock *dccp_create_openreq_child(struct sock *sk,
+struct sock *dccp_create_openreq_child(const struct sock *sk,
const struct request_sock *req,
const struct sk_buff *skb);
int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst);
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req);
struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
@@ -293,7 +295,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
void dccp_destroy_sock(struct sock *sk);
void dccp_close(struct sock *sk, long timeout);
-struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req);
int dccp_connect(struct sock *sk);
@@ -325,13 +327,13 @@ void dccp_send_close(struct sock *sk, const int active);
int dccp_invalid_packet(struct sk_buff *skb);
u32 dccp_sample_rtt(struct sock *sk, long delta);
-static inline int dccp_bad_service_code(const struct sock *sk,
+static inline bool dccp_bad_service_code(const struct sock *sk,
const __be32 service)
{
const struct dccp_sock *dp = dccp_sk(sk);
if (dp->dccps_service == service)
- return 0;
+ return false;
return !dccp_list_has_service(dp->dccps_service_list, service);
}
diff --git a/kernel/net/dccp/diag.c b/kernel/net/dccp/diag.c
index 5a45f8de5..2d84303ea 100644
--- a/kernel/net/dccp/diag.c
+++ b/kernel/net/dccp/diag.c
@@ -66,6 +66,7 @@ static const struct inet_diag_handler dccp_diag_handler = {
.dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
.idiag_type = IPPROTO_DCCP,
+ .idiag_info_size = sizeof(struct tcp_info),
};
static int __init dccp_diag_init(void)
diff --git a/kernel/net/dccp/ipv4.c b/kernel/net/dccp/ipv4.c
index ccf4c5629..902d60632 100644
--- a/kernel/net/dccp/ipv4.c
+++ b/kernel/net/dccp/ipv4.c
@@ -208,7 +208,6 @@ void dccp_req_err(struct sock *sk, u64 seq)
if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- reqsk_put(req);
} else {
/*
* Still in RESPOND, just remove it silently.
@@ -218,6 +217,7 @@ void dccp_req_err(struct sock *sk, u64 seq)
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
}
+ reqsk_put(req);
}
EXPORT_SYMBOL(dccp_req_err);
@@ -390,9 +390,12 @@ static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
*
* This is the equivalent of TCP's tcp_v4_syn_recv_sock
*/
-struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
@@ -425,7 +428,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- __inet_hash_nolisten(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
return newsk;
@@ -443,36 +446,6 @@ put_and_exit:
}
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
-static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const struct iphdr *iph = ip_hdr(skb);
- struct sock *nsk;
- /* Find possible connection requests. */
- struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport,
- iph->saddr, iph->daddr);
- if (req) {
- nsk = dccp_check_req(sk, skb, req);
- if (!nsk)
- reqsk_put(req);
- return nsk;
- }
- nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
- iph->saddr, dh->dccph_sport,
- iph->daddr, dh->dccph_dport,
- inet_iif(skb));
- if (nsk != NULL) {
- if (nsk->sk_state != DCCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
- return sk;
-}
-
static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
@@ -498,7 +471,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
return &rt->dst;
}
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
+static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req)
{
int err = -1;
struct sk_buff *skb;
@@ -527,7 +500,7 @@ out:
return err;
}
-static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
{
int err;
const struct iphdr *rxiph;
@@ -624,7 +597,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = inet_reqsk_alloc(&dccp_request_sock_ops, sk);
+ req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
if (req == NULL)
goto drop;
@@ -704,18 +677,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
*/
- if (sk->sk_state == DCCP_LISTEN) {
- struct sock *nsk = dccp_v4_hnd_req(sk, skb);
-
- if (nsk == NULL)
- goto discard;
-
- if (nsk != sk) {
- if (dccp_child_process(sk, nsk, skb))
- goto reset;
- return 0;
- }
- }
if (dccp_rcv_state_process(sk, skb, dh, skb->len))
goto reset;
@@ -723,7 +684,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
reset:
dccp_v4_ctl_send_reset(sk, skb);
-discard:
kfree_skb(skb);
return 0;
}
@@ -841,15 +801,10 @@ static int dccp_v4_rcv(struct sk_buff *skb)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
}
- /* Step 2:
- * Look up flow ID in table and get corresponding socket */
+lookup:
sk = __inet_lookup_skb(&dccp_hashinfo, skb,
dh->dccph_sport, dh->dccph_dport);
- /*
- * Step 2:
- * If no socket ...
- */
- if (sk == NULL) {
+ if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
goto no_dccp_socket;
@@ -867,6 +822,31 @@ static int dccp_v4_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}
+ if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ if (unlikely(sk->sk_state != DCCP_LISTEN)) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sock_hold(sk);
+ nsk = dccp_check_req(sk, skb, req);
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_and_relse;
+ }
+ if (nsk == sk) {
+ reqsk_put(req);
+ } else if (dccp_child_process(sk, nsk, skb)) {
+ dccp_v4_ctl_send_reset(sk, skb);
+ goto discard_and_relse;
+ } else {
+ sock_put(sk);
+ return 0;
+ }
+ }
/*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted
diff --git a/kernel/net/dccp/ipv6.c b/kernel/net/dccp/ipv6.c
index 5165571f3..b8608b71a 100644
--- a/kernel/net/dccp/ipv6.c
+++ b/kernel/net/dccp/ipv6.c
@@ -181,7 +181,7 @@ out:
}
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
+static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -202,7 +202,9 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -219,7 +221,10 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
&ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
fl6.daddr = ireq->ir_v6_rmt_addr;
- err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ rcu_read_lock();
+ err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+ np->tclass);
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -234,7 +239,7 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req)
kfree_skb(inet_rsk(req)->pktopts);
}
-static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
{
const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
@@ -290,37 +295,6 @@ static struct request_sock_ops dccp6_request_sock_ops = {
.syn_ack_timeout = dccp_syn_ack_timeout,
};
-static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct request_sock *req;
- struct sock *nsk;
-
- req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr,
- &iph->daddr, inet6_iif(skb));
- if (req) {
- nsk = dccp_check_req(sk, skb, req);
- if (!nsk)
- reqsk_put(req);
- return nsk;
- }
- nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
- &iph->saddr, dh->dccph_sport,
- &iph->daddr, ntohs(dh->dccph_dport),
- inet6_iif(skb));
- if (nsk != NULL) {
- if (nsk->sk_state != DCCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
- return sk;
-}
-
static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct request_sock *req;
@@ -350,7 +324,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk);
+ req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
if (req == NULL)
goto drop;
@@ -398,7 +372,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (dccp_v6_send_response(sk, req))
goto drop_and_free;
- inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
return 0;
drop_and_free:
@@ -408,13 +382,17 @@ drop:
return -1;
}
-static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct ipv6_pinfo *newnp;
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct inet_sock *newinet;
struct dccp6_sock *newdp6;
struct sock *newsk;
@@ -423,7 +401,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
/*
* v6 mapped
*/
- newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+ newsk = dccp_v4_request_recv_sock(sk, skb, req, dst,
+ req_unhash, own_req);
if (newsk == NULL)
return NULL;
@@ -462,22 +441,11 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
if (sk_acceptq_is_full(sk))
goto out_overflow;
- if (dst == NULL) {
- struct in6_addr *final_p, final;
+ if (!dst) {
struct flowi6 fl6;
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.daddr = ireq->ir_v6_rmt_addr;
- final_p = fl6_update_dst(&fl6, np->opt, &final);
- fl6.saddr = ireq->ir_v6_loc_addr;
- fl6.flowi6_oif = sk->sk_bound_dev_if;
- fl6.fl6_dport = ireq->ir_rmt_port;
- fl6.fl6_sport = htons(ireq->ir_num);
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
-
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
- if (IS_ERR(dst))
+ dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP);
+ if (!dst)
goto out;
}
@@ -491,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
* comment in that function for the gory details. -acme
*/
- __ip6_dst_store(newsk, dst, NULL, NULL);
+ ip6_dst_store(newsk, dst, NULL, NULL);
newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
NETIF_F_TSO);
newdp6 = (struct dccp6_sock *)newsk;
@@ -515,15 +483,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
- /* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (ireq->pktopts != NULL) {
- newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
- consume_skb(ireq->pktopts);
- ireq->pktopts = NULL;
- if (newnp->pktoptions)
- skb_set_owner_r(newnp->pktoptions, newsk);
- }
newnp->opt = NULL;
newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
@@ -534,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
* Yes, keeping reference count would be much more clever, but we make
* one more one thing there: reattach optmem to newsk.
*/
- if (np->opt != NULL)
- newnp->opt = ipv6_dup_options(newsk, np->opt);
-
+ opt = rcu_dereference(np->opt);
+ if (opt) {
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ }
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt != NULL)
- inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
- newnp->opt->opt_flen);
+ if (opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+ opt->opt_flen;
dccp_sync_mss(newsk, dst_mtu(dst));
@@ -552,7 +514,15 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
dccp_done(newsk);
goto out;
}
- __inet_hash(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ /* Clone pktoptions received with SYN, if we own the req */
+ if (*own_req && ireq->pktopts) {
+ newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
return newsk;
@@ -651,24 +621,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
*/
- if (sk->sk_state == DCCP_LISTEN) {
- struct sock *nsk = dccp_v6_hnd_req(sk, skb);
-
- if (nsk == NULL)
- goto discard;
- /*
- * Queue it on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket..
- */
- if (nsk != sk) {
- if (dccp_child_process(sk, nsk, skb))
- goto reset;
- if (opt_skb != NULL)
- __kfree_skb(opt_skb);
- return 0;
- }
- }
if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
goto reset;
@@ -715,16 +667,11 @@ static int dccp_v6_rcv(struct sk_buff *skb)
else
DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
- /* Step 2:
- * Look up flow ID in table and get corresponding socket */
+lookup:
sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
dh->dccph_sport, dh->dccph_dport,
inet6_iif(skb));
- /*
- * Step 2:
- * If no socket ...
- */
- if (sk == NULL) {
+ if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
goto no_dccp_socket;
@@ -742,6 +689,31 @@ static int dccp_v6_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}
+ if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ if (unlikely(sk->sk_state != DCCP_LISTEN)) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sock_hold(sk);
+ nsk = dccp_check_req(sk, skb, req);
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_and_relse;
+ }
+ if (nsk == sk) {
+ reqsk_put(req);
+ } else if (dccp_child_process(sk, nsk, skb)) {
+ dccp_v6_ctl_send_reset(sk, skb);
+ goto discard_and_relse;
+ } else {
+ sock_put(sk);
+ return 0;
+ }
+ }
/*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted
@@ -793,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
+ struct ipv6_txoptions *opt;
struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
@@ -892,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.fl6_sport = inet->inet_sport;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ final_p = fl6_update_dst(&fl6, opt, &final);
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -909,12 +883,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
np->saddr = *saddr;
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
icsk->icsk_ext_hdr_len = 0;
- if (np->opt != NULL)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
+ if (opt)
+ icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
inet->inet_dport = usin->sin6_port;
diff --git a/kernel/net/dccp/minisocks.c b/kernel/net/dccp/minisocks.c
index 30addee2d..1994f8af6 100644
--- a/kernel/net/dccp/minisocks.c
+++ b/kernel/net/dccp/minisocks.c
@@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
@@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
timeo = DCCP_TIMEWAIT_LEN;
inet_twsk_schedule(tw, timeo);
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
@@ -72,7 +72,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
dccp_done(sk);
}
-struct sock *dccp_create_openreq_child(struct sock *sk,
+struct sock *dccp_create_openreq_child(const struct sock *sk,
const struct request_sock *req,
const struct sk_buff *skb)
{
@@ -143,6 +143,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
{
struct sock *child = NULL;
struct dccp_request_sock *dreq = dccp_rsk(req);
+ bool own_req;
/* Check for retransmitted REQUEST */
if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
@@ -182,14 +183,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
if (dccp_parse_options(sk, dreq, skb))
goto drop;
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL)
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ req, &own_req);
+ if (!child)
goto listen_overflow;
- inet_csk_reqsk_queue_drop(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
-out:
- return child;
+ return inet_csk_complete_hashdance(sk, child, req, own_req);
+
listen_overflow:
dccp_pr_debug("listen_overflow!\n");
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
@@ -198,7 +198,7 @@ drop:
req->rsk_ops->send_reset(sk, skb);
inet_csk_reqsk_queue_drop(sk, req);
- goto out;
+ return NULL;
}
EXPORT_SYMBOL_GPL(dccp_check_req);
@@ -236,7 +236,7 @@ int dccp_child_process(struct sock *parent, struct sock *child,
EXPORT_SYMBOL_GPL(dccp_child_process);
-void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *rsk)
{
DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
diff --git a/kernel/net/dccp/output.c b/kernel/net/dccp/output.c
index 0248e8a34..4ce912e69 100644
--- a/kernel/net/dccp/output.c
+++ b/kernel/net/dccp/output.c
@@ -390,7 +390,7 @@ int dccp_retransmit_skb(struct sock *sk)
return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
}
-struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req)
{
struct dccp_hdr *dh;
@@ -398,13 +398,18 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
const u32 dccp_header_size = sizeof(struct dccp_hdr) +
sizeof(struct dccp_hdr_ext) +
sizeof(struct dccp_hdr_response);
- struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
- GFP_ATOMIC);
- if (skb == NULL)
+ struct sk_buff *skb;
+
+ /* sk is marked const to clearly express we dont hold socket lock.
+ * sock_wmalloc() will atomically change sk->sk_wmem_alloc,
+ * it is safe to promote sk to non const.
+ */
+ skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1,
+ GFP_ATOMIC);
+ if (!skb)
return NULL;
- /* Reserve space for headers. */
- skb_reserve(skb, sk->sk_prot->max_header);
+ skb_reserve(skb, MAX_DCCP_HEADER);
skb_dst_set(skb, dst_clone(dst));
diff --git a/kernel/net/dccp/probe.c b/kernel/net/dccp/probe.c
index d8346d0ea..3d3fda05b 100644
--- a/kernel/net/dccp/probe.c
+++ b/kernel/net/dccp/probe.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/kfifo.h>
#include <linux/vmalloc.h>
+#include <linux/time64.h>
#include <linux/gfp.h>
#include <net/net_namespace.h>
@@ -47,20 +48,20 @@ static struct {
struct kfifo fifo;
spinlock_t lock;
wait_queue_head_t wait;
- struct timespec tstart;
+ struct timespec64 tstart;
} dccpw;
static void printl(const char *fmt, ...)
{
va_list args;
int len;
- struct timespec now;
+ struct timespec64 now;
char tbuf[256];
va_start(args, fmt);
- getnstimeofday(&now);
+ getnstimeofday64(&now);
- now = timespec_sub(now, dccpw.tstart);
+ now = timespec64_sub(now, dccpw.tstart);
len = sprintf(tbuf, "%lu.%06lu ",
(unsigned long) now.tv_sec,
@@ -110,7 +111,7 @@ static struct jprobe dccp_send_probe = {
static int dccpprobe_open(struct inode *inode, struct file *file)
{
kfifo_reset(&dccpw.fifo);
- getnstimeofday(&dccpw.tstart);
+ getnstimeofday64(&dccpw.tstart);
return 0;
}
diff --git a/kernel/net/dccp/proto.c b/kernel/net/dccp/proto.c
index 52a940165..41e65804d 100644
--- a/kernel/net/dccp/proto.c
+++ b/kernel/net/dccp/proto.c
@@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
@@ -886,7 +885,7 @@ verify_sock_status:
break;
}
- sk_wait_data(sk, &timeo);
+ sk_wait_data(sk, &timeo, NULL);
continue;
found_ok_skb:
if (len > skb->len)
diff --git a/kernel/net/decnet/af_decnet.c b/kernel/net/decnet/af_decnet.c
index 754484b3c..13d6b1a6e 100644
--- a/kernel/net/decnet/af_decnet.c
+++ b/kernel/net/decnet/af_decnet.c
@@ -468,10 +468,10 @@ static struct proto dn_proto = {
.obj_size = sizeof(struct dn_sock),
};
-static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp)
+static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern)
{
struct dn_scp *scp;
- struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto);
+ struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern);
if (!sk)
goto out;
@@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
{
struct sock *sk;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
@@ -693,7 +696,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
}
- if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL)
+ if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL)
return -ENOBUFS;
sk->sk_protocol = protocol;
@@ -1096,7 +1099,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
cb = DN_SKB_CB(skb);
sk->sk_ack_backlog--;
- newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation);
+ newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0);
if (newsk == NULL) {
release_sock(sk);
kfree_skb(skb);
@@ -1747,9 +1750,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
}
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
}
@@ -2004,10 +2007,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo,
!dn_queue_too_long(scp, queue, flags));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
continue;
}
diff --git a/kernel/net/decnet/dn_neigh.c b/kernel/net/decnet/dn_neigh.c
index 4507b188f..482730cd8 100644
--- a/kernel/net/decnet/dn_neigh.c
+++ b/kernel/net/decnet/dn_neigh.c
@@ -194,7 +194,7 @@ static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb)
return err;
}
-static int dn_neigh_output_packet(struct sock *sk, struct sk_buff *skb)
+static int dn_neigh_output_packet(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct dn_route *rt = (struct dn_route *)dst;
@@ -246,8 +246,9 @@ static int dn_long_output(struct neighbour *neigh, struct sock *sk,
skb_reset_network_header(skb);
- return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb,
- NULL, neigh->dev, dn_neigh_output_packet);
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
+ &init_net, sk, skb, NULL, neigh->dev,
+ dn_neigh_output_packet);
}
/*
@@ -286,8 +287,9 @@ static int dn_short_output(struct neighbour *neigh, struct sock *sk,
skb_reset_network_header(skb);
- return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb,
- NULL, neigh->dev, dn_neigh_output_packet);
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
+ &init_net, sk, skb, NULL, neigh->dev,
+ dn_neigh_output_packet);
}
/*
@@ -327,11 +329,12 @@ static int dn_phase3_output(struct neighbour *neigh, struct sock *sk,
skb_reset_network_header(skb);
- return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb,
- NULL, neigh->dev, dn_neigh_output_packet);
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
+ &init_net, sk, skb, NULL, neigh->dev,
+ dn_neigh_output_packet);
}
-int dn_to_neigh_output(struct sock *sk, struct sk_buff *skb)
+int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct dn_route *rt = (struct dn_route *) dst;
@@ -375,7 +378,7 @@ void dn_neigh_pointopoint_hello(struct sk_buff *skb)
/*
* Ethernet router hello message received
*/
-int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb)
+int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data;
@@ -437,7 +440,7 @@ int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb)
/*
* Endnode hello message received
*/
-int dn_neigh_endnode_hello(struct sock *sk, struct sk_buff *skb)
+int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
struct neighbour *neigh;
diff --git a/kernel/net/decnet/dn_nsp_in.c b/kernel/net/decnet/dn_nsp_in.c
index a321eac9f..7ac086d5c 100644
--- a/kernel/net/decnet/dn_nsp_in.c
+++ b/kernel/net/decnet/dn_nsp_in.c
@@ -714,7 +714,8 @@ out:
return ret;
}
-static int dn_nsp_rx_packet(struct sock *sk2, struct sk_buff *skb)
+static int dn_nsp_rx_packet(struct net *net, struct sock *sk2,
+ struct sk_buff *skb)
{
struct dn_skb_cb *cb = DN_SKB_CB(skb);
struct sock *sk = NULL;
@@ -814,8 +815,8 @@ free_out:
int dn_nsp_rx(struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_nsp_rx_packet);
}
diff --git a/kernel/net/decnet/dn_nsp_out.c b/kernel/net/decnet/dn_nsp_out.c
index 1aaa51ebb..849805e7a 100644
--- a/kernel/net/decnet/dn_nsp_out.c
+++ b/kernel/net/decnet/dn_nsp_out.c
@@ -85,7 +85,7 @@ static void dn_nsp_send(struct sk_buff *skb)
if (dst) {
try_again:
skb_dst_set(skb, dst);
- dst_output(skb);
+ dst_output(&init_net, skb->sk, skb);
return;
}
@@ -582,7 +582,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
* associations.
*/
skb_dst_set(skb, dst_clone(dst));
- dst_output(skb);
+ dst_output(&init_net, skb->sk, skb);
}
diff --git a/kernel/net/decnet/dn_route.c b/kernel/net/decnet/dn_route.c
index 03227ffd1..607a14f20 100644
--- a/kernel/net/decnet/dn_route.c
+++ b/kernel/net/decnet/dn_route.c
@@ -512,7 +512,7 @@ static int dn_return_long(struct sk_buff *skb)
*
* Returns: result of input function if route is found, error code otherwise
*/
-static int dn_route_rx_packet(struct sock *sk, struct sk_buff *skb)
+static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dn_skb_cb *cb;
int err;
@@ -573,8 +573,8 @@ static int dn_route_rx_long(struct sk_buff *skb)
ptr++;
cb->hops = *ptr++; /* Visit Count */
- return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_route_rx_packet);
drop_it:
@@ -601,8 +601,8 @@ static int dn_route_rx_short(struct sk_buff *skb)
ptr += 2;
cb->hops = *ptr & 0x3f;
- return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_route_rx_packet);
drop_it:
@@ -610,7 +610,7 @@ drop_it:
return NET_RX_DROP;
}
-static int dn_route_discard(struct sock *sk, struct sk_buff *skb)
+static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb)
{
/*
* I know we drop the packet here, but thats considered success in
@@ -620,7 +620,7 @@ static int dn_route_discard(struct sock *sk, struct sk_buff *skb)
return NET_RX_SUCCESS;
}
-static int dn_route_ptp_hello(struct sock *sk, struct sk_buff *skb)
+static int dn_route_ptp_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
{
dn_dev_hello(skb);
dn_neigh_pointopoint_hello(skb);
@@ -706,22 +706,22 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
switch (flags & DN_RT_CNTL_MSK) {
case DN_RT_PKT_HELO:
return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
- NULL, skb, skb->dev, NULL,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_route_ptp_hello);
case DN_RT_PKT_L1RT:
case DN_RT_PKT_L2RT:
return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE,
- NULL, skb, skb->dev, NULL,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_route_discard);
case DN_RT_PKT_ERTH:
return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
- NULL, skb, skb->dev, NULL,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_neigh_router_hello);
case DN_RT_PKT_EEDH:
return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
- NULL, skb, skb->dev, NULL,
+ &init_net, NULL, skb, skb->dev, NULL,
dn_neigh_endnode_hello);
}
} else {
@@ -744,7 +744,7 @@ out:
return NET_RX_DROP;
}
-static int dn_output(struct sock *sk, struct sk_buff *skb)
+static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct dn_route *rt = (struct dn_route *)dst;
@@ -770,8 +770,8 @@ static int dn_output(struct sock *sk, struct sk_buff *skb)
cb->rt_flags |= DN_RT_F_IE;
cb->hops = 0;
- return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, sk, skb,
- NULL, dev,
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT,
+ &init_net, sk, skb, NULL, dev,
dn_to_neigh_output);
error:
@@ -789,9 +789,7 @@ static int dn_forward(struct sk_buff *skb)
struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr);
struct dn_route *rt;
int header_len;
-#ifdef CONFIG_NETFILTER
struct net_device *dev = skb->dev;
-#endif
if (skb->pkt_type != PACKET_HOST)
goto drop;
@@ -819,8 +817,8 @@ static int dn_forward(struct sk_buff *skb)
if (rt->rt_flags & RTCF_DOREDIRECT)
cb->rt_flags |= DN_RT_F_IE;
- return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, NULL, skb,
- dev, skb->dev,
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD,
+ &init_net, NULL, skb, dev, skb->dev,
dn_to_neigh_output);
drop:
@@ -832,7 +830,7 @@ drop:
* Used to catch bugs. This should never normally get
* called.
*/
-static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb)
+static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1469,7 +1467,7 @@ make_route:
rt->n = neigh;
rt->dst.lastuse = jiffies;
- rt->dst.output = dn_rt_bug_sk;
+ rt->dst.output = dn_rt_bug_out;
switch (res.type) {
case RTN_UNICAST:
rt->dst.input = dn_forward;
diff --git a/kernel/net/decnet/dn_rules.c b/kernel/net/decnet/dn_rules.c
index 9d66a0f72..295bbd6a5 100644
--- a/kernel/net/decnet/dn_rules.c
+++ b/kernel/net/decnet/dn_rules.c
@@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
.configure = dn_fib_rule_configure,
.compare = dn_fib_rule_compare,
.fill = dn_fib_rule_fill,
- .default_pref = fib_default_rule_pref,
.flush_cache = dn_fib_rule_flush_cache,
.nlgroup = RTNLGRP_DECnet_RULE,
.policy = dn_fib_rule_policy,
diff --git a/kernel/net/decnet/netfilter/dn_rtmsg.c b/kernel/net/decnet/netfilter/dn_rtmsg.c
index af34fc9bd..85f2fdc36 100644
--- a/kernel/net/decnet/netfilter/dn_rtmsg.c
+++ b/kernel/net/decnet/netfilter/dn_rtmsg.c
@@ -87,7 +87,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
}
-static unsigned int dnrmg_hook(const struct nf_hook_ops *ops,
+static unsigned int dnrmg_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
diff --git a/kernel/net/dns_resolver/dns_key.c b/kernel/net/dns_resolver/dns_key.c
index 31cd4fd75..c79b85eb4 100644
--- a/kernel/net/dns_resolver/dns_key.c
+++ b/kernel/net/dns_resolver/dns_key.c
@@ -122,7 +122,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
goto bad_option_value;
kdebug("dns error no. = %lu", derrno);
- prep->type_data[0] = ERR_PTR(-derrno);
+ prep->payload.data[dns_key_error] = ERR_PTR(-derrno);
continue;
}
@@ -137,8 +137,8 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
/* don't cache the result if we're caching an error saying there's no
* result */
- if (prep->type_data[0]) {
- kleave(" = 0 [h_error %ld]", PTR_ERR(prep->type_data[0]));
+ if (prep->payload.data[dns_key_error]) {
+ kleave(" = 0 [h_error %ld]", PTR_ERR(prep->payload.data[dns_key_error]));
return 0;
}
@@ -155,7 +155,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
memcpy(upayload->data, data, result_len);
upayload->data[result_len] = '\0';
- prep->payload[0] = upayload;
+ prep->payload.data[dns_key_data] = upayload;
kleave(" = 0");
return 0;
}
@@ -167,7 +167,7 @@ static void dns_resolver_free_preparse(struct key_preparsed_payload *prep)
{
pr_devel("==>%s()\n", __func__);
- kfree(prep->payload[0]);
+ kfree(prep->payload.data[dns_key_data]);
}
/*
@@ -223,10 +223,10 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data)
*/
static void dns_resolver_describe(const struct key *key, struct seq_file *m)
{
- int err = key->type_data.x[0];
-
seq_puts(m, key->description);
if (key_is_instantiated(key)) {
+ int err = PTR_ERR(key->payload.data[dns_key_error]);
+
if (err)
seq_printf(m, ": %d", err);
else
@@ -241,8 +241,10 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m)
static long dns_resolver_read(const struct key *key,
char __user *buffer, size_t buflen)
{
- if (key->type_data.x[0])
- return key->type_data.x[0];
+ int err = PTR_ERR(key->payload.data[dns_key_error]);
+
+ if (err)
+ return err;
return user_read(key, buffer, buflen);
}
diff --git a/kernel/net/dns_resolver/dns_query.c b/kernel/net/dns_resolver/dns_query.c
index 39d2c39bd..ecc28cff0 100644
--- a/kernel/net/dns_resolver/dns_query.c
+++ b/kernel/net/dns_resolver/dns_query.c
@@ -67,10 +67,10 @@
* Returns the size of the result on success, -ve error code otherwise.
*/
int dns_query(const char *type, const char *name, size_t namelen,
- const char *options, char **_result, time_t *_expiry)
+ const char *options, char **_result, time64_t *_expiry)
{
struct key *rkey;
- struct user_key_payload *upayload;
+ const struct user_key_payload *upayload;
const struct cred *saved_cred;
size_t typelen, desclen;
char *desc, *cp;
@@ -137,12 +137,11 @@ int dns_query(const char *type, const char *name, size_t namelen,
goto put;
/* If the DNS server gave an error, return that to the caller */
- ret = rkey->type_data.x[0];
+ ret = PTR_ERR(rkey->payload.data[dns_key_error]);
if (ret)
goto put;
- upayload = rcu_dereference_protected(rkey->payload.data,
- lockdep_is_held(&rkey->sem));
+ upayload = user_key_payload(rkey);
len = upayload->datalen;
ret = -ENOMEM;
diff --git a/kernel/net/dns_resolver/internal.h b/kernel/net/dns_resolver/internal.h
index 7af1ed39c..0c570d40e 100644
--- a/kernel/net/dns_resolver/internal.h
+++ b/kernel/net/dns_resolver/internal.h
@@ -23,6 +23,14 @@
#include <linux/sched.h>
/*
+ * Layout of key payload words.
+ */
+enum {
+ dns_key_data,
+ dns_key_error,
+};
+
+/*
* dns_key.c
*/
extern const struct cred *dns_resolver_cache;
diff --git a/kernel/net/dsa/dsa.c b/kernel/net/dsa/dsa.c
index 392e29a02..1eba07feb 100644
--- a/kernel/net/dsa/dsa.c
+++ b/kernel/net/dsa/dsa.c
@@ -22,6 +22,7 @@
#include <linux/of_platform.h>
#include <linux/of_net.h>
#include <linux/sysfs.h>
+#include <linux/phy_fixed.h>
#include "dsa_priv.h"
char dsa_driver_version[] = "0.1";
@@ -176,6 +177,41 @@ __ATTRIBUTE_GROUPS(dsa_hwmon);
#endif /* CONFIG_NET_DSA_HWMON */
/* basic switch operations **************************************************/
+static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master)
+{
+ struct dsa_chip_data *cd = ds->pd;
+ struct device_node *port_dn;
+ struct phy_device *phydev;
+ int ret, port, mode;
+
+ for (port = 0; port < DSA_MAX_PORTS; port++) {
+ if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
+ continue;
+
+ port_dn = cd->port_dn[port];
+ if (of_phy_is_fixed_link(port_dn)) {
+ ret = of_phy_register_fixed_link(port_dn);
+ if (ret) {
+ netdev_err(master,
+ "failed to register fixed PHY\n");
+ return ret;
+ }
+ phydev = of_phy_find_device(port_dn);
+
+ mode = of_get_phy_mode(port_dn);
+ if (mode < 0)
+ mode = PHY_INTERFACE_MODE_NA;
+ phydev->interface = mode;
+
+ genphy_config_init(phydev);
+ genphy_read_status(phydev);
+ if (ds->drv->adjust_link)
+ ds->drv->adjust_link(ds, port, phydev);
+ }
+ }
+ return 0;
+}
+
static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
{
struct dsa_switch_driver *drv = ds->drv;
@@ -270,7 +306,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
if (ret < 0)
goto out;
- ds->slave_mii_bus = mdiobus_alloc();
+ ds->slave_mii_bus = devm_mdiobus_alloc(parent);
if (ds->slave_mii_bus == NULL) {
ret = -ENOMEM;
goto out;
@@ -279,7 +315,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
ret = mdiobus_register(ds->slave_mii_bus);
if (ret < 0)
- goto out_free;
+ goto out;
/*
@@ -291,12 +327,20 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
ret = dsa_slave_create(ds, parent, i, pd->port_names[i]);
if (ret < 0) {
- netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s)\n",
- index, i, pd->port_names[i]);
+ netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
+ index, i, pd->port_names[i], ret);
ret = 0;
}
}
+ /* Perform configuration of the CPU and DSA ports */
+ ret = dsa_cpu_dsa_setup(ds, dst->master_netdev);
+ if (ret < 0) {
+ netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
+ index);
+ ret = 0;
+ }
+
#ifdef CONFIG_NET_DSA_HWMON
/* If the switch provides a temperature sensor,
* register with hardware monitoring subsystem.
@@ -324,10 +368,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
return ret;
-out_free:
- mdiobus_free(ds->slave_mii_bus);
out:
- kfree(ds);
return ret;
}
@@ -357,7 +398,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
/*
* Allocate and initialise switch state.
*/
- ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL);
+ ds = devm_kzalloc(parent, sizeof(*ds) + drv->priv_size, GFP_KERNEL);
if (ds == NULL)
return ERR_PTR(-ENOMEM);
@@ -377,10 +418,47 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
static void dsa_switch_destroy(struct dsa_switch *ds)
{
+ struct device_node *port_dn;
+ struct phy_device *phydev;
+ struct dsa_chip_data *cd = ds->pd;
+ int port;
+
#ifdef CONFIG_NET_DSA_HWMON
if (ds->hwmon_dev)
hwmon_device_unregister(ds->hwmon_dev);
#endif
+
+ /* Disable configuration of the CPU and DSA ports */
+ for (port = 0; port < DSA_MAX_PORTS; port++) {
+ if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
+ continue;
+
+ port_dn = cd->port_dn[port];
+ if (of_phy_is_fixed_link(port_dn)) {
+ phydev = of_phy_find_device(port_dn);
+ if (phydev) {
+ int addr = phydev->addr;
+
+ phy_device_free(phydev);
+ of_node_put(port_dn);
+ fixed_phy_del(addr);
+ }
+ }
+ }
+
+ /* Destroy network devices for physical switch ports. */
+ for (port = 0; port < DSA_MAX_PORTS; port++) {
+ if (!(ds->phys_port_mask & (1 << port)))
+ continue;
+
+ if (!ds->ports[port])
+ continue;
+
+ unregister_netdev(ds->ports[port]);
+ free_netdev(ds->ports[port]);
+ }
+
+ mdiobus_unregister(ds->slave_mii_bus);
}
#ifdef CONFIG_PM_SLEEP
@@ -554,6 +632,31 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
return 0;
}
+static int dsa_of_probe_links(struct dsa_platform_data *pd,
+ struct dsa_chip_data *cd,
+ int chip_index, int port_index,
+ struct device_node *port,
+ const char *port_name)
+{
+ struct device_node *link;
+ int link_index;
+ int ret;
+
+ for (link_index = 0;; link_index++) {
+ link = of_parse_phandle(port, "link", link_index);
+ if (!link)
+ break;
+
+ if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) {
+ ret = dsa_of_setup_routing_table(pd, cd, chip_index,
+ port_index, link);
+ if (ret)
+ return ret;
+ }
+ }
+ return 0;
+}
+
static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
{
int i;
@@ -566,6 +669,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
port_index++;
}
kfree(pd->chip[i].rtable);
+
+ /* Drop our reference to the MDIO bus device */
+ if (pd->chip[i].host_dev)
+ put_device(pd->chip[i].host_dev);
}
kfree(pd->chip);
}
@@ -573,8 +680,8 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
static int dsa_of_probe(struct device *dev)
{
struct device_node *np = dev->of_node;
- struct device_node *child, *mdio, *ethernet, *port, *link;
- struct mii_bus *mdio_bus;
+ struct device_node *child, *mdio, *ethernet, *port;
+ struct mii_bus *mdio_bus, *mdio_bus_switch;
struct net_device *ethernet_dev;
struct dsa_platform_data *pd;
struct dsa_chip_data *cd;
@@ -593,16 +700,22 @@ static int dsa_of_probe(struct device *dev)
return -EPROBE_DEFER;
ethernet = of_parse_phandle(np, "dsa,ethernet", 0);
- if (!ethernet)
- return -EINVAL;
+ if (!ethernet) {
+ ret = -EINVAL;
+ goto out_put_mdio;
+ }
ethernet_dev = of_find_net_device_by_node(ethernet);
- if (!ethernet_dev)
- return -EPROBE_DEFER;
+ if (!ethernet_dev) {
+ ret = -EPROBE_DEFER;
+ goto out_put_mdio;
+ }
pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return -ENOMEM;
+ if (!pd) {
+ ret = -ENOMEM;
+ goto out_put_ethernet;
+ }
dev->platform_data = pd;
pd->of_netdev = ethernet_dev;
@@ -623,25 +736,45 @@ static int dsa_of_probe(struct device *dev)
cd = &pd->chip[chip_index];
cd->of_node = child;
- cd->host_dev = &mdio_bus->dev;
+
+ /* When assigning the host device, increment its refcount */
+ cd->host_dev = get_device(&mdio_bus->dev);
sw_addr = of_get_property(child, "reg", NULL);
if (!sw_addr)
continue;
cd->sw_addr = be32_to_cpup(sw_addr);
- if (cd->sw_addr > PHY_MAX_ADDR)
+ if (cd->sw_addr >= PHY_MAX_ADDR)
continue;
if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
cd->eeprom_len = eeprom_len;
+ mdio = of_parse_phandle(child, "mii-bus", 0);
+ if (mdio) {
+ mdio_bus_switch = of_mdio_find_bus(mdio);
+ if (!mdio_bus_switch) {
+ ret = -EPROBE_DEFER;
+ goto out_free_chip;
+ }
+
+ /* Drop the mdio_bus device ref, replacing the host
+ * device with the mdio_bus_switch device, keeping
+ * the refcount from of_mdio_find_bus() above.
+ */
+ put_device(cd->host_dev);
+ cd->host_dev = &mdio_bus_switch->dev;
+ }
+
for_each_available_child_of_node(child, port) {
port_reg = of_get_property(port, "reg", NULL);
if (!port_reg)
continue;
port_index = be32_to_cpup(port_reg);
+ if (port_index >= DSA_MAX_PORTS)
+ break;
port_name = of_get_property(port, "label", NULL);
if (!port_name)
@@ -656,21 +789,18 @@ static int dsa_of_probe(struct device *dev)
goto out_free_chip;
}
- link = of_parse_phandle(port, "link", 0);
-
- if (!strcmp(port_name, "dsa") && link &&
- pd->nr_chips > 1) {
- ret = dsa_of_setup_routing_table(pd, cd,
- chip_index, port_index, link);
- if (ret)
- goto out_free_chip;
- }
+ ret = dsa_of_probe_links(pd, cd, chip_index,
+ port_index, port, port_name);
+ if (ret)
+ goto out_free_chip;
- if (port_index == DSA_MAX_PORTS)
- break;
}
}
+ /* The individual chips hold their own refcount on the mdio bus,
+ * so drop ours */
+ put_device(&mdio_bus->dev);
+
return 0;
out_free_chip:
@@ -678,6 +808,10 @@ out_free_chip:
out_free:
kfree(pd);
dev->platform_data = NULL;
+out_put_ethernet:
+ put_device(&ethernet_dev->dev);
+out_put_mdio:
+ put_device(&mdio_bus->dev);
return ret;
}
@@ -689,6 +823,7 @@ static void dsa_of_remove(struct device *dev)
return;
dsa_of_free_platform_data(pd);
+ put_device(&pd->of_netdev->dev);
kfree(pd);
}
#else
@@ -702,10 +837,11 @@ static inline void dsa_of_remove(struct device *dev)
}
#endif
-static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
- struct device *parent, struct dsa_platform_data *pd)
+static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
+ struct device *parent, struct dsa_platform_data *pd)
{
int i;
+ unsigned configured = 0;
dst->pd = pd;
dst->master_netdev = dev;
@@ -725,9 +861,17 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
dst->ds[i] = ds;
if (ds->drv->poll_link != NULL)
dst->link_poll_needed = 1;
+
+ ++configured;
}
/*
+ * If no switch was found, exit cleanly
+ */
+ if (!configured)
+ return -EPROBE_DEFER;
+
+ /*
* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
* sent to the tag format's receive function.
@@ -743,6 +887,8 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
add_timer(&dst->link_poll_timer);
}
+
+ return 0;
}
static int dsa_probe(struct platform_device *pdev)
@@ -783,7 +929,7 @@ static int dsa_probe(struct platform_device *pdev)
goto out;
}
- dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL);
if (dst == NULL) {
dev_put(dev);
ret = -ENOMEM;
@@ -792,7 +938,9 @@ static int dsa_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, dst);
- dsa_setup_dst(dst, dev, &pdev->dev, pd);
+ ret = dsa_setup_dst(dst, dev, &pdev->dev, pd);
+ if (ret)
+ goto out;
return 0;
@@ -814,7 +962,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
for (i = 0; i < dst->pd->nr_chips; i++) {
struct dsa_switch *ds = dst->ds[i];
- if (ds != NULL)
+ if (ds)
dsa_switch_destroy(ds);
}
}
diff --git a/kernel/net/dsa/dsa_priv.h b/kernel/net/dsa/dsa_priv.h
index d5f1f9b86..311796c80 100644
--- a/kernel/net/dsa/dsa_priv.h
+++ b/kernel/net/dsa/dsa_priv.h
@@ -13,9 +13,10 @@
#include <linux/phy.h>
#include <linux/netdevice.h>
+#include <linux/netpoll.h>
struct dsa_device_ops {
- netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev);
+ struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
int (*rcv)(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev);
};
@@ -26,7 +27,7 @@ struct dsa_slave_priv {
* switch port.
*/
struct net_device *dev;
- netdev_tx_t (*xmit)(struct sk_buff *skb,
+ struct sk_buff * (*xmit)(struct sk_buff *skb,
struct net_device *dev);
/*
@@ -47,6 +48,9 @@ struct dsa_slave_priv {
int old_duplex;
struct net_device *bridge_dev;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *netpoll;
+#endif
};
/* dsa.c */
diff --git a/kernel/net/dsa/slave.c b/kernel/net/dsa/slave.c
index 57978c5b2..7bc787b09 100644
--- a/kernel/net/dsa/slave.c
+++ b/kernel/net/dsa/slave.c
@@ -18,6 +18,7 @@
#include <net/rtnetlink.h>
#include <net/switchdev.h>
#include <linux/if_bridge.h>
+#include <linux/netpoll.h>
#include "dsa_priv.h"
/* slave mii_bus handling ***************************************************/
@@ -112,7 +113,7 @@ static int dsa_slave_open(struct net_device *dev)
clear_promisc:
if (dev->flags & IFF_PROMISC)
- dev_set_promiscuity(master, 0);
+ dev_set_promiscuity(master, -1);
clear_allmulti:
if (dev->flags & IFF_ALLMULTI)
dev_set_allmulti(master, -1);
@@ -199,103 +200,178 @@ out:
return 0;
}
-static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid, u16 nlm_flags)
+static int dsa_bridge_check_vlan_range(struct dsa_switch *ds,
+ const struct net_device *bridge,
+ u16 vid_begin, u16 vid_end)
+{
+ struct dsa_slave_priv *p;
+ struct net_device *dev, *vlan_br;
+ DECLARE_BITMAP(members, DSA_MAX_PORTS);
+ DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
+ u16 vid;
+ int member, err;
+
+ if (!ds->drv->vlan_getnext || !vid_begin)
+ return -EOPNOTSUPP;
+
+ vid = vid_begin - 1;
+
+ do {
+ err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
+ if (err)
+ break;
+
+ if (vid > vid_end)
+ break;
+
+ member = find_first_bit(members, DSA_MAX_PORTS);
+ if (member == DSA_MAX_PORTS)
+ continue;
+
+ dev = ds->ports[member];
+ p = netdev_priv(dev);
+ vlan_br = p->bridge_dev;
+ if (vlan_br == bridge)
+ continue;
+
+ netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid);
+ return -EOPNOTSUPP;
+ } while (vid < vid_end);
+
+ return err == -ENOENT ? 0 : err;
+}
+
+static int dsa_slave_port_vlan_add(struct net_device *dev,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- int ret = -EOPNOTSUPP;
+ int err;
- if (ds->drv->fdb_add)
- ret = ds->drv->fdb_add(ds, p->port, addr, vid);
+ if (switchdev_trans_ph_prepare(trans)) {
+ if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add)
+ return -EOPNOTSUPP;
- return ret;
+ /* If the requested port doesn't belong to the same bridge as
+ * the VLAN members, fallback to software VLAN (hopefully).
+ */
+ err = dsa_bridge_check_vlan_range(ds, p->bridge_dev,
+ vlan->vid_begin,
+ vlan->vid_end);
+ if (err)
+ return err;
+
+ err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans);
+ if (err)
+ return err;
+ } else {
+ err = ds->drv->port_vlan_add(ds, p->port, vlan, trans);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
-static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid)
+static int dsa_slave_port_vlan_del(struct net_device *dev,
+ const struct switchdev_obj_port_vlan *vlan)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- int ret = -EOPNOTSUPP;
- if (ds->drv->fdb_del)
- ret = ds->drv->fdb_del(ds, p->port, addr, vid);
+ if (!ds->drv->port_vlan_del)
+ return -EOPNOTSUPP;
- return ret;
+ return ds->drv->port_vlan_del(ds, p->port, vlan);
}
-static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb,
- const unsigned char *addr, u16 vid,
- bool is_static,
- u32 portid, u32 seq, int type,
- unsigned int flags)
+static int dsa_slave_port_vlan_dump(struct net_device *dev,
+ struct switchdev_obj_port_vlan *vlan,
+ switchdev_obj_dump_cb_t *cb)
{
- struct nlmsghdr *nlh;
- struct ndmsg *ndm;
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ DECLARE_BITMAP(members, DSA_MAX_PORTS);
+ DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
+ u16 pvid, vid = 0;
+ int err;
+
+ if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
+ return -EOPNOTSUPP;
+
+ err = ds->drv->port_pvid_get(ds, p->port, &pvid);
+ if (err)
+ return err;
+
+ for (;;) {
+ err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
+ if (err)
+ break;
- nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
- if (!nlh)
- return -EMSGSIZE;
+ if (!test_bit(p->port, members))
+ continue;
- ndm = nlmsg_data(nlh);
- ndm->ndm_family = AF_BRIDGE;
- ndm->ndm_pad1 = 0;
- ndm->ndm_pad2 = 0;
- ndm->ndm_flags = NTF_EXT_LEARNED;
- ndm->ndm_type = 0;
- ndm->ndm_ifindex = dev->ifindex;
- ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
+ memset(vlan, 0, sizeof(*vlan));
+ vlan->vid_begin = vlan->vid_end = vid;
- if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
- goto nla_put_failure;
+ if (vid == pvid)
+ vlan->flags |= BRIDGE_VLAN_INFO_PVID;
- if (vid && nla_put_u16(skb, NDA_VLAN, vid))
- goto nla_put_failure;
+ if (test_bit(p->port, untagged))
+ vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
- nlmsg_end(skb, nlh);
- return 0;
+ err = cb(&vlan->obj);
+ if (err)
+ break;
+ }
-nla_put_failure:
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
+ return err == -ENOENT ? 0 : err;
}
-/* Dump information about entries, in response to GETNEIGH */
-static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct net_device *dev,
- struct net_device *filter_dev, int idx)
+static int dsa_slave_port_fdb_add(struct net_device *dev,
+ const struct switchdev_obj_port_fdb *fdb,
+ struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- unsigned char addr[ETH_ALEN] = { 0 };
int ret;
- if (!ds->drv->fdb_getnext)
+ if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add)
return -EOPNOTSUPP;
- for (; ; idx++) {
- bool is_static;
+ if (switchdev_trans_ph_prepare(trans))
+ ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans);
+ else
+ ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans);
- ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static);
- if (ret < 0)
- break;
+ return ret;
+}
- if (idx < cb->args[0])
- continue;
+static int dsa_slave_port_fdb_del(struct net_device *dev,
+ const struct switchdev_obj_port_fdb *fdb)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
- ret = dsa_slave_fill_info(dev, skb, addr, 0,
- is_static,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH, NLM_F_MULTI);
- if (ret < 0)
- break;
- }
+ if (ds->drv->port_fdb_del)
+ ret = ds->drv->port_fdb_del(ds, p->port, fdb);
- return idx;
+ return ret;
+}
+
+static int dsa_slave_port_fdb_dump(struct net_device *dev,
+ struct switchdev_obj_port_fdb *fdb,
+ switchdev_obj_dump_cb_t *cb)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->port_fdb_dump)
+ return ds->drv->port_fdb_dump(ds, p->port, fdb, cb);
+
+ return -EOPNOTSUPP;
}
static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
@@ -345,6 +421,107 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state)
return ret;
}
+static int dsa_slave_port_attr_set(struct net_device *dev,
+ const struct switchdev_attr *attr,
+ struct switchdev_trans *trans)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret;
+
+ switch (attr->id) {
+ case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+ if (switchdev_trans_ph_prepare(trans))
+ ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP;
+ else
+ ret = ds->drv->port_stp_update(ds, p->port,
+ attr->u.stp_state);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ return ret;
+}
+
+static int dsa_slave_port_obj_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct switchdev_trans *trans)
+{
+ int err;
+
+ /* For the prepare phase, ensure the full set of changes is feasable in
+ * one go in order to signal a failure properly. If an operation is not
+ * supported, return -EOPNOTSUPP.
+ */
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_FDB:
+ err = dsa_slave_port_fdb_add(dev,
+ SWITCHDEV_OBJ_PORT_FDB(obj),
+ trans);
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ err = dsa_slave_port_vlan_add(dev,
+ SWITCHDEV_OBJ_PORT_VLAN(obj),
+ trans);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dsa_slave_port_obj_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ int err;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_FDB:
+ err = dsa_slave_port_fdb_del(dev,
+ SWITCHDEV_OBJ_PORT_FDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ err = dsa_slave_port_vlan_del(dev,
+ SWITCHDEV_OBJ_PORT_VLAN(obj));
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dsa_slave_port_obj_dump(struct net_device *dev,
+ struct switchdev_obj *obj,
+ switchdev_obj_dump_cb_t *cb)
+{
+ int err;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_FDB:
+ err = dsa_slave_port_fdb_dump(dev,
+ SWITCHDEV_OBJ_PORT_FDB(obj),
+ cb);
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ err = dsa_slave_port_vlan_dump(dev,
+ SWITCHDEV_OBJ_PORT_VLAN(obj),
+ cb);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
static int dsa_slave_bridge_port_join(struct net_device *dev,
struct net_device *br)
{
@@ -382,36 +559,71 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
return ret;
}
-static int dsa_slave_parent_id_get(struct net_device *dev,
- struct netdev_phys_item_id *psid)
+static int dsa_slave_port_attr_get(struct net_device *dev,
+ struct switchdev_attr *attr)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- psid->id_len = sizeof(ds->index);
- memcpy(&psid->id, &ds->index, psid->id_len);
+ switch (attr->id) {
+ case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+ attr->u.ppid.id_len = sizeof(ds->index);
+ memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
return 0;
}
-static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
+static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p,
+ struct sk_buff *skb)
{
- struct dsa_slave_priv *p = netdev_priv(dev);
-
- return p->xmit(skb, dev);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ if (p->netpoll)
+ netpoll_send_skb(p->netpoll, skb);
+#else
+ BUG();
+#endif
+ return NETDEV_TX_OK;
}
-static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb,
- struct net_device *dev)
+static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
+ struct sk_buff *nskb;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ /* Transmit function may have to reallocate the original SKB */
+ nskb = p->xmit(skb, dev);
+ if (!nskb)
+ return NETDEV_TX_OK;
+
+ /* SKB for netpoll still need to be mangled with the protocol-specific
+ * tag to be successfully transmitted
+ */
+ if (unlikely(netpoll_tx_running(dev)))
+ return dsa_netpoll_send_skb(p, nskb);
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
+ /* Queue the SKB for transmission on the parent interface, but
+ * do not modify its EtherType
+ */
+ nskb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(nskb);
return NETDEV_TX_OK;
}
+static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Just return the original SKB */
+ return skb;
+}
+
/* ethtool operations *******************************************************/
static int
@@ -641,6 +853,49 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
return ret;
}
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int dsa_slave_netpoll_setup(struct net_device *dev,
+ struct netpoll_info *ni)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ struct net_device *master = ds->dst->master_netdev;
+ struct netpoll *netpoll;
+ int err = 0;
+
+ netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+ if (!netpoll)
+ return -ENOMEM;
+
+ err = __netpoll_setup(netpoll, master);
+ if (err) {
+ kfree(netpoll);
+ goto out;
+ }
+
+ p->netpoll = netpoll;
+out:
+ return err;
+}
+
+static void dsa_slave_netpoll_cleanup(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct netpoll *netpoll = p->netpoll;
+
+ if (!netpoll)
+ return;
+
+ p->netpoll = NULL;
+
+ __netpoll_free_async(netpoll);
+}
+
+static void dsa_slave_poll_controller(struct net_device *dev)
+{
+}
+#endif
+
static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_settings = dsa_slave_get_settings,
.set_settings = dsa_slave_set_settings,
@@ -668,16 +923,31 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_change_rx_flags = dsa_slave_change_rx_flags,
.ndo_set_rx_mode = dsa_slave_set_rx_mode,
.ndo_set_mac_address = dsa_slave_set_mac_address,
- .ndo_fdb_add = dsa_slave_fdb_add,
- .ndo_fdb_del = dsa_slave_fdb_del,
- .ndo_fdb_dump = dsa_slave_fdb_dump,
+ .ndo_fdb_add = switchdev_port_fdb_add,
+ .ndo_fdb_del = switchdev_port_fdb_del,
+ .ndo_fdb_dump = switchdev_port_fdb_dump,
.ndo_do_ioctl = dsa_slave_ioctl,
.ndo_get_iflink = dsa_slave_get_iflink,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = dsa_slave_netpoll_setup,
+ .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup,
+ .ndo_poll_controller = dsa_slave_poll_controller,
+#endif
+ .ndo_bridge_getlink = switchdev_port_bridge_getlink,
+ .ndo_bridge_setlink = switchdev_port_bridge_setlink,
+ .ndo_bridge_dellink = switchdev_port_bridge_dellink,
+};
+
+static const struct switchdev_ops dsa_slave_switchdev_ops = {
+ .switchdev_port_attr_get = dsa_slave_port_attr_get,
+ .switchdev_port_attr_set = dsa_slave_port_attr_set,
+ .switchdev_port_obj_add = dsa_slave_port_obj_add,
+ .switchdev_port_obj_del = dsa_slave_port_obj_del,
+ .switchdev_port_obj_dump = dsa_slave_port_obj_dump,
};
-static const struct swdev_ops dsa_slave_swdev_ops = {
- .swdev_parent_id_get = dsa_slave_parent_id_get,
- .swdev_port_stp_update = dsa_slave_stp_update,
+static struct device_type dsa_type = {
+ .name = "dsa",
};
static void dsa_slave_adjust_link(struct net_device *dev)
@@ -728,8 +998,10 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
struct dsa_switch *ds = p->parent;
p->phy = ds->slave_mii_bus->phy_map[addr];
- if (!p->phy)
+ if (!p->phy) {
+ netdev_err(slave_dev, "no phy at %d\n", addr);
return -ENODEV;
+ }
/* Use already configured phy mode */
if (p->phy_interface == PHY_INTERFACE_MODE_NA)
@@ -763,7 +1035,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
*/
ret = of_phy_register_fixed_link(port_dn);
if (ret) {
- netdev_err(slave_dev, "failed to register fixed PHY\n");
+ netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret);
return ret;
}
phy_is_fixed = true;
@@ -774,17 +1046,20 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
phy_flags = ds->drv->get_phy_flags(ds, p->port);
if (phy_dn) {
- ret = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
+ int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
+
/* If this PHY address is part of phys_mii_mask, which means
* that we need to divert reads and writes to/from it, then we
* want to bind this device using the slave MII bus created by
* DSA to make that happen.
*/
- if (!phy_is_fixed && ret >= 0 &&
- (ds->phys_mii_mask & (1 << ret))) {
- ret = dsa_slave_phy_connect(p, slave_dev, ret);
- if (ret)
+ if (!phy_is_fixed && phy_id >= 0 &&
+ (ds->phys_mii_mask & (1 << phy_id))) {
+ ret = dsa_slave_phy_connect(p, slave_dev, phy_id);
+ if (ret) {
+ netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret);
return ret;
+ }
} else {
p->phy = of_phy_connect(slave_dev, phy_dn,
dsa_slave_adjust_link,
@@ -801,8 +1076,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
*/
if (!p->phy) {
ret = dsa_slave_phy_connect(p, slave_dev, p->port);
- if (ret)
+ if (ret) {
+ netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
return ret;
+ }
} else {
netdev_info(slave_dev, "attached PHY at address %d [%s]\n",
p->phy->addr, p->phy->drv->name);
@@ -811,12 +1088,19 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
return 0;
}
+static struct lock_class_key dsa_slave_netdev_xmit_lock_key;
+static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock,
+ &dsa_slave_netdev_xmit_lock_key);
+}
+
int dsa_slave_suspend(struct net_device *slave_dev)
{
struct dsa_slave_priv *p = netdev_priv(slave_dev);
- netif_device_detach(slave_dev);
-
if (p->phy) {
phy_stop(p->phy);
p->old_pause = -1;
@@ -858,9 +1142,13 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
slave_dev->features = master->vlan_features;
slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
eth_hw_addr_inherit(slave_dev, master);
- slave_dev->tx_queue_len = 0;
+ slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
- slave_dev->swdev_ops = &dsa_slave_swdev_ops;
+ slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
+ SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
+
+ netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
+ NULL);
SET_NETDEV_DEV(slave_dev, parent);
slave_dev->dev.of_node = ds->pd->port_dn[port];
@@ -903,6 +1191,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
ret = dsa_slave_phy_setup(p, slave_dev);
if (ret) {
+ netdev_err(master, "error %d setting up slave phy\n", ret);
free_netdev(slave_dev);
return ret;
}
@@ -956,7 +1245,7 @@ int dsa_slave_netdevice_event(struct notifier_block *unused,
goto out;
err = dsa_slave_master_changed(dev);
- if (err)
+ if (err && err != -EOPNOTSUPP)
netdev_warn(dev, "failed to reflect master change\n");
break;
diff --git a/kernel/net/dsa/tag_brcm.c b/kernel/net/dsa/tag_brcm.c
index 83d3572cd..e2aadb731 100644
--- a/kernel/net/dsa/tag_brcm.c
+++ b/kernel/net/dsa/tag_brcm.c
@@ -58,14 +58,11 @@
#define BRCM_EG_TC_MASK 0x7
#define BRCM_EG_PID_MASK 0x1f
-static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
u8 *brcm_tag;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
if (skb_cow_head(skb, BRCM_TAG_LEN) < 0)
goto out_free;
@@ -87,17 +84,11 @@ static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK;
- /* Queue the SKB for transmission on the parent interface, but
- * do not modify its EtherType
- */
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
-
- return NETDEV_TX_OK;
+ return skb;
out_free:
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/kernel/net/dsa/tag_dsa.c b/kernel/net/dsa/tag_dsa.c
index 2dab27063..aa780e4ac 100644
--- a/kernel/net/dsa/tag_dsa.c
+++ b/kernel/net/dsa/tag_dsa.c
@@ -15,14 +15,11 @@
#define DSA_HLEN 4
-static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
u8 *dsa_header;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
/*
* Convert the outermost 802.1q tag to a DSA tag for tagged
* packets, or insert a DSA tag between the addresses and
@@ -63,14 +60,11 @@ static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev)
dsa_header[3] = 0x00;
}
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
-
- return NETDEV_TX_OK;
+ return skb;
out_free:
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/kernel/net/dsa/tag_edsa.c b/kernel/net/dsa/tag_edsa.c
index 9aeda596f..2288c8098 100644
--- a/kernel/net/dsa/tag_edsa.c
+++ b/kernel/net/dsa/tag_edsa.c
@@ -16,14 +16,11 @@
#define DSA_HLEN 4
#define EDSA_HLEN 8
-static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
u8 *edsa_header;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
/*
* Convert the outermost 802.1q tag to a DSA tag and prepend
* a DSA ethertype field is the packet is tagged, or insert
@@ -76,14 +73,11 @@ static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev)
edsa_header[7] = 0x00;
}
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
-
- return NETDEV_TX_OK;
+ return skb;
out_free:
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/kernel/net/dsa/tag_trailer.c b/kernel/net/dsa/tag_trailer.c
index e268f9db8..b6ca0890d 100644
--- a/kernel/net/dsa/tag_trailer.c
+++ b/kernel/net/dsa/tag_trailer.c
@@ -13,16 +13,13 @@
#include <linux/slab.h>
#include "dsa_priv.h"
-static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct sk_buff *nskb;
int padlen;
u8 *trailer;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
/*
* We have to make sure that the trailer ends up as the very
* last 4 bytes of the packet. This means that we have to pad
@@ -36,7 +33,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
if (nskb == NULL) {
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
skb_reserve(nskb, NET_IP_ALIGN);
@@ -57,10 +54,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
trailer[2] = 0x10;
trailer[3] = 0x00;
- nskb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(nskb);
-
- return NETDEV_TX_OK;
+ return nskb;
}
static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -84,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
trailer = skb_tail_pointer(skb) - 4;
if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
- (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00)
+ (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00)
goto out_drop;
source_port = trailer[1] & 7;
diff --git a/kernel/net/ethernet/eth.c b/kernel/net/ethernet/eth.c
index f3bad41d7..9e63f252a 100644
--- a/kernel/net/ethernet/eth.c
+++ b/kernel/net/ethernet/eth.c
@@ -58,6 +58,7 @@
#include <net/ipv6.h>
#include <net/ip.h>
#include <net/dsa.h>
+#include <net/flow_dissector.h>
#include <linux/uaccess.h>
__setup("ether=", netdev_boot_setup);
@@ -113,7 +114,7 @@ int eth_header(struct sk_buff *skb, struct net_device *dev,
EXPORT_SYMBOL(eth_header);
/**
- * eth_get_headlen - determine the the length of header for an ethernet frame
+ * eth_get_headlen - determine the length of header for an ethernet frame
* @data: pointer to start of frame
* @len: total length of frame
*
@@ -126,13 +127,13 @@ u32 eth_get_headlen(void *data, unsigned int len)
struct flow_keys keys;
/* this should never happen, but better safe than sorry */
- if (len < sizeof(*eth))
+ if (unlikely(len < sizeof(*eth)))
return len;
/* parse any remaining L2/L3 headers, check for L4 */
- if (!__skb_flow_dissect(NULL, &keys, data,
- eth->h_proto, sizeof(*eth), len))
- return max_t(u32, keys.thoff, sizeof(*eth));
+ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
+ sizeof(*eth), len, 0))
+ return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */
return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
@@ -156,10 +157,11 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
skb->dev = dev;
skb_reset_mac_header(skb);
+
+ eth = (struct ethhdr *)skb->data;
skb_pull_inline(skb, ETH_HLEN);
- eth = eth_hdr(skb);
- if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
+ if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
skb->pkt_type = PACKET_BROADCAST;
else
@@ -178,7 +180,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
if (unlikely(netdev_uses_dsa(dev)))
return htons(ETH_P_XDSA);
- if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN))
+ if (likely(eth_proto_is_802_3(eth->h_proto)))
return eth->h_proto;
/*
@@ -468,6 +470,7 @@ EXPORT_SYMBOL(eth_gro_complete);
static struct packet_offload eth_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_TEB),
+ .priority = 10,
.callbacks = {
.gro_receive = eth_gro_receive,
.gro_complete = eth_gro_complete,
diff --git a/kernel/net/hsr/hsr_device.c b/kernel/net/hsr/hsr_device.c
index 44d27469a..c7d1adca3 100644
--- a/kernel/net/hsr/hsr_device.c
+++ b/kernel/net/hsr/hsr_device.c
@@ -312,7 +312,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
return;
out:
- WARN_ON_ONCE("HSR: Could not send supervision frame\n");
+ WARN_ONCE(1, "HSR: Could not send supervision frame\n");
kfree_skb(skb);
}
@@ -392,7 +392,7 @@ void hsr_dev_setup(struct net_device *dev)
dev->header_ops = &hsr_header_ops;
dev->netdev_ops = &hsr_device_ops;
SET_NETDEV_DEVTYPE(dev, &hsr_type);
- dev->tx_queue_len = 0;
+ dev->priv_flags |= IFF_NO_QUEUE;
dev->destructor = hsr_dev_destroy;
diff --git a/kernel/net/ieee802154/6lowpan/6lowpan_i.h b/kernel/net/ieee802154/6lowpan/6lowpan_i.h
index e50f69da7..b4e17a7c0 100644
--- a/kernel/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/kernel/net/ieee802154/6lowpan/6lowpan_i.h
@@ -5,6 +5,16 @@
#include <net/ieee802154_netdev.h>
#include <net/inet_frag.h>
+#include <net/6lowpan.h>
+
+typedef unsigned __bitwise__ lowpan_rx_result;
+#define RX_CONTINUE ((__force lowpan_rx_result) 0u)
+#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u)
+#define RX_DROP ((__force lowpan_rx_result) 2u)
+#define RX_QUEUED ((__force lowpan_rx_result) 3u)
+
+#define LOWPAN_DISPATCH_FRAG1 0xc0
+#define LOWPAN_DISPATCH_FRAGN 0xe0
struct lowpan_create_arg {
u16 tag;
@@ -37,26 +47,18 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
}
}
-struct lowpan_dev_record {
- struct net_device *ldev;
- struct list_head list;
-};
-
/* private device info */
struct lowpan_dev_info {
- struct net_device *real_dev; /* real WPAN device ptr */
- struct mutex dev_list_mtx; /* mutex for list ops */
+ struct net_device *wdev; /* wpan device ptr */
u16 fragment_tag;
};
static inline struct
lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
{
- return netdev_priv(dev);
+ return (struct lowpan_dev_info *)lowpan_priv(dev)->priv;
}
-extern struct list_head lowpan_devices;
-
int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
void lowpan_net_frag_exit(void);
int lowpan_net_frag_init(void);
@@ -69,4 +71,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
const void *_saddr, unsigned int len);
netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev);
+int lowpan_iphc_decompress(struct sk_buff *skb);
+lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb);
+
#endif /* __IEEE802154_6LOWPAN_I_H__ */
diff --git a/kernel/net/ieee802154/6lowpan/core.c b/kernel/net/ieee802154/6lowpan/core.c
index 0ae5822ef..20c49c724 100644
--- a/kernel/net/ieee802154/6lowpan/core.c
+++ b/kernel/net/ieee802154/6lowpan/core.c
@@ -52,29 +52,7 @@
#include "6lowpan_i.h"
-LIST_HEAD(lowpan_devices);
-static int lowpan_open_count;
-
-static __le16 lowpan_get_pan_id(const struct net_device *dev)
-{
- struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
-
- return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev);
-}
-
-static __le16 lowpan_get_short_addr(const struct net_device *dev)
-{
- struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
-
- return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev);
-}
-
-static u8 lowpan_get_dsn(const struct net_device *dev)
-{
- struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
-
- return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev);
-}
+static int open_count;
static struct header_ops lowpan_header_ops = {
.create = lowpan_header_create,
@@ -83,7 +61,7 @@ static struct header_ops lowpan_header_ops = {
static struct lock_class_key lowpan_tx_busylock;
static struct lock_class_key lowpan_netdev_xmit_lock_key;
-static void lowpan_set_lockdep_class_one(struct net_device *dev,
+static void lowpan_set_lockdep_class_one(struct net_device *ldev,
struct netdev_queue *txq,
void *_unused)
{
@@ -91,42 +69,47 @@ static void lowpan_set_lockdep_class_one(struct net_device *dev,
&lowpan_netdev_xmit_lock_key);
}
-static int lowpan_dev_init(struct net_device *dev)
+static int lowpan_dev_init(struct net_device *ldev)
+{
+ netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL);
+ ldev->qdisc_tx_busylock = &lowpan_tx_busylock;
+ return 0;
+}
+
+static int lowpan_open(struct net_device *dev)
{
- netdev_for_each_tx_queue(dev, lowpan_set_lockdep_class_one, NULL);
- dev->qdisc_tx_busylock = &lowpan_tx_busylock;
+ if (!open_count)
+ lowpan_rx_init();
+ open_count++;
+ return 0;
+}
+
+static int lowpan_stop(struct net_device *dev)
+{
+ open_count--;
+ if (!open_count)
+ lowpan_rx_exit();
return 0;
}
static const struct net_device_ops lowpan_netdev_ops = {
.ndo_init = lowpan_dev_init,
.ndo_start_xmit = lowpan_xmit,
+ .ndo_open = lowpan_open,
+ .ndo_stop = lowpan_stop,
};
-static struct ieee802154_mlme_ops lowpan_mlme = {
- .get_pan_id = lowpan_get_pan_id,
- .get_short_addr = lowpan_get_short_addr,
- .get_dsn = lowpan_get_dsn,
-};
-
-static void lowpan_setup(struct net_device *dev)
+static void lowpan_setup(struct net_device *ldev)
{
- dev->addr_len = IEEE802154_ADDR_LEN;
- memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN);
- dev->type = ARPHRD_6LOWPAN;
- /* Frame Control + Sequence Number + Address fields + Security Header */
- dev->hard_header_len = 2 + 1 + 20 + 14;
- dev->needed_tailroom = 2; /* FCS */
- dev->mtu = IPV6_MIN_MTU;
- dev->tx_queue_len = 0;
- dev->flags = IFF_BROADCAST | IFF_MULTICAST;
- dev->watchdog_timeo = 0;
-
- dev->netdev_ops = &lowpan_netdev_ops;
- dev->header_ops = &lowpan_header_ops;
- dev->ml_priv = &lowpan_mlme;
- dev->destructor = free_netdev;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN);
+ /* We need an ipv6hdr as minimum len when calling xmit */
+ ldev->hard_header_len = sizeof(struct ipv6hdr);
+ ldev->flags = IFF_BROADCAST | IFF_MULTICAST;
+
+ ldev->netdev_ops = &lowpan_netdev_ops;
+ ldev->header_ops = &lowpan_header_ops;
+ ldev->destructor = free_netdev;
+ ldev->features |= NETIF_F_NETNS_LOCAL;
}
static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -138,11 +121,10 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[])
return 0;
}
-static int lowpan_newlink(struct net *src_net, struct net_device *dev,
+static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
struct nlattr *tb[], struct nlattr *data[])
{
- struct net_device *real_dev;
- struct lowpan_dev_record *entry;
+ struct net_device *wdev;
int ret;
ASSERT_RTNL();
@@ -150,78 +132,61 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev,
pr_debug("adding new link\n");
if (!tb[IFLA_LINK] ||
- !net_eq(dev_net(dev), &init_net))
+ !net_eq(dev_net(ldev), &init_net))
return -EINVAL;
- /* find and hold real wpan device */
- real_dev = dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
- if (!real_dev)
+ /* find and hold wpan device */
+ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK]));
+ if (!wdev)
return -ENODEV;
- if (real_dev->type != ARPHRD_IEEE802154) {
- dev_put(real_dev);
+ if (wdev->type != ARPHRD_IEEE802154) {
+ dev_put(wdev);
return -EINVAL;
}
- lowpan_dev_info(dev)->real_dev = real_dev;
- mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
-
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry) {
- dev_put(real_dev);
- lowpan_dev_info(dev)->real_dev = NULL;
- return -ENOMEM;
+ if (wdev->ieee802154_ptr->lowpan_dev) {
+ dev_put(wdev);
+ return -EBUSY;
}
- entry->ldev = dev;
-
+ lowpan_dev_info(ldev)->wdev = wdev;
/* Set the lowpan hardware address to the wpan hardware address. */
- memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN);
-
- mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx);
- INIT_LIST_HEAD(&entry->list);
- list_add_tail(&entry->list, &lowpan_devices);
- mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx);
-
- ret = register_netdevice(dev);
- if (ret >= 0) {
- if (!lowpan_open_count)
- lowpan_rx_init();
- lowpan_open_count++;
+ memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN);
+ /* We need headroom for possible wpan_dev_hard_header call and tailroom
+ * for encryption/fcs handling. The lowpan interface will replace
+ * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN
+ * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6
+ * header.
+ */
+ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN +
+ wdev->needed_headroom;
+ ldev->needed_tailroom = wdev->needed_tailroom;
+
+ lowpan_netdev_setup(ldev, LOWPAN_LLTYPE_IEEE802154);
+
+ ret = register_netdevice(ldev);
+ if (ret < 0) {
+ dev_put(wdev);
+ return ret;
}
- return ret;
+ wdev->ieee802154_ptr->lowpan_dev = ldev;
+ return 0;
}
-static void lowpan_dellink(struct net_device *dev, struct list_head *head)
+static void lowpan_dellink(struct net_device *ldev, struct list_head *head)
{
- struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev);
- struct net_device *real_dev = lowpan_dev->real_dev;
- struct lowpan_dev_record *entry, *tmp;
+ struct net_device *wdev = lowpan_dev_info(ldev)->wdev;
ASSERT_RTNL();
- lowpan_open_count--;
- if (!lowpan_open_count)
- lowpan_rx_exit();
-
- mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx);
- list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) {
- if (entry->ldev == dev) {
- list_del(&entry->list);
- kfree(entry);
- }
- }
- mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx);
-
- mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx);
-
- unregister_netdevice_queue(dev, head);
-
- dev_put(real_dev);
+ wdev->ieee802154_ptr->lowpan_dev = NULL;
+ unregister_netdevice(ldev);
+ dev_put(wdev);
}
static struct rtnl_link_ops lowpan_link_ops __read_mostly = {
.kind = "lowpan",
- .priv_size = sizeof(struct lowpan_dev_info),
+ .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)),
.setup = lowpan_setup,
.newlink = lowpan_newlink,
.dellink = lowpan_dellink,
@@ -241,20 +206,22 @@ static inline void lowpan_netlink_fini(void)
static int lowpan_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- LIST_HEAD(del_list);
- struct lowpan_dev_record *entry, *tmp;
+ struct net_device *wdev = netdev_notifier_info_to_dev(ptr);
- if (dev->type != ARPHRD_IEEE802154)
+ if (wdev->type != ARPHRD_IEEE802154)
goto out;
- if (event == NETDEV_UNREGISTER) {
- list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) {
- if (lowpan_dev_info(entry->ldev)->real_dev == dev)
- lowpan_dellink(entry->ldev, &del_list);
- }
-
- unregister_netdevice_many(&del_list);
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* Check if wpan interface is unregistered that we
+ * also delete possible lowpan interfaces which belongs
+ * to the wpan interface.
+ */
+ if (wdev->ieee802154_ptr->lowpan_dev)
+ lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL);
+ break;
+ default:
+ break;
}
out:
diff --git a/kernel/net/ieee802154/6lowpan/reassembly.c b/kernel/net/ieee802154/6lowpan/reassembly.c
index f46e4d130..6b437e876 100644
--- a/kernel/net/ieee802154/6lowpan/reassembly.c
+++ b/kernel/net/ieee802154/6lowpan/reassembly.c
@@ -32,21 +32,10 @@
static const char lowpan_frags_cache_name[] = "lowpan-frags";
-struct lowpan_frag_info {
- u16 d_tag;
- u16 d_size;
- u8 d_offset;
-};
-
-static struct lowpan_frag_info *lowpan_cb(struct sk_buff *skb)
-{
- return (struct lowpan_frag_info *)skb->cb;
-}
-
static struct inet_frags lowpan_frags;
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
- struct sk_buff *prev, struct net_device *dev);
+ struct sk_buff *prev, struct net_device *ldev);
static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
const struct ieee802154_addr *saddr,
@@ -111,7 +100,7 @@ out:
}
static inline struct lowpan_frag_queue *
-fq_find(struct net *net, const struct lowpan_frag_info *frag_info,
+fq_find(struct net *net, const struct lowpan_802154_cb *cb,
const struct ieee802154_addr *src,
const struct ieee802154_addr *dst)
{
@@ -121,12 +110,12 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info,
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
- arg.tag = frag_info->d_tag;
- arg.d_size = frag_info->d_size;
+ arg.tag = cb->d_tag;
+ arg.d_size = cb->d_size;
arg.src = src;
arg.dst = dst;
- hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst);
+ hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
q = inet_frag_find(&ieee802154_lowpan->frags,
&lowpan_frags, &arg, hash);
@@ -138,17 +127,17 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info,
}
static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
- struct sk_buff *skb, const u8 frag_type)
+ struct sk_buff *skb, u8 frag_type)
{
struct sk_buff *prev, *next;
- struct net_device *dev;
+ struct net_device *ldev;
int end, offset;
if (fq->q.flags & INET_FRAG_COMPLETE)
goto err;
- offset = lowpan_cb(skb)->d_offset << 3;
- end = lowpan_cb(skb)->d_size;
+ offset = lowpan_802154_cb(skb)->d_offset << 3;
+ end = lowpan_802154_cb(skb)->d_size;
/* Is this the final fragment? */
if (offset + skb->len == end) {
@@ -174,13 +163,16 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
* this fragment, right?
*/
prev = fq->q.fragments_tail;
- if (!prev || lowpan_cb(prev)->d_offset < lowpan_cb(skb)->d_offset) {
+ if (!prev ||
+ lowpan_802154_cb(prev)->d_offset <
+ lowpan_802154_cb(skb)->d_offset) {
next = NULL;
goto found;
}
prev = NULL;
for (next = fq->q.fragments; next != NULL; next = next->next) {
- if (lowpan_cb(next)->d_offset >= lowpan_cb(skb)->d_offset)
+ if (lowpan_802154_cb(next)->d_offset >=
+ lowpan_802154_cb(skb)->d_offset)
break; /* bingo! */
prev = next;
}
@@ -195,19 +187,16 @@ found:
else
fq->q.fragments = skb;
- dev = skb->dev;
- if (dev)
+ ldev = skb->dev;
+ if (ldev)
skb->dev = NULL;
fq->q.stamp = skb->tstamp;
- if (frag_type == LOWPAN_DISPATCH_FRAG1) {
- /* Calculate uncomp. 6lowpan header to estimate full size */
- fq->q.meat += lowpan_uncompress_size(skb, NULL);
+ if (frag_type == LOWPAN_DISPATCH_FRAG1)
fq->q.flags |= INET_FRAG_FIRST_IN;
- } else {
- fq->q.meat += skb->len;
- }
- add_frag_mem_limit(&fq->q, skb->truesize);
+
+ fq->q.meat += skb->len;
+ add_frag_mem_limit(fq->q.net, skb->truesize);
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
fq->q.meat == fq->q.len) {
@@ -215,7 +204,7 @@ found:
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- res = lowpan_frag_reasm(fq, prev, dev);
+ res = lowpan_frag_reasm(fq, prev, ldev);
skb->_skb_refdst = orefdst;
return res;
}
@@ -235,7 +224,7 @@ err:
* the last and the first frames arrived and all the bits are here.
*/
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
- struct net_device *dev)
+ struct net_device *ldev)
{
struct sk_buff *fp, *head = fq->q.fragments;
int sum_truesize;
@@ -287,7 +276,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
clone->data_len = clone->len;
head->data_len -= clone->len;
head->len -= clone->len;
- add_frag_mem_limit(&fq->q, clone->truesize);
+ add_frag_mem_limit(fq->q.net, clone->truesize);
}
WARN_ON(head == NULL);
@@ -310,10 +299,10 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
}
fp = next;
}
- sub_frag_mem_limit(&fq->q, sum_truesize);
+ sub_frag_mem_limit(fq->q.net, sum_truesize);
head->next = NULL;
- head->dev = dev;
+ head->dev = ldev;
head->tstamp = fq->q.stamp;
fq->q.fragments = NULL;
@@ -325,24 +314,87 @@ out_oom:
return -1;
}
-static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type,
- struct lowpan_frag_info *frag_info)
+static int lowpan_frag_rx_handlers_result(struct sk_buff *skb,
+ lowpan_rx_result res)
+{
+ switch (res) {
+ case RX_QUEUED:
+ return NET_RX_SUCCESS;
+ case RX_CONTINUE:
+ /* nobody cared about this packet */
+ net_warn_ratelimited("%s: received unknown dispatch\n",
+ __func__);
+
+ /* fall-through */
+ default:
+ /* all others failure */
+ return NET_RX_DROP;
+ }
+}
+
+static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb)
+{
+ int ret;
+
+ if (!lowpan_is_iphc(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ ret = lowpan_iphc_decompress(skb);
+ if (ret < 0)
+ return RX_DROP;
+
+ return RX_QUEUED;
+}
+
+static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb)
+{
+ lowpan_rx_result res;
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(skb); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0)
+
+ /* likely at first */
+ CALL_RXH(lowpan_frag_rx_h_iphc);
+ CALL_RXH(lowpan_rx_h_ipv6);
+
+rxh_next:
+ return lowpan_frag_rx_handlers_result(skb, res);
+#undef CALL_RXH
+}
+
+#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07
+#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8
+
+static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type,
+ struct lowpan_802154_cb *cb)
{
bool fail;
- u8 pattern = 0, low = 0;
+ u8 high = 0, low = 0;
__be16 d_tag = 0;
- fail = lowpan_fetch_skb(skb, &pattern, 1);
+ fail = lowpan_fetch_skb(skb, &high, 1);
fail |= lowpan_fetch_skb(skb, &low, 1);
- frag_info->d_size = (pattern & 7) << 8 | low;
+ /* remove the dispatch value and use first three bits as high value
+ * for the datagram size
+ */
+ cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) <<
+ LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low;
fail |= lowpan_fetch_skb(skb, &d_tag, 2);
- frag_info->d_tag = ntohs(d_tag);
+ cb->d_tag = ntohs(d_tag);
if (frag_type == LOWPAN_DISPATCH_FRAGN) {
- fail |= lowpan_fetch_skb(skb, &frag_info->d_offset, 1);
+ fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1);
} else {
skb_reset_network_header(skb);
- frag_info->d_offset = 0;
+ cb->d_offset = 0;
+ /* check if datagram_size has ipv6hdr on FRAG1 */
+ fail |= cb->d_size < sizeof(struct ipv6hdr);
+ /* check if we can dereference the dispatch value */
+ fail |= !skb->len;
}
if (unlikely(fail))
@@ -351,27 +403,33 @@ static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type,
return 0;
}
-int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type)
+int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
{
struct lowpan_frag_queue *fq;
struct net *net = dev_net(skb->dev);
- struct lowpan_frag_info *frag_info = lowpan_cb(skb);
- struct ieee802154_addr source, dest;
+ struct lowpan_802154_cb *cb = lowpan_802154_cb(skb);
+ struct ieee802154_hdr hdr;
int err;
- source = mac_cb(skb)->source;
- dest = mac_cb(skb)->dest;
+ if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
+ goto err;
- err = lowpan_get_frag_info(skb, frag_type, frag_info);
+ err = lowpan_get_cb(skb, frag_type, cb);
if (err < 0)
goto err;
- if (frag_info->d_size > IPV6_MIN_MTU) {
+ if (frag_type == LOWPAN_DISPATCH_FRAG1) {
+ err = lowpan_invoke_frag_rx_handlers(skb);
+ if (err == NET_RX_DROP)
+ goto err;
+ }
+
+ if (cb->d_size > IPV6_MIN_MTU) {
net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n");
goto err;
}
- fq = fq_find(net, frag_info, &source, &dest);
+ fq = fq_find(net, cb, &hdr.source, &hdr.dest);
if (fq != NULL) {
int ret;
@@ -387,7 +445,6 @@ err:
kfree_skb(skb);
return -1;
}
-EXPORT_SYMBOL(lowpan_frag_rcv);
#ifdef CONFIG_SYSCTL
static int zero;
@@ -523,14 +580,19 @@ static int __net_init lowpan_frags_init_net(struct net *net)
{
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
+ int res;
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
- inet_frags_init_net(&ieee802154_lowpan->frags);
-
- return lowpan_frags_ns_sysctl_register(net);
+ res = inet_frags_init_net(&ieee802154_lowpan->frags);
+ if (res)
+ return res;
+ res = lowpan_frags_ns_sysctl_register(net);
+ if (res)
+ inet_frags_uninit_net(&ieee802154_lowpan->frags);
+ return res;
}
static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/kernel/net/ieee802154/6lowpan/rx.c b/kernel/net/ieee802154/6lowpan/rx.c
index 4be1d289a..ef185dd41 100644
--- a/kernel/net/ieee802154/6lowpan/rx.c
+++ b/kernel/net/ieee802154/6lowpan/rx.c
@@ -11,147 +11,307 @@
#include <linux/if_arp.h>
#include <net/6lowpan.h>
+#include <net/mac802154.h>
#include <net/ieee802154_netdev.h>
#include "6lowpan_i.h"
-static int lowpan_give_skb_to_devices(struct sk_buff *skb,
- struct net_device *dev)
-{
- struct lowpan_dev_record *entry;
- struct sk_buff *skb_cp;
- int stat = NET_RX_SUCCESS;
+#define LOWPAN_DISPATCH_FIRST 0xc0
+#define LOWPAN_DISPATCH_FRAG_MASK 0xf8
+
+#define LOWPAN_DISPATCH_NALP 0x00
+#define LOWPAN_DISPATCH_ESC 0x40
+#define LOWPAN_DISPATCH_HC1 0x42
+#define LOWPAN_DISPATCH_DFF 0x43
+#define LOWPAN_DISPATCH_BC0 0x50
+#define LOWPAN_DISPATCH_MESH 0x80
+static int lowpan_give_skb_to_device(struct sk_buff *skb)
+{
skb->protocol = htons(ETH_P_IPV6);
- skb->pkt_type = PACKET_HOST;
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
- rcu_read_lock();
- list_for_each_entry_rcu(entry, &lowpan_devices, list)
- if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) {
- skb_cp = skb_copy(skb, GFP_ATOMIC);
- if (!skb_cp) {
- kfree_skb(skb);
- rcu_read_unlock();
- return NET_RX_DROP;
- }
+ return netif_rx(skb);
+}
- skb_cp->dev = entry->ldev;
- stat = netif_rx(skb_cp);
- if (stat == NET_RX_DROP)
- break;
- }
- rcu_read_unlock();
+static int lowpan_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res)
+{
+ switch (res) {
+ case RX_CONTINUE:
+ /* nobody cared about this packet */
+ net_warn_ratelimited("%s: received unknown dispatch\n",
+ __func__);
+
+ /* fall-through */
+ case RX_DROP_UNUSABLE:
+ kfree_skb(skb);
- consume_skb(skb);
+ /* fall-through */
+ case RX_DROP:
+ return NET_RX_DROP;
+ case RX_QUEUED:
+ return lowpan_give_skb_to_device(skb);
+ default:
+ break;
+ }
- return stat;
+ return NET_RX_DROP;
}
-static int
-iphc_decompress(struct sk_buff *skb, const struct ieee802154_hdr *hdr)
+static inline bool lowpan_is_frag1(u8 dispatch)
{
- u8 iphc0, iphc1;
- struct ieee802154_addr_sa sa, da;
- void *sap, *dap;
+ return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAG1;
+}
- raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len);
- /* at least two bytes will be used for the encoding */
- if (skb->len < 2)
- return -EINVAL;
+static inline bool lowpan_is_fragn(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAGN;
+}
- if (lowpan_fetch_skb_u8(skb, &iphc0))
- return -EINVAL;
+static lowpan_rx_result lowpan_rx_h_frag(struct sk_buff *skb)
+{
+ int ret;
- if (lowpan_fetch_skb_u8(skb, &iphc1))
- return -EINVAL;
+ if (!(lowpan_is_frag1(*skb_network_header(skb)) ||
+ lowpan_is_fragn(*skb_network_header(skb))))
+ return RX_CONTINUE;
- ieee802154_addr_to_sa(&sa, &hdr->source);
- ieee802154_addr_to_sa(&da, &hdr->dest);
+ ret = lowpan_frag_rcv(skb, *skb_network_header(skb) &
+ LOWPAN_DISPATCH_FRAG_MASK);
+ if (ret == 1)
+ return RX_QUEUED;
- if (sa.addr_type == IEEE802154_ADDR_SHORT)
- sap = &sa.short_addr;
- else
- sap = &sa.hwaddr;
+ /* Packet is freed by lowpan_frag_rcv on error or put into the frag
+ * bucket.
+ */
+ return RX_DROP;
+}
- if (da.addr_type == IEEE802154_ADDR_SHORT)
- dap = &da.short_addr;
- else
- dap = &da.hwaddr;
+int lowpan_iphc_decompress(struct sk_buff *skb)
+{
+ struct ieee802154_hdr hdr;
- return lowpan_header_decompress(skb, skb->dev, sap, sa.addr_type,
- IEEE802154_ADDR_LEN, dap, da.addr_type,
- IEEE802154_ADDR_LEN, iphc0, iphc1);
+ if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
+ return -EINVAL;
+
+ return lowpan_header_decompress(skb, skb->dev, &hdr.dest, &hdr.source);
}
-static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
+static lowpan_rx_result lowpan_rx_h_iphc(struct sk_buff *skb)
{
- struct ieee802154_hdr hdr;
int ret;
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (!skb)
- goto drop;
+ if (!lowpan_is_iphc(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ /* Setting datagram_offset to zero indicates non frag handling
+ * while doing lowpan_header_decompress.
+ */
+ lowpan_802154_cb(skb)->d_size = 0;
- if (!netif_running(dev))
- goto drop_skb;
+ ret = lowpan_iphc_decompress(skb);
+ if (ret < 0)
+ return RX_DROP_UNUSABLE;
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto drop_skb;
+ return RX_QUEUED;
+}
- if (dev->type != ARPHRD_IEEE802154)
- goto drop_skb;
+lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb)
+{
+ if (!lowpan_is_ipv6(*skb_network_header(skb)))
+ return RX_CONTINUE;
- if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
- goto drop_skb;
-
- /* check that it's our buffer */
- if (skb->data[0] == LOWPAN_DISPATCH_IPV6) {
- /* Pull off the 1-byte of 6lowpan header. */
- skb_pull(skb, 1);
- return lowpan_give_skb_to_devices(skb, NULL);
- } else {
- switch (skb->data[0] & 0xe0) {
- case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */
- ret = iphc_decompress(skb, &hdr);
- if (ret < 0)
- goto drop_skb;
-
- return lowpan_give_skb_to_devices(skb, NULL);
- case LOWPAN_DISPATCH_FRAG1: /* first fragment header */
- ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1);
- if (ret == 1) {
- ret = iphc_decompress(skb, &hdr);
- if (ret < 0)
- goto drop_skb;
-
- return lowpan_give_skb_to_devices(skb, NULL);
- } else if (ret == -1) {
- return NET_RX_DROP;
- } else {
- return NET_RX_SUCCESS;
- }
- case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */
- ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAGN);
- if (ret == 1) {
- ret = iphc_decompress(skb, &hdr);
- if (ret < 0)
- goto drop_skb;
-
- return lowpan_give_skb_to_devices(skb, NULL);
- } else if (ret == -1) {
- return NET_RX_DROP;
- } else {
- return NET_RX_SUCCESS;
- }
- default:
- break;
- }
+ /* Pull off the 1-byte of 6lowpan header. */
+ skb_pull(skb, 1);
+ return RX_QUEUED;
+}
+
+static inline bool lowpan_is_esc(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_ESC;
+}
+
+static lowpan_rx_result lowpan_rx_h_esc(struct sk_buff *skb)
+{
+ if (!lowpan_is_esc(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN ESC not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_hc1(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_HC1;
+}
+
+static lowpan_rx_result lowpan_rx_h_hc1(struct sk_buff *skb)
+{
+ if (!lowpan_is_hc1(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN HC1 not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_dff(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_DFF;
+}
+
+static lowpan_rx_result lowpan_rx_h_dff(struct sk_buff *skb)
+{
+ if (!lowpan_is_dff(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN DFF not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_bc0(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_BC0;
+}
+
+static lowpan_rx_result lowpan_rx_h_bc0(struct sk_buff *skb)
+{
+ if (!lowpan_is_bc0(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN BC0 not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_mesh(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_MESH;
+}
+
+static lowpan_rx_result lowpan_rx_h_mesh(struct sk_buff *skb)
+{
+ if (!lowpan_is_mesh(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN MESH not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static int lowpan_invoke_rx_handlers(struct sk_buff *skb)
+{
+ lowpan_rx_result res;
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(skb); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0)
+
+ /* likely at first */
+ CALL_RXH(lowpan_rx_h_iphc);
+ CALL_RXH(lowpan_rx_h_frag);
+ CALL_RXH(lowpan_rx_h_ipv6);
+ CALL_RXH(lowpan_rx_h_esc);
+ CALL_RXH(lowpan_rx_h_hc1);
+ CALL_RXH(lowpan_rx_h_dff);
+ CALL_RXH(lowpan_rx_h_bc0);
+ CALL_RXH(lowpan_rx_h_mesh);
+
+rxh_next:
+ return lowpan_rx_handlers_result(skb, res);
+#undef CALL_RXH
+}
+
+static inline bool lowpan_is_nalp(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_NALP;
+}
+
+/* Lookup for reserved dispatch values at:
+ * https://www.iana.org/assignments/_6lowpan-parameters/_6lowpan-parameters.xhtml#_6lowpan-parameters-1
+ *
+ * Last Updated: 2015-01-22
+ */
+static inline bool lowpan_is_reserved(u8 dispatch)
+{
+ return ((dispatch >= 0x44 && dispatch <= 0x4F) ||
+ (dispatch >= 0x51 && dispatch <= 0x5F) ||
+ (dispatch >= 0xc8 && dispatch <= 0xdf) ||
+ (dispatch >= 0xe8 && dispatch <= 0xff));
+}
+
+/* lowpan_rx_h_check checks on generic 6LoWPAN requirements
+ * in MAC and 6LoWPAN header.
+ *
+ * Don't manipulate the skb here, it could be shared buffer.
+ */
+static inline bool lowpan_rx_h_check(struct sk_buff *skb)
+{
+ __le16 fc = ieee802154_get_fc_from_skb(skb);
+
+ /* check on ieee802154 conform 6LoWPAN header */
+ if (!ieee802154_is_data(fc) ||
+ !ieee802154_is_intra_pan(fc))
+ return false;
+
+ /* check if we can dereference the dispatch */
+ if (unlikely(!skb->len))
+ return false;
+
+ if (lowpan_is_nalp(*skb_network_header(skb)) ||
+ lowpan_is_reserved(*skb_network_header(skb)))
+ return false;
+
+ return true;
+}
+
+static int lowpan_rcv(struct sk_buff *skb, struct net_device *wdev,
+ struct packet_type *pt, struct net_device *orig_wdev)
+{
+ struct net_device *ldev;
+
+ if (wdev->type != ARPHRD_IEEE802154 ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ !lowpan_rx_h_check(skb))
+ goto drop;
+
+ ldev = wdev->ieee802154_ptr->lowpan_dev;
+ if (!ldev || !netif_running(ldev))
+ goto drop;
+
+ /* Replacing skb->dev and followed rx handlers will manipulate skb. */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+ skb->dev = ldev;
+
+ /* When receive frag1 it's likely that we manipulate the buffer.
+ * When recevie iphc we manipulate the data buffer. So we need
+ * to unshare the buffer.
+ */
+ if (lowpan_is_frag1(*skb_network_header(skb)) ||
+ lowpan_is_iphc(*skb_network_header(skb))) {
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
}
-drop_skb:
- kfree_skb(skb);
+ return lowpan_invoke_rx_handlers(skb);
+
drop:
+ kfree_skb(skb);
+out:
return NET_RX_DROP;
}
diff --git a/kernel/net/ieee802154/6lowpan/tx.c b/kernel/net/ieee802154/6lowpan/tx.c
index 2349070bd..d4353face 100644
--- a/kernel/net/ieee802154/6lowpan/tx.c
+++ b/kernel/net/ieee802154/6lowpan/tx.c
@@ -10,9 +10,13 @@
#include <net/6lowpan.h>
#include <net/ieee802154_netdev.h>
+#include <net/mac802154.h>
#include "6lowpan_i.h"
+#define LOWPAN_FRAG1_HEAD_SIZE 0x4
+#define LOWPAN_FRAGN_HEAD_SIZE 0x5
+
/* don't save pan id, it's intra pan */
struct lowpan_addr {
u8 mode;
@@ -36,7 +40,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb)
sizeof(struct lowpan_addr_info));
}
-int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
+/* This callback will be called from AF_PACKET and IPv6 stack, the AF_PACKET
+ * sockets gives an 8 byte array for addresses only!
+ *
+ * TODO I think AF_PACKET DGRAM (sending/receiving) RAW (sending) makes no
+ * sense here. We should disable it, the right use-case would be AF_INET6
+ * RAW/DGRAM sockets.
+ */
+int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
unsigned short type, const void *_daddr,
const void *_saddr, unsigned int len)
{
@@ -51,7 +62,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
return 0;
if (!saddr)
- saddr = dev->dev_addr;
+ saddr = ldev->dev_addr;
raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8);
raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8);
@@ -71,28 +82,33 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
static struct sk_buff*
lowpan_alloc_frag(struct sk_buff *skb, int size,
- const struct ieee802154_hdr *master_hdr)
+ const struct ieee802154_hdr *master_hdr, bool frag1)
{
- struct net_device *real_dev = lowpan_dev_info(skb->dev)->real_dev;
+ struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev;
struct sk_buff *frag;
int rc;
- frag = alloc_skb(real_dev->hard_header_len +
- real_dev->needed_tailroom + size,
+ frag = alloc_skb(wdev->needed_headroom + wdev->needed_tailroom + size,
GFP_ATOMIC);
if (likely(frag)) {
- frag->dev = real_dev;
+ frag->dev = wdev;
frag->priority = skb->priority;
- skb_reserve(frag, real_dev->hard_header_len);
+ skb_reserve(frag, wdev->needed_headroom);
skb_reset_network_header(frag);
*mac_cb(frag) = *mac_cb(skb);
- rc = dev_hard_header(frag, real_dev, 0, &master_hdr->dest,
- &master_hdr->source, size);
- if (rc < 0) {
- kfree_skb(frag);
- return ERR_PTR(rc);
+ if (frag1) {
+ memcpy(skb_put(frag, skb->mac_len),
+ skb_mac_header(skb), skb->mac_len);
+ } else {
+ rc = wpan_dev_hard_header(frag, wdev,
+ &master_hdr->dest,
+ &master_hdr->source, size);
+ if (rc < 0) {
+ kfree_skb(frag);
+ return ERR_PTR(rc);
+ }
}
} else {
frag = ERR_PTR(-ENOMEM);
@@ -104,15 +120,15 @@ lowpan_alloc_frag(struct sk_buff *skb, int size,
static int
lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr,
u8 *frag_hdr, int frag_hdrlen,
- int offset, int len)
+ int offset, int len, bool frag1)
{
struct sk_buff *frag;
raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen);
- frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr);
+ frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr, frag1);
if (IS_ERR(frag))
- return -PTR_ERR(frag);
+ return PTR_ERR(frag);
memcpy(skb_put(frag, frag_hdrlen), frag_hdr, frag_hdrlen);
memcpy(skb_put(frag, len), skb_network_header(skb) + offset, len);
@@ -123,19 +139,17 @@ lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr,
}
static int
-lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev,
- const struct ieee802154_hdr *wpan_hdr)
+lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev,
+ const struct ieee802154_hdr *wpan_hdr, u16 dgram_size,
+ u16 dgram_offset)
{
- u16 dgram_size, dgram_offset;
__be16 frag_tag;
u8 frag_hdr[5];
int frag_cap, frag_len, payload_cap, rc;
int skb_unprocessed, skb_offset;
- dgram_size = lowpan_uncompress_size(skb, &dgram_offset) -
- skb->mac_len;
- frag_tag = htons(lowpan_dev_info(dev)->fragment_tag);
- lowpan_dev_info(dev)->fragment_tag++;
+ frag_tag = htons(lowpan_dev_info(ldev)->fragment_tag);
+ lowpan_dev_info(ldev)->fragment_tag++;
frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07);
frag_hdr[1] = dgram_size & 0xff;
@@ -151,7 +165,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev,
rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr,
LOWPAN_FRAG1_HEAD_SIZE, 0,
- frag_len + skb_network_header_len(skb));
+ frag_len + skb_network_header_len(skb),
+ true);
if (rc) {
pr_debug("%s unable to send FRAG1 packet (tag: %d)",
__func__, ntohs(frag_tag));
@@ -172,7 +187,7 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev,
rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr,
LOWPAN_FRAGN_HEAD_SIZE, skb_offset,
- frag_len);
+ frag_len, false);
if (rc) {
pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n",
__func__, ntohs(frag_tag), skb_offset);
@@ -180,6 +195,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev,
}
} while (skb_unprocessed > frag_cap);
+ ldev->stats.tx_packets++;
+ ldev->stats.tx_bytes += dgram_size;
consume_skb(skb);
return NET_XMIT_SUCCESS;
@@ -188,8 +205,10 @@ err:
return rc;
}
-static int lowpan_header(struct sk_buff *skb, struct net_device *dev)
+static int lowpan_header(struct sk_buff *skb, struct net_device *ldev,
+ u16 *dgram_size, u16 *dgram_offset)
{
+ struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr;
struct ieee802154_addr sa, da;
struct ieee802154_mac_cb *cb = mac_cb_init(skb);
struct lowpan_addr_info info;
@@ -201,13 +220,16 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev)
daddr = &info.daddr.u.extended_addr;
saddr = &info.saddr.u.extended_addr;
- lowpan_header_compress(skb, dev, ETH_P_IPV6, daddr, saddr, skb->len);
+ *dgram_size = skb->len;
+ lowpan_header_compress(skb, ldev, daddr, saddr);
+ /* dgram_offset = (saved bytes after compression) + lowpan header len */
+ *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb);
cb->type = IEEE802154_FC_TYPE_DATA;
/* prepare wpan address data */
sa.mode = IEEE802154_ADDR_LONG;
- sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+ sa.pan_id = wpan_dev->pan_id;
sa.extended_addr = ieee802154_devaddr_from_raw(saddr);
/* intra-PAN communications */
@@ -216,27 +238,30 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev)
/* if the destination address is the broadcast address, use the
* corresponding short address
*/
- if (lowpan_is_addr_broadcast((const u8 *)daddr)) {
+ if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) {
da.mode = IEEE802154_ADDR_SHORT;
da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
cb->ackreq = false;
} else {
da.mode = IEEE802154_ADDR_LONG;
da.extended_addr = ieee802154_devaddr_from_raw(daddr);
- cb->ackreq = true;
+ cb->ackreq = wpan_dev->ackreq;
}
- return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
- ETH_P_IPV6, (void *)&da, (void *)&sa, 0);
+ return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa,
+ 0);
}
-netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev)
+netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
{
struct ieee802154_hdr wpan_hdr;
int max_single, ret;
+ u16 dgram_size, dgram_offset;
pr_debug("package xmit\n");
+ WARN_ON_ONCE(skb->len > IPV6_MIN_MTU);
+
/* We must take a copy of the skb before we modify/replace the ipv6
* header as the header could be used elsewhere
*/
@@ -244,7 +269,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev)
if (!skb)
return NET_XMIT_DROP;
- ret = lowpan_header(skb, dev);
+ ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset);
if (ret < 0) {
kfree_skb(skb);
return NET_XMIT_DROP;
@@ -258,13 +283,16 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev)
max_single = ieee802154_max_payload(&wpan_hdr);
if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) {
- skb->dev = lowpan_dev_info(dev)->real_dev;
+ skb->dev = lowpan_dev_info(ldev)->wdev;
+ ldev->stats.tx_packets++;
+ ldev->stats.tx_bytes += dgram_size;
return dev_queue_xmit(skb);
} else {
netdev_tx_t rc;
pr_debug("frame is too big, fragmentation is needed\n");
- rc = lowpan_xmit_fragmented(skb, dev, &wpan_hdr);
+ rc = lowpan_xmit_fragmented(skb, ldev, &wpan_hdr, dgram_size,
+ dgram_offset);
return rc < 0 ? NET_XMIT_DROP : rc;
}
diff --git a/kernel/net/ieee802154/Kconfig b/kernel/net/ieee802154/Kconfig
index 1370d5b00..188135bcb 100644
--- a/kernel/net/ieee802154/Kconfig
+++ b/kernel/net/ieee802154/Kconfig
@@ -12,6 +12,11 @@ menuconfig IEEE802154
if IEEE802154
+config IEEE802154_NL802154_EXPERIMENTAL
+ bool "IEEE 802.15.4 experimental netlink support"
+ ---help---
+ Adds experimental netlink support for nl802154.
+
config IEEE802154_SOCKET
tristate "IEEE 802.15.4 socket interface"
default y
diff --git a/kernel/net/ieee802154/core.c b/kernel/net/ieee802154/core.c
index 2ee00e8a0..c35fdfa6d 100644
--- a/kernel/net/ieee802154/core.c
+++ b/kernel/net/ieee802154/core.c
@@ -95,6 +95,18 @@ cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx)
return result;
}
+struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx)
+{
+ struct cfg802154_registered_device *rdev;
+
+ ASSERT_RTNL();
+
+ rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx);
+ if (!rdev)
+ return NULL;
+ return &rdev->wpan_phy;
+}
+
struct wpan_phy *
wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
{
@@ -121,8 +133,6 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
/* atomic_inc_return makes it start at 1, make it start at 0 */
rdev->wpan_phy_idx--;
- mutex_init(&rdev->wpan_phy.pib_lock);
-
INIT_LIST_HEAD(&rdev->wpan_dev_list);
device_initialize(&rdev->wpan_phy.dev);
dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx);
diff --git a/kernel/net/ieee802154/core.h b/kernel/net/ieee802154/core.h
index f3e95580c..231fade95 100644
--- a/kernel/net/ieee802154/core.h
+++ b/kernel/net/ieee802154/core.h
@@ -42,5 +42,6 @@ extern int cfg802154_rdev_list_generation;
void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
struct cfg802154_registered_device *
cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx);
+struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx);
#endif /* __IEEE802154_CORE_H */
diff --git a/kernel/net/ieee802154/header_ops.c b/kernel/net/ieee802154/header_ops.c
index a051b6993..c7439f0fb 100644
--- a/kernel/net/ieee802154/header_ops.c
+++ b/kernel/net/ieee802154/header_ops.c
@@ -83,35 +83,35 @@ ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr)
}
int
-ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr)
+ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr)
{
- u8 buf[MAC802154_FRAME_HARD_HEADER_LEN];
+ u8 buf[IEEE802154_MAX_HEADER_LEN];
int pos = 2;
int rc;
- struct ieee802154_hdr_fc fc = hdr->fc;
+ struct ieee802154_hdr_fc *fc = &hdr->fc;
buf[pos++] = hdr->seq;
- fc.dest_addr_mode = hdr->dest.mode;
+ fc->dest_addr_mode = hdr->dest.mode;
rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false);
if (rc < 0)
return -EINVAL;
pos += rc;
- fc.source_addr_mode = hdr->source.mode;
+ fc->source_addr_mode = hdr->source.mode;
if (hdr->source.pan_id == hdr->dest.pan_id &&
hdr->dest.mode != IEEE802154_ADDR_NONE)
- fc.intra_pan = true;
+ fc->intra_pan = true;
- rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc.intra_pan);
+ rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan);
if (rc < 0)
return -EINVAL;
pos += rc;
- if (fc.security_enabled) {
- fc.version = 1;
+ if (fc->security_enabled) {
+ fc->version = 1;
rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec);
if (rc < 0)
@@ -120,7 +120,7 @@ ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr)
pos += rc;
}
- memcpy(buf, &fc, 2);
+ memcpy(buf, fc, 2);
memcpy(skb_push(skb, pos), buf, pos);
diff --git a/kernel/net/ieee802154/nl-mac.c b/kernel/net/ieee802154/nl-mac.c
index 2b4955d7a..3503c3895 100644
--- a/kernel/net/ieee802154/nl-mac.c
+++ b/kernel/net/ieee802154/nl-mac.c
@@ -97,8 +97,10 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
BUG_ON(!phy);
get_device(&phy->dev);
- short_addr = ops->get_short_addr(dev);
- pan_id = ops->get_pan_id(dev);
+ rtnl_lock();
+ short_addr = dev->ieee802154_ptr->short_addr;
+ pan_id = dev->ieee802154_ptr->pan_id;
+ rtnl_unlock();
if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
@@ -117,12 +119,12 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
rtnl_unlock();
if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER,
- params.transmit_power) ||
+ params.transmit_power / 100) ||
nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) ||
nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE,
params.cca.mode) ||
nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL,
- params.cca_ed_level) ||
+ params.cca_ed_level / 100) ||
nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES,
params.csma_retries) ||
nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE,
@@ -166,10 +168,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
if (!dev)
return NULL;
- /* Check on mtu is currently a hacked solution because lowpan
- * and wpan have the same ARPHRD type.
- */
- if (dev->type != ARPHRD_IEEE802154 || dev->mtu != IEEE802154_MTU) {
+ if (dev->type != ARPHRD_IEEE802154) {
dev_put(dev);
return NULL;
}
@@ -244,7 +243,9 @@ int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info)
addr.mode = IEEE802154_ADDR_LONG;
addr.extended_addr = nla_get_hwaddr(
info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]);
- addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+ rtnl_lock();
+ addr.pan_id = dev->ieee802154_ptr->pan_id;
+ rtnl_unlock();
ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr,
nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
@@ -281,7 +282,9 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info)
addr.short_addr = nla_get_shortaddr(
info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]);
}
- addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+ rtnl_lock();
+ addr.pan_id = dev->ieee802154_ptr->pan_id;
+ rtnl_unlock();
ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
@@ -449,11 +452,7 @@ int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
for_each_netdev(net, dev) {
- /* Check on mtu is currently a hacked solution because lowpan
- * and wpan have the same ARPHRD type.
- */
- if (idx < s_idx || dev->type != ARPHRD_IEEE802154 ||
- dev->mtu != IEEE802154_MTU)
+ if (idx < s_idx || dev->type != ARPHRD_IEEE802154)
goto cont;
if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid,
@@ -510,7 +509,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info)
ops->get_mac_params(dev, &params);
if (info->attrs[IEEE802154_ATTR_TXPOWER])
- params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]);
+ params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100;
if (info->attrs[IEEE802154_ATTR_LBT_ENABLED])
params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]);
@@ -519,7 +518,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info)
params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]);
if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL])
- params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]);
+ params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100;
if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES])
params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]);
@@ -783,11 +782,7 @@ ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb,
int rc;
for_each_netdev(net, dev) {
- /* Check on mtu is currently a hacked solution because lowpan
- * and wpan have the same ARPHRD type.
- */
- if (idx < first_dev || dev->type != ARPHRD_IEEE802154 ||
- dev->mtu != IEEE802154_MTU)
+ if (idx < first_dev || dev->type != ARPHRD_IEEE802154)
goto skip;
data.ops = ieee802154_mlme_ops(dev);
diff --git a/kernel/net/ieee802154/nl-phy.c b/kernel/net/ieee802154/nl-phy.c
index 346c6665d..77d73014b 100644
--- a/kernel/net/ieee802154/nl-phy.c
+++ b/kernel/net/ieee802154/nl-phy.c
@@ -50,26 +50,26 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
if (!hdr)
goto out;
- mutex_lock(&phy->pib_lock);
+ rtnl_lock();
if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
goto nla_put_failure;
for (i = 0; i < 32; i++) {
- if (phy->channels_supported[i])
- buf[pages++] = phy->channels_supported[i] | (i << 27);
+ if (phy->supported.channels[i])
+ buf[pages++] = phy->supported.channels[i] | (i << 27);
}
if (pages &&
nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
pages * sizeof(uint32_t), buf))
goto nla_put_failure;
- mutex_unlock(&phy->pib_lock);
+ rtnl_unlock();
kfree(buf);
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
- mutex_unlock(&phy->pib_lock);
+ rtnl_unlock();
genlmsg_cancel(msg, hdr);
out:
kfree(buf);
diff --git a/kernel/net/ieee802154/nl802154.c b/kernel/net/ieee802154/nl802154.c
index f3c12f6a4..16ef0d9f5 100644
--- a/kernel/net/ieee802154/nl802154.c
+++ b/kernel/net/ieee802154/nl802154.c
@@ -207,10 +207,11 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
[NL802154_ATTR_PAGE] = { .type = NLA_U8, },
[NL802154_ATTR_CHANNEL] = { .type = NLA_U8, },
- [NL802154_ATTR_TX_POWER] = { .type = NLA_S8, },
+ [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, },
[NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, },
[NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, },
+ [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, },
[NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, },
@@ -225,8 +226,92 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
[NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, },
[NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, },
+
+ [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED },
+
+ [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED },
+
+ [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 },
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
+ [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
+ [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, },
+ [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 },
+
+ [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED },
+ [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED },
+ [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED },
+ [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED },
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+static int
+nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct cfg802154_registered_device **rdev,
+ struct wpan_dev **wpan_dev)
+{
+ int err;
+
+ rtnl_lock();
+
+ if (!cb->args[0]) {
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
+ nl802154_fam.attrbuf, nl802154_fam.maxattr,
+ nl802154_policy);
+ if (err)
+ goto out_unlock;
+
+ *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
+ nl802154_fam.attrbuf);
+ if (IS_ERR(*wpan_dev)) {
+ err = PTR_ERR(*wpan_dev);
+ goto out_unlock;
+ }
+ *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy);
+ /* 0 is the first index - add 1 to parse only once */
+ cb->args[0] = (*rdev)->wpan_phy_idx + 1;
+ cb->args[1] = (*wpan_dev)->identifier;
+ } else {
+ /* subtract the 1 again here */
+ struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1);
+ struct wpan_dev *tmp;
+
+ if (!wpan_phy) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ *rdev = wpan_phy_to_rdev(wpan_phy);
+ *wpan_dev = NULL;
+
+ list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) {
+ if (tmp->identifier == cb->args[1]) {
+ *wpan_dev = tmp;
+ break;
+ }
+ }
+
+ if (!*wpan_dev) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ return 0;
+ out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void
+nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev)
+{
+ rtnl_unlock();
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
/* message building helper */
static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
int flags, u8 cmd)
@@ -236,6 +321,28 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
}
static int
+nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask)
+{
+ struct nlattr *nl_flags = nla_nest_start(msg, attr);
+ int i;
+
+ if (!nl_flags)
+ return -ENOBUFS;
+
+ i = 0;
+ while (mask) {
+ if ((mask & 1) && nla_put_flag(msg, i))
+ return -ENOBUFS;
+
+ mask >>= 1;
+ i++;
+ }
+
+ nla_nest_end(msg, nl_flags);
+ return 0;
+}
+
+static int
nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
struct sk_buff *msg)
{
@@ -248,7 +355,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
for (page = 0; page <= IEEE802154_MAX_PAGE; page++) {
if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL,
- rdev->wpan_phy.channels_supported[page]))
+ rdev->wpan_phy.supported.channels[page]))
return -ENOBUFS;
}
nla_nest_end(msg, nl_page);
@@ -256,12 +363,100 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
return 0;
}
+static int
+nl802154_put_capabilities(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev)
+{
+ const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported;
+ struct nlattr *nl_caps, *nl_channels;
+ int i;
+
+ nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS);
+ if (!nl_caps)
+ return -ENOBUFS;
+
+ nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS);
+ if (!nl_channels)
+ return -ENOBUFS;
+
+ for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
+ if (caps->channels[i]) {
+ if (nl802154_put_flags(msg, i, caps->channels[i]))
+ return -ENOBUFS;
+ }
+ }
+
+ nla_nest_end(msg, nl_channels);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
+ struct nlattr *nl_ed_lvls;
+
+ nl_ed_lvls = nla_nest_start(msg,
+ NL802154_CAP_ATTR_CCA_ED_LEVELS);
+ if (!nl_ed_lvls)
+ return -ENOBUFS;
+
+ for (i = 0; i < caps->cca_ed_levels_size; i++) {
+ if (nla_put_s32(msg, i, caps->cca_ed_levels[i]))
+ return -ENOBUFS;
+ }
+
+ nla_nest_end(msg, nl_ed_lvls);
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
+ struct nlattr *nl_tx_pwrs;
+
+ nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS);
+ if (!nl_tx_pwrs)
+ return -ENOBUFS;
+
+ for (i = 0; i < caps->tx_powers_size; i++) {
+ if (nla_put_s32(msg, i, caps->tx_powers[i]))
+ return -ENOBUFS;
+ }
+
+ nla_nest_end(msg, nl_tx_pwrs);
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) {
+ if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES,
+ caps->cca_modes) ||
+ nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS,
+ caps->cca_opts))
+ return -ENOBUFS;
+ }
+
+ if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS,
+ caps->min_csma_backoffs) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS,
+ caps->max_csma_backoffs) ||
+ nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES,
+ caps->min_frame_retries) ||
+ nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES,
+ caps->max_frame_retries) ||
+ nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES,
+ caps->iftypes) ||
+ nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, nl_caps);
+
+ return 0;
+}
+
static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
enum nl802154_commands cmd,
struct sk_buff *msg, u32 portid, u32 seq,
int flags)
{
+ struct nlattr *nl_cmds;
void *hdr;
+ int i;
hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
if (!hdr)
@@ -286,25 +481,77 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
rdev->wpan_phy.current_channel))
goto nla_put_failure;
- /* supported channels array */
+ /* TODO remove this behaviour, we still keep support it for a while
+ * so users can change the behaviour to the new one.
+ */
if (nl802154_send_wpan_phy_channels(rdev, msg))
goto nla_put_failure;
/* cca mode */
- if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE,
- rdev->wpan_phy.cca.mode))
- goto nla_put_failure;
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) {
+ if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE,
+ rdev->wpan_phy.cca.mode))
+ goto nla_put_failure;
+
+ if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) {
+ if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT,
+ rdev->wpan_phy.cca.opt))
+ goto nla_put_failure;
+ }
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
+ if (nla_put_s32(msg, NL802154_ATTR_TX_POWER,
+ rdev->wpan_phy.transmit_power))
+ goto nla_put_failure;
+ }
- if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) {
- if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT,
- rdev->wpan_phy.cca.opt))
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
+ if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL,
+ rdev->wpan_phy.cca_ed_level))
goto nla_put_failure;
}
- if (nla_put_s8(msg, NL802154_ATTR_TX_POWER,
- rdev->wpan_phy.transmit_power))
+ if (nl802154_put_capabilities(msg, rdev))
+ goto nla_put_failure;
+
+ nl_cmds = nla_nest_start(msg, NL802154_ATTR_SUPPORTED_COMMANDS);
+ if (!nl_cmds)
goto nla_put_failure;
+ i = 0;
+#define CMD(op, n) \
+ do { \
+ if (rdev->ops->op) { \
+ i++; \
+ if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \
+ goto nla_put_failure; \
+ } \
+ } while (0)
+
+ CMD(add_virtual_intf, NEW_INTERFACE);
+ CMD(del_virtual_intf, DEL_INTERFACE);
+ CMD(set_channel, SET_CHANNEL);
+ CMD(set_pan_id, SET_PAN_ID);
+ CMD(set_short_addr, SET_SHORT_ADDR);
+ CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT);
+ CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS);
+ CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES);
+ CMD(set_lbt_mode, SET_LBT_MODE);
+ CMD(set_ackreq_default, SET_ACKREQ_DEFAULT);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)
+ CMD(set_tx_power, SET_TX_POWER);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)
+ CMD(set_cca_ed_level, SET_CCA_ED_LEVEL);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)
+ CMD(set_cca_mode, SET_CCA_MODE);
+
+#undef CMD
+ nla_nest_end(msg, nl_cmds);
+
finish:
genlmsg_end(msg, hdr);
return 0;
@@ -443,6 +690,107 @@ static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev)
((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32);
}
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+#include <net/ieee802154_netdev.h>
+
+static int
+ieee802154_llsec_send_key_id(struct sk_buff *msg,
+ const struct ieee802154_llsec_key_id *desc)
+{
+ struct nlattr *nl_dev_addr;
+
+ if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode))
+ return -ENOBUFS;
+
+ switch (desc->mode) {
+ case NL802154_KEY_ID_MODE_IMPLICIT:
+ nl_dev_addr = nla_nest_start(msg, NL802154_KEY_ID_ATTR_IMPLICIT);
+ if (!nl_dev_addr)
+ return -ENOBUFS;
+
+ if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID,
+ desc->device_addr.pan_id) ||
+ nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE,
+ desc->device_addr.mode))
+ return -ENOBUFS;
+
+ switch (desc->device_addr.mode) {
+ case NL802154_DEV_ADDR_SHORT:
+ if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT,
+ desc->device_addr.short_addr))
+ return -ENOBUFS;
+ break;
+ case NL802154_DEV_ADDR_EXTENDED:
+ if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED,
+ desc->device_addr.extended_addr))
+ return -ENOBUFS;
+ break;
+ default:
+ /* userspace should handle unknown */
+ break;
+ }
+
+ nla_nest_end(msg, nl_dev_addr);
+ break;
+ case NL802154_KEY_ID_MODE_INDEX:
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_SHORT:
+ /* TODO renmae short_source? */
+ if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT,
+ desc->short_source))
+ return -ENOBUFS;
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_EXTENDED:
+ if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED,
+ desc->extended_source))
+ return -ENOBUFS;
+ break;
+ default:
+ /* userspace should handle unknown */
+ break;
+ }
+
+ /* TODO key_id to key_idx ? Check naming */
+ if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) {
+ if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id))
+ return -ENOBUFS;
+ }
+
+ return 0;
+}
+
+static int nl802154_get_llsec_params(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ struct nlattr *nl_key_id;
+ struct ieee802154_llsec_params params;
+ int ret;
+
+ ret = rdev_get_llsec_params(rdev, wpan_dev, &params);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) ||
+ nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) ||
+ nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER,
+ params.frame_counter))
+ return -ENOBUFS;
+
+ nl_key_id = nla_nest_start(msg, NL802154_ATTR_SEC_OUT_KEY_ID);
+ if (!nl_key_id)
+ return -ENOBUFS;
+
+ ret = ieee802154_llsec_send_key_id(msg, &params.out_key);
+ if (ret < 0)
+ return ret;
+
+ nla_nest_end(msg, nl_key_id);
+
+ return 0;
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
static int
nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
struct cfg802154_registered_device *rdev,
@@ -490,6 +838,15 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt))
goto nla_put_failure;
+ /* ackreq default behaviour */
+ if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq))
+ goto nla_put_failure;
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0)
+ goto nla_put_failure;
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
genlmsg_end(msg, hdr);
return 0;
@@ -575,14 +932,13 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL802154_ATTR_IFTYPE]) {
type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]);
- if (type > NL802154_IFTYPE_MAX)
+ if (type > NL802154_IFTYPE_MAX ||
+ !(rdev->wpan_phy.supported.iftypes & BIT(type)))
return -EINVAL;
}
- /* TODO add nla_get_le64 to netlink */
if (info->attrs[NL802154_ATTR_EXTENDED_ADDR])
- extended_addr = (__force __le64)nla_get_u64(
- info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+ extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
if (!rdev->ops->add_virtual_intf)
return -EOPNOTSUPP;
@@ -625,7 +981,8 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]);
/* check 802.15.4 constraints */
- if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL)
+ if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL ||
+ !(rdev->wpan_phy.supported.channels[page] & BIT(channel)))
return -EINVAL;
return rdev_set_channel(rdev, page, channel);
@@ -636,12 +993,17 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
struct cfg802154_registered_device *rdev = info->user_ptr[0];
struct wpan_phy_cca cca;
+ if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE))
+ return -EOPNOTSUPP;
+
if (!info->attrs[NL802154_ATTR_CCA_MODE])
return -EINVAL;
cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]);
/* checking 802.15.4 constraints */
- if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX)
+ if (cca.mode < NL802154_CCA_ENERGY ||
+ cca.mode > NL802154_CCA_ATTR_MAX ||
+ !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode)))
return -EINVAL;
if (cca.mode == NL802154_CCA_ENERGY_CARRIER) {
@@ -649,13 +1011,58 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]);
- if (cca.opt > NL802154_CCA_OPT_ATTR_MAX)
+ if (cca.opt > NL802154_CCA_OPT_ATTR_MAX ||
+ !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt)))
return -EINVAL;
}
return rdev_set_cca_mode(rdev, &cca);
}
+static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ s32 ed_level;
+ int i;
+
+ if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL))
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL])
+ return -EINVAL;
+
+ ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]);
+
+ for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) {
+ if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i])
+ return rdev_set_cca_ed_level(rdev, ed_level);
+ }
+
+ return -EINVAL;
+}
+
+static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ s32 power;
+ int i;
+
+ if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER))
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_TX_POWER])
+ return -EINVAL;
+
+ power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]);
+
+ for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) {
+ if (power == rdev->wpan_phy.supported.tx_powers[i])
+ return rdev_set_tx_power(rdev, power);
+ }
+
+ return -EINVAL;
+}
+
static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
{
struct cfg802154_registered_device *rdev = info->user_ptr[0];
@@ -668,14 +1075,22 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
return -EBUSY;
/* don't change address fields on monitor */
- if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
- return -EINVAL;
-
- if (!info->attrs[NL802154_ATTR_PAN_ID])
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR ||
+ !info->attrs[NL802154_ATTR_PAN_ID])
return -EINVAL;
pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]);
+ /* TODO
+ * I am not sure about to check here on broadcast pan_id.
+ * Broadcast is a valid setting, comment from 802.15.4:
+ * If this value is 0xffff, the device is not associated.
+ *
+ * This could useful to simple deassociate an device.
+ */
+ if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+ return -EINVAL;
+
return rdev_set_pan_id(rdev, wpan_dev, pan_id);
}
@@ -691,14 +1106,27 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info)
return -EBUSY;
/* don't change address fields on monitor */
- if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
- return -EINVAL;
-
- if (!info->attrs[NL802154_ATTR_SHORT_ADDR])
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR ||
+ !info->attrs[NL802154_ATTR_SHORT_ADDR])
return -EINVAL;
short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
+ /* TODO
+ * I am not sure about to check here on broadcast short_addr.
+ * Broadcast is a valid setting, comment from 802.15.4:
+ * A value of 0xfffe indicates that the device has
+ * associated but has not been allocated an address. A
+ * value of 0xffff indicates that the device does not
+ * have a short address.
+ *
+ * I think we should allow to set these settings but
+ * don't allow to allow socket communication with it.
+ */
+ if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) ||
+ short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST))
+ return -EINVAL;
+
return rdev_set_short_addr(rdev, wpan_dev, short_addr);
}
@@ -722,7 +1150,11 @@ nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info)
max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]);
/* check 802.15.4 constraints */
- if (max_be < 3 || max_be > 8 || min_be > max_be)
+ if (min_be < rdev->wpan_phy.supported.min_minbe ||
+ min_be > rdev->wpan_phy.supported.max_minbe ||
+ max_be < rdev->wpan_phy.supported.min_maxbe ||
+ max_be > rdev->wpan_phy.supported.max_maxbe ||
+ min_be > max_be)
return -EINVAL;
return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be);
@@ -747,7 +1179,8 @@ nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info)
info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]);
/* check 802.15.4 constraints */
- if (max_csma_backoffs > 5)
+ if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs ||
+ max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs)
return -EINVAL;
return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs);
@@ -771,7 +1204,8 @@ nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info)
info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]);
/* check 802.15.4 constraints */
- if (max_frame_retries < -1 || max_frame_retries > 7)
+ if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries ||
+ max_frame_retries > rdev->wpan_phy.supported.max_frame_retries)
return -EINVAL;
return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries);
@@ -782,7 +1216,7 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info)
struct cfg802154_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
- bool mode;
+ int mode;
if (netif_running(dev))
return -EBUSY;
@@ -790,10 +1224,871 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL802154_ATTR_LBT_MODE])
return -EINVAL;
- mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]);
+ mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]);
+
+ if (mode != 0 && mode != 1)
+ return -EINVAL;
+
+ if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt))
+ return -EINVAL;
+
return rdev_set_lbt_mode(rdev, wpan_dev, mode);
}
+static int
+nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ int ackreq;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT])
+ return -EINVAL;
+
+ ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]);
+
+ if (ackreq != 0 && ackreq != 1)
+ return -EINVAL;
+
+ return rdev_set_ackreq_default(rdev, wpan_dev, ackreq);
+}
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
+ [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
+ [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 },
+ [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 },
+ [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 },
+};
+
+static int
+ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
+ struct ieee802154_addr *addr)
+{
+ struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla,
+ nl802154_dev_addr_policy))
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] &&
+ !attrs[NL802154_DEV_ADDR_ATTR_MODE] &&
+ !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] ||
+ attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]))
+ return -EINVAL;
+
+ addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]);
+ addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]);
+ switch (addr->mode) {
+ case NL802154_DEV_ADDR_SHORT:
+ addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]);
+ break;
+ case NL802154_DEV_ADDR_EXTENDED:
+ addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = {
+ [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 },
+ [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 },
+ [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED },
+ [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 },
+ [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 },
+};
+
+static int
+ieee802154_llsec_parse_key_id(struct nlattr *nla,
+ struct ieee802154_llsec_key_id *desc)
+{
+ struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla,
+ nl802154_key_id_policy))
+ return -EINVAL;
+
+ if (!attrs[NL802154_KEY_ID_ATTR_MODE])
+ return -EINVAL;
+
+ desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]);
+ switch (desc->mode) {
+ case NL802154_KEY_ID_MODE_IMPLICIT:
+ if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT])
+ return -EINVAL;
+
+ if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT],
+ &desc->device_addr) < 0)
+ return -EINVAL;
+ break;
+ case NL802154_KEY_ID_MODE_INDEX:
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_SHORT:
+ if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT])
+ return -EINVAL;
+
+ desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]);
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_EXTENDED:
+ if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED])
+ return -EINVAL;
+
+ desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) {
+ if (!attrs[NL802154_KEY_ID_ATTR_INDEX])
+ return -EINVAL;
+
+ /* TODO change id to idx */
+ desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]);
+ }
+
+ return 0;
+}
+
+static int nl802154_set_llsec_params(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_params params;
+ u32 changed = 0;
+ int ret;
+
+ if (info->attrs[NL802154_ATTR_SEC_ENABLED]) {
+ u8 enabled;
+
+ enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]);
+ if (enabled != 0 && enabled != 1)
+ return -EINVAL;
+
+ params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]);
+ changed |= IEEE802154_LLSEC_PARAM_ENABLED;
+ }
+
+ if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) {
+ ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID],
+ &params.out_key);
+ if (ret < 0)
+ return ret;
+
+ changed |= IEEE802154_LLSEC_PARAM_OUT_KEY;
+ }
+
+ if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) {
+ params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]);
+ if (params.out_level > NL802154_SECLEVEL_MAX)
+ return -EINVAL;
+
+ changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL;
+ }
+
+ if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) {
+ params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]);
+ changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER;
+ }
+
+ return rdev_set_llsec_params(rdev, wpan_dev, &params, changed);
+}
+
+static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev,
+ const struct ieee802154_llsec_key_entry *key)
+{
+ void *hdr;
+ u32 commands[NL802154_CMD_FRAME_NR_IDS / 32];
+ struct nlattr *nl_key, *nl_key_id;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_key = nla_nest_start(msg, NL802154_ATTR_SEC_KEY);
+ if (!nl_key)
+ goto nla_put_failure;
+
+ nl_key_id = nla_nest_start(msg, NL802154_KEY_ATTR_ID);
+ if (!nl_key_id)
+ goto nla_put_failure;
+
+ if (ieee802154_llsec_send_key_id(msg, &key->id) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_key_id);
+
+ if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES,
+ key->key->frame_types))
+ goto nla_put_failure;
+
+ if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) {
+ /* TODO for each nested */
+ memset(commands, 0, sizeof(commands));
+ commands[7] = key->key->cmd_frame_ids;
+ if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS,
+ sizeof(commands), commands))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE,
+ key->key->key))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_key);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_key_entry *key;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ list_for_each_entry(key, &table->keys, list) {
+ if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev->netdev, key) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = {
+ [NL802154_KEY_ATTR_ID] = { NLA_NESTED },
+ /* TODO handle it as for_each_nested and NLA_FLAG? */
+ [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 },
+ /* TODO handle it as for_each_nested, not static array? */
+ [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 },
+ [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE },
+};
+
+static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_key key = { };
+ struct ieee802154_llsec_key_id id = { };
+ u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { };
+
+ if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX,
+ info->attrs[NL802154_ATTR_SEC_KEY],
+ nl802154_key_policy))
+ return -EINVAL;
+
+ if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] ||
+ !attrs[NL802154_KEY_ATTR_BYTES])
+ return -EINVAL;
+
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
+ return -ENOBUFS;
+
+ key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]);
+ if (key.frame_types > BIT(NL802154_FRAME_MAX) ||
+ ((key.frame_types & BIT(NL802154_FRAME_CMD)) &&
+ !attrs[NL802154_KEY_ATTR_USAGE_CMDS]))
+ return -EINVAL;
+
+ if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) {
+ /* TODO for each nested */
+ nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS],
+ NL802154_CMD_FRAME_NR_IDS / 8);
+
+ /* TODO understand the -EINVAL logic here? last condition */
+ if (commands[0] || commands[1] || commands[2] || commands[3] ||
+ commands[4] || commands[5] || commands[6] ||
+ commands[7] > BIT(NL802154_CMD_FRAME_MAX))
+ return -EINVAL;
+
+ key.cmd_frame_ids = commands[7];
+ } else {
+ key.cmd_frame_ids = 0;
+ }
+
+ nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE);
+
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
+ return -ENOBUFS;
+
+ return rdev_add_llsec_key(rdev, wpan_dev, &id, &key);
+}
+
+static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_key_id id;
+
+ if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX,
+ info->attrs[NL802154_ATTR_SEC_KEY],
+ nl802154_key_policy))
+ return -EINVAL;
+
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
+ return -ENOBUFS;
+
+ return rdev_del_llsec_key(rdev, wpan_dev, &id);
+}
+
+static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev,
+ const struct ieee802154_llsec_device *dev_desc)
+{
+ void *hdr;
+ struct nlattr *nl_device;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_device = nla_nest_start(msg, NL802154_ATTR_SEC_DEVICE);
+ if (!nl_device)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER,
+ dev_desc->frame_counter) ||
+ nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) ||
+ nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR,
+ dev_desc->short_addr) ||
+ nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR,
+ dev_desc->hwaddr) ||
+ nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT,
+ dev_desc->seclevel_exempt) ||
+ nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_device);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_device *dev;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ list_for_each_entry(dev, &table->devices, list) {
+ if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev->netdev, dev) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = {
+ [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 },
+ [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 },
+ [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 },
+ [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 },
+ [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 },
+ [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 },
+};
+
+static int
+ieee802154_llsec_parse_device(struct nlattr *nla,
+ struct ieee802154_llsec_device *dev)
+{
+ struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, nla,
+ nl802154_dev_policy))
+ return -EINVAL;
+
+ memset(dev, 0, sizeof(*dev));
+
+ if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] ||
+ !attrs[NL802154_DEV_ATTR_PAN_ID] ||
+ !attrs[NL802154_DEV_ATTR_SHORT_ADDR] ||
+ !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] ||
+ !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] ||
+ !attrs[NL802154_DEV_ATTR_KEY_MODE])
+ return -EINVAL;
+
+ /* TODO be32 */
+ dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]);
+ dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]);
+ dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]);
+ /* TODO rename hwaddr to extended_addr */
+ dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]);
+ dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]);
+ dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]);
+
+ if (dev->key_mode > NL802154_DEVKEY_MAX ||
+ (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_device dev_desc;
+
+ if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE],
+ &dev_desc) < 0)
+ return -EINVAL;
+
+ return rdev_add_device(rdev, wpan_dev, &dev_desc);
+}
+
+static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1];
+ __le64 extended_addr;
+
+ if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX,
+ info->attrs[NL802154_ATTR_SEC_DEVICE],
+ nl802154_dev_policy))
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]);
+ return rdev_del_device(rdev, wpan_dev, extended_addr);
+}
+
+static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev, __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *devkey)
+{
+ void *hdr;
+ struct nlattr *nl_devkey, *nl_key_id;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_devkey = nla_nest_start(msg, NL802154_ATTR_SEC_DEVKEY);
+ if (!nl_devkey)
+ goto nla_put_failure;
+
+ if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR,
+ extended_addr) ||
+ nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER,
+ devkey->frame_counter))
+ goto nla_put_failure;
+
+ nl_key_id = nla_nest_start(msg, NL802154_DEVKEY_ATTR_ID);
+ if (!nl_key_id)
+ goto nla_put_failure;
+
+ if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_key_id);
+ nla_nest_end(msg, nl_devkey);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_device_key *kpos;
+ struct ieee802154_llsec_device *dpos;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ /* TODO look if remove devkey and do some nested attribute */
+ list_for_each_entry(dpos, &table->devices, list) {
+ list_for_each_entry(kpos, &dpos->keys, list) {
+ if (nl802154_send_devkey(skb,
+ NL802154_CMD_NEW_SEC_LEVEL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, rdev,
+ wpan_dev->netdev,
+ dpos->hwaddr,
+ kpos) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = {
+ [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 },
+ [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 },
+ [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED },
+};
+
+static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_device_key key;
+ __le64 extended_addr;
+
+ if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] ||
+ nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX,
+ info->attrs[NL802154_ATTR_SEC_DEVKEY],
+ nl802154_devkey_policy) < 0)
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] ||
+ !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ /* TODO change key.id ? */
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID],
+ &key.key_id) < 0)
+ return -ENOBUFS;
+
+ /* TODO be32 */
+ key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]);
+ /* TODO change naming hwaddr -> extended_addr
+ * check unique identifier short+pan OR extended_addr
+ */
+ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]);
+ return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key);
+}
+
+static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_device_key key;
+ __le64 extended_addr;
+
+ if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX,
+ info->attrs[NL802154_ATTR_SEC_DEVKEY],
+ nl802154_devkey_policy))
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ /* TODO change key.id ? */
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID],
+ &key.key_id) < 0)
+ return -ENOBUFS;
+
+ /* TODO change naming hwaddr -> extended_addr
+ * check unique identifier short+pan OR extended_addr
+ */
+ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]);
+ return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key);
+}
+
+static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ void *hdr;
+ struct nlattr *nl_seclevel;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_seclevel = nla_nest_start(msg, NL802154_ATTR_SEC_LEVEL);
+ if (!nl_seclevel)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) ||
+ nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) ||
+ nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE,
+ sl->device_override))
+ goto nla_put_failure;
+
+ if (sl->frame_type == NL802154_FRAME_CMD) {
+ if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME,
+ sl->cmd_frame_id))
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, nl_seclevel);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_seclevel *sl;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ list_for_each_entry(sl, &table->security_levels, list) {
+ if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev->netdev, sl) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = {
+ [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 },
+ [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 },
+ [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 },
+ [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 },
+};
+
+static int
+llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl)
+{
+ struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, nla,
+ nl802154_seclevel_policy))
+ return -EINVAL;
+
+ memset(sl, 0, sizeof(*sl));
+
+ if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] ||
+ !attrs[NL802154_SECLEVEL_ATTR_FRAME] ||
+ !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE])
+ return -EINVAL;
+
+ sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]);
+ sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]);
+ sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]);
+ if (sl->frame_type > NL802154_FRAME_MAX ||
+ (sl->device_override != 0 && sl->device_override != 1))
+ return -EINVAL;
+
+ if (sl->frame_type == NL802154_FRAME_CMD) {
+ if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME])
+ return -EINVAL;
+
+ sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]);
+ if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nl802154_add_llsec_seclevel(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_seclevel sl;
+
+ if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
+ &sl) < 0)
+ return -EINVAL;
+
+ return rdev_add_seclevel(rdev, wpan_dev, &sl);
+}
+
+static int nl802154_del_llsec_seclevel(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_seclevel sl;
+
+ if (!info->attrs[NL802154_ATTR_SEC_LEVEL] ||
+ llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
+ &sl) < 0)
+ return -EINVAL;
+
+ return rdev_del_seclevel(rdev, wpan_dev, &sl);
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
#define NL802154_FLAG_NEED_WPAN_PHY 0x01
#define NL802154_FLAG_NEED_NETDEV 0x02
#define NL802154_FLAG_NEED_RTNL 0x04
@@ -937,6 +2232,22 @@ static const struct genl_ops nl802154_ops[] = {
NL802154_FLAG_NEED_RTNL,
},
{
+ .cmd = NL802154_CMD_SET_CCA_ED_LEVEL,
+ .doit = nl802154_set_cca_ed_level,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_TX_POWER,
+ .doit = nl802154_set_tx_power,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
.cmd = NL802154_CMD_SET_PAN_ID,
.doit = nl802154_set_pan_id,
.policy = nl802154_policy,
@@ -984,6 +2295,127 @@ static const struct genl_ops nl802154_ops[] = {
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT,
+ .doit = nl802154_set_ackreq_default,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ {
+ .cmd = NL802154_CMD_SET_SEC_PARAMS,
+ .doit = nl802154_set_llsec_params,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_GET_SEC_KEY,
+ /* TODO .doit by matching key id? */
+ .dumpit = nl802154_dump_llsec_key,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_KEY,
+ .doit = nl802154_add_llsec_key,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_KEY,
+ .doit = nl802154_del_llsec_key,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ /* TODO unique identifier must short+pan OR extended_addr */
+ {
+ .cmd = NL802154_CMD_GET_SEC_DEV,
+ /* TODO .doit by matching extended_addr? */
+ .dumpit = nl802154_dump_llsec_dev,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_DEV,
+ .doit = nl802154_add_llsec_dev,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_DEV,
+ .doit = nl802154_del_llsec_dev,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ /* TODO remove complete devkey, put it as nested? */
+ {
+ .cmd = NL802154_CMD_GET_SEC_DEVKEY,
+ /* TODO doit by matching ??? */
+ .dumpit = nl802154_dump_llsec_devkey,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_DEVKEY,
+ .doit = nl802154_add_llsec_devkey,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_DEVKEY,
+ .doit = nl802154_del_llsec_devkey,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_GET_SEC_LEVEL,
+ /* TODO .doit by matching frame_type? */
+ .dumpit = nl802154_dump_llsec_seclevel,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_LEVEL,
+ .doit = nl802154_add_llsec_seclevel,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_LEVEL,
+ /* TODO match frame_type only? */
+ .doit = nl802154_del_llsec_seclevel,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};
/* initialisation/exit functions */
diff --git a/kernel/net/ieee802154/rdev-ops.h b/kernel/net/ieee802154/rdev-ops.h
index 7b5a9dd94..4441c63b3 100644
--- a/kernel/net/ieee802154/rdev-ops.h
+++ b/kernel/net/ieee802154/rdev-ops.h
@@ -24,6 +24,26 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
}
static inline int
+rdev_suspend(struct cfg802154_registered_device *rdev)
+{
+ int ret;
+ trace_802154_rdev_suspend(&rdev->wpan_phy);
+ ret = rdev->ops->suspend(&rdev->wpan_phy);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_resume(struct cfg802154_registered_device *rdev)
+{
+ int ret;
+ trace_802154_rdev_resume(&rdev->wpan_phy);
+ ret = rdev->ops->resume(&rdev->wpan_phy);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
unsigned char name_assign_type,
enum nl802154_iftype type, __le64 extended_addr)
@@ -75,6 +95,29 @@ rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
}
static inline int
+rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level)
+{
+ int ret;
+
+ trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level);
+ ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_tx_power(struct cfg802154_registered_device *rdev,
+ s32 power)
+{
+ int ret;
+
+ trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power);
+ ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
rdev_set_pan_id(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, __le16 pan_id)
{
@@ -152,4 +195,126 @@ rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
return ret;
}
+static inline int
+rdev_set_ackreq_default(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, bool ackreq)
+{
+ int ret;
+
+ trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev,
+ ackreq);
+ ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+/* TODO this is already a nl802154, so move into ieee802154 */
+static inline void
+rdev_get_llsec_table(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_llsec_table **table)
+{
+ rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table);
+}
+
+static inline void
+rdev_lock_llsec_table(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev);
+}
+
+static inline void
+rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev);
+}
+
+static inline int
+rdev_get_llsec_params(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_llsec_params *params)
+{
+ return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params);
+}
+
+static inline int
+rdev_set_llsec_params(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_params *params,
+ u32 changed)
+{
+ return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params,
+ changed);
+}
+
+static inline int
+rdev_add_llsec_key(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key)
+{
+ return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key);
+}
+
+static inline int
+rdev_del_llsec_key(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_key_id *id)
+{
+ return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id);
+}
+
+static inline int
+rdev_add_seclevel(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl);
+}
+
+static inline int
+rdev_del_seclevel(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl);
+}
+
+static inline int
+rdev_add_device(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_device *dev_desc)
+{
+ return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc);
+}
+
+static inline int
+rdev_del_device(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le64 extended_addr)
+{
+ return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr);
+}
+
+static inline int
+rdev_add_devkey(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *devkey)
+{
+ return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr,
+ devkey);
+}
+
+static inline int
+rdev_del_devkey(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *devkey)
+{
+ return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr,
+ devkey);
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
#endif /* __CFG802154_RDEV_OPS */
diff --git a/kernel/net/ieee802154/socket.c b/kernel/net/ieee802154/socket.c
index 627a25376..a548be247 100644
--- a/kernel/net/ieee802154/socket.c
+++ b/kernel/net/ieee802154/socket.c
@@ -64,10 +64,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr)
if (tmp->type != ARPHRD_IEEE802154)
continue;
- pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp);
- short_addr =
- ieee802154_mlme_ops(tmp)->get_short_addr(tmp);
-
+ pan_id = tmp->ieee802154_ptr->pan_id;
+ short_addr = tmp->ieee802154_ptr->short_addr;
if (pan_id == addr->pan_id &&
short_addr == addr->short_addr) {
dev = tmp;
@@ -228,15 +226,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len)
goto out;
}
- if (dev->type != ARPHRD_IEEE802154) {
- err = -ENODEV;
- goto out_put;
- }
-
sk->sk_bound_dev_if = dev->ifindex;
sk_dst_reset(sk);
-out_put:
dev_put(dev);
out:
release_sock(sk);
@@ -281,12 +273,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
goto out;
}
- mtu = dev->mtu;
+ mtu = IEEE802154_MTU;
pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
if (size > mtu) {
pr_debug("size = %Zu, mtu = %u\n", size, mtu);
- err = -EINVAL;
+ err = -EMSGSIZE;
goto out_dev;
}
@@ -645,7 +637,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
err = -ENXIO;
goto out;
}
- mtu = dev->mtu;
+ mtu = IEEE802154_MTU;
pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
if (size > mtu) {
@@ -684,8 +676,8 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
cb->seclevel = ro->seclevel;
cb->seclevel_override = ro->seclevel_override;
- err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &dst_addr,
- ro->bound ? &ro->src_addr : NULL, size);
+ err = wpan_dev_hard_header(skb, dev, &dst_addr,
+ ro->bound ? &ro->src_addr : NULL, size);
if (err < 0)
goto out_skb;
@@ -803,9 +795,9 @@ static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb)
/* Data frame processing */
BUG_ON(dev->type != ARPHRD_IEEE802154);
- pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
- short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev);
- hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr);
+ pan_id = dev->ieee802154_ptr->pan_id;
+ short_addr = dev->ieee802154_ptr->short_addr;
+ hw_addr = dev->ieee802154_ptr->extended_addr;
read_lock(&dgram_lock);
sk_for_each(sk, &dgram_head) {
@@ -1020,7 +1012,7 @@ static int ieee802154_create(struct net *net, struct socket *sock,
}
rc = -ENOMEM;
- sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto);
+ sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern);
if (!sk)
goto out;
rc = 0;
diff --git a/kernel/net/ieee802154/sysfs.c b/kernel/net/ieee802154/sysfs.c
index 133b42806..bd88525b0 100644
--- a/kernel/net/ieee802154/sysfs.c
+++ b/kernel/net/ieee802154/sysfs.c
@@ -14,11 +14,13 @@
*/
#include <linux/device.h>
+#include <linux/rtnetlink.h>
#include <net/cfg802154.h>
#include "core.h"
#include "sysfs.h"
+#include "rdev-ops.h"
static inline struct cfg802154_registered_device *
dev_to_rdev(struct device *dev)
@@ -62,10 +64,46 @@ static struct attribute *pmib_attrs[] = {
};
ATTRIBUTE_GROUPS(pmib);
+#ifdef CONFIG_PM_SLEEP
+static int wpan_phy_suspend(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ if (rdev->ops->suspend) {
+ rtnl_lock();
+ ret = rdev_suspend(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static int wpan_phy_resume(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ if (rdev->ops->resume) {
+ rtnl_lock();
+ ret = rdev_resume(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume);
+#define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops)
+#else
+#define WPAN_PHY_PM_OPS NULL
+#endif
+
struct class wpan_phy_class = {
.name = "ieee802154",
.dev_release = wpan_phy_release,
.dev_groups = pmib_groups,
+ .pm = WPAN_PHY_PM_OPS,
};
int wpan_phy_sysfs_init(void)
diff --git a/kernel/net/ieee802154/trace.h b/kernel/net/ieee802154/trace.h
index 5ac25eb6e..9a471e41e 100644
--- a/kernel/net/ieee802154/trace.h
+++ b/kernel/net/ieee802154/trace.h
@@ -1,4 +1,4 @@
-/* Based on net/wireless/tracing.h */
+/* Based on net/wireless/trace.h */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cfg802154
@@ -40,6 +40,28 @@
* rdev->ops traces *
*************************************************************/
+DECLARE_EVENT_CLASS(wpan_phy_only_evt,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy)
+);
+
TRACE_EVENT(802154_rdev_add_virtual_intf,
TP_PROTO(struct wpan_phy *wpan_phy, char *name,
enum nl802154_iftype type, __le64 extended_addr),
@@ -56,7 +78,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf,
__entry->type = type;
__entry->extended_addr = extended_addr;
),
- TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx",
+ TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx",
WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type,
__le64_to_cpu(__entry->extended_addr))
);
@@ -93,6 +115,21 @@ TRACE_EVENT(802154_rdev_set_channel,
__entry->page, __entry->channel)
);
+TRACE_EVENT(802154_rdev_set_tx_power,
+ TP_PROTO(struct wpan_phy *wpan_phy, s32 power),
+ TP_ARGS(wpan_phy, power),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(s32, power)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->power = power;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG,
+ __entry->power)
+);
+
TRACE_EVENT(802154_rdev_set_cca_mode,
TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
TP_ARGS(wpan_phy, cca),
@@ -108,6 +145,21 @@ TRACE_EVENT(802154_rdev_set_cca_mode,
WPAN_CCA_PR_ARG)
);
+TRACE_EVENT(802154_rdev_set_cca_ed_level,
+ TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level),
+ TP_ARGS(wpan_phy, ed_level),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(s32, ed_level)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->ed_level = ed_level;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG,
+ __entry->ed_level)
+);
+
DECLARE_EVENT_CLASS(802154_le16_template,
TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
__le16 le16arg),
@@ -137,7 +189,7 @@ DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr,
TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
__le16 le16arg),
TP_ARGS(wpan_phy, wpan_dev, le16arg),
- TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x",
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x",
WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
__le16_to_cpu(__entry->le16arg))
);
@@ -160,7 +212,7 @@ TRACE_EVENT(802154_rdev_set_backoff_exponent,
),
TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
- ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG,
+ ", min be: %d, max be: %d", WPAN_PHY_PR_ARG,
WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be)
);
@@ -223,6 +275,25 @@ TRACE_EVENT(802154_rdev_set_lbt_mode,
WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
);
+TRACE_EVENT(802154_rdev_set_ackreq_default,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ bool ackreq),
+ TP_ARGS(wpan_phy, wpan_dev, ackreq),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(bool, ackreq)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->ackreq = ackreq;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", ackreq default: %s", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq))
+);
+
TRACE_EVENT(802154_rdev_return_int,
TP_PROTO(struct wpan_phy *wpan_phy, int ret),
TP_ARGS(wpan_phy, ret),
diff --git a/kernel/net/ipv4/Kconfig b/kernel/net/ipv4/Kconfig
index bd2901604..416dfa004 100644
--- a/kernel/net/ipv4/Kconfig
+++ b/kernel/net/ipv4/Kconfig
@@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS
When this option is enabled IP tunnels can be configured to use
FOU or GUE encapsulation.
-config GENEVE
- tristate "Generic Network Virtualization Encapsulation (Geneve)"
- depends on INET
- select NET_UDP_TUNNEL
- ---help---
- This allows one to create Geneve virtual interfaces that provide
- Layer 2 Networks over Layer 3 Networks. Geneve is often used
- to tunnel virtual network infrastructure in virtualized environments.
- For more information see:
- http://tools.ietf.org/html/draft-gross-geneve-01
-
- To compile this driver as a module, choose M here: the module
-
-
config INET_AH
tristate "IP: AH transformation"
select XFRM_ALGO
@@ -615,6 +601,22 @@ config TCP_CONG_DCTCP
For further details see:
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+config TCP_CONG_CDG
+ tristate "CAIA Delay-Gradient (CDG)"
+ default n
+ ---help---
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
+
+ o Use the delay gradient as a congestion signal.
+ o Back off with an average probability that is independent of the RTT.
+ o Coexist with flows that use loss-based congestion control.
+ o Tolerate packet loss unrelated to congestion.
+
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -646,6 +648,9 @@ choice
config DEFAULT_DCTCP
bool "DCTCP" if TCP_CONG_DCTCP=y
+ config DEFAULT_CDG
+ bool "CDG" if TCP_CONG_CDG=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
@@ -668,6 +673,7 @@ config DEFAULT_TCP_CONG
default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
default "dctcp" if DEFAULT_DCTCP
+ default "cdg" if DEFAULT_CDG
default "cubic"
config TCP_MD5SIG
diff --git a/kernel/net/ipv4/Makefile b/kernel/net/ipv4/Makefile
index 518c04ed6..c29809f76 100644
--- a/kernel/net/ipv4/Makefile
+++ b/kernel/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -42,6 +43,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -56,7 +58,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
-obj-$(CONFIG_GENEVE) += geneve.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
diff --git a/kernel/net/ipv4/af_inet.c b/kernel/net/ipv4/af_inet.c
index a5aa54ea6..5c5db6636 100644
--- a/kernel/net/ipv4/af_inet.c
+++ b/kernel/net/ipv4/af_inet.c
@@ -112,12 +112,14 @@
#include <net/raw.h>
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/ip_tunnels.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
+#include <net/l3mdev.h>
/* The inetsw table contains everything that inet_create needs to
@@ -217,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog)
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
- !inet_csk(sk)->icsk_accept_queue.fastopenq) {
+ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
- err = fastopen_init_queue(sk, backlog);
+ fastopen_queue_tune(sk, backlog);
else if ((sysctl_tcp_fastopen &
TFO_SERVER_WO_SOCKOPT2) != 0)
- err = fastopen_init_queue(sk,
+ fastopen_queue_tune(sk,
((uint)sysctl_tcp_fastopen) >> 16);
- else
- err = 0;
- if (err)
- goto out;
tcp_fastopen_init_key_once(true);
}
@@ -259,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -319,7 +320,7 @@ lookup_protocol:
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
- sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
@@ -426,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
+ u32 tb_id = RT_TABLE_LOCAL;
int err;
/* If the socket has its own bind function then use it. (RAW) */
@@ -447,7 +449,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -490,7 +493,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
- if (sk->sk_prot->get_port(sk, snum)) {
+ if ((snum || !inet->bind_address_no_port) &&
+ sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
@@ -1038,22 +1042,16 @@ void inet_register_protosw(struct inet_protosw *p)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
- answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
-
/* Check only the non-wild match. */
- if (INET_PROTOSW_PERMANENT & answer->flags) {
- if (protocol == answer->protocol)
- break;
- last_perm = lh;
- }
-
- answer = NULL;
+ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
+ break;
+ if (protocol == answer->protocol)
+ goto out_permanent;
+ last_perm = lh;
}
- if (answer)
- goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
@@ -1432,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
struct net *net)
{
struct socket *sock;
- int rc = sock_create_kern(family, type, protocol, &sock);
+ int rc = sock_create_kern(net, family, type, protocol, &sock);
if (rc == 0) {
*sk = sock->sk;
@@ -1442,45 +1440,56 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
* we do not wish this socket to see incoming packets.
*/
(*sk)->sk_prot->unhash(*sk);
-
- sk_change_net(*sk, net);
}
return rc;
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
+{
+ return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field);
+
unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
int i;
for_each_possible_cpu(i)
- res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
+ res += snmp_get_cpu_field(mib, i, offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
#if BITS_PER_LONG==32
+u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
+ size_t syncp_offset)
+{
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib, cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *)bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
+
+ return v;
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
+
u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
u64 res = 0;
int cpu;
for_each_possible_cpu(cpu) {
- void *bhptr;
- struct u64_stats_sync *syncp;
- u64 v;
- unsigned int start;
-
- bhptr = per_cpu_ptr(mib, cpu);
- syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
- do {
- start = u64_stats_fetch_begin_irq(syncp);
- v = *(((u64 *) bhptr) + offt);
- } while (u64_stats_fetch_retry_irq(syncp, start));
-
- res += v;
+ res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
}
return res;
}
@@ -1599,7 +1608,7 @@ static __net_init int inet_init_net(struct net *net)
*/
seqlock_init(&net->ipv4.ip_local_ports.lock);
net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 61000;
+ net->ipv4.ip_local_ports.range[1] = 60999;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
@@ -1781,6 +1790,8 @@ static int __init inet_init(void)
dev_add_pack(&ip_packet_type);
+ ip_tunnel_core_init();
+
rc = 0;
out:
return rc;
diff --git a/kernel/net/ipv4/ah4.c b/kernel/net/ipv4/ah4.c
index ac9a32ec3..f2a71025a 100644
--- a/kernel/net/ipv4/ah4.c
+++ b/kernel/net/ipv4/ah4.c
@@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
ahp->icv_trunc_len + seqhi_len);
- if (!work_iph)
+ if (!work_iph) {
+ err = -ENOMEM;
goto out;
+ }
seqhi = (__be32 *)((char *)work_iph + ihl);
auth_data = ah_tmp_auth(seqhi, seqhi_len);
diff --git a/kernel/net/ipv4/arp.c b/kernel/net/ipv4/arp.c
index 933a92820..59b3e0e8f 100644
--- a/kernel/net/ipv4/arp.c
+++ b/kernel/net/ipv4/arp.c
@@ -113,6 +113,8 @@
#include <net/arp.h>
#include <net/ax25.h>
#include <net/netrom.h>
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
#include <linux/uaccess.h>
@@ -233,7 +235,7 @@ static int arp_constructor(struct neighbour *neigh)
return -EINVAL;
}
- neigh->type = inet_addr_type(dev_net(dev), addr);
+ neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
@@ -291,6 +293,39 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
kfree_skb(skb);
}
+/* Create and send an arp packet. */
+static void arp_send_dst(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw,
+ struct dst_entry *dst)
+{
+ struct sk_buff *skb;
+
+ /* arp on this interface. */
+ if (dev->flags & IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ skb_dst_set(skb, dst_clone(dst));
+ arp_xmit(skb);
+}
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
+ target_hw, NULL);
+}
+EXPORT_SYMBOL(arp_send);
+
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
@@ -299,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
__be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
struct in_device *in_dev;
+ struct dst_entry *dst = NULL;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -309,7 +345,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(dev_net(dev),
+ if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
@@ -317,7 +353,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (!skb)
break;
saddr = ip_hdr(skb)->saddr;
- if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+ if (inet_addr_type_dev_table(dev_net(dev), dev,
+ saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
@@ -346,8 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
}
- arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_hw, dev->dev_addr, NULL);
+ if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+ dst = skb_dst(skb);
+ arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL, dst);
}
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -585,48 +624,28 @@ out:
}
EXPORT_SYMBOL(arp_create);
+static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+
/*
* Send an arp packet.
*/
void arp_xmit(struct sk_buff *skb)
{
/* Send it off, maybe filter it using firewalling first. */
- NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
- NULL, skb->dev, dev_queue_xmit_sk);
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
+ dev_net(skb->dev), NULL, skb, NULL, skb->dev,
+ arp_xmit_finish);
}
EXPORT_SYMBOL(arp_xmit);
/*
- * Create and send an arp packet.
- */
-void arp_send(int type, int ptype, __be32 dest_ip,
- struct net_device *dev, __be32 src_ip,
- const unsigned char *dest_hw, const unsigned char *src_hw,
- const unsigned char *target_hw)
-{
- struct sk_buff *skb;
-
- /*
- * No arp on this interface.
- */
-
- if (dev->flags&IFF_NOARP)
- return;
-
- skb = arp_create(type, ptype, dest_ip, dev, src_ip,
- dest_hw, src_hw, target_hw);
- if (!skb)
- return;
-
- arp_xmit(skb);
-}
-EXPORT_SYMBOL(arp_send);
-
-/*
* Process an arp request.
*/
-static int arp_process(struct sock *sk, struct sk_buff *skb)
+static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -638,7 +657,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
- struct net *net = dev_net(dev);
+ struct dst_entry *reply_dst = NULL;
bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
@@ -739,13 +758,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
* cache.
*/
+ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
+ reply_dst = (struct dst_entry *)
+ iptunnel_metadata_reply(skb_metadata_dst(skb),
+ GFP_ATOMIC);
+
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- inet_addr_type(net, tip) == RTN_LOCAL &&
+ inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
- dev->dev_addr, sha);
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
+ sha, dev->dev_addr, sha, reply_dst);
goto out;
}
@@ -764,9 +788,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
- dev, tip, sha, dev->dev_addr,
- sha);
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+ sip, dev, tip, sha,
+ dev->dev_addr, sha,
+ reply_dst);
neigh_release(n);
}
}
@@ -784,13 +809,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
- dev, tip, sha, dev->dev_addr,
- sha);
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+ sip, dev, tip, sha,
+ dev->dev_addr, sha,
+ reply_dst);
} else {
pneigh_enqueue(&arp_tbl,
in_dev->arp_parms, skb);
- return 0;
+ goto out_free_dst;
}
goto out;
}
@@ -802,16 +828,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);
+
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
- inet_addr_type(net, sip) == RTN_UNICAST;
+ addr_type == RTN_UNICAST;
if (!n &&
((arp->ar_op == htons(ARPOP_REPLY) &&
- inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
+ addr_type == RTN_UNICAST) || is_garp))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -842,12 +870,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
out:
consume_skb(skb);
+out_free_dst:
+ dst_release(reply_dst);
return 0;
}
static void parp_redo(struct sk_buff *skb)
{
- arp_process(NULL, skb);
+ arp_process(dev_net(skb->dev), NULL, skb);
}
@@ -880,8 +910,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
- return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
- dev, NULL, arp_process);
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
+ dev_net(dev), NULL, skb, dev, NULL,
+ arp_process);
consumeskb:
consume_skb(skb);
@@ -1017,14 +1048,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
- r->arp_ha.sa_family = dev->type;
- strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ if (!(neigh->nud_state & NUD_NOARP)) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ err = 0;
+ }
neigh_release(neigh);
- err = 0;
}
return err;
}
diff --git a/kernel/net/ipv4/datagram.c b/kernel/net/ipv4/datagram.c
index 574fad9cc..f915abff1 100644
--- a/kernel/net/ipv4/datagram.c
+++ b/kernel/net/ipv4/datagram.c
@@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
- inet_set_txhash(sk);
+ sk_set_txhash(sk);
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
diff --git a/kernel/net/ipv4/devinet.c b/kernel/net/ipv4/devinet.c
index 419d23c53..f6303b175 100644
--- a/kernel/net/ipv4/devinet.c
+++ b/kernel/net/ipv4/devinet.c
@@ -882,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
queue_delayed_work(system_power_efficient_wq,
&check_lifetime_work, 0);
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
- blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
}
return 0;
}
@@ -1645,7 +1644,8 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
-static size_t inet_get_link_af_size(const struct net_device *dev)
+static size_t inet_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
@@ -1655,7 +1655,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev)
return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
}
-static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
@@ -1740,6 +1741,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (type == -1 || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ size += nla_total_size(4);
return size;
}
@@ -1780,6 +1783,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -1819,6 +1826,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
[NETCONFA_FORWARDING] = { .len = sizeof(int) },
[NETCONFA_RP_FILTER] = { .len = sizeof(int) },
[NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+ [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
@@ -1839,7 +1847,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
if (err < 0)
goto errout;
- err = EINVAL;
+ err = -EINVAL;
if (!tb[NETCONFA_IFINDEX])
goto errout;
@@ -2048,6 +2056,12 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
ifindex, cnf);
}
+ if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ ifindex, cnf);
+ }
}
return ret;
@@ -2169,6 +2183,8 @@ static struct devinet_sysctl_table {
"igmpv2_unsolicited_report_interval"),
DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
"igmpv3_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
+ "ignore_routes_with_linkdown"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -2383,4 +2399,3 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf, NULL);
}
-
diff --git a/kernel/net/ipv4/esp4.c b/kernel/net/ipv4/esp4.c
index 30b544f02..477937465 100644
--- a/kernel/net/ipv4/esp4.c
+++ b/kernel/net/ipv4/esp4.c
@@ -49,7 +49,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
len = ALIGN(len, crypto_tfm_ctx_alignment());
}
- len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
len = ALIGN(len, __alignof__(struct scatterlist));
len += sizeof(struct scatterlist) * nfrags;
@@ -68,17 +68,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
-static inline struct aead_givcrypt_request *esp_tmp_givreq(
- struct crypto_aead *aead, u8 *iv)
-{
- struct aead_givcrypt_request *req;
-
- req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
- crypto_tfm_ctx_alignment());
- aead_givcrypt_set_tfm(req, aead);
- return req;
-}
-
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
{
struct aead_request *req;
@@ -97,14 +86,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static inline struct scatterlist *esp_givreq_sg(
- struct crypto_aead *aead, struct aead_givcrypt_request *req)
-{
- return (void *)ALIGN((unsigned long)(req + 1) +
- crypto_aead_reqsize(aead),
- __alignof__(struct scatterlist));
-}
-
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -113,14 +94,37 @@ static void esp_output_done(struct crypto_async_request *base, int err)
xfrm_output_resume(skb, err);
}
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+ struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ __be32 *seqhi = esp_tmp_seqhi(tmp);
+
+ esph->seq_no = esph->spi;
+ esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+}
+
+static void esp_output_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_output_restore_header(skb);
+ esp_output_done(base, err);
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
- struct aead_givcrypt_request *req;
+ struct aead_request *req;
struct scatterlist *sg;
- struct scatterlist *asg;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -129,17 +133,19 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int clen;
int alen;
int plen;
+ int ivlen;
int tfclen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
__be32 *seqhi;
+ __be64 seqno;
/* skb is pure payload to encrypt */
aead = x->data;
alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
tfclen = 0;
if (x->tfcpad) {
@@ -160,16 +166,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp) {
err = -ENOMEM;
goto error;
@@ -177,9 +181,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
- req = esp_tmp_givreq(aead, iv);
- asg = esp_givreq_sg(aead, req);
- sg = asg + sglists;
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
/* Fill padding... */
tail = skb_tail_pointer(trailer);
@@ -235,37 +238,53 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_UDP;
}
- esph->spi = x->id.spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+ *seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ }
+
+ esph->spi = x->id.spi;
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
- esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
- clen + alen);
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
- if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
-
- aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
- aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, assoclen);
- aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output.low +
- ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+ aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ memset(iv, 0, ivlen);
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8),
+ min(ivlen, 8));
ESP_SKB_CB(skb)->tmp = tmp;
- err = crypto_aead_givencrypt(req);
- if (err == -EINPROGRESS)
+ err = crypto_aead_encrypt(req);
+
+ switch (err) {
+ case -EINPROGRESS:
goto error;
- if (err == -EBUSY)
+ case -EBUSY:
err = NET_XMIT_DROP;
+ break;
+
+ case 0:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_output_restore_header(skb);
+ }
kfree(tmp);
@@ -364,6 +383,20 @@ static void esp_input_done(struct crypto_async_request *base, int err)
xfrm_input_resume(skb, esp_input_done2(skb, err));
}
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, 0);
+ __skb_pull(skb, 4);
+}
+
+static void esp_input_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_input_restore_header(skb);
+ esp_input_done(base, err);
+}
+
/*
* Note: detecting truncated vs. non-truncated authentication data is very
* expensive, so we only support truncated data, which is the recommended
@@ -375,19 +408,18 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
- int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int ivlen = crypto_aead_ivsize(aead);
+ int elen = skb->len - sizeof(*esph) - ivlen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
__be32 *seqhi;
void *tmp;
u8 *iv;
struct scatterlist *sg;
- struct scatterlist *asg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
goto out;
if (elen <= 0)
@@ -400,17 +432,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
err = -ENOMEM;
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
goto out;
@@ -418,36 +448,39 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
- asg = esp_req_sg(aead, req);
- sg = asg + sglists;
+ sg = esp_req_sg(aead, req);
skb->ip_summed = CHECKSUM_NONE;
esph = (struct ip_esp_hdr *)skb->data;
- /* Get ivec. This can be wrong, check against another impls. */
- iv = esph->enc_data;
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+ }
- aead_request_set_callback(req, 0, esp_input_done, skb);
- aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, assoclen);
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+ aead_request_set_ad(req, assoclen);
err = crypto_aead_decrypt(req);
if (err == -EINPROGRESS)
goto out;
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_input_restore_header(skb);
+
err = esp_input_done2(skb, err);
out:
@@ -519,10 +552,16 @@ static void esp_destroy(struct xfrm_state *x)
static int esp_init_aead(struct xfrm_state *x)
{
+ char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = -ENAMETOOLONG;
+ if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+
+ aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -561,15 +600,19 @@ static int esp_init_authenc(struct xfrm_state *x)
if ((x->props.flags & XFRM_STATE_ESN)) {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authencesn(%s,%s)",
+ "%s%sauthencesn(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authenc(%s,%s)",
+ "%s%sauthenc(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
}
diff --git a/kernel/net/ipv4/fib_frontend.c b/kernel/net/ipv4/fib_frontend.c
index 872494e6e..473447593 100644
--- a/kernel/net/ipv4/fib_frontend.c
+++ b/kernel/net/ipv4/fib_frontend.c
@@ -45,6 +45,8 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
+#include <net/l3mdev.h>
+#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -211,12 +213,12 @@ void fib_flush_external(struct net *net)
*/
static inline unsigned int __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
- __be32 addr)
+ __be32 addr, u32 tb_id)
{
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
unsigned int ret = RTN_BROADCAST;
- struct fib_table *local_table;
+ struct fib_table *table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
@@ -225,10 +227,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
rcu_read_lock();
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (local_table) {
+ table = fib_get_table(net, tb_id);
+ if (table) {
ret = RTN_UNICAST;
- if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
@@ -238,19 +240,40 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
return ret;
}
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
+{
+ return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
unsigned int inet_addr_type(struct net *net, __be32 addr)
{
- return __inet_dev_addr_type(net, NULL, addr);
+ return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
}
EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- return __inet_dev_addr_type(net, dev, addr);
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+ return __inet_dev_addr_type(net, dev, addr, rt_table);
}
EXPORT_SYMBOL(inet_dev_addr_type);
+/* inet_addr_type with dev == NULL but using the table from a dev
+ * if one is associated
+ */
+unsigned int inet_addr_type_dev_table(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
+{
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+ return __inet_dev_addr_type(net, NULL, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_addr_type_dev_table);
+
__be32 fib_compute_spec_dst(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -280,7 +303,8 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
- if (!fib_lookup(net, &fl4, &res))
+ fl4.flowi4_tun_key.tun_id = 0;
+ if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
scope = RT_SCOPE_LINK;
@@ -308,18 +332,24 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
+ if (!fl4.flowi4_iif)
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_flags = 0;
no_addr = idev->ifa_list == NULL;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+ trace_fib_validate_source(dev, &fl4);
+
net = dev_net(dev);
- if (fib_lookup(net, &fl4, &res))
+ if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
@@ -337,6 +367,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (nh->nh_dev == dev) {
dev_match = true;
break;
+ } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+ dev_match = true;
+ break;
}
}
#else
@@ -354,7 +387,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(net, &fl4, &res) == 0) {
+ if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
}
@@ -494,9 +527,12 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
addr = sk_extract_addr(&rt->rt_gateway);
if (rt->rt_gateway.sa_family == AF_INET && addr) {
+ unsigned int addr_type;
+
cfg->fc_gw = addr;
+ addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
if (rt->rt_flags & RTF_GATEWAY &&
- inet_addr_type(net, addr) == RTN_UNICAST)
+ addr_type == RTN_UNICAST)
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
@@ -591,6 +627,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_FLOW] = { .type = NLA_U32 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +694,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_TABLE:
cfg->fc_table = nla_get_u32(attr);
break;
+ case RTA_ENCAP:
+ cfg->fc_encap = attr;
+ break;
+ case RTA_ENCAP_TYPE:
+ cfg->fc_encap_type = nla_get_u16(attr);
+ break;
}
}
@@ -760,6 +804,7 @@ out:
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
+ u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -774,11 +819,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
},
};
- if (type == RTN_UNICAST)
- tb = fib_new_table(net, RT_TABLE_MAIN);
- else
- tb = fib_new_table(net, RT_TABLE_LOCAL);
+ if (!tb_id)
+ tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
+ tb = fib_new_table(net, tb_id);
if (!tb)
return;
@@ -823,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
@@ -870,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
} else if (!ipv4_is_zeronet(any) &&
(any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_DELROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
subnet = 1;
}
@@ -960,11 +1006,14 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
}
if (!(ok & LOCAL_OK)) {
+ unsigned int addr_type;
+
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
- if (gone &&
- inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+ addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
+ ifa->ifa_local);
+ if (gone && addr_type != RTN_LOCAL) {
/* And the last, but not the least thing.
* We must flush stray FIB entries.
*
@@ -1063,9 +1112,10 @@ static void nl_fib_lookup_exit(struct net *net)
net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, unsigned long event,
+ bool force)
{
- if (fib_sync_down_dev(dev, force))
+ if (fib_sync_down_dev(dev, event, force))
fib_flush(dev_net(dev));
rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
@@ -1081,7 +1131,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
case NETDEV_UP:
fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(dev_net(dev));
@@ -1093,7 +1143,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
/* Last address was deleted from this interface.
* Disable IP.
*/
- fib_disable_ip(dev, 1);
+ fib_disable_ip(dev, event, true);
} else {
rt_cache_flush(dev_net(dev));
}
@@ -1105,11 +1155,13 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_changeupper_info *info;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ unsigned int flags;
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2);
+ fib_disable_ip(dev, event, true);
rt_flush_dev(dev);
return NOTIFY_DONE;
}
@@ -1124,18 +1176,32 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_add_ifaddr(ifa);
} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(net);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0);
+ fib_disable_ip(dev, event, false);
break;
- case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
+ flags = dev_get_flags(dev);
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ fib_sync_up(dev, RTNH_F_LINKDOWN);
+ else
+ fib_sync_down_dev(dev, event, false);
+ /* fall through */
+ case NETDEV_CHANGEMTU:
rt_cache_flush(net);
break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ fib_disable_ip(dev, NETDEV_DOWN, true);
+ break;
}
return NOTIFY_DONE;
}
diff --git a/kernel/net/ipv4/fib_lookup.h b/kernel/net/ipv4/fib_lookup.h
index c6211ed60..9c0292072 100644
--- a/kernel/net/ipv4/fib_lookup.h
+++ b/kernel/net/ipv4/fib_lookup.h
@@ -13,6 +13,7 @@ struct fib_alias {
u8 fa_state;
u8 fa_slen;
u32 tb_id;
+ s16 fa_default;
struct rcu_head rcu;
};
diff --git a/kernel/net/ipv4/fib_rules.c b/kernel/net/ipv4/fib_rules.c
index 56151982f..f2bda9e89 100644
--- a/kernel/net/ipv4/fib_rules.c
+++ b/kernel/net/ipv4/fib_rules.c
@@ -47,11 +47,12 @@ struct fib4_rule {
#endif
};
-int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+ struct fib_result *res, unsigned int flags)
{
struct fib_lookup_arg arg = {
.result = res,
- .flags = FIB_LOOKUP_NOREF,
+ .flags = flags,
};
int err;
@@ -317,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.delete = fib4_rule_delete,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
- .default_pref = fib_default_rule_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
diff --git a/kernel/net/ipv4/fib_semantics.c b/kernel/net/ipv4/fib_semantics.c
index 8d695b665..d97268e8f 100644
--- a/kernel/net/ipv4/fib_semantics.c
+++ b/kernel/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
#include "fib_lookup.h"
@@ -56,8 +57,7 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-static DEFINE_SPINLOCK(fib_multipath_lock);
+u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -208,6 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
dev_put(nexthop_nh->nh_dev);
+ lwtstate_put(nexthop_nh->nh_lwtstate);
free_nh_exceptions(nexthop_nh);
rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -266,7 +267,8 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+ lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
return -1;
onh++;
} endfor_nexthops(fi);
@@ -318,7 +320,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
nfi->fib_type == fi->fib_type &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(u32) * RTAX_MAX) == 0 &&
- ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+ !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -366,6 +368,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
if (fi->fib_nhs) {
+ size_t nh_encapsize = 0;
/* Also handles the special case fib_nhs == 1 */
/* each nexthop is packed in an attribute */
@@ -374,8 +377,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
+ /* grab encap info */
+ for_nexthops(fi) {
+ if (nh->nh_lwtstate) {
+ /* RTA_ENCAP_TYPE */
+ nh_encapsize += lwtunnel_get_encap_size(
+ nh->nh_lwtstate);
+ /* RTA_ENCAP */
+ nh_encapsize += nla_total_size(2);
+ }
+ } endfor_nexthops(fi);
+
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
+ payload += nla_total_size((fi->fib_nhs * nhsize) +
+ nh_encapsize);
+
}
return payload;
@@ -421,13 +437,15 @@ static int fib_detect_death(struct fib_info *fi, int order,
if (n) {
state = n->nud_state;
neigh_release(n);
+ } else {
+ return 0;
}
if (state == NUD_REACHABLE)
return 0;
if ((state & NUD_VALID) && order != dflt)
return 0;
if ((state & NUD_VALID) ||
- (*last_idx < 0 && order > dflt)) {
+ (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
*last_resort = fi;
*last_idx = order;
}
@@ -452,6 +470,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ int ret;
+
change_nexthops(fi) {
int attrlen;
@@ -475,18 +496,130 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (nexthop_nh->nh_tclassid)
fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ if (nla) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ struct nlattr *nla_entype;
+
+ nla_entype = nla_find(attrs, attrlen,
+ RTA_ENCAP_TYPE);
+ if (!nla_entype)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ ret = lwtunnel_build_state(dev, nla_get_u16(
+ nla_entype),
+ nla, AF_INET, cfg,
+ &lwtstate);
+ if (ret)
+ goto errout;
+ nexthop_nh->nh_lwtstate =
+ lwtstate_get(lwtstate);
+ }
}
rtnh = rtnh_next(rtnh, &remaining);
} endfor_nexthops(fi);
return 0;
+
+err_inval:
+ ret = -EINVAL;
+
+errout:
+ return ret;
}
-#endif
+static void fib_rebalance(struct fib_info *fi)
+{
+ int total;
+ int w;
+ struct in_device *in_dev;
+
+ if (fi->fib_nhs < 2)
+ return;
+
+ total = 0;
+ for_nexthops(fi) {
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN)
+ continue;
+
+ total += nh->nh_weight;
+ } endfor_nexthops(fi);
+
+ w = 0;
+ change_nexthops(fi) {
+ int upper_bound;
+
+ in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
+
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+ upper_bound = -1;
+ } else if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+ upper_bound = -1;
+ } else {
+ w += nexthop_nh->nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+ } endfor_nexthops(fi);
+
+ net_get_random_once(&fib_multipath_secret,
+ sizeof(fib_multipath_secret));
+}
+
+static inline void fib_add_weight(struct fib_info *fi,
+ const struct fib_nh *nh)
+{
+ fi->fib_weight += nh->nh_weight;
+}
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define fib_rebalance(fi) do { } while (0)
+#define fib_add_weight(fi, nh) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int fib_encap_match(struct net *net, u16 encap_type,
+ struct nlattr *encap,
+ int oif, const struct fib_nh *nh,
+ const struct fib_config *cfg)
+{
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ int ret, result = 0;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (oif)
+ dev = __dev_get_by_index(net, oif);
+ ret = lwtunnel_build_state(dev, encap_type, encap,
+ AF_INET, cfg, &lwtstate);
+ if (!ret) {
+ result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+ lwtstate_free(lwtstate);
+ }
+
+ return result;
+}
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -496,6 +629,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
return 1;
if (cfg->fc_oif || cfg->fc_gw) {
+ if (cfg->fc_encap) {
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, cfg->fc_oif,
+ fi->fib_nh, cfg))
+ return 1;
+ }
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
(!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
return 0;
@@ -585,7 +724,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
- int err;
+ int err = 0;
struct net *net;
struct net_device *dev;
@@ -594,16 +733,20 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_result res;
if (nh->nh_flags & RTNH_F_ONLINK) {
+ unsigned int addr_type;
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
- if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
- return -EINVAL;
dev = __dev_get_by_index(net, nh->nh_oif);
if (!dev)
return -ENODEV;
if (!(dev->flags & IFF_UP))
return -ENETDOWN;
+ addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
+ if (addr_type != RTN_UNICAST)
+ return -EINVAL;
+ if (!netif_carrier_ok(dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
@@ -611,6 +754,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
}
rcu_read_lock();
{
+ struct fib_table *tbl = NULL;
struct flowi4 fl4 = {
.daddr = nh->nh_gw,
.flowi4_scope = cfg->fc_scope + 1,
@@ -621,7 +765,24 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl4.flowi4_scope < RT_SCOPE_LINK)
fl4.flowi4_scope = RT_SCOPE_LINK;
- err = fib_lookup(net, &fl4, &res);
+
+ if (cfg->fc_table)
+ tbl = fib_get_table(net, cfg->fc_table);
+
+ if (tbl)
+ err = fib_table_lookup(tbl, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE |
+ FIB_LOOKUP_NOREF);
+
+ /* on error or if no table given do full lookup. This
+ * is needed for example when nexthops are in the local
+ * table rather than the given table
+ */
+ if (!tbl || err) {
+ err = fib_lookup(net, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE);
+ }
+
if (err) {
rcu_read_unlock();
return err;
@@ -636,6 +797,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
if (!dev)
goto out;
dev_hold(dev);
+ if (!netif_carrier_ok(dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
@@ -654,6 +817,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
+ if (!netif_carrier_ok(nh->nh_dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
err = 0;
}
out:
@@ -713,8 +878,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *dest;
unsigned int new_hash;
- hlist_del(&fi->fib_hash);
-
new_hash = fib_info_hashfn(fi);
dest = &new_info_hash[new_hash];
hlist_add_head(&fi->fib_hash, dest);
@@ -731,8 +894,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *ldest;
unsigned int new_hash;
- hlist_del(&fi->fib_lhash);
-
new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
ldest = &new_laddrhash[new_hash];
hlist_add_head(&fi->fib_lhash, ldest);
@@ -757,6 +918,74 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
return nh->nh_saddr;
}
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+ if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+ fib_prefsrc != cfg->fc_dst) {
+ u32 tb_id = cfg->fc_table;
+ int rc;
+
+ if (tb_id == RT_TABLE_MAIN)
+ tb_id = RT_TABLE_LOCAL;
+
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, tb_id);
+
+ if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, RT_TABLE_LOCAL);
+ }
+
+ if (rc != RTN_LOCAL)
+ return false;
+ }
+ return true;
+}
+
+static int
+fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
+{
+ bool ecn_ca = false;
+ struct nlattr *nla;
+ int remaining;
+
+ if (!cfg->fc_mx)
+ return 0;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return -EINVAL;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
+ return -EINVAL;
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ return -EINVAL;
+ fi->fib_metrics[type - 1] = val;
+ }
+
+ if (ecn_ca)
+ fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+ return 0;
+}
+
struct fib_info *fib_create_info(struct fib_config *cfg)
{
int err;
@@ -829,36 +1058,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} endfor_nexthops(fi)
- if (cfg->fc_mx) {
- struct nlattr *nla;
- int remaining;
-
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla_type(nla);
-
- if (type) {
- u32 val;
-
- if (type > RTAX_MAX)
- goto err_inval;
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
-
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp);
- if (val == TCP_CA_UNSPEC)
- goto err_inval;
- } else {
- val = nla_get_u32(nla);
- }
- if (type == RTAX_ADVMSS && val > 65535 - 40)
- val = 65535 - 40;
- if (type == RTAX_MTU && val > 65535 - 15)
- val = 65535 - 15;
- fi->fib_metrics[type - 1] = val;
- }
- }
- }
+ err = fib_convert_metrics(fi, cfg);
+ if (err)
+ goto failure;
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -879,6 +1081,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
} else {
struct fib_nh *nh = fi->fib_nh;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+
+ if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, AF_INET, cfg,
+ &lwtstate);
+ if (err)
+ goto failure;
+
+ nh->nh_lwtstate = lwtstate_get(lwtstate);
+ }
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
@@ -924,24 +1142,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (!nh->nh_dev)
goto failure;
} else {
+ int linkdown = 0;
+
change_nexthops(fi) {
err = fib_check_nh(cfg, fi, nexthop_nh);
if (err != 0)
goto failure;
+ if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+ linkdown++;
} endfor_nexthops(fi)
+ if (linkdown == fi->fib_nhs)
+ fi->fib_flags |= RTNH_F_LINKDOWN;
}
- if (fi->fib_prefsrc) {
- if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
- fi->fib_prefsrc != cfg->fc_dst)
- if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
- goto err_inval;
- }
+ if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
+ goto err_inval;
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
+ fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
+ fib_rebalance(fi);
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -1027,17 +1250,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
if (fi->fib_nhs == 1) {
+ struct in_device *in_dev;
+
if (fi->fib_nh->nh_gw &&
nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
goto nla_put_failure;
if (fi->fib_nh->nh_oif &&
nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
goto nla_put_failure;
+ if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+ in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+ rtm->rtm_flags |= RTNH_F_DEAD;
+ }
#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid &&
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
+ if (fi->fib_nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1049,11 +1282,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
for_nexthops(fi) {
+ struct in_device *in_dev;
+
rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
if (!rtnh)
goto nla_put_failure;
rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+ if (nh->nh_flags & RTNH_F_LINKDOWN) {
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+ rtnh->rtnh_flags |= RTNH_F_DEAD;
+ }
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
@@ -1065,6 +1306,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
+ if (nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate);
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
@@ -1107,7 +1350,13 @@ int fib_sync_down_addr(struct net *net, __be32 local)
return ret;
}
-int fib_sync_down_dev(struct net_device *dev, int force)
+/* Event force Flags Description
+ * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
+ * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
+ * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed
+ * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed
+ */
+int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
int ret = 0;
int scope = RT_SCOPE_NOWHERE;
@@ -1133,49 +1382,79 @@ int fib_sync_down_dev(struct net_device *dev, int force)
dead++;
else if (nexthop_nh->nh_dev == dev &&
nexthop_nh->nh_scope != scope) {
- nexthop_nh->nh_flags |= RTNH_F_DEAD;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nexthop_nh->nh_power;
- nexthop_nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+ break;
+ }
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (force > 1 && nexthop_nh->nh_dev == dev) {
+ if (event == NETDEV_UNREGISTER &&
+ nexthop_nh->nh_dev == dev) {
dead = fi->fib_nhs;
break;
}
#endif
} endfor_nexthops(fi)
if (dead == fi->fib_nhs) {
- fi->fib_flags |= RTNH_F_DEAD;
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ fi->fib_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ fi->fib_flags |= RTNH_F_LINKDOWN;
+ break;
+ }
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
}
/* Must be invoked inside of an RCU protected region. */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
struct hlist_head *fa_head = res->fa_head;
struct fib_table *tb = res->table;
+ u8 slen = 32 - res->prefixlen;
int order = -1, last_idx = -1;
- struct fib_alias *fa;
+ struct fib_alias *fa, *fa1 = NULL;
+ u32 last_prio = res->fi->fib_priority;
+ u8 last_tos = 0;
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
+ if (fa->fa_slen != slen)
+ continue;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
+ if (next_fi->fib_priority > last_prio &&
+ fa->fa_tos == last_tos) {
+ if (last_tos)
+ continue;
+ break;
+ }
+ if (next_fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ last_tos = fa->fa_tos;
+ last_prio = next_fi->fib_priority;
+
if (next_fi->fib_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
@@ -1185,10 +1464,11 @@ void fib_select_default(struct fib_result *res)
if (!fi) {
if (next_fi != res->fi)
break;
+ fa1 = fa;
} else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
+ &last_idx, fa1->fa_default)) {
fib_result_assign(res, fi);
- tb->tb_default = order;
+ fa1->fa_default = order;
goto out;
}
fi = next_fi;
@@ -1196,31 +1476,30 @@ void fib_select_default(struct fib_result *res)
}
if (order <= 0 || !fi) {
- tb->tb_default = -1;
+ if (fa1)
+ fa1->fa_default = -1;
goto out;
}
if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
+ fa1->fa_default)) {
fib_result_assign(res, fi);
- tb->tb_default = order;
+ fa1->fa_default = order;
goto out;
}
if (last_idx >= 0)
fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
+ fa1->fa_default = last_idx;
out:
return;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
/*
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
*/
-int fib_sync_up(struct net_device *dev)
+int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
{
struct fib_info *prev_fi;
unsigned int hash;
@@ -1231,6 +1510,13 @@ int fib_sync_up(struct net_device *dev)
if (!(dev->flags & IFF_UP))
return 0;
+ if (nh_flags & RTNH_F_DEAD) {
+ unsigned int flags = dev_get_flags(dev);
+
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ nh_flags |= RTNH_F_LINKDOWN;
+ }
+
prev_fi = NULL;
hash = fib_devindex_hashfn(dev->ifindex);
head = &fib_info_devhash[hash];
@@ -1247,7 +1533,7 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & nh_flags)) {
alive++;
continue;
}
@@ -1258,71 +1544,57 @@ int fib_sync_up(struct net_device *dev)
!__in_dev_get_rtnl(dev))
continue;
alive++;
- spin_lock_bh(&fib_multipath_lock);
- nexthop_nh->nh_power = 0;
- nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
- spin_unlock_bh(&fib_multipath_lock);
+ nexthop_nh->nh_flags &= ~nh_flags;
} endfor_nexthops(fi)
if (alive > 0) {
- fi->fib_flags &= ~RTNH_F_DEAD;
+ fi->fib_flags &= ~nh_flags;
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
}
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
- int w;
- spin_lock_bh(&fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
- power += nexthop_nh->nh_weight;
- nexthop_nh->nh_power = nexthop_nh->nh_weight;
- }
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power <= 0) {
- spin_unlock_bh(&fib_multipath_lock);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- return;
- }
- }
-
-
- /* w should be random number [0..fi->fib_power-1],
- * it is pretty bad approximation.
- */
-
- w = jiffies % fi->fib_power;
+ for_nexthops(fi) {
+ if (hash > atomic_read(&nh->nh_upper_bound))
+ continue;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
- nexthop_nh->nh_power) {
- w -= nexthop_nh->nh_power;
- if (w <= 0) {
- nexthop_nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&fib_multipath_lock);
- return;
- }
- }
+ res->nh_sel = nhsel;
+ return;
} endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
- spin_unlock_bh(&fib_multipath_lock);
}
#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+ struct flowi4 *fl4, int mp_hash)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (mp_hash < 0)
+ mp_hash = get_hash_from_flowi4(fl4) >> 1;
+
+ fib_select_multipath(res, mp_hash);
+ }
+ else
+#endif
+ if (!res->prefixlen &&
+ res->table->tb_num_default > 1 &&
+ res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(fl4, res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL_GPL(fib_select_path);
diff --git a/kernel/net/ipv4/fib_trie.c b/kernel/net/ipv4/fib_trie.c
index 0ca933db1..744e5936c 100644
--- a/kernel/net/ipv4/fib_trie.c
+++ b/kernel/net/ipv4/fib_trie.c
@@ -72,6 +72,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,6 +81,7 @@
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/switchdev.h>
+#include <trace/events/fib.h>
#include "fib_lookup.h"
#define MAX_STAT_DEPTH 32
@@ -324,13 +326,15 @@ static inline void empty_child_dec(struct key_vector *n)
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
- struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
- struct key_vector *l = kv->kv;
+ struct key_vector *l;
+ struct tnode *kv;
+ kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (!kv)
return NULL;
/* initialize key vector */
+ l = kv->kv;
l->key = key;
l->pos = 0;
l->bits = 0;
@@ -345,24 +349,26 @@ static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
- struct tnode *tnode = tnode_alloc(bits);
unsigned int shift = pos + bits;
- struct key_vector *tn = tnode->kv;
+ struct key_vector *tn;
+ struct tnode *tnode;
/* verify bits and pos their msb bits clear and values are valid */
BUG_ON(!bits || (shift > KEYLENGTH));
- pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
- sizeof(struct key_vector *) << bits);
-
+ tnode = tnode_alloc(bits);
if (!tnode)
return NULL;
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
if (bits == KEYLENGTH)
tnode->full_children = 1;
else
tnode->empty_children = 1ul << bits;
+ tn = tnode->kv;
tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
tn->pos = pos;
tn->bits = bits;
@@ -1077,6 +1083,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
+ unsigned int nlflags = 0;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1165,14 +1172,15 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
- err = netdev_switch_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
+ err = switchdev_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
if (err) {
- netdev_switch_fib_ipv4_abort(fi);
+ switchdev_fib_ipv4_abort(fi);
kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
}
@@ -1196,7 +1204,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (fa_match)
goto out;
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
+ if (cfg->fc_nlflags & NLM_F_APPEND)
+ nlflags = NLM_F_APPEND;
+ else
fa = fa_first;
}
err = -ENOENT;
@@ -1214,14 +1224,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
/* (Optionally) offload fib entry to switch hardware. */
- err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
+ err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
+ cfg->fc_nlflags, tb->tb_id);
if (err) {
- netdev_switch_fib_ipv4_abort(fi);
+ switchdev_fib_ipv4_abort(fi);
goto out_free_new_fa;
}
@@ -1235,12 +1244,12 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
- &cfg->fc_nlinfo, 0);
+ &cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
out_sw_fib_del:
- netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
+ switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1270,6 +1279,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
unsigned long index;
t_key cindex;
+ trace_fib_table_lookup(tb->tb_id, flp);
+
pn = t->kv;
cindex = 0;
@@ -1406,11 +1417,20 @@ found:
continue;
for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
const struct fib_nh *nh = &fi->fib_nh[nhsel];
+ struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
if (nh->nh_flags & RTNH_F_DEAD)
continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN &&
+ !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
+ if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
+ if (flp->flowi4_oif &&
+ flp->flowi4_oif != nh->nh_oif)
+ continue;
+ }
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&fi->fib_clntref);
@@ -1425,6 +1445,8 @@ found:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
+ trace_fib_table_lookup_nh(nh);
+
return err;
}
}
@@ -1518,8 +1540,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
+ switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1547,7 +1569,7 @@ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
do {
/* record parent and next child index */
pn = n;
- cindex = key ? get_index(key, pn) : 0;
+ cindex = (key > pn->key) ? get_index(key, pn) : 0;
if (cindex >> pn->bits)
break;
@@ -1768,10 +1790,9 @@ void fib_table_flush_external(struct fib_table *tb)
if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
continue;
- netdev_switch_fib_ipv4_del(n->key,
- KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos,
- fa->fa_type, tb->tb_id);
+ switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id);
}
/* update leaf slen */
@@ -1834,10 +1855,9 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- netdev_switch_fib_ipv4_del(n->key,
- KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos,
- fa->fa_type, tb->tb_id);
+ switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1976,7 +1996,6 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
return NULL;
tb->tb_id = id;
- tb->tb_default = -1;
tb->tb_num_default = 0;
tb->tb_data = (alias ? alias->__data : tb->__data);
@@ -2053,11 +2072,12 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct key_vector *n, *pn = t->kv;
+ struct key_vector *n, *pn;
if (!t)
return NULL;
+ pn = t->kv;
n = rcu_dereference(pn->tnode[0]);
if (!n)
return NULL;
diff --git a/kernel/net/ipv4/fou.c b/kernel/net/ipv4/fou.c
index 34968cd5c..bd903fe0f 100644
--- a/kernel/net/ipv4/fou.c
+++ b/kernel/net/ipv4/fou.c
@@ -24,6 +24,7 @@ struct fou {
u16 type;
struct udp_offload udp_offloads;
struct list_head list;
+ struct rcu_head rcu;
};
#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
@@ -79,7 +80,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
__be16 *pd = data;
size_t start = ntohs(pd[0]);
size_t offset = ntohs(pd[1]);
- size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+ size_t plen = sizeof(struct udphdr) + hdrlen +
+ max_t(size_t, offset + sizeof(u16), start);
+
+ if (skb->remcsum_offload)
+ return guehdr;
if (!pskb_may_pull(skb, plen))
return NULL;
@@ -221,29 +226,21 @@ out_unlock:
static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
struct guehdr *guehdr, void *data,
- size_t hdrlen, u8 ipproto,
- struct gro_remcsum *grc, bool nopartial)
+ size_t hdrlen, struct gro_remcsum *grc,
+ bool nopartial)
{
__be16 *pd = data;
size_t start = ntohs(pd[0]);
size_t offset = ntohs(pd[1]);
- size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
if (skb->remcsum_offload)
- return NULL;
+ return guehdr;
if (!NAPI_GRO_CB(skb)->csum_valid)
return NULL;
- /* Pull checksum that will be written */
- if (skb_gro_header_hard(skb, off + plen)) {
- guehdr = skb_gro_header_slow(skb, off + plen, off);
- if (!guehdr)
- return NULL;
- }
-
- skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
- start, offset, grc, nopartial);
+ guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
+ start, offset, grc, nopartial);
skb->remcsum_offload = 1;
@@ -307,10 +304,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
if (flags & GUE_PFLAG_REMCSUM) {
guehdr = gue_gro_remcsum(skb, off, guehdr,
- data + doffset, hdrlen,
- guehdr->proto_ctype, &grc,
+ data + doffset, hdrlen, &grc,
!!(fou->flags &
FOU_F_REMCSUM_NOPARTIAL));
+
if (!guehdr)
goto out;
@@ -351,7 +348,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[guehdr->proto_ctype]);
- if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
@@ -421,7 +418,7 @@ static void fou_release(struct fou *fou)
list_del(&fou->list);
udp_tunnel_sock_release(sock);
- kfree(fou);
+ kfree_rcu(fou, rcu);
}
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
@@ -570,7 +567,7 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
- if (family != AF_INET && family != AF_INET6)
+ if (family != AF_INET)
return -EINVAL;
cfg->udp_config.family = family;
diff --git a/kernel/net/ipv4/geneve.c b/kernel/net/ipv4/geneve.c
deleted file mode 100644
index 8986e63f3..000000000
--- a/kernel/net/ipv4/geneve.c
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
- * Geneve: Generic Network Virtualization Encapsulation
- *
- * Copyright (c) 2014 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/igmp.h>
-#include <linux/etherdevice.h>
-#include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/ethtool.h>
-#include <linux/mutex.h>
-#include <net/arp.h>
-#include <net/ndisc.h>
-#include <net/ip.h>
-#include <net/ip_tunnels.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#include <net/geneve.h>
-#include <net/protocol.h>
-#include <net/udp_tunnel.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-#include <net/ip6_tunnel.h>
-#include <net/ip6_checksum.h>
-#endif
-
-/* Protects sock_list and refcounts. */
-static DEFINE_MUTEX(geneve_mutex);
-
-/* per-network namespace private data for this module */
-struct geneve_net {
- struct list_head sock_list;
-};
-
-static int geneve_net_id;
-
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
- return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
-static struct geneve_sock *geneve_find_sock(struct net *net,
- sa_family_t family, __be16 port)
-{
- struct geneve_net *gn = net_generic(net, geneve_net_id);
- struct geneve_sock *gs;
-
- list_for_each_entry(gs, &gn->sock_list, list) {
- if (inet_sk(gs->sock->sk)->inet_sport == port &&
- inet_sk(gs->sock->sk)->sk.sk_family == family)
- return gs;
- }
-
- return NULL;
-}
-
-static void geneve_build_header(struct genevehdr *geneveh,
- __be16 tun_flags, u8 vni[3],
- u8 options_len, u8 *options)
-{
- geneveh->ver = GENEVE_VER;
- geneveh->opt_len = options_len / 4;
- geneveh->oam = !!(tun_flags & TUNNEL_OAM);
- geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
- geneveh->rsvd1 = 0;
- memcpy(geneveh->vni, vni, 3);
- geneveh->proto_type = htons(ETH_P_TEB);
- geneveh->rsvd2 = 0;
-
- memcpy(geneveh->options, options, options_len);
-}
-
-/* Transmit a fully formatted Geneve frame.
- *
- * When calling this function. The skb->data should point
- * to the geneve header which is fully formed.
- *
- * This function will add other UDP tunnel headers.
- */
-int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
- struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- bool csum, bool xnet)
-{
- struct genevehdr *gnvh;
- int min_headroom;
- int err;
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
- + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
-
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
- return err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (unlikely(!skb))
- return -ENOMEM;
-
- skb = udp_tunnel_handle_offloads(skb, csum);
- if (IS_ERR(skb))
- return PTR_ERR(skb);
-
- gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
- geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
-
- return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
- tos, ttl, df, src_port, dst_port, xnet,
- !csum);
-}
-EXPORT_SYMBOL_GPL(geneve_xmit_skb);
-
-static int geneve_hlen(struct genevehdr *gh)
-{
- return sizeof(*gh) + gh->opt_len * 4;
-}
-
-static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
-{
- struct sk_buff *p, **pp = NULL;
- struct genevehdr *gh, *gh2;
- unsigned int hlen, gh_len, off_gnv;
- const struct packet_offload *ptype;
- __be16 type;
- int flush = 1;
-
- off_gnv = skb_gro_offset(skb);
- hlen = off_gnv + sizeof(*gh);
- gh = skb_gro_header_fast(skb, off_gnv);
- if (skb_gro_header_hard(skb, hlen)) {
- gh = skb_gro_header_slow(skb, hlen, off_gnv);
- if (unlikely(!gh))
- goto out;
- }
-
- if (gh->ver != GENEVE_VER || gh->oam)
- goto out;
- gh_len = geneve_hlen(gh);
-
- hlen = off_gnv + gh_len;
- if (skb_gro_header_hard(skb, hlen)) {
- gh = skb_gro_header_slow(skb, hlen, off_gnv);
- if (unlikely(!gh))
- goto out;
- }
-
- flush = 0;
-
- for (p = *head; p; p = p->next) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- gh2 = (struct genevehdr *)(p->data + off_gnv);
- if (gh->opt_len != gh2->opt_len ||
- memcmp(gh, gh2, gh_len)) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
- }
-
- type = gh->proto_type;
-
- rcu_read_lock();
- ptype = gro_find_receive_by_type(type);
- if (!ptype) {
- flush = 1;
- goto out_unlock;
- }
-
- skb_gro_pull(skb, gh_len);
- skb_gro_postpull_rcsum(skb, gh, gh_len);
- pp = ptype->callbacks.gro_receive(head, skb);
-
-out_unlock:
- rcu_read_unlock();
-out:
- NAPI_GRO_CB(skb)->flush |= flush;
-
- return pp;
-}
-
-static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
-{
- struct genevehdr *gh;
- struct packet_offload *ptype;
- __be16 type;
- int gh_len;
- int err = -ENOSYS;
-
- udp_tunnel_gro_complete(skb, nhoff);
-
- gh = (struct genevehdr *)(skb->data + nhoff);
- gh_len = geneve_hlen(gh);
- type = gh->proto_type;
-
- rcu_read_lock();
- ptype = gro_find_complete_by_type(type);
- if (ptype)
- err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
-
- rcu_read_unlock();
- return err;
-}
-
-static void geneve_notify_add_rx_port(struct geneve_sock *gs)
-{
- struct sock *sk = gs->sock->sk;
- sa_family_t sa_family = sk->sk_family;
- int err;
-
- if (sa_family == AF_INET) {
- err = udp_add_offload(&gs->udp_offloads);
- if (err)
- pr_warn("geneve: udp_add_offload failed with status %d\n",
- err);
- }
-}
-
-static void geneve_notify_del_rx_port(struct geneve_sock *gs)
-{
- struct sock *sk = gs->sock->sk;
- sa_family_t sa_family = sk->sk_family;
-
- if (sa_family == AF_INET)
- udp_del_offload(&gs->udp_offloads);
-}
-
-/* Callback from net/ipv4/udp.c to receive packets */
-static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct genevehdr *geneveh;
- struct geneve_sock *gs;
- int opts_len;
-
- /* Need Geneve and inner Ethernet header to be present */
- if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
- goto error;
-
- /* Return packets with reserved bits set */
- geneveh = geneve_hdr(skb);
-
- if (unlikely(geneveh->ver != GENEVE_VER))
- goto error;
-
- if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
- goto error;
-
- opts_len = geneveh->opt_len * 4;
- if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
- htons(ETH_P_TEB)))
- goto drop;
-
- gs = rcu_dereference_sk_user_data(sk);
- if (!gs)
- goto drop;
-
- gs->rcv(gs, skb);
- return 0;
-
-drop:
- /* Consume bad packet */
- kfree_skb(skb);
- return 0;
-
-error:
- /* Let the UDP layer deal with the skb */
- return 1;
-}
-
-static struct socket *geneve_create_sock(struct net *net, bool ipv6,
- __be16 port)
-{
- struct socket *sock;
- struct udp_port_cfg udp_conf;
- int err;
-
- memset(&udp_conf, 0, sizeof(udp_conf));
-
- if (ipv6) {
- udp_conf.family = AF_INET6;
- } else {
- udp_conf.family = AF_INET;
- udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
- }
-
- udp_conf.local_udp_port = port;
-
- /* Open UDP socket */
- err = udp_sock_create(net, &udp_conf, &sock);
- if (err < 0)
- return ERR_PTR(err);
-
- return sock;
-}
-
-/* Create new listen socket if needed */
-static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
- bool ipv6)
-{
- struct geneve_net *gn = net_generic(net, geneve_net_id);
- struct geneve_sock *gs;
- struct socket *sock;
- struct udp_tunnel_sock_cfg tunnel_cfg;
-
- gs = kzalloc(sizeof(*gs), GFP_KERNEL);
- if (!gs)
- return ERR_PTR(-ENOMEM);
-
- sock = geneve_create_sock(net, ipv6, port);
- if (IS_ERR(sock)) {
- kfree(gs);
- return ERR_CAST(sock);
- }
-
- gs->sock = sock;
- gs->refcnt = 1;
- gs->rcv = rcv;
- gs->rcv_data = data;
-
- /* Initialize the geneve udp offloads structure */
- gs->udp_offloads.port = port;
- gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive;
- gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
- geneve_notify_add_rx_port(gs);
-
- /* Mark socket as an encapsulation socket */
- tunnel_cfg.sk_user_data = gs;
- tunnel_cfg.encap_type = 1;
- tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
- tunnel_cfg.encap_destroy = NULL;
- setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
-
- list_add(&gs->list, &gn->sock_list);
-
- return gs;
-}
-
-struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
- bool no_share, bool ipv6)
-{
- struct geneve_sock *gs;
-
- mutex_lock(&geneve_mutex);
-
- gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
- if (gs) {
- if (!no_share && gs->rcv == rcv)
- gs->refcnt++;
- else
- gs = ERR_PTR(-EBUSY);
- } else {
- gs = geneve_socket_create(net, port, rcv, data, ipv6);
- }
-
- mutex_unlock(&geneve_mutex);
-
- return gs;
-}
-EXPORT_SYMBOL_GPL(geneve_sock_add);
-
-void geneve_sock_release(struct geneve_sock *gs)
-{
- mutex_lock(&geneve_mutex);
-
- if (--gs->refcnt)
- goto unlock;
-
- list_del(&gs->list);
- geneve_notify_del_rx_port(gs);
- udp_tunnel_sock_release(gs->sock);
- kfree_rcu(gs, rcu);
-
-unlock:
- mutex_unlock(&geneve_mutex);
-}
-EXPORT_SYMBOL_GPL(geneve_sock_release);
-
-static __net_init int geneve_init_net(struct net *net)
-{
- struct geneve_net *gn = net_generic(net, geneve_net_id);
-
- INIT_LIST_HEAD(&gn->sock_list);
-
- return 0;
-}
-
-static struct pernet_operations geneve_net_ops = {
- .init = geneve_init_net,
- .id = &geneve_net_id,
- .size = sizeof(struct geneve_net),
-};
-
-static int __init geneve_init_module(void)
-{
- int rc;
-
- rc = register_pernet_subsys(&geneve_net_ops);
- if (rc)
- return rc;
-
- pr_info("Geneve driver\n");
-
- return 0;
-}
-module_init(geneve_init_module);
-
-static void __exit geneve_cleanup_module(void)
-{
- unregister_pernet_subsys(&geneve_net_ops);
-}
-module_exit(geneve_cleanup_module);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
-MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
-MODULE_ALIAS_RTNL_LINK("geneve");
diff --git a/kernel/net/ipv4/gre_demux.c b/kernel/net/ipv4/gre_demux.c
index 4a7b5b2a1..d9c552a72 100644
--- a/kernel/net/ipv4/gre_demux.c
+++ b/kernel/net/ipv4/gre_demux.c
@@ -31,7 +31,6 @@
#include <net/xfrm.h>
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
@@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
- int hdr_len)
-{
- struct gre_base_hdr *greh;
-
- skb_push(skb, hdr_len);
-
- skb_reset_transport_header(skb);
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(tpi->flags);
- greh->protocol = tpi->proto;
-
- if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (tpi->flags&TUNNEL_SEQ) {
- *ptr = tpi->seq;
- ptr--;
- }
- if (tpi->flags&TUNNEL_KEY) {
- *ptr = tpi->key;
- ptr--;
- }
- if (tpi->flags&TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type &
- (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
- *ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
- }
- }
-}
-EXPORT_SYMBOL_GPL(gre_build_header);
-
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err)
-{
- const struct gre_base_hdr *greh;
- __be32 *options;
- int hdr_len;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
- return -EINVAL;
-
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
- hdr_len = ip_gre_calc_hlen(tpi->flags);
-
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- tpi->proto = greh->protocol;
-
- options = (__be32 *)(greh + 1);
- if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
- *csum_err = true;
- return -EINVAL;
- }
-
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
-
- options++;
- }
-
- if (greh->flags & GRE_KEY) {
- tpi->key = *options;
- options++;
- } else
- tpi->key = 0;
-
- if (unlikely(greh->flags & GRE_SEQ)) {
- tpi->seq = *options;
- options++;
- } else
- tpi->seq = 0;
-
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
- tpi->proto = htons(ETH_P_IP);
- if ((*(u8 *)options & 0xF0) != 0x40) {
- hdr_len += 4;
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
- }
- }
-
- return iptunnel_pull_header(skb, hdr_len, tpi->proto);
-}
-
-static int gre_cisco_rcv(struct sk_buff *skb)
-{
- struct tnl_ptk_info tpi;
- int i;
- bool csum_err = false;
-
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
- }
-#endif
-
- if (parse_gre_header(skb, &tpi, &csum_err) < 0)
- goto drop;
-
- rcu_read_lock();
- for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
- struct gre_cisco_protocol *proto;
- int ret;
-
- proto = rcu_dereference(gre_cisco_proto_list[i]);
- if (!proto)
- continue;
- ret = proto->handler(skb, &tpi);
- if (ret == PACKET_RCVD) {
- rcu_read_unlock();
- return 0;
- }
- }
- rcu_read_unlock();
-
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-drop:
- kfree_skb(skb);
- return 0;
-}
-
-static void gre_cisco_err(struct sk_buff *skb, u32 info)
-{
- /* All the routers (except for Linux) return only
- * 8 bytes of packet payload. It means, that precise relaying of
- * ICMP in the real Internet is absolutely infeasible.
- *
- * Moreover, Cisco "wise men" put GRE key to the third word
- * in GRE header. It makes impossible maintaining even soft
- * state for keyed
- * GRE tunnels with enabled checksum. Tell them "thank you".
- *
- * Well, I wonder, rfc1812 was written by Cisco employee,
- * what the hell these idiots break standards established
- * by themselves???
- */
-
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- struct tnl_ptk_info tpi;
- bool csum_err = false;
- int i;
-
- if (parse_gre_header(skb, &tpi, &csum_err)) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
-
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- skb->dev->ifindex, 0, IPPROTO_GRE, 0);
- return;
- }
- if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
- IPPROTO_GRE, 0);
- return;
- }
-
- rcu_read_lock();
- for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
- struct gre_cisco_protocol *proto;
-
- proto = rcu_dereference(gre_cisco_proto_list[i]);
- if (!proto)
- continue;
-
- if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
- goto out;
-
- }
-out:
- rcu_read_unlock();
-}
-
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
@@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = {
.netns_ok = 1,
};
-static const struct gre_protocol ipgre_protocol = {
- .handler = gre_cisco_rcv,
- .err_handler = gre_cisco_err,
-};
-
-int gre_cisco_register(struct gre_cisco_protocol *newp)
-{
- struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
- &gre_cisco_proto_list[newp->priority];
-
- return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_register);
-
-int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
-{
- struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
- &gre_cisco_proto_list[del_proto->priority];
- int ret;
-
- ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
-
- if (ret)
- return ret;
-
- synchronize_net();
- return 0;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_unregister);
-
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
- goto err;
- }
-
- if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
- pr_info("%s: can't add ipgre handler\n", __func__);
- goto err_gre;
+ return -EAGAIN;
}
-
return 0;
-err_gre:
- inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-err:
- return -EAGAIN;
}
static void __exit gre_exit(void)
{
- gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/kernel/net/ipv4/gre_offload.c b/kernel/net/ipv4/gre_offload.c
index 5aa46d4b4..5a8ee3282 100644
--- a/kernel/net/ipv4/gre_offload.c
+++ b/kernel/net/ipv4/gre_offload.c
@@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP)))
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT)))
goto out;
if (!skb->encapsulation)
diff --git a/kernel/net/ipv4/icmp.c b/kernel/net/ipv4/icmp.c
index be5fd9b81..74314d95d 100644
--- a/kernel/net/ipv4/icmp.c
+++ b/kernel/net/ipv4/icmp.c
@@ -97,6 +97,7 @@
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
+#include <net/l3mdev.h>
/*
* Build xmit assembly blocks
@@ -309,9 +310,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
rc = false;
if (icmp_global_allow()) {
+ int vif = l3mdev_master_ifindex(dst->dev);
struct inet_peer *peer;
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
if (peer)
@@ -426,6 +428,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
+ fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -438,6 +441,22 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Source and destination is swapped. See ip_multipath_icmp_hash */
+static int icmp_multipath_hash_skb(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return fib_multipath_hash(iph->daddr, iph->saddr);
+}
+
+#else
+
+#define icmp_multipath_hash_skb(skb) (-1)
+
+#endif
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -459,8 +478,11 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
+ fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
+
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key(net, fl4);
+ rt = __ip_route_output_key_hash(net, fl4,
+ icmp_multipath_hash_skb(skb_in));
if (IS_ERR(rt))
return rt;
@@ -481,7 +503,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ if (inet_addr_type_dev_table(net, skb_in->dev,
+ fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
err = PTR_ERR(rt2);
@@ -497,6 +520,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
}
/* Ugh! */
orefdst = skb_in->_skb_refdst; /* save old refdst */
+ skb_dst_set(skb_in, NULL);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
RT_TOS(tos), rt2->dst.dev);
@@ -829,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb)
*/
if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
- inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+ inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
&ip_hdr(skb)->saddr,
icmph->type, icmph->code,
diff --git a/kernel/net/ipv4/igmp.c b/kernel/net/ipv4/igmp.c
index a3a697f5f..05e4cba14 100644
--- a/kernel/net/ipv4/igmp.c
+++ b/kernel/net/ipv4/igmp.c
@@ -110,6 +110,9 @@
#define IP_MAX_MEMBERSHIPS 20
#define IP_MAX_MSF 10
+/* IGMP reports for link-local multicast groups are enabled by default */
+int sysctl_igmp_llm_reports __read_mostly = 1;
+
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
@@ -394,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(skb);
+ return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
+ if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+ return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
@@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
for_each_pmc_rcu(in_dev, pmc) {
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(pmc->multiaddr) &&
+ !sysctl_igmp_llm_reports)
+ continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
@@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
- else if (type == IGMP_HOST_LEAVE_MESSAGE)
+
+ if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ return 0;
+
+ if (type == IGMP_HOST_LEAVE_MESSAGE)
dst = IGMP_ALL_ROUTER;
else
dst = group;
@@ -727,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
static void igmp_gq_timer_expire(unsigned long data)
@@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
if (group == IGMP_ALL_HOSTS)
return false;
+ if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ return false;
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
@@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
continue;
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !sysctl_igmp_llm_reports)
+ continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
im->gsquery = im->gsquery && mark;
@@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ return;
reporter = im->reporter;
igmp_stop_timer(im);
@@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ return;
if (in_dev->dead)
return;
@@ -1339,6 +1360,171 @@ out:
}
EXPORT_SYMBOL(ip_mc_inc_group);
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ unsigned int len;
+ unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+ return -EINVAL;
+
+ offset += ip_hdrlen(skb) - sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ return -EINVAL;
+
+ len = skb_network_offset(skb) + ntohs(iph->tot_len);
+ if (skb->len < len || len < offset)
+ return -EINVAL;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmpv3_report);
+
+ return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmphdr);
+ if (skb->len < len)
+ return -EINVAL;
+
+ /* IGMPv{1,2}? */
+ if (skb->len != len) {
+ /* or IGMPv3? */
+ len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
+ if (skb->len < len || !pskb_may_pull(skb, len))
+ return -EINVAL;
+ }
+
+ /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+ * all-systems destination addresses (224.0.0.1) for general queries
+ */
+ if (!igmp_hdr(skb)->group &&
+ ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+ switch (igmp_hdr(skb)->type) {
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ /* fall through */
+ return 0;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ return ip_mc_check_igmp_reportv3(skb);
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ return ip_mc_check_igmp_query(skb);
+ default:
+ return -ENOMSG;
+ }
+}
+
+static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+ return skb_checksum_simple_validate(skb);
+}
+
+static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+
+{
+ struct sk_buff *skb_chk;
+ unsigned int transport_len;
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+ int ret = -EINVAL;
+
+ transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+
+ skb_chk = skb_checksum_trimmed(skb, transport_len,
+ ip_mc_validate_checksum);
+ if (!skb_chk)
+ goto err;
+
+ if (!pskb_may_pull(skb_chk, len))
+ goto err;
+
+ ret = ip_mc_check_igmp_msg(skb_chk);
+ if (ret)
+ goto err;
+
+ if (skb_trimmed)
+ *skb_trimmed = skb_chk;
+ /* free now unneeded clone */
+ else if (skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ ret = 0;
+
+err:
+ if (ret && skb_chk && skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return ret;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb transport header accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ * standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an IGMP packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * Caller needs to set the skb network header and free any returned skb if it
+ * differs from the provided skb.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+ int ret = ip_mc_check_iphdr(skb);
+
+ if (ret < 0)
+ return ret;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+ return -ENOMSG;
+
+ return __ip_mc_check_igmp(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
/*
* Resend IGMP JOIN report; used by netdev notifier.
*/
@@ -1353,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !sysctl_igmp_llm_reports)
+ continue;
/* a failover is happening and switches
* must be notified immediately
@@ -1937,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
ASSERT_RTNL();
in_dev = ip_mc_find_dev(net, imr);
- if (!in_dev) {
+ if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
ret = -ENODEV;
goto out;
}
@@ -1958,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
- ip_mc_dec_group(in_dev, group);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, group);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
@@ -2203,11 +2393,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct ip_sf_socklist *psl;
struct net *net = sock_net(sk);
+ ASSERT_RTNL();
+
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
@@ -2228,7 +2418,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
if (!psl) {
len = 0;
count = 0;
@@ -2247,7 +2436,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
return -EFAULT;
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2261,6 +2449,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ ASSERT_RTNL();
+
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
@@ -2268,8 +2458,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
@@ -2281,7 +2469,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
goto done;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
@@ -2301,7 +2488,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
}
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2380,7 +2566,7 @@ void ip_mc_drop_socket(struct sock *sk)
}
/* called with rcu_read_lock() */
-int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
struct ip_mc_list __rcu **mc_hash;
diff --git a/kernel/net/ipv4/inet_connection_sock.c b/kernel/net/ipv4/inet_connection_sock.c
index b27fc401c..641489148 100644
--- a/kernel/net/ipv4/inet_connection_sock.c
+++ b/kernel/net/ipv4/inet_connection_sock.c
@@ -99,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk);
+ int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
local_bh_disable();
if (!snum) {
@@ -106,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
again:
inet_get_local_port_range(net, &low, &high);
+ if (attempt_half) {
+ int half = low + ((high - low) >> 1);
+
+ if (attempt_half == 1)
+ high = half;
+ else
+ low = half;
+ }
remaining = (high - low) + 1;
smallest_rover = rover = prandom_u32() % remaining + low;
@@ -127,11 +136,6 @@ again:
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
- !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
- snum = smallest_rover;
- goto tb_found;
- }
}
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
@@ -159,6 +163,11 @@ again:
snum = smallest_rover;
goto have_snum;
}
+ if (attempt_half == 1) {
+ /* OK we now try the upper half of the range */
+ attempt_half = 2;
+ goto again;
+ }
goto fail;
}
/* OK, here is the one we will use. HEAD is
@@ -321,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
- req = reqsk_queue_remove(queue);
+ req = reqsk_queue_remove(queue, sk);
newsk = req->sk;
- sk_acceptq_removed(sk);
if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_rsk(req)->tfo_listener &&
- queue->fastopenq) {
- spin_lock_bh(&queue->fastopenq->lock);
+ tcp_rsk(req)->tfo_listener) {
+ spin_lock_bh(&queue->fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
@@ -339,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
req->sk = NULL;
req = NULL;
}
- spin_unlock_bh(&queue->fastopenq->lock);
+ spin_unlock_bh(&queue->fastopenq.lock);
}
out:
release_sock(sk);
@@ -399,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
}
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
-struct dst_entry *inet_csk_route_req(struct sock *sk,
+struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
{
@@ -430,7 +437,7 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
-struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
const struct request_sock *req)
{
@@ -469,65 +476,12 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u32 synq_hsize)
-{
- return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
-}
-
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) true
#endif
-/* Note: this is temporary :
- * req sock will no longer be in listener hash table
-*/
-struct request_sock *inet_csk_search_req(struct sock *sk,
- const __be16 rport,
- const __be32 raddr,
- const __be32 laddr)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req;
- u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries);
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
- for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->ir_rmt_port == rport &&
- ireq->ir_rmt_addr == raddr &&
- ireq->ir_loc_addr == laddr &&
- AF_INET_FAMILY(req->rsk_ops->family)) {
- atomic_inc(&req->rsk_refcnt);
- WARN_ON(req->sk);
- break;
- }
- }
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return req;
-}
-EXPORT_SYMBOL_GPL(inet_csk_search_req);
-
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
- inet_rsk(req)->ir_rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
-}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
-
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
@@ -554,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
req->num_timeout >= rskq_defer_accept - 1;
}
-int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
{
int err = req->rsk_ops->rtx_syn_ack(parent, req);
@@ -564,27 +518,21 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
-/* return true if req was found in the syn_table[] */
+/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req)
{
- struct listen_sock *lopt = queue->listen_opt;
- struct request_sock **prev;
+ struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
- spin_lock(&queue->syn_wait_lock);
+ if (sk_hashed(req_to_sk(req))) {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
- for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
- prev = &(*prev)->dl_next) {
- if (*prev == req) {
- *prev = req->dl_next;
- found = true;
- break;
- }
+ spin_lock(lock);
+ found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+ spin_unlock(lock);
}
-
- spin_unlock(&queue->syn_wait_lock);
- if (del_timer_sync(&req->rsk_timer))
+ if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req);
return found;
}
@@ -598,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_put(req);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+
static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct listen_sock *lopt = queue->listen_opt;
int qlen, expire = 0, resend = 0;
int max_retries, thresh;
u8 defer_accept;
- if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
- reqsk_put(req);
- return;
- }
+ if (sk_state_load(sk_listener) != TCP_LISTEN)
+ goto drop;
max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
thresh = max_retries;
@@ -633,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data)
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- qlen = listen_sock_qlen(lopt);
- if (qlen >> (lopt->max_qlen_log - 1)) {
- int young = listen_sock_young(lopt) << 1;
+ qlen = reqsk_queue_len(queue);
+ if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
if (qlen < young)
@@ -657,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data)
unsigned long timeo;
if (req->num_timeout++ == 0)
- atomic_inc(&lopt->young_dec);
+ atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
return;
}
- inet_csk_reqsk_queue_drop(sk_listener, req);
- reqsk_put(req);
+drop:
+ inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
}
-void reqsk_queue_hash_req(struct request_sock_queue *queue,
- u32 hash, struct request_sock *req,
- unsigned long timeout)
+static void reqsk_queue_hash_req(struct request_sock *req,
+ unsigned long timeout)
{
- struct listen_sock *lopt = queue->listen_opt;
-
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
+ setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+
+ inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
- atomic_set(&req->rsk_refcnt, 2);
- setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
- req->rsk_hash = hash;
-
- spin_lock(&queue->syn_wait_lock);
- req->dl_next = lopt->syn_table[hash];
- lopt->syn_table[hash] = req;
- spin_unlock(&queue->syn_wait_lock);
+ atomic_set(&req->rsk_refcnt, 2 + 1);
+}
- mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ unsigned long timeout)
+{
+ reqsk_queue_hash_req(req, timeout);
+ inet_csk_reqsk_queue_added(sk);
}
-EXPORT_SYMBOL(reqsk_queue_hash_req);
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -782,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+int inet_csk_listen_start(struct sock *sk, int backlog)
{
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+ struct inet_sock *inet = inet_sk(sk);
- if (rc != 0)
- return rc;
+ reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = 0;
+ sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -800,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
- sk->sk_state = TCP_LISTEN;
+ sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
@@ -811,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
}
sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+ reqsk_put(req);
+}
+
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+ struct request_sock *req,
+ struct sock *child)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+ spin_lock(&queue->rskq_lock);
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_child_forget(sk, req, child);
+ child = NULL;
+ } else {
+ req->sk = child;
+ req->dl_next = NULL;
+ if (queue->rskq_accept_head == NULL)
+ queue->rskq_accept_head = req;
+ else
+ queue->rskq_accept_tail->dl_next = req;
+ queue->rskq_accept_tail = req;
+ sk_acceptq_added(sk);
+ }
+ spin_unlock(&queue->rskq_lock);
+ return child;
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+ struct request_sock *req, bool own_req)
+{
+ if (own_req) {
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ if (inet_csk_reqsk_queue_add(sk, req, child))
+ return child;
+ }
+ /* Too bad, another child took ownership of the request, undo. */
+ bh_unlock_sock(child);
+ sock_put(child);
+ return NULL;
+}
+EXPORT_SYMBOL(inet_csk_complete_hashdance);
+
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted.
@@ -824,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct request_sock *acc_req;
- struct request_sock *req;
-
- /* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(queue);
+ struct request_sock *next, *req;
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -838,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(queue);
-
- while ((req = acc_req) != NULL) {
+ while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
struct sock *child = req->sk;
- acc_req = req->dl_next;
-
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
- sk->sk_prot->disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- percpu_counter_inc(sk->sk_prot->orphan_count);
-
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != req->rsk_listener);
-
- /* Paranoid, to prevent race condition if
- * an inbound pkt destined for child is
- * blocked by sock lock in tcp_v4_rcv().
- * Also to satisfy an assertion in
- * tcp_v4_destroy_sock().
- */
- tcp_sk(child)->fastopen_rsk = NULL;
- }
- inet_csk_destroy_sock(child);
-
+ inet_child_forget(sk, req, child);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
- sk_acceptq_removed(sk);
- reqsk_put(req);
+ cond_resched();
}
- if (queue->fastopenq) {
+ if (queue->fastopenq.rskq_rst_head) {
/* Free all the reqs queued in rskq_rst_head. */
- spin_lock_bh(&queue->fastopenq->lock);
- acc_req = queue->fastopenq->rskq_rst_head;
- queue->fastopenq->rskq_rst_head = NULL;
- spin_unlock_bh(&queue->fastopenq->lock);
- while ((req = acc_req) != NULL) {
- acc_req = req->dl_next;
+ spin_lock_bh(&queue->fastopenq.lock);
+ req = queue->fastopenq.rskq_rst_head;
+ queue->fastopenq.rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq.lock);
+ while (req != NULL) {
+ next = req->dl_next;
reqsk_put(req);
+ req = next;
}
}
- WARN_ON(sk->sk_ack_backlog);
+ WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/kernel/net/ipv4/inet_diag.c b/kernel/net/ipv4/inet_diag.c
index 4d32262c7..ab9f8a666 100644
--- a/kernel/net/ipv4/inet_diag.c
+++ b/kernel/net/ipv4/inet_diag.c
@@ -151,6 +151,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (nla_put_u8(skb, INET_DIAG_TCLASS,
inet6_sk(sk)->tclass) < 0)
goto errout;
+
+ if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+ goto errout;
}
#endif
@@ -200,9 +204,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
}
#undef EXPIRES_IN_MS
- if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
attr = nla_reserve(skb, INET_DIAG_INFO,
- sizeof(struct tcp_info));
+ handler->idiag_info_size);
if (!attr)
goto errout;
@@ -726,91 +730,21 @@ static void twsk_build_assert(void)
#endif
}
-static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_diag_entry entry;
- int j, s_j, reqnum, s_reqnum;
- struct listen_sock *lopt;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-
- lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !listen_sock_qlen(lopt))
- goto out;
-
- if (bc) {
- entry.sport = inet->inet_num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < lopt->nr_table_entries; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.idiag_dport != ireq->ir_rmt_port &&
- r->id.idiag_dport)
- continue;
-
- if (bc) {
- /* Note: entry.sport and entry.userlocks are already set */
- entry_fill_addrs(&entry, req_to_sk(req));
- entry.dport = ntohs(ireq->ir_rmt_port);
-
- if (!inet_diag_bc_run(bc, &entry))
- continue;
- }
-
- err = inet_req_diag_fill(req_to_sk(req), skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, cb->nlh);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return err;
-}
-
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
+ u32 idiag_states = r->idiag_states;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & TCPF_LISTEN))
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
@@ -840,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (!(r->idiag_states & TCPF_LISTEN) ||
- r->id.idiag_dport ||
+ if (r->id.idiag_dport ||
cb->args[3] > 0)
- goto syn_recv;
-
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
- spin_unlock_bh(&ilb->lock);
- goto done;
- }
-
-syn_recv:
- if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
- if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -875,7 +799,7 @@ skip_listen_ht:
s_i = num = s_num = 0;
}
- if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & ~TCPF_LISTEN))
goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
@@ -902,7 +826,7 @@ skip_listen_ht:
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state;
- if (!(r->idiag_states & (1 << state)))
+ if (!(idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
@@ -1078,14 +1002,62 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
+static
+int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
+{
+ const struct inet_diag_handler *handler;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ struct inet_diag_msg *r;
+ void *info = NULL;
+ int err = 0;
+
+ nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0);
+ if (!nlh)
+ return -ENOMEM;
+
+ r = nlmsg_data(nlh);
+ memset(r, 0, sizeof(*r));
+ inet_diag_msg_common_fill(r, sk);
+ if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM)
+ r->id.idiag_sport = inet_sk(sk)->inet_sport;
+ r->idiag_state = sk->sk_state;
+
+ if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ handler = inet_diag_lock_handler(sk->sk_protocol);
+ if (IS_ERR(handler)) {
+ inet_diag_unlock_handler(handler);
+ nlmsg_cancel(skb, nlh);
+ return PTR_ERR(handler);
+ }
+
+ attr = handler->idiag_info_size
+ ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size)
+ : NULL;
+ if (attr)
+ info = nla_data(attr);
+
+ handler->idiag_get_info(sk, r, info);
+ inet_diag_unlock_handler(handler);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
.dump = inet_diag_handler_dump,
+ .get_info = inet_diag_handler_get_info,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
.dump = inet_diag_handler_dump,
+ .get_info = inet_diag_handler_get_info,
};
int inet_diag_register(const struct inet_diag_handler *h)
diff --git a/kernel/net/ipv4/inet_fragment.c b/kernel/net/ipv4/inet_fragment.c
index 5e346a082..fe144dae7 100644
--- a/kernel/net/ipv4/inet_fragment.c
+++ b/kernel/net/ipv4/inet_fragment.c
@@ -131,34 +131,22 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
unsigned int evicted = 0;
HLIST_HEAD(expired);
-evict_again:
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))
continue;
- if (!del_timer(&fq->timer)) {
- /* q expiring right now thus increment its refcount so
- * it won't be freed under us and wait until the timer
- * has finished executing then destroy it
- */
- atomic_inc(&fq->refcnt);
- spin_unlock(&hb->chain_lock);
- del_timer_sync(&fq->timer);
- inet_frag_put(fq, f);
- goto evict_again;
- }
+ if (!del_timer(&fq->timer))
+ continue;
- fq->flags |= INET_FRAG_EVICTED;
- hlist_del(&fq->list);
- hlist_add_head(&fq->list, &expired);
+ hlist_add_head(&fq->list_evictor, &expired);
++evicted;
}
spin_unlock(&hb->chain_lock);
- hlist_for_each_entry_safe(fq, n, &expired, list)
+ hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq);
return evicted;
@@ -221,12 +209,6 @@ int inet_frags_init(struct inet_frags *f)
}
EXPORT_SYMBOL(inet_frags_init);
-void inet_frags_init_net(struct netns_frags *nf)
-{
- init_frag_mem_limit(nf);
-}
-EXPORT_SYMBOL(inet_frags_init_net);
-
void inet_frags_fini(struct inet_frags *f)
{
cancel_work_sync(&f->frags_work);
@@ -240,18 +222,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
int i;
nf->low_thresh = 0;
- local_bh_disable();
evict_again:
+ local_bh_disable();
seq = read_seqbegin(&f->rnd_seqlock);
for (i = 0; i < INETFRAGS_HASHSZ ; i++)
inet_evict_bucket(f, &f->hash[i]);
- if (read_seqretry(&f->rnd_seqlock, seq))
- goto evict_again;
-
local_bh_enable();
+ cond_resched();
+
+ if (read_seqretry(&f->rnd_seqlock, seq) ||
+ percpu_counter_sum(&nf->mem))
+ goto evict_again;
percpu_counter_destroy(&nf->mem);
}
@@ -284,8 +268,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
struct inet_frag_bucket *hb;
hb = get_frag_bucket_locked(fq, f);
- if (!(fq->flags & INET_FRAG_EVICTED))
- hlist_del(&fq->list);
+ hlist_del(&fq->list);
+ fq->flags |= INET_FRAG_COMPLETE;
spin_unlock(&hb->chain_lock);
}
@@ -297,7 +281,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
- fq->flags |= INET_FRAG_COMPLETE;
}
}
EXPORT_SYMBOL(inet_frag_kill);
@@ -330,11 +313,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
fp = xp;
}
sum = sum_truesize + f->qsize;
- sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
+
+ sub_frag_mem_limit(nf, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
@@ -390,7 +374,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
q->net = nf;
f->constructor(q, arg);
- add_frag_mem_limit(q, f->qsize);
+ add_frag_mem_limit(nf, f->qsize);
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
diff --git a/kernel/net/ipv4/inet_hashtables.c b/kernel/net/ipv4/inet_hashtables.c
index c6fb80bd5..ccc598079 100644
--- a/kernel/net/ipv4/inet_hashtables.c
+++ b/kernel/net/ipv4/inet_hashtables.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
+#include <linux/vmalloc.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
@@ -90,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
- atomic_inc(&hashinfo->bsockets);
-
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
tb->num_owners++;
@@ -111,8 +108,6 @@ static void __inet_put_port(struct sock *sk)
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind_bucket *tb;
- atomic_dec(&hashinfo->bsockets);
-
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
@@ -131,7 +126,7 @@ void inet_put_port(struct sock *sk)
}
EXPORT_SYMBOL(inet_put_port);
-int __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num;
@@ -142,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (unlikely(!tb)) {
+ spin_unlock(&head->lock);
+ return -ENOENT;
+ }
if (tb->port != port) {
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
@@ -190,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
}
return score;
}
@@ -348,7 +349,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -376,21 +376,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
@@ -399,23 +395,27 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static inline u32 inet_sk_port_offset(const struct sock *sk)
+static u32 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
+
return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
inet->inet_daddr,
inet->inet_dport);
}
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+/* insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
- int twrefcnt = 0;
+ bool ret = true;
- WARN_ON(!sk_unhashed(sk));
+ WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
@@ -423,25 +423,41 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock(lock);
- __sk_nulls_add_node_rcu(sk, list);
- if (tw) {
- WARN_ON(sk->sk_hash != tw->tw_hash);
- twrefcnt = inet_twsk_unhash(tw);
+ if (osk) {
+ WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+ ret = sk_nulls_del_node_init_rcu(osk);
}
+ if (ret)
+ __sk_nulls_add_node_rcu(sk, list);
spin_unlock(lock);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- return twrefcnt;
+ return ret;
}
-EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+{
+ bool ok = inet_ehash_insert(sk, osk);
+
+ if (ok) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ } else {
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ sk->sk_state = TCP_CLOSE;
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_csk_destroy_sock(sk);
+ }
+ return ok;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+
+void __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
- if (sk->sk_state != TCP_LISTEN)
- return __inet_hash_nolisten(sk, tw);
-
+ if (sk->sk_state != TCP_LISTEN) {
+ inet_ehash_nolisten(sk, osk);
+ return;
+ }
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -449,7 +465,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
- return 0;
}
EXPORT_SYMBOL(__inet_hash);
@@ -496,7 +511,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
- int twrefcnt = 1;
if (!snum) {
int i, remaining, low, high, port;
@@ -507,8 +521,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
+ /* By starting with offset being an even number,
+ * we tend to leave about 50% of ports for other uses,
+ * like bind(0).
+ */
+ offset &= ~1;
+
local_bh_disable();
- for (i = 1; i <= remaining; i++) {
+ for (i = 0; i < remaining; i++) {
port = low + (i + offset) % remaining;
if (inet_is_local_reserved_port(net, port))
continue;
@@ -552,25 +572,20 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL;
ok:
- hint += i;
+ hint += (i + 2) & ~1;
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- twrefcnt += __inet_hash_nolisten(sk, tw);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
if (tw)
- twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+ inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
- if (tw) {
- inet_twsk_deschedule(tw);
- while (twrefcnt) {
- twrefcnt--;
- inet_twsk_put(tw);
- }
- }
+ if (tw)
+ inet_twsk_deschedule_put(tw);
ret = 0;
goto out;
@@ -580,7 +595,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash_nolisten(sk, NULL);
+ inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -599,7 +614,11 @@ out:
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+ u32 port_offset = 0;
+
+ if (!inet_sk(sk)->inet_num)
+ port_offset = inet_sk_port_offset(sk);
+ return __inet_hash_connect(death_row, sk, port_offset,
__inet_check_established);
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
@@ -608,7 +627,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
{
int i;
- atomic_set(&h->bsockets, 0);
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
@@ -616,3 +634,32 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
}
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+ unsigned int locksz = sizeof(spinlock_t);
+ unsigned int i, nblocks = 1;
+
+ if (locksz != 0) {
+ /* allocate 2 cache lines or at least one spinlock per cpu */
+ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
+ nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+
+ /* no more locks than number of hash buckets */
+ nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+ hashinfo->ehash_locks = kmalloc_array(nblocks, locksz,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!hashinfo->ehash_locks)
+ hashinfo->ehash_locks = vmalloc(nblocks * locksz);
+
+ if (!hashinfo->ehash_locks)
+ return -ENOMEM;
+
+ for (i = 0; i < nblocks; i++)
+ spin_lock_init(&hashinfo->ehash_locks[i]);
+ }
+ hashinfo->ehash_locks_mask = nblocks - 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
diff --git a/kernel/net/ipv4/inet_timewait_sock.c b/kernel/net/ipv4/inet_timewait_sock.c
index 00ec8d5d7..c67f9bd76 100644
--- a/kernel/net/ipv4/inet_timewait_sock.c
+++ b/kernel/net/ipv4/inet_timewait_sock.c
@@ -18,28 +18,6 @@
/**
- * inet_twsk_unhash - unhash a timewait socket from established hash
- * @tw: timewait socket
- *
- * unhash a timewait socket from established hash, if hashed.
- * ehash lock must be held by caller.
- * Returns 1 if caller should call inet_twsk_put() after lock release.
- */
-int inet_twsk_unhash(struct inet_timewait_sock *tw)
-{
- if (hlist_nulls_unhashed(&tw->tw_node))
- return 0;
-
- hlist_nulls_del_rcu(&tw->tw_node);
- sk_nulls_node_init(&tw->tw_node);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
-}
-
-/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
* @tw: timewait socket
* @hashinfo: hashinfo pointer
@@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
* bind hash lock must be held by caller.
* Returns 1 if caller should call inet_twsk_put() after lock release.
*/
-int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
- return 0;
+ return;
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
+ __sock_put((struct sock *)tw);
}
/* Must be called with locally disabled BHs. */
static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
- struct inet_bind_hashbucket *bhead;
- int refcnt;
- /* Unlink from established hashes. */
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+ struct inet_bind_hashbucket *bhead;
spin_lock(lock);
- refcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
spin_unlock(lock);
/* Disassociate with bind bucket. */
@@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
- refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+ inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
- atomic_sub(refcnt, &tw->tw_refcnt);
atomic_dec(&tw->tw_dr->tw_count);
inet_twsk_put(tw);
}
@@ -153,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
/*
* Step 2: Hash TW into tcp ehash chain.
* Notes :
- * - tw_refcnt is set to 3 because :
+ * - tw_refcnt is set to 4 because :
* - We have one reference from bhash chain.
* - We have one reference from ehash chain.
+ * - We have one reference from timer.
+ * - One reference for ourself (our caller will release it).
* We can use atomic_set() because prior spin_lock()/spin_unlock()
* committed into memory all tw fields.
*/
- atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ atomic_set(&tw->tw_refcnt, 4);
inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
@@ -170,7 +142,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
-void tw_timer_handler(unsigned long data)
+static void tw_timer_handler(unsigned long data)
{
struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
@@ -235,15 +207,19 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
* tcp_input.c to verify this.
*/
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
if (del_timer_sync(&tw->tw_timer))
inet_twsk_kill(tw);
+ inet_twsk_put(tw);
}
-EXPORT_SYMBOL(inet_twsk_deschedule);
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
-void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
+void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
{
/* timeout := RTO * 3.5
*
@@ -271,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
*/
tw->tw_kill = timeo <= 4*HZ;
- if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
- atomic_inc(&tw->tw_refcnt);
+ if (!rearm) {
+ BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));
atomic_inc(&tw->tw_dr->tw_count);
+ } else {
+ mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
-EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family)
@@ -311,9 +289,8 @@ restart:
rcu_read_unlock();
local_bh_disable();
- inet_twsk_deschedule(tw);
+ inet_twsk_deschedule_put(tw);
local_bh_enable();
- inet_twsk_put(tw);
goto restart_rcu;
}
/* If the nulls value we got at the end of this lookup is
diff --git a/kernel/net/ipv4/inetpeer.c b/kernel/net/ipv4/inetpeer.c
index 241afd743..86fa45809 100644
--- a/kernel/net/ipv4/inetpeer.c
+++ b/kernel/net/ipv4/inetpeer.c
@@ -157,22 +157,6 @@ void __init inet_initpeers(void)
INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
-static int addr_compare(const struct inetpeer_addr *a,
- const struct inetpeer_addr *b)
-{
- int i, n = (a->family == AF_INET ? 1 : 4);
-
- for (i = 0; i < n; i++) {
- if (a->addr.a6[i] == b->addr.a6[i])
- continue;
- if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
- return -1;
- return 1;
- }
-
- return 0;
-}
-
#define rcu_deref_locked(X, BASE) \
rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
@@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a,
*stackptr++ = &_base->root; \
for (u = rcu_deref_locked(_base->root, _base); \
u != peer_avl_empty;) { \
- int cmp = addr_compare(_daddr, &u->daddr); \
+ int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \
if (cmp == 0) \
break; \
if (cmp == -1) \
@@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
int count = 0;
while (u != peer_avl_empty) {
- int cmp = addr_compare(daddr, &u->daddr);
+ int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
if (cmp == 0) {
/* Before taking a reference, check if this entry was
* deleted (refcnt=-1)
diff --git a/kernel/net/ipv4/ip_forward.c b/kernel/net/ipv4/ip_forward.c
index 367448494..da0d7ce85 100644
--- a/kernel/net/ipv4/ip_forward.c
+++ b/kernel/net/ipv4/ip_forward.c
@@ -39,17 +39,21 @@
#include <net/route.h>
#include <net/xfrm.h>
-static bool ip_may_fragment(const struct sk_buff *skb)
-{
- return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
- skb->ignore_df;
-}
-
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
+ if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0))
+ return false;
+
+ /* original fragment exceeds mtu and DF is set */
+ if (unlikely(IPCB(skb)->frag_max_size > mtu))
+ return true;
+
+ if (skb->ignore_df)
+ return false;
+
if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
return false;
@@ -57,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
}
-static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
int ip_forward(struct sk_buff *skb)
@@ -77,6 +81,7 @@ int ip_forward(struct sk_buff *skb)
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
+ struct net *net;
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
@@ -95,6 +100,7 @@ int ip_forward(struct sk_buff *skb)
return NET_RX_SUCCESS;
skb_forward_csum(skb);
+ net = dev_net(skb->dev);
/*
* According to the RFC, we must first decrease the TTL field. If
@@ -114,8 +120,8 @@ int ip_forward(struct sk_buff *skb)
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
- if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
- IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+ if (ip_exceeds_mtu(skb, mtu)) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
goto drop;
@@ -139,8 +145,9 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
- skb->dev, rt->dst.dev, ip_forward_finish);
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, rt->dst.dev,
+ ip_forward_finish);
sr_failed:
/*
@@ -151,7 +158,7 @@ sr_failed:
too_many_hops:
/* Tell the sender its packet died... */
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
diff --git a/kernel/net/ipv4/ip_fragment.c b/kernel/net/ipv4/ip_fragment.c
index cae22a1a8..b8a0607da 100644
--- a/kernel/net/ipv4/ip_fragment.c
+++ b/kernel/net/ipv4/ip_fragment.c
@@ -48,6 +48,7 @@
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
#include <net/inet_ecn.h>
+#include <net/l3mdev.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -75,7 +76,9 @@ struct ipq {
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */
+ u16 max_df_size; /* largest frag with DF set seen */
int iif;
+ int vif; /* L3 master device index */
unsigned int rid;
struct inet_peer *peer;
};
@@ -98,6 +101,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct ip4_create_arg {
struct iphdr *iph;
u32 user;
+ int vif;
};
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
@@ -126,7 +130,8 @@ static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
- qp->user == arg->user;
+ qp->user == arg->user &&
+ qp->vif == arg->vif;
}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -143,9 +148,11 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
qp->ecn = ip4_frag_ecn(arg->iph->tos);
qp->saddr = arg->iph->saddr;
qp->daddr = arg->iph->daddr;
+ qp->vif = arg->vif;
qp->user = arg->user;
qp->peer = sysctl_ipfrag_max_dist ?
- inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
+ NULL;
}
static void ip4_frag_free(struct inet_frag_queue *q)
@@ -173,6 +180,15 @@ static void ipq_kill(struct ipq *ipq)
inet_frag_kill(&ipq->q, &ip4_frags);
}
+static bool frag_expire_skip_icmp(u32 user)
+{
+ return user == IP_DEFRAG_AF_PACKET ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+ __IP_DEFRAG_CONNTRACK_IN_END) ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
+ __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
+}
+
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
@@ -192,7 +208,7 @@ static void ip_expire(unsigned long arg)
ipq_kill(qp);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
- if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+ if (!inet_frag_evicting(&qp->q)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
@@ -217,10 +233,8 @@ static void ip_expire(unsigned long arg)
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (qp->user == IP_DEFRAG_AF_PACKET ||
- ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
- (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
- (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ if (frag_expire_skip_icmp(qp->user) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock;
/* Send an ICMP "Fragment Reassembly Timeout" message. */
@@ -236,7 +250,8 @@ out:
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
-static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+ u32 user, int vif)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
@@ -244,6 +259,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
arg.iph = iph;
arg.user = user;
+ arg.vif = vif;
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
@@ -301,7 +317,7 @@ static int ip_frag_reinit(struct ipq *qp)
kfree_skb(fp);
fp = xp;
} while (fp);
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
@@ -319,6 +335,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
+ unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
@@ -446,7 +463,7 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- sub_frag_mem_limit(&qp->q, free_it->truesize);
+ sub_frag_mem_limit(qp->q.net, free_it->truesize);
kfree_skb(free_it);
}
}
@@ -470,13 +487,18 @@ found:
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(&qp->q, skb->truesize);
+ add_frag_mem_limit(qp->q.net, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
+ fragsize = skb->len + ihl;
+
+ if (fragsize > qp->q.max_size)
+ qp->q.max_size = fragsize;
+
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len + ihl > qp->q.max_size)
- qp->q.max_size = skb->len + ihl;
+ fragsize > qp->max_df_size)
+ qp->max_df_size = fragsize;
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
@@ -508,7 +530,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
- int sum_truesize;
u8 ecn;
ipq_kill(qp);
@@ -573,47 +594,47 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(&qp->q, clone->truesize);
+ add_frag_mem_limit(qp->q.net, clone->truesize);
}
+ skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- sum_truesize = head->truesize;
- for (fp = head->next; fp;) {
- bool headstolen;
- int delta;
- struct sk_buff *next = fp->next;
-
- sum_truesize += fp->truesize;
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
-
- if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
- kfree_skb_partial(fp, headstolen);
- } else {
- if (!skb_shinfo(head)->frag_list)
- skb_shinfo(head)->frag_list = fp;
- head->data_len += fp->len;
- head->len += fp->len;
- head->truesize += fp->truesize;
- }
- fp = next;
+ head->truesize += fp->truesize;
}
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, head->truesize);
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = qp->q.max_size;
+ IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
iph = ip_hdr(head);
- /* max_size != 0 implies at least one fragment had IP_DF set */
- iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
+ /* When we set IP_DF on a refragmented skb we must also force a
+ * call to ip_fragment to avoid forwarding a DF-skb of size s while
+ * original sender only sent fragments of size f (where f < s).
+ *
+ * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+ * frag seen to avoid sending tiny DF-fragments in case skb was built
+ * from one very small df-fragment and one large non-df frag.
+ */
+ if (qp->max_df_size == qp->q.max_size) {
+ IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ iph->frag_off = htons(IP_DF);
+ } else {
+ iph->frag_off = 0;
+ }
+
ip_send_check(iph);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -633,16 +654,17 @@ out_fail:
}
/* Process an incoming IP datagram fragment. */
-int ip_defrag(struct sk_buff *skb, u32 user)
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
+ struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+ int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
- struct net *net;
- net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+ skb_orphan(skb);
/* Lookup (or create) queue header */
- qp = ip_find(net, ip_hdr(skb), user);
+ qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret;
@@ -661,7 +683,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
-struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct iphdr iph;
int netoff;
@@ -690,7 +712,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
if (pskb_trim_rcsum(skb, netoff + len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- if (ip_defrag(skb, user))
+ if (ip_defrag(net, skb, user))
return NULL;
skb_clear_hash(skb);
}
@@ -818,6 +840,8 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
+ int res;
+
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -841,9 +865,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
*/
net->ipv4.frags.timeout = IP_FRAG_TIME;
- inet_frags_init_net(&net->ipv4.frags);
-
- return ip4_frags_ns_ctl_register(net);
+ res = inet_frags_init_net(&net->ipv4.frags);
+ if (res)
+ return res;
+ res = ip4_frags_ns_ctl_register(net);
+ if (res)
+ inet_frags_uninit_net(&net->ipv4.frags);
+ return res;
}
static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/kernel/net/ipv4/ip_gre.c b/kernel/net/ipv4/ip_gre.c
index 5fd706473..614521437 100644
--- a/kernel/net/ipv4/ip_gre.c
+++ b/kernel/net/ipv4/ip_gre.c
@@ -25,6 +25,7 @@
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/mroute.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -47,6 +48,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/gre.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -121,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
-static int ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ip_gre_calc_hlen(__be16 o_flags)
+{
+ int addend = 4;
+
+ if (o_flags & TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags & TUNNEL_KEY)
+ addend += 4;
+ if (o_flags & TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
+
+static __be16 gre_flags_to_tnl_flags(__be16 flags)
+{
+ __be16 tflags = 0;
+
+ if (flags & GRE_CSUM)
+ tflags |= TUNNEL_CSUM;
+ if (flags & GRE_ROUTING)
+ tflags |= TUNNEL_ROUTING;
+ if (flags & GRE_KEY)
+ tflags |= TUNNEL_KEY;
+ if (flags & GRE_SEQ)
+ tflags |= TUNNEL_SEQ;
+ if (flags & GRE_STRICT)
+ tflags |= TUNNEL_STRICT;
+ if (flags & GRE_REC)
+ tflags |= TUNNEL_REC;
+ if (flags & GRE_VERSION)
+ tflags |= TUNNEL_VERSION;
+
+ return tflags;
+}
+
+static __be16 tnl_flags_to_gre_flags(__be16 tflags)
+{
+ __be16 flags = 0;
+
+ if (tflags & TUNNEL_CSUM)
+ flags |= GRE_CSUM;
+ if (tflags & TUNNEL_ROUTING)
+ flags |= GRE_ROUTING;
+ if (tflags & TUNNEL_KEY)
+ flags |= GRE_KEY;
+ if (tflags & TUNNEL_SEQ)
+ flags |= GRE_SEQ;
+ if (tflags & TUNNEL_STRICT)
+ flags |= GRE_STRICT;
+ if (tflags & TUNNEL_REC)
+ flags |= GRE_REC;
+ if (tflags & TUNNEL_VERSION)
+ flags |= GRE_VERSION;
+
+ return flags;
+}
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else {
+ tpi->key = 0;
+ }
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else {
+ tpi->seq = 0;
+ }
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static void ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return PACKET_RCVD;
+ return;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return PACKET_RCVD;
+ return;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
break;
}
break;
+
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return PACKET_RCVD;
+ return;
break;
case ICMP_REDIRECT:
@@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
iph->daddr, iph->saddr, tpi->key);
if (!t)
- return PACKET_REJECT;
+ return;
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return PACKET_RCVD;
+ return;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return PACKET_RCVD;
+ return;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
- return PACKET_RCVD;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ ipgre_err(skb, info, &tpi);
+}
+
+static __be64 key_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be64)((__force u32)key);
+#else
+ return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 tunnel_id_to_key(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be32)x;
+#else
+ return (__force __be32)((__force u64)x >> 32);
+#endif
}
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
struct net *net = dev_net(skb->dev);
+ struct metadata_dst *tun_dst = NULL;
struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
@@ -218,40 +399,211 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
if (tunnel) {
skb_pop_mac_header(skb);
- ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ if (tunnel->collect_md) {
+ __be16 flags;
+ __be64 tun_id;
+
+ flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ tun_id = key_to_tunnel_id(tpi->key);
+ tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
+ if (!tun_dst)
+ return PACKET_REJECT;
+ }
+
+ ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
return PACKET_REJECT;
}
+static int gre_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
+ __be16 proto, __be32 key, __be32 seq)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(flags);
+ greh->protocol = proto;
+
+ if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (flags & TUNNEL_SEQ) {
+ *ptr = seq;
+ ptr--;
+ }
+ if (flags & TUNNEL_KEY) {
+ *ptr = key;
+ ptr--;
+ }
+ if (flags & TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct tnl_ptk_info tpi;
- tpi.flags = tunnel->parms.o_flags;
- tpi.proto = proto;
- tpi.key = tunnel->parms.o_key;
if (tunnel->parms.o_flags & TUNNEL_SEQ)
tunnel->o_seqno++;
- tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */
- gre_build_header(skb, &tpi, tunnel->tun_hlen);
-
- skb_set_inner_protocol(skb, tpi.proto);
+ build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+ skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
+static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
+ bool csum)
+{
+ return iptunnel_handle_offloads(skb, csum,
+ csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+}
+
+static struct rtable *gre_get_rt(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi4 *fl,
+ const struct ip_tunnel_key *key)
+{
+ struct net *net = dev_net(dev);
+
+ memset(fl, 0, sizeof(*fl));
+ fl->daddr = key->u.ipv4.dst;
+ fl->saddr = key->u.ipv4.src;
+ fl->flowi4_tos = RT_TOS(key->tos);
+ fl->flowi4_mark = skb->mark;
+ fl->flowi4_proto = IPPROTO_GRE;
+
+ return ip_route_output_key(net, fl);
+}
+
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct flowi4 fl;
+ struct rtable *rt;
+ int min_headroom;
+ int tunnel_hlen;
+ __be16 df, flags;
+ int err;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+ rt = gre_get_rt(skb, dev, &fl, key);
+ if (IS_ERR(rt))
+ goto err_free_skb;
+
+ tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + tunnel_hlen + sizeof(struct iphdr);
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ /* Push Tunnel header. */
+ skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
+ if (IS_ERR(skb)) {
+ skb = NULL;
+ goto err_free_rt;
+ }
+
+ flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+ tunnel_id_to_key(tun_info->key.tun_id), 0);
+
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+ key->u.ipv4.dst, IPPROTO_GRE,
+ key->tos, key->ttl, df, false);
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return;
+
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+}
+
+static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+
+ rt = gre_get_rt(skb, dev, &fl4, &info->key);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ ip_rt_put(rt);
+ info->key.u.ipv4.src = fl4.saddr;
+ return 0;
+}
+
static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
+ }
+
if (dev->header_ops) {
/* Need space for new headers */
if (skb_cow_head(skb, dev->needed_headroom -
@@ -277,7 +629,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
goto out;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
-
return NETDEV_TX_OK;
free_skb:
@@ -292,6 +643,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
+ }
+
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
@@ -300,7 +656,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
goto free_skb;
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
-
return NETDEV_TX_OK;
free_skb:
@@ -530,10 +885,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev);
}
-static struct gre_cisco_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
- .priority = 0,
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
};
static int __net_init ipgre_init_net(struct net *net)
@@ -596,8 +950,10 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
- struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
{
memset(parms, 0, sizeof(*parms));
@@ -635,6 +991,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_GRE_COLLECT_METADATA]) {
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ t->collect_md = true;
+ }
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -688,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -712,7 +1075,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
return err;
}
- ipgre_netlink_parms(data, tb, &p);
+ ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -730,7 +1093,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
- ipgre_netlink_parms(data, tb, &p);
+ ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_changelink(dev, tb, &p);
}
@@ -765,6 +1128,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_GRE_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -796,6 +1161,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags))
goto nla_put_failure;
+ if (t->collect_md) {
+ if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
return 0;
nla_put_failure:
@@ -817,6 +1187,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -849,9 +1220,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+ u8 name_assign_type)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ int err;
+
+ memset(&tb, 0, sizeof(tb));
+
+ dev = rtnl_create_link(net, name, name_assign_type,
+ &ipgre_tap_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ /* Configure flow based GRE device. */
+ t = netdev_priv(dev);
+ t->collect_md = true;
+
+ err = ipgre_newlink(net, dev, tb, NULL);
+ if (err < 0)
+ goto out;
+ return dev;
+out:
+ free_netdev(dev);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
+
static int __net_init ipgre_tap_init_net(struct net *net)
{
- return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
static void __net_exit ipgre_tap_exit_net(struct net *net)
@@ -881,7 +1281,7 @@ static int __init ipgre_init(void)
if (err < 0)
goto pnet_tap_faied;
- err = gre_cisco_register(&ipgre_protocol);
+ err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
@@ -900,7 +1300,7 @@ static int __init ipgre_init(void)
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- gre_cisco_unregister(&ipgre_protocol);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_faied:
@@ -912,7 +1312,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- gre_cisco_unregister(&ipgre_protocol);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/kernel/net/ipv4/ip_input.c b/kernel/net/ipv4/ip_input.c
index 2db4c8773..b1209b633 100644
--- a/kernel/net/ipv4/ip_input.c
+++ b/kernel/net/ipv4/ip_input.c
@@ -146,6 +146,7 @@
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <net/dst_metadata.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -156,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
@@ -166,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- net_eq(sock_net(sk), dev_net(dev))) {
+ net_eq(sock_net(sk), net)) {
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
}
if (last) {
@@ -187,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
-
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
@@ -247,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb)
/*
* Reassemble IP fragments.
*/
+ struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
- return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
+ net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
@@ -310,7 +311,7 @@ drop:
int sysctl_ip_early_demux __read_mostly = 1;
EXPORT_SYMBOL(sysctl_ip_early_demux);
-static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
@@ -331,13 +332,12 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (!skb_dst(skb)) {
+ if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
- NET_INC_STATS_BH(dev_net(skb->dev),
- LINUX_MIB_IPRPFILTER);
+ NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
}
@@ -358,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
- skb->len);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
- skb->len);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
return dst_input(skb);
@@ -377,6 +375,7 @@ drop:
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
+ struct net *net;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -386,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
- IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
+ net = dev_net(dev);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
@@ -416,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
- IP_ADD_STATS_BH(dev_net(dev),
+ IP_ADD_STATS_BH(net,
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
@@ -430,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
@@ -440,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -452,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
- dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
ip_rcv_finish);
csum_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
diff --git a/kernel/net/ipv4/ip_output.c b/kernel/net/ipv4/ip_output.c
index c65b93a7b..49f028563 100644
--- a/kernel/net/ipv4/ip_output.c
+++ b/kernel/net/ipv4/ip_output.c
@@ -83,6 +83,11 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
+
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
@@ -91,32 +96,28 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
- skb_dst(skb)->dev, dst_output_sk);
-}
-
-int __ip_local_out(struct sk_buff *skb)
-{
- return __ip_local_out_sk(skb->sk, skb);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
}
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip_local_out(skb);
+ err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output_sk(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -131,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
* Add an ip header to a skbuff and send it out.
*
*/
-int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
@@ -145,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->dst))
- iph->frag_off = htons(IP_DF);
- else
- iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(sock_net(sk), skb, sk);
+ if (ip_dont_fragment(sk, &rt->dst)) {
+ iph->frag_off = htons(IP_DF);
+ iph->id = 0;
+ } else {
+ iph->frag_off = 0;
+ __ip_select_ident(net, iph, 1);
+ }
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -164,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
skb->mark = sk->sk_mark;
/* Send it out. */
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
@@ -178,9 +182,9 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
@@ -216,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -224,8 +229,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
- skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
- return ip_finish_output2(sk, skb);
+ skb_gso_network_seglen(skb) <= mtu)
+ return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -235,6 +240,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
* from host network stack.
*/
features = netif_skb_features(skb);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -248,7 +254,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, ip_finish_output2);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -258,25 +264,28 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
return ret;
}
-static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
+ mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb);
+ return ip_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > ip_skb_dst_mtu(skb))
- return ip_fragment(sk, skb, ip_finish_output2);
+ if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
}
-int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
@@ -284,7 +293,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -312,7 +321,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- sk, newskb, NULL, newskb->dev,
+ net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
}
@@ -327,26 +336,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
- NULL, newskb->dev, dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
}
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
- skb->dev, ip_finish_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
+ ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sock *sk, struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -369,6 +380,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -399,7 +411,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
@@ -436,20 +448,20 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(sock_net(sk), skb, sk,
+ ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- res = ip_local_out(skb);
+ res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
@@ -478,6 +490,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->frag_off & htons(IP_DF)) == 0)
+ return ip_do_fragment(net, sk, skb, output);
+
+ if (unlikely(!skb->ignore_df ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(net, sk, skb, output);
+}
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -485,8 +519,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
* single device frame, and queue such a frame for sending.
*/
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -500,6 +534,11 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
dev = rt->dst.dev;
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
+
/*
* Point into the IP datagram header.
*/
@@ -507,15 +546,8 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
- (IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > mtu))) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- kfree_skb(skb);
- return -EMSGSIZE;
- }
+ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+ mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
@@ -523,10 +555,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge)
- mtu -= nf_bridge_mtu_reduction(skb);
-#endif
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
@@ -599,10 +627,10 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
ip_send_check(iph);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
@@ -612,7 +640,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
}
if (err == 0) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return 0;
}
@@ -621,7 +649,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
kfree_skb(frag);
frag = skb;
}
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
slow_path_clean:
@@ -635,9 +663,6 @@ slow_path_clean:
}
slow_path:
- /* for offloaded checksums cleanup checksum before fragmentation */
- if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
- goto fail;
iph = ip_hdr(skb);
left = skb->len - hlen; /* Space per frame */
@@ -711,6 +736,9 @@ slow_path:
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ iph->frag_off |= htons(IP_DF);
+
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -736,22 +764,22 @@ slow_path:
ip_send_check(iph);
- err = output(sk, skb2);
+ err = output(net, sk, skb2);
if (err)
goto fail;
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
}
consume_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
}
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
@@ -886,6 +914,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ !(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -893,7 +922,7 @@ static int __ip_append_data(struct sock *sk,
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
@@ -1217,11 +1246,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
}
while (size > 0) {
- int i;
-
- if (skb_is_gso(skb))
+ if (skb_is_gso(skb)) {
len = size;
- else {
+ } else {
/* Check if the remaining data fits into current packet. */
len = mtu - skb->len;
@@ -1273,15 +1300,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
continue;
}
- i = skb_shinfo(skb)->nr_frags;
if (len > size)
len = size;
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
- } else if (i < MAX_SKB_FRAGS) {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, len);
- } else {
+
+ if (skb_append_pagefrags(skb, page, offset, len)) {
err = -EMSGSIZE;
goto error;
}
@@ -1416,7 +1438,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
- err = ip_local_out(skb);
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
@@ -1524,6 +1546,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
struct net *net = sock_net(sk);
struct sk_buff *nskb;
int err;
+ int oif;
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
return;
@@ -1541,7 +1564,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
daddr = replyopts.opt.opt.faddr;
}
- flowi4_init_output(&fl4, arg->bound_dev_if,
+ oif = arg->bound_dev_if;
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+ oif = skb->skb_iif;
+
+ flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark),
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
@@ -1573,7 +1600,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out:
diff --git a/kernel/net/ipv4/ip_sockglue.c b/kernel/net/ipv4/ip_sockglue.c
index 6ddde8999..a50124260 100644
--- a/kernel/net/ipv4/ip_sockglue.c
+++ b/kernel/net/ipv4/ip_sockglue.c
@@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+
+ /* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
err < 40 ? err : 40);
if (err)
@@ -591,6 +593,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_TRANSPARENT:
case IP_MINTTL:
case IP_NODEFRAG:
+ case IP_BIND_ADDRESS_NO_PORT:
case IP_UNICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_ALL:
@@ -741,6 +744,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
inet->nodefrag = val ? 1 : 0;
break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet->bind_address_no_port = val ? 1 : 0;
+ break;
case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval;
@@ -1247,11 +1253,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
* the _received_ ones. The set sets the _sent_ ones.
*/
+static bool getsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_MSFILTER:
+ case MCAST_MSFILTER:
+ return true;
+ }
+ return false;
+}
+
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
- int val;
+ bool needs_rtnl = getsockopt_needs_rtnl(optname);
+ int val, err = 0;
int len;
if (level != SOL_IP)
@@ -1265,6 +1282,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (len < 0)
return -EINVAL;
+ if (needs_rtnl)
+ rtnl_lock();
lock_sock(sk);
switch (optname) {
@@ -1333,6 +1352,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_NODEFRAG:
val = inet->nodefrag;
break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ val = inet->bind_address_no_port;
+ break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
@@ -1379,39 +1401,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MSFILTER:
{
struct ip_msfilter msf;
- int err;
if (len < IP_MSFILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_msfget(sk, &msf,
(struct ip_msfilter __user *)optval, optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case MCAST_MSFILTER:
{
struct group_filter gsf;
- int err;
if (len < GROUP_FILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_gsfget(sk, &gsf,
(struct group_filter __user *)optval,
optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case IP_MULTICAST_ALL:
val = inet->mc_all;
@@ -1478,6 +1496,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
}
return 0;
+
+out:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
}
int ip_getsockopt(struct sock *sk, int level,
diff --git a/kernel/net/ipv4/ip_tunnel.c b/kernel/net/ipv4/ip_tunnel.c
index 626d9e56a..cbb51f3fa 100644
--- a/kernel/net/ipv4/ip_tunnel.c
+++ b/kernel/net/ipv4/ip_tunnel.c
@@ -230,10 +230,13 @@ skip_key_lookup:
if (cand)
return cand;
+ t = rcu_dereference(itn->collect_md_tun);
+ if (t)
+ return t;
+
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
return netdev_priv(itn->fb_tunnel_dev);
-
return NULL;
}
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
struct hlist_head *head = ip_bucket(itn, &t->parms);
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, t);
hlist_add_head_rcu(&t->hash_node, head);
}
-static void ip_tunnel_del(struct ip_tunnel *t)
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, NULL);
hlist_del_init_rcu(&t->hash_node);
}
@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
}
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
- const struct tnl_ptk_info *tpi, bool log_ecn_error)
+ const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+ bool log_ecn_error)
{
struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->dev = tunnel->dev;
}
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel_parm *p,
bool set_mtu)
{
- ip_tunnel_del(t);
+ ip_tunnel_del(itn, t);
t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr;
t->parms.i_key = p->i_key;
@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
if (itn->fb_tunnel_dev != dev) {
- ip_tunnel_del(netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
unregister_netdevice_queue(dev, head);
}
}
@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
nt = netdev_priv(dev);
itn = net_generic(net, nt->ip_tnl_net_id);
- if (ip_tunnel_find(itn, p, dev->type))
- return -EEXIST;
+ if (nt->collect_md) {
+ if (rtnl_dereference(itn->collect_md_tun))
+ return -EEXIST;
+ } else {
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+ }
nt->net = net;
nt->parms = *p;
@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
dev->mtu = mtu;
ip_tunnel_add(itn, nt);
-
out:
return err;
}
@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev)
iph->version = 4;
iph->ihl = 5;
+ if (tunnel->collect_md) {
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ netif_keep_dst(dev);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev)
itn = net_generic(net, tunnel->ip_tnl_net_id);
/* fb_tunnel_dev will be unregisted in net-exit call. */
if (itn->fb_tunnel_dev != dev)
- ip_tunnel_del(netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
ip_tunnel_dst_reset_all(tunnel);
}
diff --git a/kernel/net/ipv4/ip_tunnel_core.c b/kernel/net/ipv4/ip_tunnel_core.c
index ce63ab21b..6cb9009c3 100644
--- a/kernel/net/ipv4/ip_tunnel_core.c
+++ b/kernel/net/ipv4/ip_tunnel_core.c
@@ -32,6 +32,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
+#include <linux/static_key.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -45,12 +46,14 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
+#include <net/dst_metadata.h>
int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
{
- int pkt_len = skb->len;
+ int pkt_len = skb->len - skb_inner_network_offset(skb);
+ struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
int err;
@@ -74,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(dev_net(rt->dst.dev), iph,
- skb_shinfo(skb)->gso_segs ?: 1);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
- err = ip_local_out_sk(sk, skb);
+ err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
@@ -98,7 +100,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
return -ENOMEM;
eh = (struct ethhdr *)skb->data;
- if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ if (likely(eth_proto_is_802_3(eh->h_proto)))
skb->protocol = eh->h_proto;
else
skb->protocol = htons(ETH_P_802_2);
@@ -118,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
+ gfp_t flags)
+{
+ struct metadata_dst *res;
+ struct ip_tunnel_info *dst, *src;
+
+ if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+ return NULL;
+
+ res = metadata_dst_alloc(0, flags);
+ if (!res)
+ return NULL;
+
+ dst = &res->u.tun_info;
+ src = &md->u.tun_info;
+ dst->key.tun_id = src->key.tun_id;
+ if (src->mode & IP_TUNNEL_INFO_IPV6)
+ memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ else
+ dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+ dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
+
struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
bool csum_help,
int gso_type_mask)
@@ -165,6 +194,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
{
int i;
+ netdev_stats_to_stats64(tot, &dev->stats);
+
for_each_possible_cpu(i) {
const struct pcpu_sw_netstats *tstats =
per_cpu_ptr(dev->tstats, i);
@@ -185,22 +216,211 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
tot->tx_bytes += tx_bytes;
}
- tot->multicast = dev->stats.multicast;
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
+static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_ID] = { .type = NLA_U64 },
+ [LWTUNNEL_IP_DST] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
+};
+
+static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct ip_tunnel_info *tun_info;
+ struct lwtunnel_state *new_state;
+ struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
+ int err;
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
+ err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy);
+ if (err < 0)
+ return err;
- tot->collisions = dev->stats.collisions;
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ if (!new_state)
+ return -ENOMEM;
- return tot;
+ new_state->type = LWTUNNEL_ENCAP_IP;
+
+ tun_info = lwt_tun_info(new_state);
+
+ if (tb[LWTUNNEL_IP_ID])
+ tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]);
+
+ if (tb[LWTUNNEL_IP_DST])
+ tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
+
+ if (tb[LWTUNNEL_IP_SRC])
+ tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
+
+ if (tb[LWTUNNEL_IP_TTL])
+ tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
+
+ if (tb[LWTUNNEL_IP_TOS])
+ tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
+
+ if (tb[LWTUNNEL_IP_FLAGS])
+ tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]);
+
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->options_len = 0;
+
+ *ts = new_state;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
+ nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+ nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+ nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
+ nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size(8) /* LWTUNNEL_IP_ID */
+ + nla_total_size(4) /* LWTUNNEL_IP_DST */
+ + nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ + nla_total_size(1) /* LWTUNNEL_IP_TOS */
+ + nla_total_size(1) /* LWTUNNEL_IP_TTL */
+ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */
+}
+
+static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ return memcmp(lwt_tun_info(a), lwt_tun_info(b),
+ sizeof(struct ip_tunnel_info));
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+ .build_state = ip_tun_build_state,
+ .fill_encap = ip_tun_fill_encap_info,
+ .get_encap_size = ip_tun_encap_nlsize,
+ .cmp_encap = ip_tun_cmp_encap,
+};
+
+static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
+ [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
+ [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
+ [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
+ [LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
+ [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+};
+
+static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct ip_tunnel_info *tun_info;
+ struct lwtunnel_state *new_state;
+ struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy);
+ if (err < 0)
+ return err;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ if (!new_state)
+ return -ENOMEM;
+
+ new_state->type = LWTUNNEL_ENCAP_IP6;
+
+ tun_info = lwt_tun_info(new_state);
+
+ if (tb[LWTUNNEL_IP6_ID])
+ tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]);
+
+ if (tb[LWTUNNEL_IP6_DST])
+ tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
+
+ if (tb[LWTUNNEL_IP6_SRC])
+ tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
+
+ if (tb[LWTUNNEL_IP6_HOPLIMIT])
+ tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
+
+ if (tb[LWTUNNEL_IP6_TC])
+ tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
+
+ if (tb[LWTUNNEL_IP6_FLAGS])
+ tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
+
+ tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
+ tun_info->options_len = 0;
+
+ *ts = new_state;
+
+ return 0;
+}
+
+static int ip6_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+ nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
+ nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) ||
+ nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size(8) /* LWTUNNEL_IP6_ID */
+ + nla_total_size(16) /* LWTUNNEL_IP6_DST */
+ + nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
+ + nla_total_size(1) /* LWTUNNEL_IP6_TC */
+ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
+ .build_state = ip6_tun_build_state,
+ .fill_encap = ip6_tun_fill_encap_info,
+ .get_encap_size = ip6_tun_encap_nlsize,
+ .cmp_encap = ip_tun_cmp_encap,
+};
+
+void __init ip_tunnel_core_init(void)
+{
+ lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+ lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
+}
+
+struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+ static_key_slow_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+ static_key_slow_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/kernel/net/ipv4/ip_vti.c b/kernel/net/ipv4/ip_vti.c
index 0c152087c..4d8f0b698 100644
--- a/kernel/net/ipv4/ip_vti.c
+++ b/kernel/net/ipv4/ip_vti.c
@@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
- err = dst_output(skb);
+ err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
diff --git a/kernel/net/ipv4/ipconfig.c b/kernel/net/ipv4/ipconfig.c
index 8e7328c6a..0bc7412d9 100644
--- a/kernel/net/ipv4/ipconfig.c
+++ b/kernel/net/ipv4/ipconfig.c
@@ -94,7 +94,7 @@
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
#define CONF_SEND_RETRIES 6 /* Send six requests per open */
-#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */
#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
@@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
+#if defined(CONFIG_IP_PNP_DHCP)
+static char dhcp_client_identifier[253] __initdata;
+#endif
+
/* Persistent data: */
static int ic_proto_used; /* Protocol used, if any */
@@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options)
memcpy(e, vendor_class_identifier, len);
e += len;
}
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
+ }
}
*e++ = 255; /* End of the list */
@@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name)
return 0;
}
#ifdef CONFIG_IP_PNP_DHCP
- else if (!strcmp(name, "dhcp")) {
+ else if (!strncmp(name, "dhcp", 4)) {
+ char *client_id;
+
ic_proto_enabled &= ~IC_RARP;
+ client_id = strstr(name, "dhcp,");
+ if (client_id) {
+ char *v;
+
+ client_id = client_id + 5;
+ v = strchr(client_id, ',');
+ if (!v)
+ return 1;
+ *v = 0;
+ if (kstrtou8(client_id, 0, dhcp_client_identifier))
+ DBG("DHCP: Invalid client identifier type\n");
+ strncpy(dhcp_client_identifier + 1, v + 1, 251);
+ *v = ',';
+ }
return 1;
}
#endif
diff --git a/kernel/net/ipv4/ipip.c b/kernel/net/ipv4/ipip.c
index ff96396eb..a09fb0dec 100644
--- a/kernel/net/ipv4/ipip.c
+++ b/kernel/net/ipv4/ipip.c
@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
return -1;
@@ -251,10 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EINVAL;
}
- p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
+ p.i_key = p.o_key = 0;
+ p.i_flags = p.o_flags = 0;
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
diff --git a/kernel/net/ipv4/ipmr.c b/kernel/net/ipv4/ipmr.c
index 3a2c0162c..c3a38353f 100644
--- a/kernel/net/ipv4/ipmr.c
+++ b/kernel/net/ipv4/ipmr.c
@@ -134,7 +134,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void mroute_clean_tables(struct mr_table *mrt);
+static void mroute_clean_tables(struct mr_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.match = ipmr_rule_match,
.configure = ipmr_rule_configure,
.compare = ipmr_rule_compare,
- .default_pref = fib_default_rule_pref,
.fill = ipmr_rule_fill,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = ipmr_rule_policy,
@@ -351,7 +350,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, true);
kfree(mrt);
}
@@ -442,10 +441,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -541,10 +536,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -1209,7 +1200,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct mr_table *mrt)
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
int i;
LIST_HEAD(list);
@@ -1218,8 +1209,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
/* Shut down all active vif entries */
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags & VIFF_STATIC))
- vif_delete(mrt, i, 0, &list);
+ if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
+ continue;
+ vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
@@ -1227,7 +1219,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags & MFC_STATIC)
+ if (!all && (c->mfc_flags & MFC_STATIC))
continue;
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
@@ -1262,7 +1254,7 @@ static void mrtsock_destruct(struct sock *sk)
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, false);
}
}
rtnl_unlock();
@@ -1679,17 +1671,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
nf_reset(skb);
}
-static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
+static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
/*
@@ -1746,7 +1739,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* to blackhole.
*/
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
goto out_free;
}
@@ -1788,8 +1781,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
- skb->dev, dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dev,
ipmr_forward_finish);
return;
diff --git a/kernel/net/ipv4/netfilter.c b/kernel/net/ipv4/netfilter.c
index 65de0684e..c3776ff67 100644
--- a/kernel/net/ipv4/netfilter.c
+++ b/kernel/net/ipv4/netfilter.c
@@ -17,9 +17,8 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
@@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
}
}
-static int nf_ip_reroute(struct sk_buff *skb,
+static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
const struct nf_queue_entry *entry)
{
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(skb, RTN_UNSPEC);
+ return ip_route_me_harder(net, skb, RTN_UNSPEC);
}
return 0;
}
@@ -197,11 +196,4 @@ static int __init ipv4_netfilter_init(void)
{
return nf_register_afinfo(&nf_ip_afinfo);
}
-
-static void __exit ipv4_netfilter_fini(void)
-{
- nf_unregister_afinfo(&nf_ip_afinfo);
-}
-
-module_init(ipv4_netfilter_init);
-module_exit(ipv4_netfilter_fini);
+subsys_initcall(ipv4_netfilter_init);
diff --git a/kernel/net/ipv4/netfilter/Kconfig b/kernel/net/ipv4/netfilter/Kconfig
index fb20f3631..c187c60e3 100644
--- a/kernel/net/ipv4/netfilter/Kconfig
+++ b/kernel/net/ipv4/netfilter/Kconfig
@@ -58,6 +58,13 @@ config NFT_REJECT_IPV4
default NFT_REJECT
tristate
+config NFT_DUP_IPV4
+ tristate "IPv4 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV4
+ help
+ This module enables IPv4 packet duplication support for nf_tables.
+
endif # NF_TABLES_IPV4
config NF_TABLES_ARP
@@ -67,6 +74,13 @@ config NF_TABLES_ARP
endif # NF_TABLES
+config NF_DUP_IPV4
+ tristate "Netfilter IPv4 packet duplication to alternate destination"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ help
+ This option enables the nf_dup_ipv4 core, which duplicates an IPv4
+ packet to be rerouted to another destination.
+
config NF_LOG_ARP
tristate "ARP packet logging"
default m if NETFILTER_ADVANCED=n
@@ -195,7 +209,8 @@ config IP_NF_MATCH_ECN
config IP_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE || IP_NF_RAW
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
diff --git a/kernel/net/ipv4/netfilter/Makefile b/kernel/net/ipv4/netfilter/Makefile
index 7fe6c7035..87b073da1 100644
--- a/kernel/net/ipv4/netfilter/Makefile
+++ b/kernel/net/ipv4/netfilter/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
+obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
# generic IP tables
@@ -70,3 +71,5 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o
diff --git a/kernel/net/ipv4/netfilter/arp_tables.c b/kernel/net/ipv4/netfilter/arp_tables.c
index a61200754..11dccba47 100644
--- a/kernel/net/ipv4/netfilter/arp_tables.c
+++ b/kernel/net/ipv4/netfilter/arp_tables.c
@@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, arpinfo->iniface,
- arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : "");
return 0;
}
@@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, arpinfo->outiface,
- arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : "");
return 0;
}
@@ -240,23 +240,24 @@ get_entry(const void *base, unsigned int offset)
return (struct arpt_entry *)(base + offset);
}
-static inline __pure
+static inline
struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
{
return (void *)entry + entry->next_offset;
}
unsigned int arpt_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
const struct arphdr *arp;
- struct arpt_entry *e, *back;
+ struct arpt_entry *e, **jumpstack;
const char *indev, *outdev;
- void *table_base;
+ const void *table_base;
+ unsigned int cpu, stackidx = 0;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
@@ -270,16 +271,21 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ cpu = smp_processor_id();
/*
* Ensure we load private-> members after we've fetched the base
* pointer.
*/
smp_read_barrier_depends();
- table_base = private->entries[smp_processor_id()];
+ table_base = private->entries;
+ jumpstack = (struct arpt_entry **)private->jumpstack[cpu];
+ /* No TEE support for arptables, so no need to switch to alternate
+ * stack. All targets that reenter must return absolute verdicts.
+ */
e = get_entry(table_base, private->hook_entry[hook]);
- back = get_entry(table_base, private->underflow[hook]);
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.hooknum = hook;
@@ -289,13 +295,15 @@ unsigned int arpt_do_table(struct sk_buff *skb,
arp = arp_hdr(skb);
do {
const struct xt_entry_target *t;
+ struct xt_counters *counter;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
e = arpt_next_entry(e);
continue;
}
- ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
t = arpt_get_target_c(e);
@@ -310,27 +318,24 @@ unsigned int arpt_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- e = back;
- back = get_entry(table_base, back->comefrom);
+ if (stackidx == 0) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ } else {
+ e = jumpstack[--stackidx];
+ e = arpt_next_entry(e);
+ }
continue;
}
if (table_base + v
!= arpt_next_entry(e)) {
- /* Save old back ptr in next entry */
- struct arpt_entry *next = arpt_next_entry(e);
- next->comefrom = (void *)back - table_base;
-
- /* set back pointer to next entry */
- back = next;
+ jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
continue;
}
- /* Targets which reenter must return
- * abs. verdicts
- */
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
@@ -463,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -521,6 +526,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
@@ -538,6 +547,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -614,13 +625,14 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
static int translate_table(struct xt_table_info *newinfo, void *entry0,
- const struct arpt_replace *repl)
+ const struct arpt_replace *repl)
{
struct arpt_entry *iter;
unsigned int i;
@@ -702,12 +714,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -722,14 +728,16 @@ static void get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -774,7 +782,7 @@ static int copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
/* ... then copy entire thing ... */
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
@@ -863,16 +871,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct arpt_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -884,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1037,7 +1045,7 @@ static int __do_replace(struct net *net, const char *name,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ loc_cpu_old_entry = oldinfo->entries;
xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
cleanup_entry(iter);
@@ -1061,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name,
}
static int do_replace(struct net *net, const void __user *user,
- unsigned int len)
+ unsigned int len)
{
int ret;
struct arpt_replace tmp;
@@ -1084,8 +1092,7 @@ static int do_replace(struct net *net, const void __user *user,
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1115,7 +1122,7 @@ static int do_replace(struct net *net, const void __user *user,
static int do_add_counters(struct net *net, const void __user *user,
unsigned int len, int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1125,7 +1132,6 @@ static int do_add_counters(struct net *net, const void __user *user,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
struct arpt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1181,12 +1187,13 @@ static int do_add_counters(struct net *net, const void __user *user,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
- loc_cpu_entry = private->entries[curcpu];
+
addend = xt_write_recseq_begin();
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
@@ -1396,7 +1403,7 @@ static int translate_compat_table(const char *name,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1416,9 +1423,17 @@ static int translate_compat_table(const char *name,
i = 0;
xt_entry_foreach(iter1, entry1, newinfo->size) {
+ iter1->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(iter1->counters.pcnt)) {
+ ret = -ENOMEM;
+ break;
+ }
+
ret = check_target(iter1, name);
- if (ret != 0)
+ if (ret != 0) {
+ xt_percpu_counter_free(iter1->counters.pcnt);
break;
+ }
++i;
if (strcmp(arpt_get_target(iter1)->u.user.name,
XT_ERROR_TARGET) == 0)
@@ -1448,11 +1463,6 @@ static int translate_compat_table(const char *name,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1511,8 +1521,7 @@ static int compat_do_replace(struct net *net, void __user *user,
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1609,7 +1618,6 @@ static int compat_copy_entries_to_user(unsigned int total_size,
void __user *pos;
unsigned int size;
int ret = 0;
- void *loc_cpu_entry;
unsigned int i = 0;
struct arpt_entry *iter;
@@ -1617,11 +1625,9 @@ static int compat_copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy on our node/cpu */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -1790,8 +1796,7 @@ struct xt_table *arpt_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(newinfo, loc_cpu_entry, repl);
@@ -1822,7 +1827,7 @@ void arpt_unregister_table(struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter);
if (private->number > private->initial_entries)
diff --git a/kernel/net/ipv4/netfilter/arptable_filter.c b/kernel/net/ipv4/netfilter/arptable_filter.c
index 93876d031..1897ee160 100644
--- a/kernel/net/ipv4/netfilter/arptable_filter.c
+++ b/kernel/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,10 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
-arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+arptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net = dev_net(state->in ? state->in : state->out);
-
- return arpt_do_table(skb, ops->hooknum, state,
- net->ipv4.arptable_filter);
+ return arpt_do_table(skb, state, state->net->ipv4.arptable_filter);
}
static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/ip_tables.c b/kernel/net/ipv4/netfilter/ip_tables.c
index 2d0e265fe..b99affad6 100644
--- a/kernel/net/ipv4/netfilter/ip_tables.c
+++ b/kernel/net/ipv4/netfilter/ip_tables.c
@@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, ipinfo->iniface,
- ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : "");
return false;
}
@@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, ipinfo->outiface,
- ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : "");
return false;
}
@@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip,
FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
dprintf("Packet protocol %hi does not match %hi.%s\n",
ip->protocol, ipinfo->proto,
- ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : "");
return false;
}
@@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
return 0;
}
-static void trace_packet(const struct sk_buff *skb,
+static void trace_packet(struct net *net,
+ const struct sk_buff *skb,
unsigned int hook,
const struct net_device *in,
const struct net_device *out,
@@ -254,15 +255,12 @@ static void trace_packet(const struct sk_buff *skb,
const struct xt_table_info *private,
const struct ipt_entry *e)
{
- const void *table_base;
const struct ipt_entry *root;
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
- struct net *net = dev_net(in ? in : out);
- table_base = private->entries[smp_processor_id()];
- root = get_entry(table_base, private->hook_entry[hook]);
+ root = get_entry(private->entries, private->hook_entry[hook]);
hookname = chainname = hooknames[hook];
comment = comments[NF_IP_TRACE_COMMENT_RULE];
@@ -278,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb,
}
#endif
-static inline __pure
+static inline
struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
{
return (void *)entry + entry->next_offset;
@@ -287,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
@@ -298,12 +296,13 @@ ipt_do_table(struct sk_buff *skb,
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
- unsigned int *stackptr, origptr, cpu;
+ unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
+ stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
@@ -316,6 +315,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.family = NFPROTO_IPV4;
@@ -331,20 +331,29 @@ ipt_do_table(struct sk_buff *skb,
* pointer.
*/
smp_read_barrier_depends();
- table_base = private->entries[cpu];
+ table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
- stackptr = per_cpu_ptr(private->stackptr, cpu);
- origptr = *stackptr;
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
- pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
- table->name, hook, origptr,
+ pr_debug("Entering %s(hook %u), UF %p\n",
+ table->name, hook,
get_entry(table_base, private->underflow[hook]));
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
+ struct xt_counters *counter;
IP_NF_ASSERT(e);
if (!ip_packet_match(ip, indev, outdev,
@@ -361,7 +370,8 @@ ipt_do_table(struct sk_buff *skb,
goto no_match;
}
- ADD_COUNTER(e->counters, skb->len, 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -369,8 +379,8 @@ ipt_do_table(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
- trace_packet(skb, hook, state->in, state->out,
- table->name, private, e);
+ trace_packet(state->net, skb, hook, state->in,
+ state->out, table->name, private, e);
#endif
/* Standard target? */
if (!t->u.kernel.target->target) {
@@ -383,28 +393,24 @@ ipt_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- if (*stackptr <= origptr) {
+ if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
pr_debug("Underflow (this is normal) "
"to %p\n", e);
} else {
- e = jumpstack[--*stackptr];
+ e = jumpstack[--stackidx];
pr_debug("Pulled %p out from pos %u\n",
- e, *stackptr);
+ e, stackidx);
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
- if (*stackptr >= private->stacksize) {
- verdict = NF_DROP;
- break;
- }
- jumpstack[(*stackptr)++] = e;
+ jumpstack[stackidx++] = e;
pr_debug("Pushed %p into pos %u\n",
- e, *stackptr - 1);
+ e, stackidx - 1);
}
e = get_entry(table_base, v);
@@ -423,11 +429,10 @@ ipt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- pr_debug("Exiting %s; resetting sp from %u to %u\n",
- __func__, *stackptr, origptr);
- *stackptr = origptr;
- xt_write_recseq_end(addend);
- local_bh_enable();
+ pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
+
+ xt_write_recseq_end(addend);
+ local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -479,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- XT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -544,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -665,6 +670,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -691,6 +700,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
ret = check_target(e, net, name);
if (ret)
goto err;
+
return 0;
err:
module_put(t->u.kernel.target->me);
@@ -700,6 +710,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -784,13 +797,14 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
- const struct ipt_replace *repl)
+ const struct ipt_replace *repl)
{
struct ipt_entry *iter;
unsigned int i;
@@ -866,12 +880,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -887,14 +895,16 @@ get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -939,11 +949,7 @@ copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
@@ -1051,16 +1057,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct ipt_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1072,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1181,7 +1187,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_table *t;
struct xt_table_info *oldinfo;
struct xt_counters *counters;
- void *loc_cpu_old_entry;
struct ipt_entry *iter;
ret = 0;
@@ -1224,8 +1229,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
@@ -1271,8 +1275,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1301,9 +1304,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
static int
do_add_counters(struct net *net, const void __user *user,
- unsigned int len, int compat)
+ unsigned int len, int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1313,7 +1316,6 @@ do_add_counters(struct net *net, const void __user *user,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
struct ipt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1369,12 +1371,12 @@ do_add_counters(struct net *net, const void __user *user,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
- loc_cpu_entry = private->entries[curcpu];
addend = xt_write_recseq_begin();
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
@@ -1444,7 +1446,6 @@ static int
compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ipt_ip *ip,
- unsigned int hookmask,
int *size)
{
struct xt_match *match;
@@ -1513,8 +1514,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
entry_offset = (void *)e - (void *)base;
j = 0;
xt_ematch_foreach(ematch, e) {
- ret = compat_find_calc_match(ematch, name,
- &e->ip, e->comefrom, &off);
+ ret = compat_find_calc_match(ematch, name, &e->ip, &off);
if (ret != 0)
goto release_matches;
++j;
@@ -1610,6 +1610,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
unsigned int j;
int ret = 0;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -1634,6 +1638,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -1718,7 +1725,7 @@ translate_compat_table(struct net *net,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1770,11 +1777,6 @@ translate_compat_table(struct net *net,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1821,8 +1823,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1893,7 +1894,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
void __user *pos;
unsigned int size;
int ret = 0;
- const void *loc_cpu_entry;
unsigned int i = 0;
struct ipt_entry *iter;
@@ -1901,14 +1901,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -2083,8 +2078,7 @@ struct xt_table *ipt_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu, but dont care about preemption */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2115,7 +2109,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter, net);
if (private->number > private->initial_entries)
diff --git a/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c b/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 771ab3d01..4a9e6db9d 100644
--- a/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
struct clusterip_config *config;
int ret;
+ if (par->nft_compat) {
+ pr_err("cannot use CLUSTERIP target from nftables compat\n");
+ return -EOPNOTSUPP;
+ }
+
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
@@ -487,14 +492,14 @@ static void arp_print(struct arp_payload *payload)
{
#define HBUFFERLEN 30
char hbuffer[HBUFFERLEN];
- int j,k;
+ int j, k;
- for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) {
hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
- hbuffer[k++]=':';
+ hbuffer[k++] = ':';
}
- hbuffer[--k]='\0';
+ hbuffer[--k] = '\0';
pr_debug("src %pI4@%s, dst %pI4\n",
&payload->src_ip, hbuffer, &payload->dst_ip);
@@ -502,14 +507,14 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(const struct nf_hook_ops *ops,
+arp_mangle(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
- struct net *net = dev_net(state->in ? state->in : state->out);
+ struct net *net = state->net;
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
diff --git a/kernel/net/ipv4/netfilter/ipt_ECN.c b/kernel/net/ipv4/netfilter/ipt_ECN.c
index 4bf3dc49a..270765236 100644
--- a/kernel/net/ipv4/netfilter/ipt_ECN.c
+++ b/kernel/net/ipv4/netfilter/ipt_ECN.c
@@ -72,7 +72,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
tcph->cwr = einfo->proto.tcp.cwr;
inet_proto_csum_replace2(&tcph->check, skb,
- oldval, ((__be16 *)tcph)[6], 0);
+ oldval, ((__be16 *)tcph)[6], false);
return true;
}
diff --git a/kernel/net/ipv4/netfilter/ipt_REJECT.c b/kernel/net/ipv4/netfilter/ipt_REJECT.c
index 87907d4bd..1d16c0f28 100644
--- a/kernel/net/ipv4/netfilter/ipt_REJECT.c
+++ b/kernel/net/ipv4/netfilter/ipt_REJECT.c
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(skb, hook);
+ nf_send_reset(par->net, skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c b/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c
index e9e677930..5fdc55651 100644
--- a/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -18,7 +18,7 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
static struct iphdr *
-synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct iphdr *iph;
@@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
}
static void
-synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+synproxy_send_tcp(const struct synproxy_net *snet,
+ const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
+ struct net *net = nf_ct_net(snet->tmpl);
+
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
@@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
nf_conntrack_get(nfct);
}
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
free_nskb:
@@ -68,7 +71,8 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+synproxy_send_client_synack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
@@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
@@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
@@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
@@ -220,13 +224,14 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
nth->ack_seq = th->ack_seq;
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
- nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
}
static bool
@@ -257,7 +262,7 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_net *snet = synproxy_pernet(par->net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -286,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(skb, th, &opts);
+ synproxy_send_client_synack(snet, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
@@ -298,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+static unsigned int ipv4_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ struct synproxy_net *snet = synproxy_pernet(nhs->net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -432,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = {
static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
diff --git a/kernel/net/ipv4/netfilter/ipt_ah.c b/kernel/net/ipv4/netfilter/ipt_ah.c
index 14a2aa8b8..a787d07f6 100644
--- a/kernel/net/ipv4/netfilter/ipt_ah.c
+++ b/kernel/net/ipv4/netfilter/ipt_ah.c
@@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
bool r;
pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
invert ? '!' : ' ', min, spi, max);
- r=(spi >= min && spi <= max) ^ invert;
+ r = (spi >= min && spi <= max) ^ invert;
pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
diff --git a/kernel/net/ipv4/netfilter/ipt_rpfilter.c b/kernel/net/ipv4/netfilter/ipt_rpfilter.c
index 4bfaedf9b..78cc64edd 100644
--- a/kernel/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/kernel/net/ipv4/netfilter/ipt_rpfilter.c
@@ -32,15 +32,14 @@ static __be32 rpfilter_get_saddr(__be32 addr)
return addr;
}
-static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
+static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
const struct net_device *dev, u8 flags)
{
struct fib_result res;
bool dev_match;
- struct net *net = dev_net(dev);
int ret __maybe_unused;
- if (fib_lookup(net, fl4, &res))
+ if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
if (res.type != RTN_UNICAST) {
@@ -61,9 +60,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
if (FIB_RES_DEV(res) == dev)
dev_match = true;
#endif
- if (dev_match || flags & XT_RPFILTER_LOOSE)
- return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
- return dev_match;
+ return dev_match || flags & XT_RPFILTER_LOOSE;
}
static bool rpfilter_is_local(const struct sk_buff *skb)
@@ -98,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = RT_TOS(iph->tos);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
- return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/kernel/net/ipv4/netfilter/iptable_filter.c b/kernel/net/ipv4/netfilter/iptable_filter.c
index a0f3beca5..397ef2dd1 100644
--- a/kernel/net/ipv4/netfilter/iptable_filter.c
+++ b/kernel/net/ipv4/netfilter/iptable_filter.c
@@ -33,19 +33,16 @@ static const struct xt_table packet_filter = {
};
static unsigned int
-iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/iptable_mangle.c b/kernel/net/ipv4/netfilter/iptable_mangle.c
index 62cbb8c5f..ba5d392a1 100644
--- a/kernel/net/ipv4/netfilter/iptable_mangle.c
+++ b/kernel/net/ipv4/netfilter/iptable_mangle.c
@@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = {
static unsigned int
ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
{
- struct net_device *out = state->out;
unsigned int ret;
const struct iphdr *iph;
u_int8_t tos;
@@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
- dev_net(out)->ipv4.iptable_mangle);
+ ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
/* Reroute for ANY change. */
if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
@@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_mangle_hook(const struct nf_hook_ops *ops,
+iptable_mangle_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (ops->hooknum == NF_INET_LOCAL_OUT)
+ if (state->hook == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, state);
- if (ops->hooknum == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, ops->hooknum, state,
- dev_net(state->out)->ipv4.iptable_mangle);
+ if (state->hook == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, state,
+ state->net->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, ops->hooknum, state,
- dev_net(state->in)->ipv4.iptable_mangle);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/iptable_nat.c b/kernel/net/ipv4/netfilter/iptable_nat.c
index 0d4d9cdf9..ae2cd2752 100644
--- a/kernel/net/ipv4/netfilter/iptable_nat.c
+++ b/kernel/net/ipv4/netfilter/iptable_nat.c
@@ -28,49 +28,46 @@ static const struct xt_table nf_nat_ipv4_table = {
.af = NFPROTO_IPV4,
};
-static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
-
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
+ return ipt_do_table(skb, state, state->net->ipv4.nat_table);
}
-static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
}
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
@@ -78,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
@@ -86,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_local_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
@@ -94,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
diff --git a/kernel/net/ipv4/netfilter/iptable_raw.c b/kernel/net/ipv4/netfilter/iptable_raw.c
index 0356e6da4..1ba02811a 100644
--- a/kernel/net/ipv4/netfilter/iptable_raw.c
+++ b/kernel/net/ipv4/netfilter/iptable_raw.c
@@ -20,19 +20,16 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_raw_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/iptable_security.c b/kernel/net/ipv4/netfilter/iptable_security.c
index 4bce3980c..c2e23d5e9 100644
--- a/kernel/net/ipv4/netfilter/iptable_security.c
+++ b/kernel/net/ipv4/netfilter/iptable_security.c
@@ -37,20 +37,16 @@ static const struct xt_table security_table = {
};
static unsigned int
-iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_security_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state,
- net->ipv4.iptable_security);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
@@ -83,7 +79,7 @@ static int __init iptable_security_init(void)
int ret;
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0)
return ret;
sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
diff --git a/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 30ad9554b..461ca926f 100644
--- a/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
return NF_ACCEPT;
}
-static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+static unsigned int ipv4_helper(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
ct, ctinfo);
}
-static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+static unsigned int ipv4_confirm(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -143,14 +143,14 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
-static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_local(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
/* Connection tracking may drop packets, but never alters them, so
@@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
.hook = ipv4_conntrack_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -280,7 +274,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -EINVAL;
}
- h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
if (h) {
struct sockaddr_in sin;
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 80d5554b9..c567e1b5d 100644
--- a/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net)
}
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct icmphdr *hp;
struct icmphdr _hdr;
@@ -134,15 +134,17 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_zone tmp;
NF_CT_ASSERT(skb->nfct == NULL);
+ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
if (!nf_ct_get_tuplepr(skb,
skb_network_offset(skb) + ip_hdrlen(skb)
+ sizeof(struct icmphdr),
- PF_INET, &origtuple)) {
+ PF_INET, net, &origtuple)) {
pr_debug("icmp_error_message: failed to get tuple\n");
return -NF_ACCEPT;
}
diff --git a/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c b/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c
index c88b7d434..a04dee536 100644
--- a/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,14 +22,13 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+ u_int32_t user)
{
int err;
- skb_orphan(skb);
-
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(net, skb, user);
local_bh_enable();
if (!err) {
@@ -43,33 +42,32 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
struct sk_buff *skb)
{
- u16 zone = NF_CT_DEFAULT_ZONE;
-
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct)
- zone = nf_ct_zone((struct nf_conn *)skb->nfct);
-#endif
+ if (skb->nfct) {
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge &&
- skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
- return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+ }
#endif
+ if (nf_bridge_in_prerouting(skb))
+ return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
+
if (hooknum == NF_INET_PRE_ROUTING)
- return IP_DEFRAG_CONNTRACK_IN + zone;
+ return IP_DEFRAG_CONNTRACK_IN + zone_id;
else
- return IP_DEFRAG_CONNTRACK_OUT + zone;
+ return IP_DEFRAG_CONNTRACK_OUT + zone_id;
}
-static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_defrag(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (sk && (sk->sk_family == PF_INET) &&
- inet->nodefrag)
+ if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
+ inet_sk(sk)->nodefrag)
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -83,9 +81,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
/* Gather fragments. */
if (ip_is_fragment(ip_hdr(skb))) {
enum ip_defrag_users user =
- nf_ct_defrag_user(ops->hooknum, skb);
+ nf_ct_defrag_user(state->hook, skb);
- if (nf_ct_ipv4_gather_frags(skb, user))
+ if (nf_ct_ipv4_gather_frags(state->net, skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
@@ -94,14 +92,12 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
diff --git a/kernel/net/ipv4/netfilter/nf_dup_ipv4.c b/kernel/net/ipv4/netfilter/nf_dup_ipv4.c
new file mode 100644
index 000000000..ceb187308
--- /dev/null
+++ b/kernel/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -0,0 +1,106 @@
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 or later, as
+ * published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
+ const struct in_addr *gw, int oif)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ memset(&fl4, 0, sizeof(fl4));
+ if (oif != -1)
+ fl4.flowi4_oif = oif;
+
+ fl4.daddr = gw->s_addr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return false;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = rt->dst.dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return true;
+}
+
+void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
+ const struct in_addr *gw, int oif)
+{
+ struct iphdr *iph;
+
+ if (this_cpu_read(nf_skb_duplicated))
+ return;
+ /*
+ * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+ * the original skb, which should continue on its way as if nothing has
+ * happened. The copy should be independently delivered to the gateway.
+ */
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Avoid counting cloned packets towards the original connection. */
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+#endif
+ /*
+ * If we are in PREROUTING/INPUT, the checksum must be recalculated
+ * since the length could have changed as a result of defragmentation.
+ *
+ * We also decrease the TTL to mitigate potential loops between two
+ * hosts.
+ *
+ * Set %IP_DF so that the original source is notified of a potentially
+ * decreased MTU on the clone route. IPv6 does this too.
+ */
+ iph = ip_hdr(skb);
+ iph->frag_off |= htons(IP_DF);
+ if (hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_IN)
+ --iph->ttl;
+ ip_send_check(iph);
+
+ if (nf_dup_ipv4_route(net, skb, gw, oif)) {
+ __this_cpu_write(nf_skb_duplicated, true);
+ ip_local_out(net, skb->sk, skb);
+ __this_cpu_write(nf_skb_duplicated, false);
+ } else {
+ kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv4);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet");
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index e59cc05c0..5075b7ecd 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -120,7 +120,7 @@ static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
oldip = iph->daddr;
newip = t->dst.u3.ip;
}
- inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+ inet_proto_csum_replace4(check, skb, oldip, newip, true);
}
static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
@@ -151,7 +151,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
}
} else
inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), 1);
+ htons(oldlen), htons(datalen), true);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
unsigned int
-nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
/* We never see fragments: conntrack defrags on pre-routing
* and local-out, and nf_nat_out protects post-routing.
@@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
case IP_CT_RELATED_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
+ state->hook))
return NF_DROP;
else
return NF_ACCEPT;
@@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = do_chain(ops, skb, state, ct);
+ ret = do_chain(priv, skb, state, ct);
if (ret != NF_ACCEPT)
return ret;
- if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
break;
- ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat,
state->out))
goto oif_changed;
}
@@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+ return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
@@ -345,9 +345,9 @@ oif_changed:
EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
unsigned int
-nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
@@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
unsigned int
-nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.src.u.all !=
ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(skb, AF_INET);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
unsigned int
-nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(skb, AF_INET);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/kernel/net/ipv4/netfilter/nf_nat_pptp.c b/kernel/net/ipv4/netfilter/nf_nat_pptp.c
index 657d2307f..b3ca21b2b 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_pptp.c
@@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
const struct nf_conn *master = ct->master;
struct nf_conntrack_expect *other_exp;
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_tuple t = {};
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_range range;
diff --git a/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c b/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 4557b4ab8..7b98baa13 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -67,7 +67,7 @@ icmp_manip_pkt(struct sk_buff *skb,
hdr = (struct icmphdr *)(skb->data + hdroff);
inet_proto_csum_replace2(&hdr->checksum, skb,
- hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+ hdr->un.echo.id, tuple->src.u.icmp.id, false);
hdr->un.echo.id = tuple->src.u.icmp.id;
return true;
}
diff --git a/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c b/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 7c6766713..ddb894ac1 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg,
}
if (obj->type == SNMP_IPADDR)
- mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+ mangle_address(ctx.begin, ctx.pointer - 4, map, check);
kfree(obj->id);
kfree(obj);
diff --git a/kernel/net/ipv4/netfilter/nf_reject_ipv4.c b/kernel/net/ipv4/netfilter/nf_reject_ipv4.c
index 3262e41ff..c747b2d9e 100644
--- a/kernel/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
/* Send RST reply */
-void nf_send_reset(struct sk_buff *oldskb, int hook)
+void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
const struct iphdr *oiph;
@@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
/* "Never happens" */
@@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
diff --git a/kernel/net/ipv4/netfilter/nf_tables_arp.c b/kernel/net/ipv4/netfilter/nf_tables_arp.c
index 8412268bb..9d09d4f59 100644
--- a/kernel/net/ipv4/netfilter/nf_tables_arp.c
+++ b/kernel/net/ipv4/netfilter/nf_tables_arp.c
@@ -15,15 +15,15 @@
#include <net/netfilter/nf_tables.h>
static unsigned int
-nft_do_chain_arp(const struct nf_hook_ops *ops,
+nft_do_chain_arp(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo(&pkt, ops, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
static struct nft_af_info nft_af_arp __read_mostly = {
diff --git a/kernel/net/ipv4/netfilter/nf_tables_ipv4.c b/kernel/net/ipv4/netfilter/nf_tables_ipv4.c
index aa180d3a6..ca9dc3c46 100644
--- a/kernel/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -18,18 +18,18 @@
#include <net/ip.h>
#include <net/netfilter/nf_tables_ipv4.h>
-static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+static unsigned int nft_do_chain_ipv4(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+static unsigned int nft_ipv4_output(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
return NF_ACCEPT;
}
- return nft_do_chain_ipv4(ops, skb, state);
+ return nft_do_chain_ipv4(priv, skb, state);
}
struct nft_af_info nft_af_ipv4 __read_mostly = {
diff --git a/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index bf5c30ae1..f5c66a7a4 100644
--- a/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -26,44 +26,44 @@
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/ip.h>
-static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
}
static const struct nf_chain_type nft_chain_nat_ipv4 = {
diff --git a/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c b/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c
index e335b0afd..2375b0a8b 100644
--- a/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -21,7 +21,7 @@
#include <net/route.h>
#include <net/ip.h>
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+static unsigned int nf_route_table_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
mark = skb->mark;
iph = ip_hdr(skb);
@@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
daddr = iph->daddr;
tos = iph->tos;
- ret = nft_do_chain(&pkt, ops);
+ ret = nft_do_chain(&pkt, priv);
if (ret != NF_DROP && ret != NF_QUEUE) {
iph = ip_hdr(skb);
@@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
+ if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
ret = NF_DROP;
}
return ret;
diff --git a/kernel/net/ipv4/netfilter/nft_dup_ipv4.c b/kernel/net/ipv4/netfilter/nft_dup_ipv4.c
new file mode 100644
index 000000000..bf855e64f
--- /dev/null
+++ b/kernel/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+
+struct nft_dup_ipv4 {
+ enum nft_registers sreg_addr:8;
+ enum nft_registers sreg_dev:8;
+};
+
+static void nft_dup_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+ struct in_addr gw = {
+ .s_addr = (__force __be32)regs->data[priv->sreg_addr],
+ };
+ int oif = regs->data[priv->sreg_dev];
+
+ nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif);
+}
+
+static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+ return -EINVAL;
+
+ priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+ err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+ priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+ return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ }
+ return 0;
+}
+
+static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+ nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv4_type;
+static const struct nft_expr_ops nft_dup_ipv4_ops = {
+ .type = &nft_dup_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)),
+ .eval = nft_dup_ipv4_eval,
+ .init = nft_dup_ipv4_init,
+ .dump = nft_dup_ipv4_dump,
+};
+
+static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
+ [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "dup",
+ .ops = &nft_dup_ipv4_ops,
+ .policy = nft_dup_ipv4_policy,
+ .maxattr = NFTA_DUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_dup_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_dup_ipv4_type);
+}
+
+static void __exit nft_dup_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_dup_ipv4_type);
+}
+
+module_init(nft_dup_ipv4_module_init);
+module_exit(nft_dup_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
diff --git a/kernel/net/ipv4/netfilter/nft_masq_ipv4.c b/kernel/net/ipv4/netfilter/nft_masq_ipv4.c
index 40e414c4c..b72ffc58e 100644
--- a/kernel/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
&range, pkt->out);
}
diff --git a/kernel/net/ipv4/netfilter/nft_redir_ipv4.c b/kernel/net/ipv4/netfilter/nft_redir_ipv4.c
index d8d795df9..c09d43814 100644
--- a/kernel/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
mr.range[0].flags |= priv->flags;
regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
- pkt->ops->hooknum);
+ pkt->hook);
}
static struct nft_expr_type nft_redir_ipv4_type;
diff --git a/kernel/net/ipv4/netfilter/nft_reject_ipv4.c b/kernel/net/ipv4/netfilter/nft_reject_ipv4.c
index b07e58b51..c24f41c81 100644
--- a/kernel/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ nf_send_reset(pkt->net, pkt->skb, pkt->hook);
break;
default:
break;
diff --git a/kernel/net/ipv4/ping.c b/kernel/net/ipv4/ping.c
index 05ff44b75..aa67e0e64 100644
--- a/kernel/net/ipv4/ping.c
+++ b/kernel/net/ipv4/ping.c
@@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(isk->freebind || isk->transparent || has_addr ||
+ if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
+ isk->freebind || isk->transparent || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
@@ -745,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
}
diff --git a/kernel/net/ipv4/proc.c b/kernel/net/ipv4/proc.c
index e1f3b911d..3abd9d7a3 100644
--- a/kernel/net/ipv4/proc.c
+++ b/kernel/net/ipv4/proc.c
@@ -298,6 +298,10 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+ SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+ SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
SNMP_MIB_SENTINEL
};
diff --git a/kernel/net/ipv4/raw.c b/kernel/net/ipv4/raw.c
index 561cd4b8f..7113bae4e 100644
--- a/kernel/net/ipv4/raw.c
+++ b/kernel/net/ipv4/raw.c
@@ -406,13 +406,16 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
ip_select_ident(net, skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->transport_header += iphlen;
+ if (iph->protocol == IPPROTO_ICMP &&
+ length >= iphlen + sizeof(struct icmphdr))
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
}
- if (iph->protocol == IPPROTO_ICMP)
- icmp_out_count(net, ((struct icmphdr *)
- skb_transport_header(skb))->type);
- err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
- NULL, rt->dst.dev, dst_output_sk);
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, rt->dst.dev,
+ dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -483,6 +486,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
@@ -542,9 +546,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
- if (err)
+ err = ip_cmsg_send(net, msg, &ipc, false);
+ if (unlikely(err)) {
+ kfree(ipc.opt);
goto out;
+ }
if (ipc.opt)
free = 1;
}
@@ -597,6 +603,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (err < 0)
+ goto done;
+ }
+
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -607,7 +619,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
diff --git a/kernel/net/ipv4/route.c b/kernel/net/ipv4/route.c
index f45f2a12f..02c62299d 100644
--- a/kernel/net/ipv4/route.c
+++ b/kernel/net/ipv4/route.c
@@ -91,6 +91,7 @@
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -102,6 +103,7 @@
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
+#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
@@ -109,6 +111,8 @@
#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -125,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
/*
* Interface to generic destination cache.
*/
@@ -457,12 +462,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
}
#define IP_IDENTS_SZ 2048u
-struct ip_ident_bucket {
- atomic_t id;
- u32 stamp32;
-};
-static struct ip_ident_bucket *ip_idents __read_mostly;
+static atomic_t *ip_idents __read_mostly;
+static u32 *ip_tstamps __read_mostly;
/* In order to protect privacy, we add a perturbation to identifiers
* if one generator is seldom used. This makes hard for an attacker
@@ -470,15 +472,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly;
*/
u32 ip_idents_reserve(u32 hash, int segs)
{
- struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
+ atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
u32 delta = 0;
- if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, &bucket->id) - segs;
+ return atomic_add_return(segs + delta, p_id) - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -749,11 +752,11 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
if (!(n->nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
} else {
- if (fib_lookup(net, fl4, &res) == 0) {
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, new_gw,
- 0, 0);
+ 0, jiffies + ip_rt_gc_timeout);
}
if (kill_route)
rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -836,6 +839,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
struct inet_peer *peer;
struct net *net;
int log_martians;
+ int vif;
rcu_read_lock();
in_dev = __in_dev_get_rcu(rt->dst.dev);
@@ -844,10 +848,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
return;
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+ vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
rcu_read_unlock();
net = dev_net(rt->dst.dev);
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
if (!peer) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
rt_nexthop(rt, ip_hdr(skb)->daddr));
@@ -936,7 +941,8 @@ static int ip_error(struct sk_buff *skb)
break;
}
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
+ l3mdev_master_ifindex(skb->dev), 1);
send = true;
if (peer) {
@@ -977,7 +983,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+ if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
@@ -1147,7 +1153,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
pr_debug("%s: %pI4 -> %pI4, %s\n",
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1188,7 +1194,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
fl4.flowi4_mark = skb->mark;
rcu_read_lock();
- if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
else
src = inet_select_addr(rt->dst.dev,
@@ -1405,6 +1411,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
+ rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr);
else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1432,12 +1439,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
}
static struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
bool nopolicy, bool noxfrm, bool will_cache)
{
- return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
- (nopolicy ? DST_NOPOLICY : 0) |
- (noxfrm ? DST_NOXFRM : 0));
+ struct rtable *rt;
+
+ rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0));
+
+ if (rt) {
+ rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ rt->rt_flags = flags;
+ rt->rt_type = type;
+ rt->rt_is_input = 0;
+ rt->rt_iif = 0;
+ rt->rt_pmtu = 0;
+ rt->rt_gateway = 0;
+ rt->rt_uses_gateway = 0;
+ rt->rt_table_id = 0;
+ INIT_LIST_HEAD(&rt->rt_uncached);
+
+ rt->dst.output = ip_output;
+ if (flags & RTCF_LOCAL)
+ rt->dst.input = ip_local_deliver;
+ }
+
+ return rt;
}
/* called in rcu_read_lock() section */
@@ -1446,6 +1475,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct rtable *rth;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ unsigned int flags = RTCF_MULTICAST;
u32 itag = 0;
int err;
@@ -1458,9 +1488,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(saddr))
- goto e_inval;
+ if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+ goto e_inval;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
@@ -1471,7 +1500,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+ if (our)
+ flags |= RTCF_LOCAL;
+
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
goto e_nobufs;
@@ -1480,20 +1512,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->dst.tclassid = itag;
#endif
rth->dst.output = ip_rt_bug;
-
- rth->rt_genid = rt_genid_ipv4(dev_net(dev));
- rth->rt_flags = RTCF_MULTICAST;
- rth->rt_type = RTN_MULTICAST;
rth->rt_is_input= 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
- if (our) {
- rth->dst.input= ip_local_deliver;
- rth->rt_flags |= RTCF_LOCAL;
- }
#ifdef CONFIG_IP_MROUTE
if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
@@ -1538,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nh->nh_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1548,7 +1597,6 @@ static int __mkroute_input(struct sk_buff *skb,
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned int flags = 0;
bool do_cache;
u32 itag = 0;
@@ -1592,18 +1640,27 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe)
+ if (fnhe) {
rth = rcu_dereference(fnhe->fnhe_rth_input);
- else
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ if (rth && rth->dst.expires &&
+ time_after(jiffies, rth->dst.expires)) {
+ ip_del_fnhe(&FIB_RES_NH(*res), daddr);
+ fnhe = NULL;
+ } else {
+ goto rt_cache;
+ }
+ }
+
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+rt_cache:
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
}
}
- rth = rt_dst_alloc(out_dev->dev,
+ rth = rt_dst_alloc(out_dev->dev, 0, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
if (!rth) {
@@ -1611,21 +1668,22 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
- rth->rt_flags = flags;
- rth->rt_type = res->type;
rth->rt_is_input = 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(in_slow_tot);
rth->dst.input = ip_forward;
- rth->dst.output = ip_output;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1633,6 +1691,48 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+ const struct iphdr *outer_iph = ip_hdr(skb);
+ struct icmphdr _icmph;
+ const struct icmphdr *icmph;
+ struct iphdr _inner_iph;
+ const struct iphdr *inner_iph;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ goto standard_hash;
+
+ icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+ &_icmph);
+ if (!icmph)
+ goto standard_hash;
+
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_REDIRECT &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB) {
+ goto standard_hash;
+ }
+
+ inner_iph = skb_header_pointer(skb,
+ outer_iph->ihl * 4 + sizeof(_icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto standard_hash;
+
+ return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+ return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1640,8 +1740,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ if (res->fi && res->fi->fib_nhs > 1) {
+ int h;
+
+ if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+ h = ip_multipath_icmp_hash(skb);
+ else
+ h = fib_multipath_hash(saddr, daddr);
+ fib_select_multipath(res, h);
+ }
#endif
/* create a routing cache entry */
@@ -1664,6 +1771,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct ip_tunnel_info *tun_info;
struct flowi4 fl4;
unsigned int flags = 0;
u32 itag = 0;
@@ -1681,10 +1789,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+ else
+ fl4.flowi4_tun_key.tun_id = 0;
+ skb_dst_drop(skb);
+
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
res.fi = NULL;
+ res.table = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
@@ -1712,13 +1828,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
- err = fib_lookup(net, &fl4, &res);
+ err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
err = -EHOSTUNREACH;
@@ -1732,7 +1849,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
goto local_input;
}
@@ -1754,7 +1871,7 @@ brd_input:
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
}
flags |= RTCF_BROADCAST;
res.type = RTN_BROADCAST;
@@ -1774,26 +1891,19 @@ local_input:
}
}
- rth = rt_dst_alloc(net->loopback_dev,
+ rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
- rth->dst.input= ip_local_deliver;
rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
-
- rth->rt_genid = rt_genid_ipv4(net);
- rth->rt_flags = flags|RTCF_LOCAL;
- rth->rt_type = res.type;
rth->rt_is_input = 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res.table)
+ rth->rt_table_id = res.table->tb_id;
+
RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
@@ -1814,6 +1924,7 @@ no_route:
RT_CACHE_STAT_INC(in_no_route);
res.type = RTN_UNREACHABLE;
res.fi = NULL;
+ res.table = NULL;
goto local_input;
/*
@@ -1836,8 +1947,6 @@ e_nobufs:
goto out;
martian_source:
- err = -EINVAL;
-martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
goto out;
}
@@ -1945,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr);
- if (fnhe)
+ if (fnhe) {
prth = &fnhe->fnhe_rth_output;
- else {
- if (unlikely(fl4->flowi4_flags &
- FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
- do_cache = false;
- goto add;
+ rth = rcu_dereference(*prth);
+ if (rth && rth->dst.expires &&
+ time_after(jiffies, rth->dst.expires)) {
+ ip_del_fnhe(nh, fl4->daddr);
+ fnhe = NULL;
+ } else {
+ goto rt_cache;
}
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
}
+
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
+
+rt_cache:
if (rt_cache_valid(rth)) {
dst_hold(&rth->dst);
return rth;
@@ -1965,29 +2084,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
}
add:
- rth = rt_dst_alloc(dev_out,
+ rth = rt_dst_alloc(dev_out, flags, type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM),
do_cache);
if (!rth)
return ERR_PTR(-ENOBUFS);
- rth->dst.output = ip_output;
-
- rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
- rth->rt_flags = flags;
- rth->rt_type = type;
- rth->rt_is_input = 0;
rth->rt_iif = orig_oif ? : 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(out_slow_tot);
- if (flags & RTCF_LOCAL)
- rth->dst.input = ip_local_deliver;
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
@@ -2006,6 +2115,8 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ if (lwtunnel_output_redirect(rth->dst.lwtstate))
+ rth->dst.output = lwtunnel_output;
return rth;
}
@@ -2014,7 +2125,8 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ int mp_hash)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2022,6 +2134,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
struct fib_result res;
struct rtable *rth;
int orig_oif;
+ int err = -ENETUNREACH;
res.tclassid = 0;
res.fi = NULL;
@@ -2097,7 +2210,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto out;
}
if (ipv4_is_local_multicast(fl4->daddr) ||
- ipv4_is_lbcast(fl4->daddr)) {
+ ipv4_is_lbcast(fl4->daddr) ||
+ fl4->flowi4_proto == IPPROTO_IGMP) {
if (!fl4->saddr)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
@@ -2111,6 +2225,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
+
+ rth = l3mdev_get_rtable(dev_out, fl4);
+ if (rth)
+ goto out;
}
if (!fl4->daddr) {
@@ -2124,10 +2242,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
- if (fib_lookup(net, fl4, &res)) {
+ err = fib_lookup(net, fl4, &res, 0);
+ if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif) {
+ if (fl4->flowi4_oif &&
+ !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2152,7 +2272,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
res.type = RTN_UNICAST;
goto make_route;
}
- rth = ERR_PTR(-ENETUNREACH);
+ rth = ERR_PTR(err);
goto out;
}
@@ -2169,18 +2289,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
- else
-#endif
- if (!res.prefixlen &&
- res.table->tb_num_default > 1 &&
- res.type == RTN_UNICAST && !fl4->flowi4_oif)
- fib_select_default(&res);
-
- if (!fl4->saddr)
- fl4->saddr = FIB_RES_PREFSRC(net, res);
+ fib_select_path(net, &res, fl4, mp_hash);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
@@ -2193,7 +2302,7 @@ out:
rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -2245,7 +2354,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
new->dev = ort->dst.dev;
if (new->dev)
@@ -2262,7 +2371,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
-
dst_free(new);
}
@@ -2272,7 +2380,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
}
struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
- struct sock *sk)
+ const struct sock *sk)
{
struct rtable *rt = __ip_route_output_key(net, flp4);
@@ -2288,7 +2396,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
u32 seq, int event, int nowait, unsigned int flags)
{
@@ -2308,8 +2416,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
r->rtm_tos = fl4->flowi4_tos;
- r->rtm_table = RT_TABLE_MAIN;
- if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ r->rtm_table = table_id;
+ if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
@@ -2414,6 +2522,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int err;
int mark;
struct sk_buff *skb;
+ u32 table_id = RT_TABLE_MAIN;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2449,6 +2558,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ if (netif_index_is_l3_master(net, fl4.flowi4_oif))
+ fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+
if (iif) {
struct net_device *dev;
@@ -2483,7 +2595,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(net, dst, src, &fl4, skb,
+ if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
+ table_id = rt->rt_table_id;
+
+ err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
RTM_NEWROUTE, 0, 0);
if (err < 0)
@@ -2504,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev)
}
#ifdef CONFIG_SYSCTL
-static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
static int ip_rt_gc_interval __read_mostly = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly = 8;
@@ -2742,6 +2856,10 @@ int __init ip_rt_init(void)
prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
+ if (!ip_tstamps)
+ panic("IP: failed to allocate ip_tstamps\n");
+
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --git a/kernel/net/ipv4/syncookies.c b/kernel/net/ipv4/syncookies.c
index df849e5a1..4cbe9f0a4 100644
--- a/kernel/net/ipv4/syncookies.c
+++ b/kernel/net/ipv4/syncookies.c
@@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
-__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
- __u16 *mssp)
+__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- tcp_synq_overflow(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
return __cookie_v4_init_sequence(iph, th, mssp);
}
@@ -219,23 +215,26 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_check);
-static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
+ bool own_req;
- child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ NULL, &own_req);
if (child) {
atomic_set(&req->rsk_refcnt, 1);
+ sock_rps_save_rxhash(child, skb);
inet_csk_reqsk_queue_add(sk, req, child);
} else {
reqsk_free(req);
}
return child;
}
-
+EXPORT_SYMBOL(tcp_get_cookie_sock);
/*
* when syncookies are in effect and tcp timestamps are enabled we stored
@@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
}
EXPORT_SYMBOL(cookie_ecn_ok);
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ * NULL if memory could not be allocated.
+ */
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
@@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
if (!req)
goto out;
@@ -345,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
- treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->snt_synack.v64 = 0;
treq->tfo_listener = false;
ireq->ir_iif = sk->sk_bound_dev_if;
@@ -381,17 +384,17 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
- ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
* Normal sockets get it right from inet_csk_route_child_sock()
*/
diff --git a/kernel/net/ipv4/sysctl_net_ipv4.c b/kernel/net/ipv4/sysctl_net_ipv4.c
index 143f5f380..1866f9102 100644
--- a/kernel/net/ipv4/sysctl_net_ipv4.c
+++ b/kernel/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
+static int thousand = 1000;
static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -45,10 +46,16 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
- write_seqlock(&net->ipv4.ip_local_ports.lock);
+ bool same_parity = !((range[0] ^ range[1]) & 1);
+
+ write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
+ if (same_parity && !net->ipv4.ip_local_ports.warned) {
+ net->ipv4.ip_local_ports.warned = true;
+ pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
+ }
net->ipv4.ip_local_ports.range[0] = range[0];
net->ipv4.ip_local_ports.range[1] = range[1];
- write_sequnlock(&net->ipv4.ip_local_ports.lock);
+ write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -489,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_recovery",
+ .data = &sysctl_tcp_recovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -570,6 +584,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_min_rtt_wlen",
+ .data = &sysctl_tcp_min_rtt_wlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
@@ -702,10 +723,28 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = &one,
.extra2 = &gso_max_segs,
},
{
+ .procname = "tcp_pacing_ss_ratio",
+ .data = &sysctl_tcp_pacing_ss_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
+ .procname = "tcp_pacing_ca_ratio",
+ .data = &sysctl_tcp_pacing_ca_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
.procname = "tcp_autocorking",
.data = &sysctl_tcp_autocorking,
.maxlen = sizeof(int),
@@ -828,6 +867,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_ecn_fallback",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_local_port_range",
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
.data = &init_net.ipv4.ip_local_ports.range,
@@ -904,6 +950,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "igmp_link_local_mcast_reports",
+ .data = &sysctl_igmp_llm_reports,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
diff --git a/kernel/net/ipv4/tcp.c b/kernel/net/ipv4/tcp.c
index bb2ce74f6..036a76ba2 100644
--- a/kernel/net/ipv4/tcp.c
+++ b/kernel/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <asm/unaligned.h>
#include <net/busy_poll.h>
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
@@ -388,6 +389,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ tp->rtt_min[0].rtt = ~0U;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -450,11 +452,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ int state;
sock_rps_record_flow(sk);
sock_poll_wait(file, sk_sleep(sk), wait);
- if (sk->sk_state == TCP_LISTEN)
+
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
@@ -491,14 +496,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM | POLLRDHUP;
/* Connected or passive Fast Open socket? */
- if (sk->sk_state != TCP_SYN_SENT &&
- (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ if (state != TCP_SYN_SENT &&
+ (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq &&
@@ -506,9 +511,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;
- /* Potential race condition. If read of tp below will
- * escape above sk->sk_state, we can be illegally awaken
- * in SYN_* states. */
if (tp->rcv_nxt - tp->copied_seq >= target)
mask |= POLLIN | POLLRDNORM;
@@ -516,8 +518,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
@@ -627,6 +628,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+ tcp_slow_start_after_idle_check(sk);
}
static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
@@ -695,8 +698,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
- ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
- tss->flags);
+ ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+ min(rd_desc->count, len), tss->flags,
+ skb_socket_splice);
if (ret > 0)
rd_desc->count -= ret;
return ret;
@@ -779,7 +783,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ret = -EAGAIN;
break;
}
- sk_wait_data(sk, &timeo);
+ sk_wait_data(sk, &timeo, NULL);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
break;
@@ -809,16 +813,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_SYMBOL(tcp_splice_read);
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+ bool force_schedule)
{
struct sk_buff *skb;
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
+ if (unlikely(tcp_under_memory_pressure(sk)))
+ sk_mem_reclaim_partial(sk);
+
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
- if (skb) {
- if (sk_wmem_schedule(sk, skb->truesize)) {
+ if (likely(skb)) {
+ bool mem_scheduled;
+
+ if (force_schedule) {
+ mem_scheduled = true;
+ sk_forced_mem_schedule(sk, skb->truesize);
+ } else {
+ mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
+ }
+ if (likely(mem_scheduled)) {
skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
@@ -885,11 +901,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto out_err;
}
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
@@ -908,7 +925,8 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ skb_queue_empty(&sk->sk_write_queue));
if (!skb)
goto wait_for_memory;
@@ -921,7 +939,7 @@ new_segment:
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -951,7 +969,8 @@ new_segment:
copied += copy;
offset += copy;
- if (!(size -= copy)) {
+ size -= copy;
+ if (!size) {
tcp_tx_timestamp(sk, skb);
goto out;
}
@@ -972,7 +991,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -987,6 +1007,9 @@ do_error:
if (copied)
goto out;
out_err:
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
return sk_stream_error(sk, flags, err);
}
@@ -1092,7 +1115,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto do_error;
}
@@ -1110,7 +1134,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
/* This should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1144,7 +1168,8 @@ new_segment:
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
- sk->sk_allocation);
+ sk->sk_allocation,
+ skb_queue_empty(&sk->sk_write_queue));
if (!skb)
goto wait_for_memory;
@@ -1187,7 +1212,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i == MAX_SKB_FRAGS || !sg) {
+ if (i == sysctl_max_skb_frags || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1247,7 +1272,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1275,6 +1301,9 @@ do_error:
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
release_sock(sk);
return err;
}
@@ -1554,7 +1583,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
u32 urg_hole = 0;
if (unlikely(flags & MSG_ERRQUEUE))
@@ -1614,7 +1643,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Next get a buffer. */
+ last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
+ last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
@@ -1733,15 +1764,17 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
- } else
- sk_wait_data(sk, &timeo);
+ } else {
+ sk_wait_data(sk, &timeo, last);
+ }
if (user_recv) {
int chunk;
/* __ Restore normal policy in scheduler __ */
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
@@ -1752,7 +1785,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
do_prequeue:
tcp_prequeue_process(sk);
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
@@ -1900,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state)
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
- sk->sk_state = state;
+ sk_state_store(sk, state);
#ifdef STATE_TRACE
SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2204,7 +2238,8 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
- if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq += tp->max_window + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
@@ -2227,13 +2262,6 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
-void tcp_sock_destruct(struct sock *sk)
-{
- inet_sock_destruct(sk);
-
- kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
-}
-
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
@@ -2483,6 +2511,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
icsk->icsk_syn_retries = val;
break;
+ case TCP_SAVE_SYN:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->save_syn = val;
+ break;
+
case TCP_LINGER2:
if (val < 0)
tp->linger2 = -1;
@@ -2548,7 +2583,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
TCPF_LISTEN))) {
tcp_fastopen_init_key_once(true);
- err = fastopen_init_queue(sk, val);
+ fastopen_queue_tune(sk, val);
} else {
err = -EINVAL;
}
@@ -2599,15 +2634,19 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
unsigned int start;
+ u64 rate64;
u32 rate;
memset(info, 0, sizeof(*info));
+ if (sk->sk_type != SOCK_STREAM)
+ return;
+
+ info->tcpi_state = sk_state_load(sk);
- info->tcpi_state = sk->sk_state;
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2635,7 +2674,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (sk->sk_state == TCP_LISTEN) {
+ if (info->tcpi_state == TCP_LISTEN) {
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
} else {
@@ -2665,16 +2704,20 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
rate = READ_ONCE(sk->sk_pacing_rate);
- info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate);
- info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_max_pacing_rate);
do {
start = u64_stats_fetch_begin_irq(&tp->syncp);
- info->tcpi_bytes_acked = tp->bytes_acked;
- info->tcpi_bytes_received = tp->bytes_received;
+ put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+ put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+ info->tcpi_segs_out = tp->segs_out;
+ info->tcpi_segs_in = tp->segs_in;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2812,10 +2855,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_FASTOPEN:
- if (icsk->icsk_accept_queue.fastopenq)
- val = icsk->icsk_accept_queue.fastopenq->max_qlen;
- else
- val = 0;
+ val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
case TCP_TIMESTAMP:
@@ -2824,6 +2864,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
+ case TCP_SAVE_SYN:
+ val = tp->save_syn;
+ break;
+ case TCP_SAVED_SYN: {
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ if (tp->saved_syn) {
+ if (len < tp->saved_syn[0]) {
+ if (put_user(tp->saved_syn[0], optlen)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ release_sock(sk);
+ return -EINVAL;
+ }
+ len = tp->saved_syn[0];
+ if (put_user(len, optlen)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ tcp_saved_syn_free(tp);
+ release_sock(sk);
+ } else {
+ release_sock(sk);
+ len = 0;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ }
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -3028,11 +3104,12 @@ __setup("thash_entries=", set_thash_entries);
static void __init tcp_init_mem(void)
{
- unsigned long limit = nr_free_buffer_pages() / 8;
+ unsigned long limit = nr_free_buffer_pages() / 16;
+
limit = max(limit, 128UL);
- sysctl_tcp_mem[0] = limit / 4 * 3;
- sysctl_tcp_mem[1] = limit;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
+ sysctl_tcp_mem[1] = limit; /* 6.25 % */
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
}
void __init tcp_init(void)
diff --git a/kernel/net/ipv4/tcp_bic.c b/kernel/net/ipv4/tcp_bic.c
index c037644ea..fd1405d37 100644
--- a/kernel/net/ipv4/tcp_bic.c
+++ b/kernel/net/ipv4/tcp_bic.c
@@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
diff --git a/kernel/net/ipv4/tcp_cdg.c b/kernel/net/ipv4/tcp_cdg.c
new file mode 100644
index 000000000..167b6a3e1
--- /dev/null
+++ b/kernel/net/ipv4/tcp_cdg.c
@@ -0,0 +1,433 @@
+/*
+ * CAIA Delay-Gradient (CDG) congestion control
+ *
+ * This implementation is based on the paper:
+ * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011.
+ *
+ * Scavenger traffic (Less-than-Best-Effort) should disable coexistence
+ * heuristics using parameters use_shadow=0 and use_ineff=0.
+ *
+ * Parameters window, backoff_beta, and backoff_factor are crucial for
+ * throughput and delay. Future work is needed to determine better defaults,
+ * and to provide guidelines for use in different environments/contexts.
+ *
+ * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/.
+ * Parameter window is only configurable when loading tcp_cdg as a module.
+ *
+ * Notable differences from paper/FreeBSD:
+ * o Using Hybrid Slow start and Proportional Rate Reduction.
+ * o Add toggle for shadow window mechanism. Suggested by David Hayes.
+ * o Add toggle for non-congestion loss tolerance.
+ * o Scaling parameter G is changed to a backoff factor;
+ * conversion is given by: backoff_factor = 1000/(G * window).
+ * o Limit shadow window to 2 * cwnd, or to cwnd when application limited.
+ * o More accurate e^-x.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define HYSTART_ACK_TRAIN 1
+#define HYSTART_DELAY 2
+
+static int window __read_mostly = 8;
+static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */
+static unsigned int backoff_factor __read_mostly = 42;
+static unsigned int hystart_detect __read_mostly = 3;
+static unsigned int use_ineff __read_mostly = 5;
+static bool use_shadow __read_mostly = true;
+static bool use_tolerance __read_mostly;
+
+module_param(window, int, 0444);
+MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)");
+module_param(backoff_beta, uint, 0644);
+MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)");
+module_param(backoff_factor, uint, 0644);
+MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor");
+module_param(hystart_detect, uint, 0644);
+MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start "
+ "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)");
+module_param(use_ineff, uint, 0644);
+MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)");
+module_param(use_shadow, bool, 0644);
+MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
+module_param(use_tolerance, bool, 0644);
+MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
+
+struct minmax {
+ union {
+ struct {
+ s32 min;
+ s32 max;
+ };
+ u64 v64;
+ };
+};
+
+enum cdg_state {
+ CDG_UNKNOWN = 0,
+ CDG_NONFULL = 1,
+ CDG_FULL = 2,
+ CDG_BACKOFF = 3,
+};
+
+struct cdg {
+ struct minmax rtt;
+ struct minmax rtt_prev;
+ struct minmax *gradients;
+ struct minmax gsum;
+ bool gfilled;
+ u8 tail;
+ u8 state;
+ u8 delack;
+ u32 rtt_seq;
+ u32 undo_cwnd;
+ u32 shadow_wnd;
+ u16 backoff_cnt;
+ u16 sample_cnt;
+ s32 delay_min;
+ u32 last_ack;
+ u32 round_start;
+};
+
+/**
+ * nexp_u32 - negative base-e exponential
+ * @ux: x in units of micro
+ *
+ * Returns exp(ux * -1e-6) * U32_MAX.
+ */
+static u32 __pure nexp_u32(u32 ux)
+{
+ static const u16 v[] = {
+ /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */
+ 65535,
+ 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
+ 61378, 57484, 50423, 38795, 22965, 8047, 987, 14,
+ };
+ u32 msb = ux >> 8;
+ u32 res;
+ int i;
+
+ /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */
+ if (msb > U16_MAX)
+ return 0;
+
+ /* Scale first eight bits linearly: */
+ res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
+
+ /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */
+ for (i = 1; msb; i++, msb >>= 1) {
+ u32 y = v[i & -(msb & 1)] + U32_C(1);
+
+ res = ((u64)res * y) >> 16;
+ }
+
+ return res;
+}
+
+/* Based on the HyStart algorithm (by Ha et al.) that is implemented in
+ * tcp_cubic. Differences/experimental changes:
+ * o Using Hayes' delayed ACK filter.
+ * o Using a usec clock for the ACK train.
+ * o Reset ACK train when application limited.
+ * o Invoked at any cwnd (i.e. also when cwnd < 16).
+ * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh).
+ */
+static void tcp_cdg_hystart_update(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min);
+ if (ca->delay_min == 0)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
+
+ if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
+ ca->last_ack = now_us;
+ ca->round_start = now_us;
+ } else if (before(now_us, ca->last_ack + 3000)) {
+ u32 base_owd = max(ca->delay_min / 2U, 125U);
+
+ ca->last_ack = now_us;
+ if (after(now_us, ca->round_start + base_owd)) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ return;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ if (ca->sample_cnt < 8) {
+ ca->sample_cnt++;
+ } else {
+ s32 thresh = max(ca->delay_min + ca->delay_min / 8U,
+ 125U);
+
+ if (ca->rtt.min > thresh) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+static s32 tcp_cdg_grad(struct cdg *ca)
+{
+ s32 gmin = ca->rtt.min - ca->rtt_prev.min;
+ s32 gmax = ca->rtt.max - ca->rtt_prev.max;
+ s32 grad;
+
+ if (ca->gradients) {
+ ca->gsum.min += gmin - ca->gradients[ca->tail].min;
+ ca->gsum.max += gmax - ca->gradients[ca->tail].max;
+ ca->gradients[ca->tail].min = gmin;
+ ca->gradients[ca->tail].max = gmax;
+ ca->tail = (ca->tail + 1) & (window - 1);
+ gmin = ca->gsum.min;
+ gmax = ca->gsum.max;
+ }
+
+ /* We keep sums to ignore gradients during cwnd reductions;
+ * the paper's smoothed gradients otherwise simplify to:
+ * (rtt_latest - rtt_oldest) / window.
+ *
+ * We also drop division by window here.
+ */
+ grad = gmin > 0 ? gmin : gmax;
+
+ /* Extrapolate missing values in gradient window: */
+ if (!ca->gfilled) {
+ if (!ca->gradients && window > 1)
+ grad *= window; /* Memory allocation failed. */
+ else if (ca->tail == 0)
+ ca->gfilled = true;
+ else
+ grad = (grad * window) / (int)ca->tail;
+ }
+
+ /* Backoff was effectual: */
+ if (gmin <= -32 || gmax <= -32)
+ ca->backoff_cnt = 0;
+
+ if (use_tolerance) {
+ /* Reduce small variations to zero: */
+ gmin = DIV_ROUND_CLOSEST(gmin, 64);
+ gmax = DIV_ROUND_CLOSEST(gmax, 64);
+
+ if (gmin > 0 && gmax <= 0)
+ ca->state = CDG_FULL;
+ else if ((gmin > 0 && gmax > 0) || gmax < 0)
+ ca->state = CDG_NONFULL;
+ }
+ return grad;
+}
+
+static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (prandom_u32() <= nexp_u32(grad * backoff_factor))
+ return false;
+
+ if (use_ineff) {
+ ca->backoff_cnt++;
+ if (ca->backoff_cnt > use_ineff)
+ return false;
+ }
+
+ ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd);
+ ca->state = CDG_BACKOFF;
+ tcp_enter_cwr(sk);
+ return true;
+}
+
+/* Not called in CWR or Recovery state. */
+static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_cwnd;
+ u32 incr;
+
+ if (tcp_in_slow_start(tp) && hystart_detect)
+ tcp_cdg_hystart_update(sk);
+
+ if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
+ s32 grad = 0;
+
+ if (ca->rtt_prev.v64)
+ grad = tcp_cdg_grad(ca);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ ca->last_ack = 0;
+ ca->sample_cnt = 0;
+
+ if (grad > 0 && tcp_cdg_backoff(sk, grad))
+ return;
+ }
+
+ if (!tcp_is_cwnd_limited(sk)) {
+ ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd);
+ return;
+ }
+
+ prior_snd_cwnd = tp->snd_cwnd;
+ tcp_reno_cong_avoid(sk, ack, acked);
+
+ incr = tp->snd_cwnd - prior_snd_cwnd;
+ ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
+}
+
+static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rtt_us <= 0)
+ return;
+
+ /* A heuristic for filtering delayed ACKs, adapted from:
+ * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support
+ * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
+ */
+ if (tp->sacked_out == 0) {
+ if (num_acked == 1 && ca->delack) {
+ /* A delayed ACK is only used for the minimum if it is
+ * provenly lower than an existing non-zero minimum.
+ */
+ ca->rtt.min = min(ca->rtt.min, rtt_us);
+ ca->delack--;
+ return;
+ } else if (num_acked > 1 && ca->delack < 5) {
+ ca->delack++;
+ }
+ }
+
+ ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
+ ca->rtt.max = max(ca->rtt.max, rtt_us);
+}
+
+static u32 tcp_cdg_ssthresh(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->undo_cwnd = tp->snd_cwnd;
+
+ if (ca->state == CDG_BACKOFF)
+ return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
+
+ if (ca->state == CDG_NONFULL && use_tolerance)
+ return tp->snd_cwnd;
+
+ ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd);
+ if (use_shadow)
+ return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1);
+ return max(2U, tp->snd_cwnd >> 1);
+}
+
+static u32 tcp_cdg_undo_cwnd(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd);
+}
+
+static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct minmax *gradients;
+
+ switch (ev) {
+ case CA_EVENT_CWND_RESTART:
+ gradients = ca->gradients;
+ if (gradients)
+ memset(gradients, 0, window * sizeof(gradients[0]));
+ memset(ca, 0, sizeof(*ca));
+
+ ca->gradients = gradients;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tp->snd_cwnd;
+ break;
+ case CA_EVENT_COMPLETE_CWR:
+ ca->state = CDG_UNKNOWN;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcp_cdg_init(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* We silently fall back to window = 1 if allocation fails. */
+ if (window > 1)
+ ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
+ GFP_NOWAIT | __GFP_NOWARN);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tp->snd_cwnd;
+}
+
+static void tcp_cdg_release(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ kfree(ca->gradients);
+}
+
+struct tcp_congestion_ops tcp_cdg __read_mostly = {
+ .cong_avoid = tcp_cdg_cong_avoid,
+ .cwnd_event = tcp_cdg_cwnd_event,
+ .pkts_acked = tcp_cdg_acked,
+ .undo_cwnd = tcp_cdg_undo_cwnd,
+ .ssthresh = tcp_cdg_ssthresh,
+ .release = tcp_cdg_release,
+ .init = tcp_cdg_init,
+ .owner = THIS_MODULE,
+ .name = "cdg",
+};
+
+static int __init tcp_cdg_register(void)
+{
+ if (backoff_beta > 1024 || window < 1 || window > 256)
+ return -ERANGE;
+ if (!is_power_of_2(window))
+ return -EINVAL;
+
+ BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_cdg);
+ return 0;
+}
+
+static void __exit tcp_cdg_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_cdg);
+}
+
+module_init(tcp_cdg_register);
+module_exit(tcp_cdg_unregister);
+MODULE_AUTHOR("Kenneth Klette Jonassen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP CDG");
diff --git a/kernel/net/ipv4/tcp_cong.c b/kernel/net/ipv4/tcp_cong.c
index 84be008c9..882caa4e7 100644
--- a/kernel/net/ipv4/tcp_cong.c
+++ b/kernel/net/ipv4/tcp_cong.c
@@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-u32 tcp_ca_get_key_by_name(const char *name)
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
- u32 key;
+ u32 key = TCP_CA_UNSPEC;
might_sleep();
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
- key = ca ? ca->key : TCP_CA_UNSPEC;
+ if (ca) {
+ key = ca->key;
+ *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
+ }
rcu_read_unlock();
return key;
@@ -170,6 +173,10 @@ out:
*/
if (ca->get_info)
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
}
void tcp_init_congestion_control(struct sock *sk)
@@ -178,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk)
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
}
static void tcp_reinit_congestion_control(struct sock *sk,
@@ -189,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
- if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
+ if (sk->sk_state != TCP_CLOSE)
+ tcp_init_congestion_control(sk);
}
/* Manage refcounts on socket close. */
@@ -365,10 +376,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
*/
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- u32 cwnd = tp->snd_cwnd + acked;
+ u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
- if (cwnd > tp->snd_ssthresh)
- cwnd = tp->snd_ssthresh + 1;
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
@@ -413,7 +422,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
diff --git a/kernel/net/ipv4/tcp_cubic.c b/kernel/net/ipv4/tcp_cubic.c
index 06d3d665a..448c2615f 100644
--- a/kernel/net/ipv4/tcp_cubic.c
+++ b/kernel/net/ipv4/tcp_cubic.c
@@ -151,6 +151,27 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_TX_START) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+ s32 delta;
+
+ delta = now - tcp_sk(sk)->lsndtime;
+
+ /* We were application limited (idle) for a while.
+ * Shift epoch_start to keep cwnd growth to cubic curve.
+ */
+ if (ca->epoch_start && delta > 0) {
+ ca->epoch_start += delta;
+ if (after(ca->epoch_start, now))
+ ca->epoch_start = now;
+ }
+ return;
+ }
+}
+
/* calculate the cubic root of x using a table lookup followed by one
* Newton-Raphson iteration.
* Avg err ~= 0.195%
@@ -320,7 +341,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
@@ -439,7 +460,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+ if (hystart && tcp_in_slow_start(tp) &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
@@ -450,6 +471,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
.undo_cwnd = bictcp_undo_cwnd,
+ .cwnd_event = bictcp_cwnd_event,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",
diff --git a/kernel/net/ipv4/tcp_dctcp.c b/kernel/net/ipv4/tcp_dctcp.c
index 4c41c1287..7e538f71f 100644
--- a/kernel/net/ipv4/tcp_dctcp.c
+++ b/kernel/net/ipv4/tcp_dctcp.c
@@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
- /* For avoiding denominator == 1. */
- if (ca->acked_bytes_total == 0)
- ca->acked_bytes_total = 1;
+ u64 bytes_ecn = ca->acked_bytes_ecn;
+ u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */
- ca->dctcp_alpha = ca->dctcp_alpha -
- (ca->dctcp_alpha >> dctcp_shift_g) +
- (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
- ca->acked_bytes_total;
- if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
- /* Clamp dctcp_alpha to max. */
- ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
+ if (bytes_ecn) {
+ /* If dctcp_shift_g == 1, a 32bit value would overflow
+ * after 8 Mbytes.
+ */
+ bytes_ecn <<= (10 - dctcp_shift_g);
+ do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
+ alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
+ }
+ /* dctcp_alpha can be read from dctcp_get_info() without
+ * synchro, so we ask compiler to not use dctcp_alpha
+ * as a temporary variable in prior operations.
+ */
+ WRITE_ONCE(ca->dctcp_alpha, alpha);
dctcp_reset(tp, ca);
}
}
diff --git a/kernel/net/ipv4/tcp_diag.c b/kernel/net/ipv4/tcp_diag.c
index 79b34a0f4..b31604086 100644
--- a/kernel/net/ipv4/tcp_diag.c
+++ b/kernel/net/ipv4/tcp_diag.c
@@ -19,13 +19,14 @@
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_info *info = _info;
- if (sk->sk_state == TCP_LISTEN) {
+ if (sk_state_load(sk) == TCP_LISTEN) {
r->idiag_rqueue = sk->sk_ack_backlog;
r->idiag_wqueue = sk->sk_max_ack_backlog;
- } else {
+ } else if (sk->sk_type == SOCK_STREAM) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+
r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
r->idiag_wqueue = tp->write_seq - tp->snd_una;
}
@@ -50,6 +51,7 @@ static const struct inet_diag_handler tcp_diag_handler = {
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_type = IPPROTO_TCP,
+ .idiag_info_size = sizeof(struct tcp_info),
};
static int __init tcp_diag_init(void)
diff --git a/kernel/net/ipv4/tcp_fastopen.c b/kernel/net/ipv4/tcp_fastopen.c
index f9c0fb84e..55be6ac70 100644
--- a/kernel/net/ipv4/tcp_fastopen.c
+++ b/kernel/net/ipv4/tcp_fastopen.c
@@ -124,27 +124,29 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
return false;
}
-static bool tcp_fastopen_create_child(struct sock *sk,
- struct sk_buff *skb,
- struct dst_entry *dst,
- struct request_sock *req)
+static struct sock *tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
{
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
u32 end_seq;
+ bool own_req;
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ NULL, &own_req);
if (!child)
- return false;
+ return NULL;
- spin_lock(&queue->fastopenq->lock);
- queue->fastopenq->qlen++;
- spin_unlock(&queue->fastopenq->lock);
+ spin_lock(&queue->fastopenq.lock);
+ queue->fastopenq.qlen++;
+ spin_unlock(&queue->fastopenq.lock);
/* Initialize the child socket. Have to fix some values to take
* into account the child is a Fast Open socket and is created
@@ -161,15 +163,13 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
/* Activate the retrans timer so that SYNACK can be retransmitted.
- * The request socket is not added to the SYN table of the parent
+ * The request socket is not added to the ehash
* because it's been added to the accept queue directly.
*/
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
- atomic_set(&req->rsk_refcnt, 1);
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
+ atomic_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
inet_csk(child)->icsk_af_ops->rebuild_header(child);
@@ -178,12 +178,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- * Note that IPv6 might also have used skb_get() trick
- * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
- * So we need to eventually get a clone of the packet,
- * before inserting it in sk_receive_queue.
+ /* Queue the data carried in the SYN packet.
+ * We used to play tricky games with skb_get().
+ * With lockless listener, it is a dead end.
+ * Do not think about it.
*
* XXX (TFO) - we honor a zero-payload TFO request for now,
* (any reason not to?) but no need to queue the skb since
@@ -191,12 +189,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
*/
end_seq = TCP_SKB_CB(skb)->end_seq;
if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2;
-
- if (unlikely(skb_shared(skb)))
- skb2 = skb_clone(skb, GFP_ATOMIC);
- else
- skb2 = skb_get(skb);
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (likely(skb2)) {
skb_dst_drop(skb2);
@@ -214,11 +207,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
}
}
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
- sk->sk_data_ready(sk);
- bh_unlock_sock(child);
- sock_put(child);
- WARN_ON(!req->sk);
- return true;
+ /* tcp_conn_request() is sending the SYNACK,
+ * and queues the child into listener accept queue.
+ */
+ return child;
}
static bool tcp_fastopen_queue_check(struct sock *sk)
@@ -235,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* between qlen overflow causing Fast Open to be disabled
* temporarily vs a server not supporting Fast Open at all.
*/
- fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (!fastopenq || fastopenq->max_qlen == 0)
+ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq->max_qlen == 0)
return false;
if (fastopenq->qlen >= fastopenq->max_qlen) {
@@ -261,13 +253,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
* cookie request (foc->len == 0).
*/
-bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
{
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+ struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -276,7 +269,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
(syn_data || foc->len >= 0) &&
tcp_fastopen_queue_check(sk))) {
foc->len = -1;
- return false;
+ return NULL;
}
if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
@@ -296,11 +289,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
* data in SYN_RECV state.
*/
fastopen:
- if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ child = tcp_fastopen_create_child(sk, skb, dst, req);
+ if (child) {
foc->len = -1;
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
- return true;
+ return child;
}
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
@@ -308,6 +302,5 @@ fastopen:
valid_foc.exp = foc->exp;
*foc = valid_foc;
- return false;
+ return NULL;
}
-EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/kernel/net/ipv4/tcp_highspeed.c b/kernel/net/ipv4/tcp_highspeed.c
index 882c08aae..db7842495 100644
--- a/kernel/net/ipv4/tcp_highspeed.c
+++ b/kernel/net/ipv4/tcp_highspeed.c
@@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
diff --git a/kernel/net/ipv4/tcp_htcp.c b/kernel/net/ipv4/tcp_htcp.c
index 58469fff6..82f0d9ed6 100644
--- a/kernel/net/ipv4/tcp_htcp.c
+++ b/kernel/net/ipv4/tcp_htcp.c
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
diff --git a/kernel/net/ipv4/tcp_hybla.c b/kernel/net/ipv4/tcp_hybla.c
index f963b274f..083831e35 100644
--- a/kernel/net/ipv4/tcp_hybla.c
+++ b/kernel/net/ipv4/tcp_hybla.c
@@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rho_fractions = ca->rho_3ls - (ca->rho << 3);
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/*
* slow start
* INC = 2^RHO - 1
diff --git a/kernel/net/ipv4/tcp_illinois.c b/kernel/net/ipv4/tcp_illinois.c
index f71002e4d..2ab9bbb6f 100644
--- a/kernel/net/ipv4/tcp_illinois.c
+++ b/kernel/net/ipv4/tcp_illinois.c
@@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In slow start */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
diff --git a/kernel/net/ipv4/tcp_input.c b/kernel/net/ipv4/tcp_input.c
index c9ab96418..d4c511584 100644
--- a/kernel/net/ipv4/tcp_input.c
+++ b/kernel/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
+int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -109,6 +110,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -196,11 +198,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline bool tcp_in_quickack_mode(const struct sock *sk)
+static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
- return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
@@ -359,7 +363,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !sk_under_memory_pressure(sk)) {
+ !tcp_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -446,7 +450,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !sk_under_memory_pressure(sk) &&
+ !tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
@@ -750,13 +754,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
* TCP pacing, to smooth the burst on large writes when packets
* in flight is significantly lower than cwnd (or rwin)
*/
+int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
+int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
+
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
- rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+ /* current rate is (cwnd * mss) / srtt
+ * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+ * In Congestion Avoidance phase, set it to 120 % the current rate.
+ *
+ * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+ * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+ * end of slow start and should slow down.
+ */
+ if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+ rate *= sysctl_tcp_pacing_ss_ratio;
+ else
+ rate *= sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out);
@@ -861,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > 0)
tcp_disable_early_retrans(tp);
+ tp->rack.reord = 1;
}
/* This must be called before lost_out is incremented */
@@ -886,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
}
}
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
- struct sk_buff *skb)
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
@@ -1028,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
return !before(start_seq, end_seq - tp->max_window);
}
-/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
- * for reordering! Ugly, but should help.
- *
- * Search retransmitted skbs from write_queue that were sent when snd_nxt was
- * less than what is now known to be received by the other end (derived from
- * highest SACK block). Also calculate the lowest snd_nxt among the remaining
- * retransmitted skbs to avoid some costly processing per ACKs.
- */
-static void tcp_mark_lost_retrans(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
- u32 new_low_seq = tp->snd_nxt;
- u32 received_upto = tcp_highest_sack_seq(tp);
-
- if (!tcp_is_fack(tp) || !tp->retrans_out ||
- !after(received_upto, tp->lost_retrans_low) ||
- icsk->icsk_ca_state != TCP_CA_Recovery)
- return;
-
- tcp_for_write_queue(skb, sk) {
- u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
-
- if (skb == tcp_send_head(sk))
- break;
- if (cnt == tp->retrans_out)
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
- continue;
-
- /* TODO: We would like to get rid of tcp_is_fack(tp) only
- * constraint here (see above) but figuring out that at
- * least tp->reordering SACK blocks reside between ack_seq
- * and received_upto is not easy task to do cheaply with
- * the available datastructures.
- *
- * Whether FACK should check here for tp->reordering segs
- * in-between one could argue for either way (it would be
- * rather simple to implement as we could count fack_count
- * during the walk and do tp->fackets_out - fack_count).
- */
- if (after(received_upto, ack_seq)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
-
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
- } else {
- if (before(ack_seq, new_low_seq))
- new_low_seq = ack_seq;
- cnt += tcp_skb_pcount(skb);
- }
- }
-
- if (tp->retrans_out)
- tp->lost_retrans_low = new_low_seq;
-}
-
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
@@ -1130,7 +1086,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sacktag_state {
int reord;
int fack_count;
- long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ struct skb_mstamp first_sackt;
+ struct skb_mstamp last_sackt;
int flag;
};
@@ -1212,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
+ tcp_rack_advance(tp, xmit_time, sacked);
+
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
@@ -1233,14 +1196,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
- /* Pick the earliest sequence sacked for RTT */
- if (state->rtt_us < 0) {
- struct skb_mstamp now;
-
- skb_mstamp_get(&now);
- state->rtt_us = skb_mstamp_us_delta(&now,
- xmit_time);
- }
+ if (state->first_sackt.v64 == 0)
+ state->first_sackt = *xmit_time;
+ state->last_sackt = *xmit_time;
}
if (sacked & TCPCB_LOST) {
@@ -1316,16 +1274,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* code can come after this skb later on it's better to keep
* setting gso_size to something.
*/
- if (!skb_shinfo(prev)->gso_size) {
- skb_shinfo(prev)->gso_size = mss;
- skb_shinfo(prev)->gso_type = sk->sk_gso_type;
- }
+ if (!TCP_SKB_CB(prev)->tcp_gso_size)
+ TCP_SKB_CB(prev)->tcp_gso_size = mss;
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
- if (tcp_skb_pcount(skb) <= 1) {
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- }
+ if (tcp_skb_pcount(skb) <= 1)
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1634,7 +1588,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
- u32 prior_snd_una, long *sack_rtt_us)
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1642,7 +1596,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
- struct tcp_sacktag_state state;
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
@@ -1650,9 +1603,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
int i, j;
int first_sack_index;
- state.flag = 0;
- state.reord = tp->packets_out;
- state.rtt_us = -1L;
+ state->flag = 0;
+ state->reord = tp->packets_out;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1663,7 +1615,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
- state.flag |= FLAG_DSACKING_ACK;
+ state->flag |= FLAG_DSACKING_ACK;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1728,7 +1680,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
skb = tcp_write_queue_head(sk);
- state.fack_count = 0;
+ state->fack_count = 0;
i = 0;
if (!tp->sacked_out) {
@@ -1762,10 +1714,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
- skb = tcp_sacktag_skip(skb, sk, &state,
+ skb = tcp_sacktag_skip(skb, sk, state,
start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
- &state,
+ state,
start_seq,
cache->start_seq,
dup_sack);
@@ -1776,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
goto advance_sp;
skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
- &state,
+ state,
cache->end_seq);
/* ...tail remains todo... */
@@ -1785,12 +1737,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
cache++;
goto walk;
}
- skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
@@ -1800,12 +1752,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
}
- skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, start_seq);
walk:
- skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+ skb = tcp_sacktag_walk(skb, sk, next_dup, state,
start_seq, end_seq, dup_sack);
advance_sp:
@@ -1820,11 +1772,10 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- if ((state.reord < tp->fackets_out) &&
+ if ((state->reord < tp->fackets_out) &&
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
- tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+ tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk);
tcp_verify_left_out(tp);
out:
@@ -1834,8 +1785,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
- *sack_rtt_us = state.rtt_us;
- return state.flag;
+ return state->flag;
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
@@ -1924,14 +1874,13 @@ void tcp_enter_loss(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- bool new_recovery = false;
+ bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
- new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2255,7 +2204,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
(oldcnt >= packets))
break;
- mss = skb_shinfo(skb)->gso_size;
+ mss = tcp_skb_mss(skb);
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
mss, GFP_ATOMIC);
if (err < 0)
@@ -2303,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+ return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, when);
+}
+
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
- (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+ tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}
/* Undo procedures. */
@@ -2482,15 +2446,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
return false;
}
-/* The cwnd reduction in CWR and Recovery use the PRR algorithm
- * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
* It computes the number of packets to send (sndcnt) based on packets newly
* delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
- * 2) If packets in flight is lower than ssthresh (such as due to excess
- * losses and/or application stalls), do not perform any further cwnd
- * reductions, but instead slow start up to ssthresh.
+ * 2) Otherwise PRR uses packet conservation to send as much as delivered.
+ * But when the retransmits are acked without further losses, PRR
+ * slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
{
@@ -2507,7 +2470,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
}
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit)
+ int fast_rexmit, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2515,17 +2478,22 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
int newly_acked_sacked = prior_unsacked -
(tp->packets_out - tp->sacked_out);
+ if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+ return;
+
tp->prr_delivered += newly_acked_sacked;
- if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else {
+ } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
+ !(flag & FLAG_LOST_RETRANS)) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
+ } else {
+ sndcnt = min(delta, newly_acked_sacked);
}
-
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2555,6 +2523,7 @@ void tcp_enter_cwr(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
+EXPORT_SYMBOL(tcp_enter_cwr);
static void tcp_try_keep_open(struct sock *sk)
{
@@ -2585,7 +2554,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
} else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2595,6 +2564,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2614,6 +2584,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2682,7 +2653,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!tcp_in_cwnd_reduction(sk)) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tcp_init_cwnd_reduction(sk);
@@ -2742,7 +2713,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked)
+ const int prior_unsacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2758,7 +2729,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* mark more packets lost or retransmit more.
*/
if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
return true;
}
@@ -2838,6 +2809,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+ tcp_rack_mark_lost(sk))
+ flag |= FLAG_LOST_RETRANS;
+
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2845,7 +2821,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2858,9 +2834,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ !(flag & FLAG_LOST_RETRANS))
return;
- /* Fall through to processing in Open state. */
+ /* Change state if cwnd is undone or retransmits are lost */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2895,12 +2872,73 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
tcp_xmit_retransmit_queue(sk);
}
+/* Kathleen Nichols' algorithm for tracking the minimum value of
+ * a data stream over some fixed time interval. (E.g., the minimum
+ * RTT over the past five minutes.) It uses constant space and constant
+ * time per update yet almost always delivers the same minimum as an
+ * implementation that has to keep all the data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of the
+ * n'th best >= n-1'th best. It also makes sure that the three values
+ * are widely separated in the time window since that bounds the worse
+ * case error when that data is monotonically increasing over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because it
+ * has no value - the new min is <= everything else in the window by
+ * definition and it's the most recent. So we restart fresh on every new min
+ * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
+ * best.
+ */
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+{
+ const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
+ struct rtt_meas *m = tcp_sk(sk)->rtt_min;
+ struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+ u32 elapsed;
+
+ /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
+ if (unlikely(rttm.rtt <= m[0].rtt))
+ m[0] = m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[1].rtt)
+ m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[2].rtt)
+ m[2] = rttm;
+
+ elapsed = now - m[0].ts;
+ if (unlikely(elapsed > wlen)) {
+ /* Passed entire window without a new min so make 2nd choice
+ * the new min & 3rd choice the new 2nd. So forth and so on.
+ */
+ m[0] = m[1];
+ m[1] = m[2];
+ m[2] = rttm;
+ if (now - m[0].ts > wlen) {
+ m[0] = m[1];
+ m[1] = rttm;
+ if (now - m[0].ts > wlen)
+ m[0] = rttm;
+ }
+ } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
+ /* Passed a quarter of the window without a new min so
+ * take 2nd choice from the 2nd quarter of the window.
+ */
+ m[2] = m[1] = rttm;
+ } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
+ /* Passed half the window without a new min so take the 3rd
+ * choice from the last half of the window.
+ */
+ m[2] = rttm;
+ }
+}
+
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- long seq_rtt_us, long sack_rtt_us)
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2909,9 +2947,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
- if (flag & FLAG_RETRANS_DATA_ACKED)
- seq_rtt_us = -1L;
-
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
@@ -2923,11 +2958,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-
+ seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
+ tp->rx_opt.rcv_tsecr);
if (seq_rtt_us < 0)
return false;
+ /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+ * always taken together with ACK, SACK, or TS-opts. Any negative
+ * values will be skipped with the seq_rtt_us < 0 check above.
+ */
+ tcp_update_rtt_min(sk, ca_rtt_us);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -2937,21 +2977,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
+void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
- struct tcp_sock *tp = tcp_sk(sk);
- long seq_rtt_us = -1L;
+ long rtt_us = -1L;
- if (synack_stamp && !tp->total_retrans)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
+ if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
+ struct skb_mstamp now;
- /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
- * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
- */
- if (!tp->srtt_us)
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
+ skb_mstamp_get(&now);
+ rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
+ }
+
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
}
+
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3055,7 +3095,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, long sack_rtt_us)
+ u32 prior_snd_una,
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt, now;
@@ -3063,8 +3104,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
bool fully_acked = true;
- long ca_seq_rtt_us = -1L;
+ long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
+ long ca_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
bool rtt_update;
@@ -3113,6 +3155,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
+ else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3151,17 +3195,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
- if (likely(first_ackt.v64)) {
+ if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+ if (sack->first_sackt.v64) {
+ sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
- rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+ ca_rtt_us);
if (flag & FLAG_ACKED) {
- const struct tcp_congestion_ops *ca_ops
- = inet_csk(sk)->icsk_ca_ops;
-
tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
@@ -3184,11 +3230,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- if (ca_ops->pkts_acked) {
- long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
- ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
- }
-
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3198,6 +3239,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
@@ -3238,7 +3282,7 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
- unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
when, TCP_RTO_MAX);
@@ -3331,6 +3375,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
tp->pred_flags = 0;
tcp_fast_path_check(sk);
+ if (tcp_send_head(sk))
+ tcp_slow_start_after_idle_check(sk);
+
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
@@ -3466,6 +3513,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sacktag_state sack_state;
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -3474,7 +3522,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
- long sack_rtt_us = -1L;
+
+ sack_state.first_sackt.v64 = 0;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3538,7 +3587,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
@@ -3563,13 +3612,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
- sack_rtt_us);
+ &sack_state);
acked -= tp->packets_out;
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
@@ -3578,6 +3623,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3615,7 +3664,7 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
@@ -3951,7 +4000,6 @@ void tcp_reset(struct sock *sk)
static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct dst_entry *dst;
inet_csk_schedule_ack(sk);
@@ -3963,9 +4011,7 @@ static void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- dst = __sk_dst_get(sk);
- if (!dst || !dst_metric(dst, RTAX_QUICKACK))
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -4438,19 +4484,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
+ int err = -ENOMEM;
+ int data_len = 0;
bool fragstolen;
if (size == 0)
return 0;
- skb = alloc_skb(size, sk->sk_allocation);
+ if (size > PAGE_SIZE) {
+ int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+ data_len = npages << PAGE_SHIFT;
+ size = data_len + (size & ~PAGE_MASK);
+ }
+ skb = alloc_skb_with_frags(size - data_len, data_len,
+ PAGE_ALLOC_COSTLY_ORDER,
+ &err, sk->sk_allocation);
if (!skb)
goto err;
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+
if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- if (memcpy_from_msg(skb_put(skb, size), msg, size))
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err)
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
@@ -4466,7 +4527,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
err_free:
kfree_skb(skb);
err:
- return -ENOMEM;
+ return err;
+
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
@@ -4514,10 +4576,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
- if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
-
+ if (eaten < 0) {
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
@@ -4788,7 +4852,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (sk_under_memory_pressure(sk))
+ else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4832,7 +4896,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
return false;
/* If we are under soft global TCP memory pressure, do not expand. */
@@ -5451,7 +5515,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5622,6 +5686,7 @@ discard:
}
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->copied_seq = tp->rcv_nxt;
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5677,15 +5742,14 @@ reset_and_undo:
* address independent.
*/
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;
- u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5729,7 +5793,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
case TCP_SYN_SENT:
- queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5764,15 +5828,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (!acceptable)
return 1;
+ if (!tp->srtt_us)
+ tcp_synack_rtt_meas(sk, req);
+
/* Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
- synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
- synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
@@ -5795,7 +5860,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, synack_stamp);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5982,14 +6046,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
+ u32 ecn_ok_dst;
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
- ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+ ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+ ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
- if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+ if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ (ecn_ok_dst & DST_FEATURE_ECN_CA))
inet_rsk(req)->ecn_ok = 1;
}
@@ -5999,11 +6066,11 @@ static void tcp_openreq_init(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ skb_mstamp_get(&tcp_rsk(req)->snt_synack);
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
@@ -6019,9 +6086,11 @@ static void tcp_openreq_init(struct request_sock *req,
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener)
+ struct sock *sk_listener,
+ bool attach_listener)
{
- struct request_sock *req = reqsk_alloc(ops, sk_listener);
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
@@ -6041,13 +6110,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(struct sock *sk,
+static bool tcp_syn_flood_action(const struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
- struct listen_sock *lopt;
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
@@ -6058,29 +6127,45 @@ static bool tcp_syn_flood_action(struct sock *sk,
#endif
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
- if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
- lopt->synflood_warned = 1;
+ if (!queue->synflood_warned &&
+ sysctl_tcp_syncookies != 2 &&
+ xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
- }
+
return want_cookie;
}
+static void tcp_reqsk_record_syn(const struct sock *sk,
+ struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ if (tcp_sk(sk)->save_syn) {
+ u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+ u32 *copy;
+
+ copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
+ if (copy) {
+ copy[0] = len;
+ memcpy(&copy[1], skb_network_header(skb), len);
+ req->saved_syn = copy;
+ }
+ }
+}
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
- struct request_sock *req;
struct tcp_sock *tp = tcp_sk(sk);
+ struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
- __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
- bool want_cookie = false, fastopen;
+ struct request_sock *req;
+ bool want_cookie = false;
struct flowi fl;
- struct tcp_fastopen_cookie foc = { .len = -1 };
- int err;
-
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
@@ -6104,7 +6189,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops, sk);
+ req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
@@ -6187,19 +6272,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
- fastopen = !want_cookie &&
- tcp_try_fastopen(sk, skb, req, &foc, dst);
- err = af_ops->send_synack(sk, dst, &fl, req,
- skb_get_queue_mapping(skb), &foc);
- if (!fastopen) {
- if (err || want_cookie)
- goto drop_and_free;
-
+ if (!want_cookie) {
+ tcp_reqsk_record_syn(sk, req, skb);
+ fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+ }
+ if (fastopen_sk) {
+ af_ops->send_synack(fastopen_sk, dst, &fl, req,
+ &foc, false);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(fastopen_sk);
+ sock_put(fastopen_sk);
+ } else {
tcp_rsk(req)->tfo_listener = false;
- af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ if (!want_cookie)
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ af_ops->send_synack(sk, dst, &fl, req,
+ &foc, !want_cookie);
+ if (want_cookie)
+ goto drop_and_free;
}
-
+ reqsk_put(req);
return 0;
drop_and_release:
diff --git a/kernel/net/ipv4/tcp_ipv4.c b/kernel/net/ipv4/tcp_ipv4.c
index 441ca6f38..8c7e63163 100644
--- a/kernel/net/ipv4/tcp_ipv4.c
+++ b/kernel/net/ipv4/tcp_ipv4.c
@@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err)
goto failure;
- inet_set_txhash(sk);
+ sk_set_txhash(sk);
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
@@ -312,7 +312,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
-void tcp_req_err(struct sock *sk, u32 seq)
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
@@ -324,17 +324,17 @@ void tcp_req_err(struct sock *sk, u32 seq)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- reqsk_put(req);
- } else {
+ } else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
}
+ reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
@@ -384,7 +384,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq,
+ type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -576,7 +581,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -705,7 +710,8 @@ release_sk1:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(struct net *net,
+ struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
@@ -720,7 +726,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
];
} rep;
struct ip_reply_arg arg;
- struct net *net = dev_net(skb_dst(skb)->dev);
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -782,7 +787,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcp_v4_send_ack(sock_net(sk), skb,
+ tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
@@ -795,15 +801,17 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
inet_twsk_put(tw);
}
-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
- tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+ tcp_sk(sk)->snd_nxt;
+
+ tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
@@ -818,11 +826,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
- u16 queue_mapping,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -833,12 +841,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc);
+ skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- skb_set_queue_mapping(skb, queue_mapping);
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
ireq->opt);
@@ -865,7 +872,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
*/
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
const union tcp_md5_addr *addr,
int family)
{
@@ -877,7 +884,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -894,7 +901,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
}
EXPORT_SYMBOL(tcp_md5_do_lookup);
-struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
+struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
@@ -922,7 +929,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk));
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1112,10 +1120,13 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+#endif
+
/* Called with rcu_read_lock() */
-static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb)
{
+#ifdef CONFIG_TCP_MD5SIG
/*
* This gets called for each TCP segment that arrives
* so we want to be efficient.
@@ -1165,10 +1176,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk,
return true;
}
return false;
-}
#endif
+ return false;
+}
-static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
+static void tcp_v4_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
@@ -1179,7 +1192,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
ireq->opt = tcp_v4_save_options(skb);
}
-static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct flowi *fl,
const struct request_sock *req,
bool *strict)
{
@@ -1218,7 +1232,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_sequence,
.send_synack = tcp_v4_send_synack,
- .queue_hash_add = inet_csk_reqsk_queue_hash_add,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1241,9 +1254,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request);
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
-struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
@@ -1277,7 +1292,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- inet_set_txhash(newsk);
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
@@ -1320,7 +1334,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- __inet_hash_nolisten(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ if (*own_req)
+ tcp_move_syn(newtp, req);
return newsk;
@@ -1338,34 +1354,11 @@ put_and_exit:
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
+#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
- const struct iphdr *iph = ip_hdr(skb);
- struct request_sock *req;
- struct sock *nsk;
- req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
- if (req) {
- nsk = tcp_check_req(sk, skb, req, false);
- if (!nsk || nsk == sk)
- reqsk_put(req);
- return nsk;
- }
-
- nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
- th->source, iph->daddr, th->dest, inet_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
-#ifdef CONFIG_SYN_COOKIES
if (!th->syn)
sk = cookie_v4_check(sk, skb);
#endif
@@ -1373,7 +1366,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
}
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
@@ -1400,17 +1393,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
- if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ struct sock *nsk = tcp_v4_cookie_check(sk, skb);
+
if (!nsk)
goto discard;
-
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
- sk_mark_napi_id(sk, skb);
+ sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1420,7 +1413,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+ if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
@@ -1508,7 +1501,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
@@ -1590,6 +1583,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1598,6 +1592,35 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sock_hold(sk);
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_and_relse;
+ }
+ if (nsk == sk) {
+ reqsk_put(req);
+ } else if (tcp_child_process(sk, nsk, skb)) {
+ tcp_v4_send_reset(nsk, skb);
+ goto discard_and_relse;
+ } else {
+ sock_put(sk);
+ return 0;
+ }
+ }
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
@@ -1606,26 +1629,25 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse;
-#endif
nf_reset(skb);
if (sk_filter(sk, skb))
goto discard_and_relse;
- sk_incoming_cpu_update(sk);
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v4_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
+ tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1638,6 +1660,7 @@ process:
}
bh_unlock_sock(sk);
+put_and_return:
sock_put(sk);
return ret;
@@ -1646,7 +1669,7 @@ no_tcp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+ if (tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
@@ -1670,10 +1693,6 @@ do_time_wait:
goto discard_it;
}
- if (skb->len < (th->doff << 2)) {
- inet_twsk_put(inet_twsk(sk));
- goto bad_packet;
- }
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
@@ -1686,8 +1705,7 @@ do_time_wait:
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
goto process;
}
@@ -1713,8 +1731,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
- dst_hold(dst);
+ if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
@@ -1802,6 +1819,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
+ tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
sock_release_memcg(sk);
@@ -1836,35 +1854,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- if (st->state == TCP_SEQ_STATE_OPENREQ) {
- struct request_sock *req = cur;
-
- icsk = inet_csk(st->syn_wait_sk);
- req = req->dl_next;
- while (1) {
- while (req) {
- if (req->rsk_ops->family == st->family) {
- cur = req;
- goto out;
- }
- req = req->dl_next;
- }
- if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
- break;
-get_req:
- req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
- }
- sk = sk_nulls_next(st->syn_wait_sk);
- st->state = TCP_SEQ_STATE_LISTENING;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- } else {
- icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue))
- goto start_req;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- sk = sk_nulls_next(sk);
- }
+ sk = sk_nulls_next(sk);
get_sk:
sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
@@ -1874,16 +1864,6 @@ get_sk:
goto out;
}
icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
-start_req:
- st->uid = sock_i_uid(sk);
- st->syn_wait_sk = sk;
- st->state = TCP_SEQ_STATE_OPENREQ;
- st->sbucket = 0;
- goto get_req;
- }
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
@@ -2015,7 +1995,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
void *rc = NULL;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE)
break;
@@ -2074,7 +2053,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
@@ -2099,11 +2077,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
struct tcp_iter_state *st = seq->private;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
- if (v) {
- struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- }
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
@@ -2157,7 +2130,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct request_sock *req,
- struct seq_file *f, int i, kuid_t uid)
+ struct seq_file *f, int i)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->rsk_timer.expires - jiffies;
@@ -2174,7 +2147,8 @@ static void get_openreq4(const struct request_sock *req,
1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta),
req->num_timeout,
- from_kuid_munged(seq_user_ns(f), uid),
+ from_kuid_munged(seq_user_ns(f),
+ sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2188,12 +2162,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct inet_sock *inet = inet_sk(sk);
- struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
__be32 dest = inet->inet_daddr;
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
+ int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
@@ -2211,17 +2186,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
timer_expires = jiffies;
}
- if (sk->sk_state == TCP_LISTEN)
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
rx_queue = sk->sk_ack_backlog;
else
- /*
- * because we dont lock socket, we might find a transient negative value
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
*/
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
- i, src, srcp, dest, destp, sk->sk_state,
+ i, src, srcp, dest, destp, state,
tp->write_seq - tp->snd_una,
rx_queue,
timer_active,
@@ -2235,8 +2211,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- sk->sk_state == TCP_LISTEN ?
- (fastopenq ? fastopenq->max_qlen : 0) :
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
@@ -2275,18 +2251,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
- switch (st->state) {
- case TCP_SEQ_STATE_LISTENING:
- case TCP_SEQ_STATE_ESTABLISHED:
- if (sk->sk_state == TCP_TIME_WAIT)
- get_timewait4_sock(v, seq, st->num);
- else
- get_tcp4_sock(v, seq, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(v, seq, st->num, st->uid);
- break;
- }
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq4(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
out:
seq_pad(seq, '\n');
return 0;
@@ -2410,12 +2380,15 @@ static int __net_init tcp_sk_init(struct net *net)
goto fail;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
+
net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
- return 0;
+ return 0;
fail:
tcp_sk_exit(net);
diff --git a/kernel/net/ipv4/tcp_metrics.c b/kernel/net/ipv4/tcp_metrics.c
index a51d63a43..c8cbc2b4b 100644
--- a/kernel/net/ipv4/tcp_metrics.c
+++ b/kernel/net/ipv4/tcp_metrics.c
@@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
- if (a->family != b->family)
- return false;
- if (a->family == AF_INET)
- return a->addr.a4 == b->addr.a4;
- return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
+ return inetpeer_addr_cmp(a, b) == 0;
}
struct tcpm_hash_bucket {
@@ -247,14 +243,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
daddr.family = req->rsk_ops->family;
switch (daddr.family) {
case AF_INET:
- saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
- daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
+ inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
+ hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
- daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
+ inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
+ inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
break;
#endif
@@ -285,25 +281,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
struct net *net;
if (tw->tw_family == AF_INET) {
- saddr.family = AF_INET;
- saddr.addr.a4 = tw->tw_rcv_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = tw->tw_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
+ inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
+ hash = ipv4_addr_hash(tw->tw_daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (tw->tw_family == AF_INET6) {
if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
- saddr.family = AF_INET;
- saddr.addr.a4 = tw->tw_rcv_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = tw->tw_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
+ inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
+ hash = ipv4_addr_hash(tw->tw_daddr);
} else {
- saddr.family = AF_INET6;
- saddr.addr.in6 = tw->tw_v6_rcv_saddr;
- daddr.family = AF_INET6;
- daddr.addr.in6 = tw->tw_v6_daddr;
+ inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr);
+ inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr);
hash = ipv6_addr_hash(&tw->tw_v6_daddr);
}
}
@@ -335,25 +325,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
struct net *net;
if (sk->sk_family == AF_INET) {
- saddr.family = AF_INET;
- saddr.addr.a4 = inet_sk(sk)->inet_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = inet_sk(sk)->inet_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+ inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+ hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
- saddr.family = AF_INET;
- saddr.addr.a4 = inet_sk(sk)->inet_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = inet_sk(sk)->inet_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+ inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+ hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
} else {
- saddr.family = AF_INET6;
- saddr.addr.in6 = sk->sk_v6_rcv_saddr;
- daddr.family = AF_INET6;
- daddr.addr.in6 = sk->sk_v6_daddr;
+ inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
+ inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
hash = ipv6_addr_hash(&sk->sk_v6_daddr);
}
}
@@ -461,7 +445,7 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
tp->snd_cwnd);
}
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ } else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
@@ -796,18 +780,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
switch (tm->tcpm_daddr.family) {
case AF_INET:
if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
- tm->tcpm_daddr.addr.a4) < 0)
+ inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
goto nla_put_failure;
if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
- tm->tcpm_saddr.addr.a4) < 0)
+ inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
goto nla_put_failure;
break;
case AF_INET6:
if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
- &tm->tcpm_daddr.addr.in6) < 0)
+ inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
goto nla_put_failure;
if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
- &tm->tcpm_saddr.addr.in6) < 0)
+ inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
goto nla_put_failure;
break;
default:
@@ -956,20 +940,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
a = info->attrs[v4];
if (a) {
- addr->family = AF_INET;
- addr->addr.a4 = nla_get_in_addr(a);
+ inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
if (hash)
- *hash = (__force unsigned int) addr->addr.a4;
+ *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
return 0;
}
a = info->attrs[v6];
if (a) {
+ struct in6_addr in6;
+
if (nla_len(a) != sizeof(struct in6_addr))
return -EINVAL;
- addr->family = AF_INET6;
- addr->addr.in6 = nla_get_in6_addr(a);
+ in6 = nla_get_in6_addr(a);
+ inetpeer_set_addr_v6(addr, &in6);
if (hash)
- *hash = ipv6_addr_hash(&addr->addr.in6);
+ *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
return 0;
}
return optional ? 1 : -EAFNOSUPPORT;
diff --git a/kernel/net/ipv4/tcp_minisocks.c b/kernel/net/ipv4/tcp_minisocks.c
index 17e7339ee..ac6b1961f 100644
--- a/kernel/net/ipv4/tcp_minisocks.c
+++ b/kernel/net/ipv4/tcp_minisocks.c
@@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->fin ||
TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_RST;
}
@@ -163,9 +162,9 @@ kill_with_rst:
if (tcp_death_row.sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw))
- inet_twsk_schedule(tw, tw->tw_timeout);
+ inet_twsk_reschedule(tw, tw->tw_timeout);
else
- inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -198,12 +197,11 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
}
}
- inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -253,7 +251,7 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
@@ -324,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
} while (0);
#endif
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
-
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
timeo = rto;
@@ -340,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
inet_twsk_schedule(tw, timeo);
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
@@ -364,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+/* Warning : This function is called without sk_listener being locked.
+ * Be sure to read socket fields once, as their value could change under us.
+ */
void tcp_openreq_init_rwin(struct request_sock *req,
- struct sock *sk, struct dst_entry *dst)
+ const struct sock *sk_listener,
+ const struct dst_entry *dst)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- __u8 rcv_wscale;
+ const struct tcp_sock *tp = tcp_sk(sk_listener);
+ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ int full_space = tcp_full_space(sk_listener);
int mss = dst_metric_advmss(dst);
+ u32 window_clamp;
+ __u8 rcv_wscale;
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
- mss = tp->rx_opt.user_mss;
+ if (user_mss && user_mss < mss)
+ mss = user_mss;
+ window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
- if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
- req->window_clamp = tcp_full_space(sk);
+ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
/* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
+ tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
+ &req->rsk_rcv_wnd,
+ &req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
@@ -436,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
-struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(const struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
@@ -451,6 +458,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
+ newtp->segs_in = 0;
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -462,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newtp->rtt_min[0].rtt = ~0U;
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -471,7 +480,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
- newtp->lsndtime = treq->snt_synack;
+ newtp->lsndtime = treq->snt_synack.stamp_jiffies;
+ newsk->sk_txhash = treq->txhash;
newtp->last_oow_ack_time = 0;
newtp->total_retrans = req->num_retrans;
@@ -503,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_ssthresh = req->rcv_wnd;
- newtp->rcv_wnd = req->rcv_wnd;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
@@ -538,6 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
+ newtp->rack.mstamp.v64 = 0;
+ newtp->rack.advanced = 0;
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
@@ -565,8 +577,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false;
-
- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+ bool own_req;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -697,7 +708,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
@@ -754,16 +765,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ req, &own_req);
if (!child)
goto listen_overflow;
- inet_csk_reqsk_queue_drop(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
- /* Warning: caller must not call reqsk_put(req);
- * child stole last reference on it.
- */
- return child;
+ sock_rps_save_rxhash(child, skb);
+ tcp_synack_rtt_meas(child, req);
+ return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
@@ -810,8 +819,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int state = child->sk_state;
if (!sock_owned_by_user(child)) {
- ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
- skb->len);
+ ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent);
diff --git a/kernel/net/ipv4/tcp_offload.c b/kernel/net/ipv4/tcp_offload.c
index 3f7c2fca5..9864a2dba 100644
--- a/kernel/net/ipv4/tcp_offload.c
+++ b/kernel/net/ipv4/tcp_offload.c
@@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
- mss = tcp_skb_mss(skb);
+ mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
@@ -242,7 +242,7 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
- mss = tcp_skb_mss(p);
+ mss = skb_shinfo(p)->gso_size;
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
diff --git a/kernel/net/ipv4/tcp_output.c b/kernel/net/ipv4/tcp_output.c
index 986440b24..9bfc39ff2 100644
--- a/kernel/net/ipv4/tcp_output.c
+++ b/kernel/net/ipv4/tcp_output.c
@@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
*/
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-/* Default TSQ limit of two TSO segments */
-int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+/* Default TSQ limit of four TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
@@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk)
}
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
- * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
- s32 delta = tcp_time_stamp - tp->lsndtime;
- u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 cwnd = tp->snd_cwnd;
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -163,20 +163,17 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
- const struct dst_entry *dst = __sk_dst_get(sk);
- if (sysctl_tcp_slow_start_after_idle &&
- (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
- tcp_cwnd_restart(sk, __sk_dst_get(sk));
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
tp->lsndtime = now;
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
- (!dst || !dst_metric(dst, RTAX_QUICKACK)))
- icsk->icsk_ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
/* Account for an ACK we sent. */
@@ -350,15 +347,20 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
}
}
+static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+ /* tp->ecn_flags are cleared at a later point in time when
+ * SYN ACK is ultimatively being received.
+ */
+ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+}
+
static void
-tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
- struct sock *sk)
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
- if (inet_rsk(req)->ecn_ok) {
+ if (inet_rsk(req)->ecn_ok)
th->ece = 1;
- if (tcp_ca_needs_ecn(sk))
- INET_ECN_xmit(sk);
- }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -393,8 +395,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
-
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
@@ -402,8 +402,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->sacked = 0;
tcp_skb_pcount_set(skb, 1);
- shinfo->gso_size = 0;
- shinfo->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -610,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct sock *sk,
- struct request_sock *req,
- unsigned int mss, struct sk_buff *skb,
- struct tcp_out_options *opts,
- const struct tcp_md5sig_key *md5,
- struct tcp_fastopen_cookie *foc)
+static unsigned int tcp_synack_options(struct request_sock *req,
+ unsigned int mss, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ const struct tcp_md5sig_key *md5,
+ struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -941,9 +938,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(sk, CA_EVENT_TX_START);
-
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
* which holds one reference to sk_wmem_alloc.
@@ -994,6 +988,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
tcp_ecn_send(sk, skb, tcp_header_size);
@@ -1018,8 +1013,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
- /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ tp->segs_out += tcp_skb_pcount(skb);
+ /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+ skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
skb->tstamp.tv64 = 0;
@@ -1056,25 +1053,17 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now)
+static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
-
- /* Make sure we own this skb before messing gso_size/gso_segs */
- WARN_ON_ONCE(skb_cloned(skb));
-
if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
tcp_skb_pcount_set(skb, 1);
- shinfo->gso_size = 0;
- shinfo->gso_type = 0;
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
} else {
tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
- shinfo->gso_size = mss_now;
- shinfo->gso_type = sk->sk_gso_type;
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
}
}
@@ -1163,7 +1152,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, gfp);
+ buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
@@ -1206,8 +1195,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* If this packet has been sent out already, we must
* adjust the various packet counters.
@@ -1287,7 +1276,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
return 0;
}
@@ -1619,13 +1608,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now)
+static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
@@ -1680,7 +1668,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
- tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_init_tso_segs(skb, cur_mss);
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
return 0;
@@ -1722,7 +1710,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now, gfp);
- buff = sk_stream_alloc_skb(sk, 0, gfp);
+ buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
@@ -1749,8 +1737,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* Link BUFF into the send queue. */
__skb_header_release(buff);
@@ -1777,7 +1765,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
+ if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
@@ -1834,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
/* Ok, it looks like it is advisable to defer. */
- if (cong_win < send_win && cong_win < skb->len)
+ if (cong_win < send_win && cong_win <= skb->len)
*is_cwnd_limited = true;
return true;
@@ -1941,7 +1929,7 @@ static int tcp_mtu_probe(struct sock *sk)
}
/* We're allowed to probe. Build it now. */
- nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
sk->sk_wmem_queued += nskb->truesize;
@@ -1984,7 +1972,7 @@ static int tcp_mtu_probe(struct sock *sk)
skb->len, 0);
} else {
__pskb_trim_head(skb, copy);
- tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
}
TCP_SKB_CB(skb)->seq += copy;
}
@@ -1994,7 +1982,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (len >= probe_size)
break;
}
- tcp_init_tso_segs(sk, nskb, nskb->len);
+ tcp_init_tso_segs(nskb, nskb->len);
/* We're ready to send. If this fails, the probe will
* be resegmented into mss-sized pieces by tcp_write_xmit().
@@ -2056,7 +2044,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
- tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
@@ -2067,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
- is_cwnd_limited = true;
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
@@ -2078,7 +2065,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
- if (tso_segs == 1 || !max_segs) {
+ if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
@@ -2091,7 +2078,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
limit = mss_now;
- if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+ if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
@@ -2149,10 +2136,11 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
- return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+ return !tp->packets_out && tcp_send_head(sk);
}
bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2172,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (sk->sk_state == TCP_SYN_RECV)
+ if (tp->fastopen_rsk)
return false;
/* TLP is only scheduled when next timer event is RTO. */
@@ -2182,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
+ if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
return false;
@@ -2191,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
return false;
/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
- * for delayed ack when there's one outstanding packet.
+ * for delayed ack when there's one outstanding packet. If no RTT
+ * sample is available then probe after TCP_TIMEOUT_INIT.
*/
- timeout = rtt << 1;
+ timeout = rtt << 1 ? : TCP_TIMEOUT_INIT;
if (tp->packets_out == 1)
timeout = max_t(u32, timeout,
(rtt + (rtt >> 1) + TCP_DELACK_MAX));
@@ -2229,7 +2218,7 @@ static bool skb_still_in_host_queue(const struct sock *sk,
return false;
}
-/* When probe timeout (PTO) fires, send a new segment if one exists, else
+/* When probe timeout (PTO) fires, try send a new segment if possible, else
* retransmit the last segment.
*/
void tcp_send_loss_probe(struct sock *sk)
@@ -2238,11 +2227,19 @@ void tcp_send_loss_probe(struct sock *sk)
struct sk_buff *skb;
int pcount;
int mss = tcp_current_mss(sk);
- int err = -1;
- if (tcp_send_head(sk)) {
- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
- goto rearm_timer;
+ skb = tcp_send_head(sk);
+ if (skb) {
+ if (tcp_snd_wnd_test(tp, skb, mss)) {
+ pcount = tp->packets_out;
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ if (tp->packets_out > pcount)
+ goto probe_sent;
+ goto rearm_timer;
+ }
+ skb = tcp_write_queue_prev(sk, skb);
+ } else {
+ skb = tcp_write_queue_tail(sk);
}
/* At most one outstanding TLP retransmission. */
@@ -2250,7 +2247,6 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
/* Retransmit last segment. */
- skb = tcp_write_queue_tail(sk);
if (WARN_ON(!skb))
goto rearm_timer;
@@ -2265,26 +2261,24 @@ void tcp_send_loss_probe(struct sock *sk)
if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
GFP_ATOMIC)))
goto rearm_timer;
- skb = tcp_write_queue_tail(sk);
+ skb = tcp_write_queue_next(sk, skb);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
- err = __tcp_retransmit_skb(sk, skb);
+ if (__tcp_retransmit_skb(sk, skb))
+ goto rearm_timer;
/* Record snd_nxt for loss detection. */
- if (likely(!err))
- tp->tlp_high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = tp->snd_nxt;
+probe_sent:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+ /* Reset s.t. tcp_rearm_rto will restart timer from now */
+ inet_csk(sk)->icsk_pending = 0;
rearm_timer:
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
-
- if (likely(!err))
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSSPROBES);
+ tcp_rearm_rto(sk);
}
/* Push out any pending frames which were held back due to
@@ -2392,7 +2386,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
@@ -2610,11 +2604,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (unlikely(oldpcount > 1)) {
if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
- tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_init_tso_segs(skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
}
+ /* RFC3168, section 6.1.1.1. ECN fallback */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+
tcp_retrans_try_collapse(sk, skb, cur_mss);
/* Make a copy, if the first transmission SKB clone we made
@@ -2657,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
- if (!tp->retrans_out)
- tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
@@ -2666,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- /* snd_nxt is stored to detect loss of retransmitted segment,
- * see tcp_input.c tcp_sacktag_write_queue().
- */
- TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
} else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
@@ -2816,8 +2808,10 @@ begin_fwd:
* connection tear down and (memory) recovery.
* Otherwise tcp_send_fin() could be tempted to either delay FIN
* or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
*/
-static void sk_forced_wmem_schedule(struct sock *sk, int size)
+void sk_forced_mem_schedule(struct sock *sk, int size)
{
int amt, status;
@@ -2841,7 +2835,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
- if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+ if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
@@ -2864,7 +2858,7 @@ coalesce:
return;
}
skb_reserve(skb, MAX_TCP_HEADER);
- sk_forced_wmem_schedule(sk, skb->truesize);
+ sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
@@ -2945,20 +2939,22 @@ int tcp_send_synack(struct sock *sk)
* Allocate one skb and build a SYNACK packet.
* @dst is consumed : Caller should not use it again.
*/
-struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
- struct tcp_out_options opts;
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th;
- struct sk_buff *skb;
+ const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *md5 = NULL;
+ struct tcp_out_options opts;
+ struct sk_buff *skb;
int tcp_header_size;
+ struct tcphdr *th;
+ u16 user_mss;
int mss;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2966,11 +2962,21 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
+ if (attach_req) {
+ skb_set_owner_w(skb, req_to_sk(req));
+ } else {
+ /* sk is a const pointer, because we want to express multiple
+ * cpu might call us concurrently.
+ * sk->sk_wmem_alloc in an atomic, we can promote to rw.
+ */
+ skb_set_owner_w(skb, (struct sock *)sk);
+ }
skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
- mss = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss && user_mss < mss)
+ mss = user_mss;
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -2984,8 +2990,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc) + sizeof(*th);
+ skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
+ sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2994,7 +3001,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- tcp_ecn_make_synack(req, th, sk);
+ tcp_ecn_make_synack(req, th);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
@@ -3008,8 +3015,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
- th->window = htons(min(req->rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ th->window = htons(min(req->rsk_rcv_wnd, 65535U));
+ tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
@@ -3143,7 +3150,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0, copied;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
struct sk_buff *syn_data;
@@ -3176,22 +3183,23 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
if (!syn_data)
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
- copied = copy_from_iter(skb_put(syn_data, space), space,
- &fo->data->msg_iter);
- if (unlikely(!copied)) {
- kfree_skb(syn_data);
- goto fallback;
- }
- if (copied != space) {
- skb_trim(syn_data, copied);
- space = copied;
+ if (space) {
+ int copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
}
-
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
fo->data = NULL;
@@ -3242,7 +3250,7 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
@@ -3383,7 +3391,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack);
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -3401,6 +3409,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
+ NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -3408,12 +3417,12 @@ void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
- tcp_xmit_probe_skb(sk, 0);
+ tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
}
}
/* Initiate keepalive or window probe from timer. */
-int tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -3441,7 +3450,7 @@ int tcp_write_wakeup(struct sock *sk)
if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb, mss);
+ tcp_set_skb_tso_segs(skb, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
@@ -3450,8 +3459,8 @@ int tcp_write_wakeup(struct sock *sk)
return err;
} else {
if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
- tcp_xmit_probe_skb(sk, 1);
- return tcp_xmit_probe_skb(sk, 0);
+ tcp_xmit_probe_skb(sk, 1, mib);
+ return tcp_xmit_probe_skb(sk, 0, mib);
}
}
@@ -3465,7 +3474,7 @@ void tcp_send_probe0(struct sock *sk)
unsigned long probe_max;
int err;
- err = tcp_write_wakeup(sk);
+ err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
if (tp->packets_out || !tcp_send_head(sk)) {
/* Cancel probe timer, if it is not required. */
@@ -3491,17 +3500,18 @@ void tcp_send_probe0(struct sock *sk)
probe_max = TCP_RESOURCE_PROBE_INTERVAL;
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- inet_csk_rto_backoff(icsk, probe_max),
+ tcp_probe0_when(sk, probe_max),
TCP_RTO_MAX);
}
-int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
{
const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
struct flowi fl;
int res;
- res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ tcp_rsk(req)->txhash = net_tx_rndhash();
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/kernel/net/ipv4/tcp_recovery.c b/kernel/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000..5353085fd
--- /dev/null
+++ b/kernel/net/ipv4/tcp_recovery.c
@@ -0,0 +1,109 @@
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ u32 reo_wnd, prior_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+ return 0;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay
+ * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+ * RTT because reordering is often a path property and less related
+ * to queuing or delayed ACKs.
+ *
+ * TODO: measure and adapt to the observed reordering delay, and
+ * use a timer to retransmit like the delayed early retransmit.
+ */
+ reo_wnd = 1000;
+ if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+
+ tcp_for_write_queue(skb, sk) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* Skip ones already (s)acked */
+ if (!after(scb->end_seq, tp->snd_una) ||
+ scb->sacked & TCPCB_SACKED_ACKED)
+ continue;
+
+ if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+
+ if (skb_mstamp_us_delta(&tp->rack.mstamp,
+ &skb->skb_mstamp) <= reo_wnd)
+ continue;
+
+ /* skb is lost if packet sent later is sacked */
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (scb->sacked & TCPCB_SACKED_RETRANS) {
+ scb->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ } else if (!(scb->sacked & TCPCB_RETRANS)) {
+ /* Original data are sent sequentially so stop early
+ * b/c the rest are all sent after rack_sent
+ */
+ break;
+ }
+ }
+ return prior_retrans - tp->retrans_out;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets */
+void tcp_rack_advance(struct tcp_sock *tp,
+ const struct skb_mstamp *xmit_time, u8 sacked)
+{
+ if (tp->rack.mstamp.v64 &&
+ !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ return;
+
+ if (sacked & TCPCB_RETRANS) {
+ struct skb_mstamp now;
+
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ skb_mstamp_get(&now);
+ if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ return;
+ }
+
+ tp->rack.mstamp = *xmit_time;
+ tp->rack.advanced = 1;
+}
diff --git a/kernel/net/ipv4/tcp_scalable.c b/kernel/net/ipv4/tcp_scalable.c
index 333bcb241..bf5ea9e9b 100644
--- a/kernel/net/ipv4/tcp_scalable.c
+++ b/kernel/net/ipv4/tcp_scalable.c
@@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
diff --git a/kernel/net/ipv4/tcp_timer.c b/kernel/net/ipv4/tcp_timer.c
index 8c65dc147..193ba1fa8 100644
--- a/kernel/net/ipv4/tcp_timer.c
+++ b/kernel/net/ipv4/tcp_timer.c
@@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
}
/* Calculate maximal number or retries on an orphaned socket. */
-static int tcp_orphan_retries(struct sock *sk, int alive)
+static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sysctl_tcp_orphan_retries; /* May be zero. */
@@ -168,7 +168,7 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (tp->syn_data)
+ if (tp->syn_data && icsk->icsk_retransmits == 1)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
@@ -176,6 +176,18 @@ static int tcp_write_timeout(struct sock *sk)
syn_set = true;
} else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ /* Some middle-boxes may black-hole Fast Open _after_
+ * the handshake. Therefore we conservatively disable
+ * Fast Open on this path on recurring timeouts with
+ * few or zero bytes acked after Fast Open.
+ */
+ if (tp->syn_data_acked &&
+ tp->bytes_acked <= tp->rx_opt.mss_clamp) {
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
+ if (icsk->icsk_retransmits == sysctl_tcp_retries1)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -184,7 +196,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -247,7 +259,7 @@ void tcp_delack_timer_handler(struct sock *sk)
}
out:
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
sk_mem_reclaim(sk);
}
@@ -298,7 +310,7 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
@@ -616,7 +628,7 @@ static void tcp_keepalive_timer (unsigned long data)
tcp_write_err(sk);
goto out;
}
- if (tcp_write_wakeup(sk) <= 0) {
+ if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
@@ -649,4 +661,3 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/kernel/net/ipv4/tcp_vegas.c b/kernel/net/ipv4/tcp_vegas.c
index a6cea1d5e..13951c408 100644
--- a/kernel/net/ipv4/tcp_vegas.c
+++ b/kernel/net/ipv4/tcp_vegas.c
@@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
- if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (diff > gamma && tcp_in_slow_start(tp)) {
/* Going too fast. Time to slow down
* and switch to congestion avoidance.
*/
@@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
- } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ } else if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
@@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
vegas->minRTT = 0x7fffffff;
}
/* Use normal slow start */
- else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ else if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
}
diff --git a/kernel/net/ipv4/tcp_veno.c b/kernel/net/ipv4/tcp_veno.c
index 112151eee..0d094b995 100644
--- a/kernel/net/ipv4/tcp_veno.c
+++ b/kernel/net/ipv4/tcp_veno.c
@@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
diff --git a/kernel/net/ipv4/tcp_yeah.c b/kernel/net/ipv4/tcp_yeah.c
index 17d356629..3e6a472e6 100644
--- a/kernel/net/ipv4/tcp_yeah.c
+++ b/kernel/net/ipv4/tcp_yeah.c
@@ -219,7 +219,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- return tp->snd_cwnd - reduction;
+ return max_t(int, tp->snd_cwnd - reduction, 2);
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
diff --git a/kernel/net/ipv4/udp.c b/kernel/net/ipv4/udp.c
index 1b8c5ba7d..7f8ab46ad 100644
--- a/kernel/net/ipv4/udp.c
+++ b/kernel/net/ipv4/udp.c
@@ -100,7 +100,6 @@
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
-#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
@@ -375,7 +374,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
-
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
return score;
}
@@ -419,6 +419,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
@@ -963,8 +966,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc,
sk->sk_family == AF_INET6);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
connected = 0;
@@ -1013,13 +1018,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!rt) {
struct net *net = sock_net(sk);
+ __u8 flow_flags = inet_sk_flowi_flags(sk);
fl4 = &fl4_stack;
+
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk),
+ flow_flags,
faddr, saddr, dport, inet->inet_sport);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (err < 0)
+ goto out;
+ }
+
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/kernel/net/ipv4/udp_diag.c b/kernel/net/ipv4/udp_diag.c
index b763c39ae..6116604bf 100644
--- a/kernel/net/ipv4/udp_diag.c
+++ b/kernel/net/ipv4/udp_diag.c
@@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = {
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
+ .idiag_info_size = 0,
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = {
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
+ .idiag_info_size = 0,
};
static int __init udp_diag_init(void)
diff --git a/kernel/net/ipv4/udp_tunnel.c b/kernel/net/ipv4/udp_tunnel.c
index 6bb98cc19..aba428626 100644
--- a/kernel/net/ipv4/udp_tunnel.c
+++ b/kernel/net/ipv4/udp_tunnel.c
@@ -4,9 +4,10 @@
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <net/dst_metadata.h>
+#include <net/net_namespace.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
-#include <net/net_namespace.h>
int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
@@ -15,12 +16,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket *sock = NULL;
struct sockaddr_in udp_addr;
- err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
if (err < 0)
goto error;
- sk_change_net(sock->sk, net);
-
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@@ -47,7 +46,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
error:
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
*sockp = NULL;
return err;
@@ -101,8 +100,30 @@ void udp_tunnel_sock_release(struct socket *sock)
{
rcu_assign_sk_user_data(sock->sk, NULL);
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
+ __be16 flags, __be64 tunnel_id, int md_size)
+{
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *info;
+
+ if (family == AF_INET)
+ tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size);
+ else
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size);
+ if (!tun_dst)
+ return NULL;
+
+ info = &tun_dst->u.tun_info;
+ info->key.tp_src = udp_hdr(skb)->source;
+ info->key.tp_dst = udp_hdr(skb)->dest;
+ if (udp_hdr(skb)->check)
+ info->key.tun_flags |= TUNNEL_CSUM;
+ return tun_dst;
+}
+EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
+
MODULE_LICENSE("GPL");
diff --git a/kernel/net/ipv4/xfrm4_input.c b/kernel/net/ipv4/xfrm4_input.c
index 60b032f58..62e1e72db 100644
--- a/kernel/net/ipv4/xfrm4_input.c
+++ b/kernel/net/ipv4/xfrm4_input.c
@@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm4_extract_header(skb);
}
-static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
+static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
if (!skb_dst(skb)) {
const struct iphdr *iph = ip_hdr(skb);
@@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm4_rcv_encap_finish);
return 0;
}
diff --git a/kernel/net/ipv4/xfrm4_output.c b/kernel/net/ipv4/xfrm4_output.c
index 2878dbfff..7ee6518af 100644
--- a/kernel/net/ipv4/xfrm4_output.c
+++ b/kernel/net/ipv4/xfrm4_output.c
@@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
mtu = dst_mtu(skb_dst(skb));
if (skb->len > mtu) {
+ skb->protocol = htons(ETH_P_IP);
+
if (skb->sk)
xfrm_local_error(skb, mtu);
else
@@ -80,24 +82,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
return xfrm_output(sk, skb);
}
-static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
+static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct xfrm_state *x = skb_dst(skb)->xfrm;
#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
return x->outer_mode->afinfo->output_finish(sk, skb);
}
-int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, skb_dst(skb)->dev, __xfrm4_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/kernel/net/ipv4/xfrm4_policy.c b/kernel/net/ipv4/xfrm4_policy.c
index bff69746e..7b0edb37a 100644
--- a/kernel/net/ipv4/xfrm4_policy.c
+++ b/kernel/net/ipv4/xfrm4_policy.c
@@ -15,11 +15,12 @@
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/l3mdev.h>
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
- int tos,
+ int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr)
{
@@ -28,9 +29,12 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
+ fl4->flowi4_oif = oif;
if (saddr)
fl4->saddr = saddr->a4;
+ fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF;
+
rt = __ip_route_output_key(net, fl4);
if (!IS_ERR(rt))
return &rt->dst;
@@ -38,22 +42,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
return ERR_CAST(rt);
}
-static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr)
{
struct flowi4 fl4;
- return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
+ return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
}
-static int xfrm4_get_saddr(struct net *net,
+static int xfrm4_get_saddr(struct net *net, int oif,
xfrm_address_t *saddr, xfrm_address_t *daddr)
{
struct dst_entry *dst;
struct flowi4 fl4;
- dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
if (IS_ERR(dst))
return -EHOSTUNREACH;
@@ -93,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_gateway = rt->rt_gateway;
xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ xdst->u.rt.rt_table_id = rt->rt_table_id;
INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
return 0;
@@ -107,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
int oif = 0;
if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
+ oif = l3mdev_fib_oif(skb_dst(skb)->dev);
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
@@ -122,7 +127,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
case IPPROTO_DCCP:
if (xprth + 4 < skb->data ||
pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ports = (__be16 *)xprth;
+ __be16 *ports;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ports = (__be16 *)xprth;
fl4->fl4_sport = ports[!!reverse];
fl4->fl4_dport = ports[!reverse];
@@ -130,8 +138,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
break;
case IPPROTO_ICMP:
- if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
- u8 *icmp = xprth;
+ if (xprth + 2 < skb->data ||
+ pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ icmp = xprth;
fl4->fl4_icmp_type = icmp[0];
fl4->fl4_icmp_code = icmp[1];
@@ -139,33 +151,50 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
break;
case IPPROTO_ESP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be32 *ehdr = (__be32 *)xprth;
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be32 *ehdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ehdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ehdr[0];
}
break;
case IPPROTO_AH:
- if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32 *)xprth;
+ if (xprth + 8 < skb->data ||
+ pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ __be32 *ah_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ah_hdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ah_hdr[1];
}
break;
case IPPROTO_COMP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ipcomp_hdr = (__be16 *)xprth;
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ipcomp_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ipcomp_hdr = (__be16 *)xprth;
fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
}
break;
case IPPROTO_GRE:
- if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
- __be16 *greflags = (__be16 *)xprth;
- __be32 *gre_hdr = (__be32 *)xprth;
+ if (xprth + 12 < skb->data ||
+ pskb_may_pull(skb, xprth + 12 - skb->data)) {
+ __be16 *greflags;
+ __be32 *gre_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ greflags = (__be16 *)xprth;
+ gre_hdr = (__be32 *)xprth;
if (greflags[0] & GRE_KEY) {
if (greflags[0] & GRE_CSUM)
@@ -230,7 +259,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm4_dst_ops = {
+static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
@@ -239,12 +268,12 @@ static struct dst_ops xfrm4_dst_ops = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 32768,
+ .gc_thresh = INT_MAX,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.family = AF_INET,
- .dst_ops = &xfrm4_dst_ops,
+ .dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
.decode_session = _decode_session4,
@@ -266,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static int __net_init xfrm4_net_init(struct net *net)
+static int __net_init xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -294,7 +323,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm4_net_exit(struct net *net)
+static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -306,12 +335,44 @@ static void __net_exit xfrm4_net_exit(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm4_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+ sizeof(xfrm4_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm4_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ xfrm4_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+}
static struct pernet_operations __net_initdata xfrm4_net_ops = {
.init = xfrm4_net_init,
.exit = xfrm4_net_exit,
};
-#endif
static void __init xfrm4_policy_init(void)
{
@@ -320,13 +381,9 @@ static void __init xfrm4_policy_init(void)
void __init xfrm4_init(void)
{
- dst_entries_init(&xfrm4_dst_ops);
-
xfrm4_state_init();
xfrm4_policy_init();
xfrm4_protocol_init();
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm4_net_ops);
-#endif
}
diff --git a/kernel/net/ipv6/Kconfig b/kernel/net/ipv6/Kconfig
index 438a73aa7..983bb9997 100644
--- a/kernel/net/ipv6/Kconfig
+++ b/kernel/net/ipv6/Kconfig
@@ -5,16 +5,15 @@
# IPv6 as module will cause a CRASH if you try to unload it
menuconfig IPV6
tristate "The IPv6 protocol"
- default m
+ default y
---help---
- This is complemental support for the IP version 6.
- You will still be able to do traditional IPv4 networking as well.
+ Support for IP version 6 (IPv6).
For general information about IPv6, see
<https://en.wikipedia.org/wiki/IPv6>.
- For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
- For specific information about IPv6 under Linux, read the HOWTO at
- <http://www.bieringer.de/linux/IPv6/>.
+ For specific information about IPv6 under Linux, see
+ Documentation/networking/ipv6.txt and read the HOWTO at
+ <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
To compile this protocol support as a module, choose M here: the
module will be called ipv6.
@@ -93,6 +92,25 @@ config IPV6_MIP6
If unsure, say N.
+config IPV6_ILA
+ tristate "IPv6: Identifier Locator Addressing (ILA)"
+ select LWTUNNEL
+ ---help---
+ Support for IPv6 Identifier Locator Addressing (ILA).
+
+ ILA is a mechanism to do network virtualization without
+ encapsulation. The basic concept of ILA is that we split an
+ IPv6 address into a 64 bit locator and 64 bit identifier. The
+ identifier is the identity of an entity in communication
+ ("who") and the locator expresses the location of the
+ entity ("where").
+
+ ILA can be configured using the "encap ila" option with
+ "ip -6 route" command. ILA is described in
+ https://tools.ietf.org/html/draft-herbert-nvo3-ila-00.
+
+ If unsure, say N.
+
config INET6_XFRM_TUNNEL
tristate
select INET6_TUNNEL
diff --git a/kernel/net/ipv6/Makefile b/kernel/net/ipv6/Makefile
index 2e8c06108..2c900c7b7 100644
--- a/kernel/net/ipv6/Makefile
+++ b/kernel/net/ipv6/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
obj-$(CONFIG_IPV6_MIP6) += mip6.o
+obj-$(CONFIG_IPV6_ILA) += ila.o
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
@@ -48,4 +49,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
ifneq ($(CONFIG_IPV6),)
obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
+obj-y += mcast_snoop.o
endif
diff --git a/kernel/net/ipv6/addrconf.c b/kernel/net/ipv6/addrconf.c
index 37b70e82b..e8d3da081 100644
--- a/kernel/net/ipv6/addrconf.c
+++ b/kernel/net/ipv6/addrconf.c
@@ -81,6 +81,7 @@
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/l3mdev.h>
#include <linux/if_tunnel.h>
#include <linux/rtnetlink.h>
#include <linux/netconf.h>
@@ -195,6 +196,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
.accept_ra_from_local = 0,
+ .accept_ra_min_hop_limit= 1,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
@@ -211,7 +213,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_ra_mtu = 1,
.stable_secret = {
.initialized = false,
- }
+ },
+ .use_oif_addrs_only = 0,
+ .ignore_routes_with_linkdown = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -236,6 +240,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
.accept_ra_from_local = 0,
+ .accept_ra_min_hop_limit= 1,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
@@ -253,6 +258,8 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.stable_secret = {
.initialized = false,
},
+ .use_oif_addrs_only = 0,
+ .ignore_routes_with_linkdown = 0,
};
/* Check if a valid qdisc is available */
@@ -343,6 +350,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
setup_timer(&ndev->rs_timer, addrconf_rs_timer,
(unsigned long)ndev);
memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
+
+ if (ndev->cnf.stable_secret.initialized)
+ ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ else
+ ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+
ndev->cnf.mtu6 = dev->mtu;
ndev->cnf.sysctl = NULL;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -411,6 +424,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
if (err) {
ipv6_mc_destroy_dev(ndev);
del_timer(&ndev->regen_timer);
+ snmp6_unregister_dev(ndev);
goto err_release;
}
/* protected by rtnl_lock */
@@ -468,6 +482,9 @@ static int inet6_netconf_msgsize_devconf(int type)
if (type == -1 || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ size += nla_total_size(4);
+
return size;
}
@@ -504,6 +521,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ devconf->ignore_routes_with_linkdown) < 0)
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -540,6 +562,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
[NETCONFA_IFINDEX] = { .len = sizeof(int) },
[NETCONFA_FORWARDING] = { .len = sizeof(int) },
[NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+ [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
@@ -560,7 +583,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
if (err < 0)
goto errout;
- err = EINVAL;
+ err = -EINVAL;
if (!tb[NETCONFA_IFINDEX])
goto errout;
@@ -762,6 +785,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
rt6_purge_dflt_routers(net);
return 1;
}
+
+static void addrconf_linkdown_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
+
+ idev->cnf.ignore_routes_with_linkdown = newf;
+ if (changed)
+ inet6_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ dev->ifindex,
+ &idev->cnf);
+ }
+ }
+}
+
+static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
+{
+ struct net *net;
+ int old;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ net = (struct net *)table->extra2;
+ old = *p;
+ *p = newf;
+
+ if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ rtnl_unlock();
+ return 0;
+ }
+
+ if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
+ net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+ addrconf_linkdown_change(net, newf);
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ }
+ rtnl_unlock();
+
+ return 1;
+}
+
#endif
/* Nobody refers to this ifaddr, destroy it */
@@ -1358,15 +1438,96 @@ out:
return ret;
}
+static int __ipv6_dev_get_saddr(struct net *net,
+ struct ipv6_saddr_dst *dst,
+ struct inet6_dev *idev,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+ int i;
+
+ /*
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense, unless it is also flagged as optimistic.
+ * - Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
+ */
+ if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+ (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+ continue;
+
+ score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+ if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+ score->addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+ idev->dev->name);
+ continue;
+ }
+
+ score->rule = -1;
+ bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+ for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+ int minihiscore, miniscore;
+
+ minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
+ miniscore = ipv6_get_saddr_eval(net, score, dst, i);
+
+ if (minihiscore > miniscore) {
+ if (i == IPV6_SADDR_RULE_SCOPE &&
+ score->scopedist > 0) {
+ /*
+ * special case:
+ * each remaining entry
+ * has too small (not enough)
+ * scope, because ifa entries
+ * are sorted by their scope
+ * values.
+ */
+ goto out;
+ }
+ break;
+ } else if (minihiscore < miniscore) {
+ if (hiscore->ifa)
+ in6_ifa_put(hiscore->ifa);
+
+ in6_ifa_hold(score->ifa);
+
+ swap(hiscore, score);
+ hiscore_idx = 1 - hiscore_idx;
+
+ /* restore our iterator */
+ score->ifa = hiscore->ifa;
+
+ break;
+ }
+ }
+ }
+out:
+ read_unlock_bh(&idev->lock);
+ return hiscore_idx;
+}
+
int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
{
- struct ipv6_saddr_score scores[2],
- *score = &scores[0], *hiscore = &scores[1];
+ struct ipv6_saddr_score scores[2], *hiscore;
struct ipv6_saddr_dst dst;
+ struct inet6_dev *idev;
struct net_device *dev;
int dst_type;
+ bool use_oif_addr = false;
+ int hiscore_idx = 0;
dst_type = __ipv6_addr_type(daddr);
dst.addr = daddr;
@@ -1375,105 +1536,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
dst.prefs = prefs;
- hiscore->rule = -1;
- hiscore->ifa = NULL;
+ scores[hiscore_idx].rule = -1;
+ scores[hiscore_idx].ifa = NULL;
rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- struct inet6_dev *idev;
-
- /* Candidate Source Address (section 4)
- * - multicast and link-local destination address,
- * the set of candidate source address MUST only
- * include addresses assigned to interfaces
- * belonging to the same link as the outgoing
- * interface.
- * (- For site-local destination addresses, the
- * set of candidate source addresses MUST only
- * include addresses assigned to interfaces
- * belonging to the same site as the outgoing
- * interface.)
- */
- if (((dst_type & IPV6_ADDR_MULTICAST) ||
- dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
- dst.ifindex && dev->ifindex != dst.ifindex)
- continue;
-
- idev = __in6_dev_get(dev);
- if (!idev)
- continue;
-
- read_lock_bh(&idev->lock);
- list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
- int i;
-
- /*
- * - Tentative Address (RFC2462 section 5.4)
- * - A tentative address is not considered
- * "assigned to an interface" in the traditional
- * sense, unless it is also flagged as optimistic.
- * - Candidate Source Address (section 4)
- * - In any case, anycast addresses, multicast
- * addresses, and the unspecified address MUST
- * NOT be included in a candidate set.
- */
- if ((score->ifa->flags & IFA_F_TENTATIVE) &&
- (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
- continue;
-
- score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+ /* Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ * - "It is RECOMMENDED that the candidate source addresses
+ * be the set of unicast addresses assigned to the
+ * interface that will be used to send to the destination
+ * (the 'outgoing' interface)." (RFC 6724)
+ */
+ if (dst_dev) {
+ idev = __in6_dev_get(dst_dev);
+ if ((dst_type & IPV6_ADDR_MULTICAST) ||
+ dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+ (idev && idev->cnf.use_oif_addrs_only)) {
+ use_oif_addr = true;
+ }
+ }
- if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
- score->addr_type & IPV6_ADDR_MULTICAST)) {
- net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
- dev->name);
+ if (use_oif_addr) {
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
continue;
- }
-
- score->rule = -1;
- bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
-
- for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
- int minihiscore, miniscore;
-
- minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
- miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
-
- if (minihiscore > miniscore) {
- if (i == IPV6_SADDR_RULE_SCOPE &&
- score->scopedist > 0) {
- /*
- * special case:
- * each remaining entry
- * has too small (not enough)
- * scope, because ifa entries
- * are sorted by their scope
- * values.
- */
- goto try_nextdev;
- }
- break;
- } else if (minihiscore < miniscore) {
- if (hiscore->ifa)
- in6_ifa_put(hiscore->ifa);
-
- in6_ifa_hold(score->ifa);
-
- swap(hiscore, score);
-
- /* restore our iterator */
- score->ifa = hiscore->ifa;
-
- break;
- }
- }
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
-try_nextdev:
- read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
+ hiscore = &scores[hiscore_idx];
if (!hiscore->ifa)
return -EADDRNOTAVAIL;
@@ -1845,37 +1951,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
__ipv6_dev_ac_dec(ifp->idev, &addr);
}
-static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
-{
- if (dev->addr_len != ETH_ALEN)
- return -1;
- memcpy(eui, dev->dev_addr, 3);
- memcpy(eui + 5, dev->dev_addr + 3, 3);
-
- /*
- * The zSeries OSA network cards can be shared among various
- * OS instances, but the OSA cards have only one MAC address.
- * This leads to duplicate address conflicts in conjunction
- * with IPv6 if more than one instance uses the same card.
- *
- * The driver for these cards can deliver a unique 16-bit
- * identifier for each instance sharing the same card. It is
- * placed instead of 0xFFFE in the interface identifier. The
- * "u" bit of the interface identifier is not inverted in this
- * case. Hence the resulting interface identifier has local
- * scope according to RFC2373.
- */
- if (dev->dev_id) {
- eui[3] = (dev->dev_id >> 8) & 0xFF;
- eui[4] = dev->dev_id & 0xFF;
- } else {
- eui[3] = 0xFF;
- eui[4] = 0xFE;
- eui[0] ^= 2;
- }
- return 0;
-}
-
static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
{
if (dev->addr_len != IEEE802154_ADDR_LEN)
@@ -2079,7 +2154,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
unsigned long expires, u32 flags)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_PREFIX,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
@@ -2112,8 +2187,9 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
+ u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
- table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX);
+ table = fib6_get_table(dev_net(dev), tb_id);
if (!table)
return NULL;
@@ -2121,6 +2197,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
if (!fn)
goto out;
+
+ noflags |= RTF_CACHE;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
if (rt->dst.dev->ifindex != dev->ifindex)
continue;
@@ -2142,7 +2220,7 @@ out:
static void addrconf_add_mroute(struct net_device *dev)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_LOCAL,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL,
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
@@ -2383,7 +2461,7 @@ ok:
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if (in6_dev->cnf.optimistic_dad &&
!net->ipv6.devconf_all->forwarding && sllao)
- addr_flags = IFA_F_OPTIMISTIC;
+ addr_flags |= IFA_F_OPTIMISTIC;
#endif
/* Do not allow to create too much of autoconfigured
@@ -2960,6 +3038,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
{
struct in6_addr addr;
+ /* no link local addresses on L3 master devices */
+ if (netif_is_l3_master(idev->dev))
+ return;
+
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) {
@@ -3050,6 +3132,8 @@ static void addrconf_gre_config(struct net_device *dev)
}
addrconf_addr_gen(idev, true);
+ if (dev->flags & IFF_POINTOPOINT)
+ addrconf_add_mroute(dev);
}
#endif
@@ -3070,6 +3154,32 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
}
break;
+ case NETDEV_CHANGEMTU:
+ /* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
+ if (dev->mtu < IPV6_MIN_MTU) {
+ addrconf_ifdown(dev, 1);
+ break;
+ }
+
+ if (idev) {
+ rt6_mtu_change(dev, dev->mtu);
+ idev->cnf.mtu6 = dev->mtu;
+ break;
+ }
+
+ /* allocate new idev */
+ idev = ipv6_add_dev(dev);
+ if (IS_ERR(idev))
+ break;
+
+ /* device is still not ready */
+ if (!(idev->if_flags & IF_READY))
+ break;
+
+ run_pending = 1;
+
+ /* fall through */
+
case NETDEV_UP:
case NETDEV_CHANGE:
if (dev->flags & IFF_SLAVE)
@@ -3093,7 +3203,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
idev->if_flags |= IF_READY;
run_pending = 1;
}
- } else {
+ } else if (event == NETDEV_CHANGE) {
if (!addrconf_qdisc_ok(dev)) {
/* device is still not ready. */
break;
@@ -3158,24 +3268,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
}
break;
- case NETDEV_CHANGEMTU:
- if (idev && dev->mtu >= IPV6_MIN_MTU) {
- rt6_mtu_change(dev, dev->mtu);
- idev->cnf.mtu6 = dev->mtu;
- break;
- }
-
- if (!idev && dev->mtu >= IPV6_MIN_MTU) {
- idev = ipv6_add_dev(dev);
- if (!IS_ERR(idev))
- break;
- }
-
- /*
- * if MTU under IPV6_MIN_MTU.
- * Stop IPv6 on this interface.
- */
-
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
/*
@@ -3414,6 +3506,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
+ bool notify = false;
addrconf_join_solict(dev, &ifp->addr);
@@ -3459,7 +3552,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
/* Because optimistic nodes can use this address,
* notify listeners. If DAD fails, RTM_DELADDR is sent.
*/
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ notify = true;
}
}
@@ -3467,6 +3560,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
out:
spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
+ if (notify)
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
}
static void addrconf_dad_start(struct inet6_ifaddr *ifp)
@@ -3556,7 +3651,7 @@ static void addrconf_dad_work(struct work_struct *w)
/* send a neighbour solicitation for our addr */
addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
- ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
+ ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
out:
in6_ifa_put(ifp);
rtnl_unlock();
@@ -4558,6 +4653,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+ array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
#ifdef CONFIG_IPV6_ROUTER_PREF
array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
@@ -4583,7 +4679,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
+ array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
/* we omit DEVCONF_STABLE_SECRET for now */
+ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
}
static inline size_t inet6_ifla6_size(void)
@@ -4603,6 +4701,7 @@ static inline size_t inet6_if_nlmsg_size(void)
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(4) /* IFLA_MTU */
+ nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
}
@@ -4622,18 +4721,24 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
}
static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
- int items, int bytes, size_t syncpoff)
+ int bytes, size_t syncpoff)
{
- int i;
- int pad = bytes - sizeof(u64) * items;
+ int i, c;
+ u64 buff[IPSTATS_MIB_MAX];
+ int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX;
+
BUG_ON(pad < 0);
- /* Use put_unaligned() because stats may not be aligned for u64. */
- put_unaligned(items, &stats[0]);
- for (i = 1; i < items; i++)
- put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+ memset(buff, 0, sizeof(buff));
+ buff[0] = IPSTATS_MIB_MAX;
- memset(&stats[items], 0, pad);
+ for_each_possible_cpu(c) {
+ for (i = 1; i < IPSTATS_MIB_MAX; i++)
+ buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff);
+ }
+
+ memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64));
+ memset(&stats[IPSTATS_MIB_MAX], 0, pad);
}
static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
@@ -4641,8 +4746,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
{
switch (attrtype) {
case IFLA_INET6_STATS:
- __snmp6_fill_stats64(stats, idev->stats.ipv6,
- IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
+ __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes,
+ offsetof(struct ipstats_mib, syncp));
break;
case IFLA_INET6_ICMP6STATS:
__snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
@@ -4650,7 +4755,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
}
}
-static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
+ u32 ext_filter_mask)
{
struct nlattr *nla;
struct ifla_cacheinfo ci;
@@ -4670,6 +4776,9 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
/* XXX - MC not implemented */
+ if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS)
+ return 0;
+
nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
if (!nla)
goto nla_put_failure;
@@ -4697,7 +4806,8 @@ nla_put_failure:
return -EMSGSIZE;
}
-static size_t inet6_get_link_af_size(const struct net_device *dev)
+static size_t inet6_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
if (!__in6_dev_get(dev))
return 0;
@@ -4705,14 +4815,15 @@ static size_t inet6_get_link_af_size(const struct net_device *dev)
return inet6_ifla6_size();
}
-static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct inet6_dev *idev = __in6_dev_get(dev);
if (!idev)
return -ENODATA;
- if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+ if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0)
return -EMSGSIZE;
return 0;
@@ -4859,13 +4970,15 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
(dev->ifindex != dev_get_iflink(dev) &&
- nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN))
goto nla_put_failure;
protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
if (!protoinfo)
goto nla_put_failure;
- if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+ if (inet6_fill_ifla6_attrs(skb, idev, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, protoinfo);
@@ -5046,13 +5159,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
ifp->idev->dev, 0, 0);
- if (rt && ip6_del_rt(rt))
- dst_free(&rt->dst);
+ if (rt)
+ ip6_del_rt(rt);
}
dst_hold(&ifp->rt->dst);
- if (ip6_del_rt(ifp->rt))
- dst_free(&ifp->rt->dst);
+ ip6_del_rt(ifp->rt);
rt_genid_bump_ipv6(net);
break;
@@ -5260,13 +5372,10 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
goto out;
}
- if (!write) {
- err = snprintf(str, sizeof(str), "%pI6",
- &secret->secret);
- if (err >= sizeof(str)) {
- err = -EIO;
- goto out;
- }
+ err = snprintf(str, sizeof(str), "%pI6", &secret->secret);
+ if (err >= sizeof(str)) {
+ err = -EIO;
+ goto out;
}
err = proc_dostring(&lctl, write, buffer, lenp, ppos);
@@ -5304,6 +5413,34 @@ out:
return err;
}
+static
+int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
+ int ret;
+
+ /* ctl->data points to idev->cnf.ignore_routes_when_linkdown
+ * we should not modify it until we get the rtnl lock.
+ */
+ lctl = *ctl;
+ lctl.data = &val;
+
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+ if (write)
+ ret = addrconf_fixup_linkdown(ctl, valp, val);
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
static struct addrconf_sysctl_table
{
struct ctl_table_header *sysctl_header;
@@ -5454,6 +5591,13 @@ static struct addrconf_sysctl_table
.proc_handler = proc_dointvec,
},
{
+ .procname = "accept_ra_min_hop_limit",
+ .data = &ipv6_devconf.accept_ra_min_hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "accept_ra_pinfo",
.data = &ipv6_devconf.accept_ra_pinfo,
.maxlen = sizeof(int),
@@ -5583,6 +5727,20 @@ static struct addrconf_sysctl_table
.proc_handler = addrconf_sysctl_stable_secret,
},
{
+ .procname = "use_oif_addrs_only",
+ .data = &ipv6_devconf.use_oif_addrs_only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ignore_routes_with_linkdown",
+ .data = &ipv6_devconf.ignore_routes_with_linkdown,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown,
+ },
+ {
/* sentinel */
}
},
diff --git a/kernel/net/ipv6/addrconf_core.c b/kernel/net/ipv6/addrconf_core.c
index ca09bf49a..bfa941fc1 100644
--- a/kernel/net/ipv6/addrconf_core.c
+++ b/kernel/net/ipv6/addrconf_core.c
@@ -107,7 +107,16 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v)
}
EXPORT_SYMBOL(inet6addr_notifier_call_chain);
-const struct ipv6_stub *ipv6_stub __read_mostly;
+static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
+ struct dst_entry **u2,
+ struct flowi6 *u3)
+{
+ return -EAFNOSUPPORT;
+}
+
+const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
+ .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+};
EXPORT_SYMBOL_GPL(ipv6_stub);
/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
diff --git a/kernel/net/ipv6/addrlabel.c b/kernel/net/ipv6/addrlabel.c
index 882124ebb..a8f6986dc 100644
--- a/kernel/net/ipv6/addrlabel.c
+++ b/kernel/net/ipv6/addrlabel.c
@@ -552,7 +552,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh)
rcu_read_lock();
p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
- if (p && ip6addrlbl_hold(p))
+ if (p && !ip6addrlbl_hold(p))
p = NULL;
lseq = ip6addrlbl_table.seq;
rcu_read_unlock();
diff --git a/kernel/net/ipv6/af_inet6.c b/kernel/net/ipv6/af_inet6.c
index eef63b394..9f5137cd6 100644
--- a/kernel/net/ipv6/af_inet6.c
+++ b/kernel/net/ipv6/af_inet6.c
@@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
@@ -167,7 +170,7 @@ lookup_protocol:
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
- sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot);
+ sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
@@ -197,6 +200,7 @@ lookup_protocol:
np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
np->mc_loop = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
+ np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk));
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
@@ -342,7 +346,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!(inet->freebind || inet->transparent) &&
+ if (!net->ipv6.sysctl.ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
!ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
@@ -362,7 +367,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
np->saddr = addr->sin6_addr;
/* Make sure we are allowed to bind here. */
- if (sk->sk_prot->get_port(sk, snum)) {
+ if ((snum || !inet->bind_address_no_port) &&
+ sk->sk_prot->get_port(sk, snum)) {
inet_reset_saddr(sk);
err = -EADDRINUSE;
goto out;
@@ -425,9 +431,11 @@ void inet6_destroy_sock(struct sock *sk)
/* Free tx options */
- opt = xchg(&np->opt, NULL);
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
}
EXPORT_SYMBOL_GPL(inet6_destroy_sock);
@@ -656,7 +664,10 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.fl6_sport = inet->inet_sport;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
+ &final);
+ rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -665,7 +676,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
return PTR_ERR(dst);
}
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
}
return 0;
@@ -678,8 +689,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct ipv6_pinfo *np = inet6_sk(sk);
if (np->rxopt.all) {
- if ((opt->hop && (np->rxopt.bits.hopopts ||
- np->rxopt.bits.ohopopts)) ||
+ if (((opt->flags & IP6SKB_HOPBYHOP) &&
+ (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
(ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
np->rxopt.bits.rxflow) ||
(opt->srcrt && (np->rxopt.bits.srcrt ||
@@ -765,9 +776,10 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.bindv6only = 0;
net->ipv6.sysctl.icmpv6_time = 1*HZ;
net->ipv6.sysctl.flowlabel_consistency = 1;
- net->ipv6.sysctl.auto_flowlabels = 0;
+ net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
net->ipv6.sysctl.idgen_retries = 3;
net->ipv6.sysctl.idgen_delay = 1 * HZ;
+ net->ipv6.sysctl.flowlabel_state_ranges = 0;
atomic_set(&net->ipv6.fib6_sernum, 1);
err = ipv6_init_mibs(net);
diff --git a/kernel/net/ipv6/ah6.c b/kernel/net/ipv6/ah6.c
index ed7d4e3f9..0630a4d5d 100644
--- a/kernel/net/ipv6/ah6.c
+++ b/kernel/net/ipv6/ah6.c
@@ -577,8 +577,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
ahp->icv_trunc_len + seqhi_len);
- if (!work_iph)
+ if (!work_iph) {
+ err = -ENOMEM;
goto out;
+ }
auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
diff --git a/kernel/net/ipv6/datagram.c b/kernel/net/ipv6/datagram.c
index b10a88986..428162155 100644
--- a/kernel/net/ipv6/datagram.c
+++ b/kernel/net/ipv6/datagram.c
@@ -162,13 +162,18 @@ ipv4_connected:
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
fl6.flowi6_oif = np->mcast_oif;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- opt = flowlabel ? flowlabel->opt : np->opt;
+ rcu_read_lock();
+ opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
final_p = fl6_update_dst(&fl6, opt, &final);
+ rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
err = 0;
@@ -199,7 +204,7 @@ ipv4_connected:
NULL);
sk->sk_state = TCP_ESTABLISHED;
- ip6_set_txhash(sk);
+ sk_set_txhash(sk);
out:
fl6_sock_release(flowlabel);
return err;
@@ -263,7 +268,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
struct ipv6hdr *iph;
struct sk_buff *skb;
@@ -568,8 +573,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
}
/* HbH is allowed only once */
- if (np->rxopt.bits.hopopts && opt->hop) {
- u8 *ptr = nh + opt->hop;
+ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
}
@@ -630,8 +635,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
- if (np->rxopt.bits.ohopopts && opt->hop) {
- u8 *ptr = nh + opt->hop;
+ if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.odstopts && opt->dst0) {
diff --git a/kernel/net/ipv6/esp6.c b/kernel/net/ipv6/esp6.c
index 7c07ce36a..060a60b2f 100644
--- a/kernel/net/ipv6/esp6.c
+++ b/kernel/net/ipv6/esp6.c
@@ -76,7 +76,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
len = ALIGN(len, crypto_tfm_ctx_alignment());
}
- len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
len = ALIGN(len, __alignof__(struct scatterlist));
len += sizeof(struct scatterlist) * nfrags;
@@ -96,17 +96,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
-static inline struct aead_givcrypt_request *esp_tmp_givreq(
- struct crypto_aead *aead, u8 *iv)
-{
- struct aead_givcrypt_request *req;
-
- req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
- crypto_tfm_ctx_alignment());
- aead_givcrypt_set_tfm(req, aead);
- return req;
-}
-
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
{
struct aead_request *req;
@@ -125,14 +114,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static inline struct scatterlist *esp_givreq_sg(
- struct crypto_aead *aead, struct aead_givcrypt_request *req)
-{
- return (void *)ALIGN((unsigned long)(req + 1) +
- crypto_aead_reqsize(aead),
- __alignof__(struct scatterlist));
-}
-
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -141,32 +122,57 @@ static void esp_output_done(struct crypto_async_request *base, int err)
xfrm_output_resume(skb, err);
}
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+ struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ __be32 *seqhi = esp_tmp_seqhi(tmp);
+
+ esph->seq_no = esph->spi;
+ esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+}
+
+static void esp_output_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_output_restore_header(skb);
+ esp_output_done(base, err);
+}
+
static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
- struct aead_givcrypt_request *req;
+ struct aead_request *req;
struct scatterlist *sg;
- struct scatterlist *asg;
struct sk_buff *trailer;
void *tmp;
int blksize;
int clen;
int alen;
int plen;
+ int ivlen;
int tfclen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
u8 *iv;
u8 *tail;
__be32 *seqhi;
+ __be64 seqno;
/* skb is pure payload to encrypt */
aead = x->data;
alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
tfclen = 0;
if (x->tfcpad) {
@@ -187,16 +193,14 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp) {
err = -ENOMEM;
goto error;
@@ -204,9 +208,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
- req = esp_tmp_givreq(aead, iv);
- asg = esp_givreq_sg(aead, req);
- sg = asg + sglists;
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
/* Fill padding... */
tail = skb_tail_pointer(trailer);
@@ -227,37 +230,53 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
esph = ip_esp_hdr(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
- esph->spi = x->id.spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+ *seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ }
+
+ esph->spi = x->id.spi;
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
- esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
- clen + alen);
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
- if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
-
- aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
- aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, assoclen);
- aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output.low +
- ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+ aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ memset(iv, 0, ivlen);
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8),
+ min(ivlen, 8));
ESP_SKB_CB(skb)->tmp = tmp;
- err = crypto_aead_givencrypt(req);
- if (err == -EINPROGRESS)
+ err = crypto_aead_encrypt(req);
+
+ switch (err) {
+ case -EINPROGRESS:
goto error;
- if (err == -EBUSY)
+ case -EBUSY:
err = NET_XMIT_DROP;
+ break;
+
+ case 0:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_output_restore_header(skb);
+ }
kfree(tmp);
@@ -318,25 +337,38 @@ static void esp_input_done(struct crypto_async_request *base, int err)
xfrm_input_resume(skb, esp_input_done2(skb, err));
}
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, 0);
+ __skb_pull(skb, 4);
+}
+
+static void esp_input_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_input_restore_header(skb);
+ esp_input_done(base, err);
+}
+
static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_esp_hdr *esph;
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
- int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int ivlen = crypto_aead_ivsize(aead);
+ int elen = skb->len - sizeof(*esph) - ivlen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
int ret = 0;
void *tmp;
__be32 *seqhi;
u8 *iv;
struct scatterlist *sg;
- struct scatterlist *asg;
- if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) {
+ if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) {
ret = -EINVAL;
goto out;
}
@@ -355,16 +387,14 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
ret = -ENOMEM;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
goto out;
@@ -372,36 +402,39 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
- asg = esp_req_sg(aead, req);
- sg = asg + sglists;
+ sg = esp_req_sg(aead, req);
skb->ip_summed = CHECKSUM_NONE;
esph = (struct ip_esp_hdr *)skb->data;
- /* Get ivec. This can be wrong, check against another impls. */
- iv = esph->enc_data;
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+ }
- aead_request_set_callback(req, 0, esp_input_done, skb);
- aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, assoclen);
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+ aead_request_set_ad(req, assoclen);
ret = crypto_aead_decrypt(req);
if (ret == -EINPROGRESS)
goto out;
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_input_restore_header(skb);
+
ret = esp_input_done2(skb, ret);
out:
@@ -461,10 +494,16 @@ static void esp6_destroy(struct xfrm_state *x)
static int esp_init_aead(struct xfrm_state *x)
{
+ char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = -ENAMETOOLONG;
+ if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+
+ aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -503,15 +542,19 @@ static int esp_init_authenc(struct xfrm_state *x)
if ((x->props.flags & XFRM_STATE_ESN)) {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authencesn(%s,%s)",
+ "%s%sauthencesn(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authenc(%s,%s)",
+ "%s%sauthenc(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
}
diff --git a/kernel/net/ipv6/exthdrs.c b/kernel/net/ipv6/exthdrs.c
index a7bbbe455..ea7c4d64a 100644
--- a/kernel/net/ipv6/exthdrs.c
+++ b/kernel/net/ipv6/exthdrs.c
@@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
return -1;
}
- opt->hop = sizeof(struct ipv6hdr);
+ opt->flags |= IP6SKB_HOPBYHOP;
if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
opt = IP6CB(skb);
@@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
*((char **)&opt2->dst1opt) += dif;
if (opt2->srcrt)
*((char **)&opt2->srcrt) += dif;
+ atomic_set(&opt2->refcnt, 1);
}
return opt2;
}
@@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return ERR_PTR(-ENOBUFS);
memset(opt2, 0, tot_len);
-
+ atomic_set(&opt2->refcnt, 1);
opt2->tot_len = tot_len;
p = (char *)(opt2 + 1);
diff --git a/kernel/net/ipv6/fib6_rules.c b/kernel/net/ipv6/fib6_rules.c
index 2367a16ea..ed33abf57 100644
--- a/kernel/net/ipv6/fib6_rules.c
+++ b/kernel/net/ipv6/fib6_rules.c
@@ -32,6 +32,7 @@ struct fib6_rule {
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
+ struct rt6_info *rt;
struct fib_lookup_arg arg = {
.lookup_ptr = lookup,
.flags = FIB_LOOKUP_NOREF,
@@ -40,11 +41,21 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- if (arg.result)
- return arg.result;
+ rt = arg.result;
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return &net->ipv6.ip6_null_entry->dst;
+ if (!rt) {
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return &net->ipv6.ip6_null_entry->dst;
+ }
+
+ if (rt->rt6i_flags & RTF_REJECT &&
+ rt->dst.error == -EAGAIN) {
+ ip6_rt_put(rt);
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ }
+
+ return &rt->dst;
}
static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
@@ -258,11 +269,6 @@ nla_put_failure:
return -ENOBUFS;
}
-static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
-{
- return 0x3FFF;
-}
-
static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(16) /* dst */
@@ -279,7 +285,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
.configure = fib6_rule_configure,
.compare = fib6_rule_compare,
.fill = fib6_rule_fill,
- .default_pref = fib6_rule_default_pref,
.nlmsg_payload = fib6_rule_nlmsg_payload,
.nlgroup = RTNLGRP_IPV6_RULE,
.policy = fib6_rule_policy,
diff --git a/kernel/net/ipv6/icmp.c b/kernel/net/ipv6/icmp.c
index 2c2b5d51f..0a37ddc7a 100644
--- a/kernel/net/ipv6/icmp.c
+++ b/kernel/net/ipv6/icmp.c
@@ -68,6 +68,7 @@
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/dsfield.h>
+#include <net/l3mdev.h>
#include <asm/uaccess.h>
@@ -207,7 +208,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
struct inet_peer *peer;
peer = inet_getpeer_v6(net->ipv6.peers,
- &rt->rt6i_dst.addr, 1);
+ &fl6->daddr, 1);
res = inet_peer_xrlim_allow(peer, tmo);
if (peer)
inet_putpeer(peer);
@@ -329,7 +330,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
struct flowi6 fl2;
int err;
- err = ip6_dst_lookup(sk, &dst, fl6);
+ err = ip6_dst_lookup(net, sk, &dst, fl6);
if (err)
return ERR_PTR(err);
@@ -337,7 +338,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
* We won't send icmp if the destination is known
* anycast.
*/
- if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+ if (ipv6_anycast_destination(dst, &fl6->daddr)) {
net_dbg_ratelimited("icmp6_send: acast source\n");
dst_release(dst);
return ERR_PTR(-EINVAL);
@@ -361,7 +362,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- err = ip6_dst_lookup(sk, &dst2, &fl2);
+ err = ip6_dst_lookup(net, sk, &dst2, &fl2);
if (err)
goto relookup_failed;
@@ -452,7 +453,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
* and anycast addresses will be checked later.
*/
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
- net_dbg_ratelimited("icmp6_send: addr_any/mcast source\n");
+ net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n",
+ &hdr->saddr, &hdr->daddr);
return;
}
@@ -460,7 +462,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
* Never answer to a ICMP packet.
*/
if (is_ineligible(skb)) {
- net_dbg_ratelimited("icmp6_send: no reply to icmp error\n");
+ net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n",
+ &hdr->saddr, &hdr->daddr);
return;
}
@@ -496,6 +499,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev);
+
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
if (IS_ERR(dst))
goto out;
@@ -509,7 +515,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
len = skb->len - msg.offset;
len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr));
if (len < 0) {
- net_dbg_ratelimited("icmp: len problem\n");
+ net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n",
+ &hdr->saddr, &hdr->daddr);
goto out_dst_release;
}
@@ -564,7 +571,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (!ipv6_unicast_destination(skb) &&
!(net->ipv6.sysctl.anycast_src_echo_reply &&
- ipv6_anycast_destination(skb)))
+ ipv6_anycast_destination(skb_dst(skb), saddr)))
saddr = NULL;
memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
@@ -575,7 +582,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
fl6.saddr = *saddr;
- fl6.flowi6_oif = skb->dev->ifindex;
+ fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
@@ -591,7 +598,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- err = ip6_dst_lookup(sk, &dst, &fl6);
+ err = ip6_dst_lookup(net, sk, &dst, &fl6);
if (err)
goto out;
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
@@ -781,7 +788,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
if (type & ICMPV6_INFOMSG_MASK)
break;
- net_dbg_ratelimited("icmpv6: msg of unknown type\n");
+ net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n",
+ saddr, daddr);
/*
* error of unknown type.
@@ -826,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
}
-/*
- * Special lock-class for __icmpv6_sk:
- */
-static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
-
static int __net_init icmpv6_sk_init(struct net *net)
{
struct sock *sk;
@@ -852,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net)
net->ipv6.icmp_sk[i] = sk;
- /*
- * Split off their lock-class, because sk->sk_dst_lock
- * gets used from softirqs, which is safe for
- * __icmpv6_sk (because those never get directly used
- * via userspace syscalls), but unsafe for normal sockets.
- */
- lockdep_set_class(&sk->sk_dst_lock,
- &icmpv6_socket_sk_dst_lock_key);
-
/* Enough space for 2 64K ICMP packets, including
* sk_buff struct overhead.
*/
diff --git a/kernel/net/ipv6/ila.c b/kernel/net/ipv6/ila.c
new file mode 100644
index 000000000..1a6852e1a
--- /dev/null
+++ b/kernel/net/ipv6/ila.c
@@ -0,0 +1,229 @@
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+#include <net/lwtunnel.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+
+struct ila_params {
+ __be64 locator;
+ __be64 locator_match;
+ __wsum csum_diff;
+};
+
+static inline struct ila_params *ila_params_lwtunnel(
+ struct lwtunnel_state *lwstate)
+{
+ return (struct ila_params *)lwstate->data;
+}
+
+static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], to[0], to[1],
+ };
+
+ return csum_partial(diff, sizeof(diff), 0);
+}
+
+static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
+{
+ if (*(__be64 *)&ip6h->daddr == p->locator_match)
+ return p->csum_diff;
+ else
+ return compute_csum_diff8((__be32 *)&ip6h->daddr,
+ (__be32 *)&p->locator);
+}
+
+static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
+{
+ __wsum diff;
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ size_t nhoff = sizeof(struct ipv6hdr);
+
+ /* First update checksum */
+ switch (ip6h->nexthdr) {
+ case NEXTHDR_TCP:
+ if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
+ struct tcphdr *th = (struct tcphdr *)
+ (skb_network_header(skb) + nhoff);
+
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&th->check, skb,
+ diff, true);
+ }
+ break;
+ case NEXTHDR_UDP:
+ if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
+ struct udphdr *uh = (struct udphdr *)
+ (skb_network_header(skb) + nhoff);
+
+ if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&uh->check, skb,
+ diff, true);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ }
+ break;
+ case NEXTHDR_ICMP:
+ if (likely(pskb_may_pull(skb,
+ nhoff + sizeof(struct icmp6hdr)))) {
+ struct icmp6hdr *ih = (struct icmp6hdr *)
+ (skb_network_header(skb) + nhoff);
+
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
+ diff, true);
+ }
+ break;
+ }
+
+ /* Now change destination address */
+ *(__be64 *)&ip6h->daddr = p->locator;
+}
+
+static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int ila_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
+
+ return dst->lwtstate->orig_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+ [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+};
+
+static int ila_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct ila_params *p;
+ struct nlattr *tb[ILA_ATTR_MAX + 1];
+ size_t encap_len = sizeof(*p);
+ struct lwtunnel_state *newts;
+ const struct fib6_config *cfg6 = cfg;
+ int ret;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla,
+ ila_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[ILA_ATTR_LOCATOR])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(encap_len);
+ if (!newts)
+ return -ENOMEM;
+
+ newts->len = encap_len;
+ p = ila_params_lwtunnel(newts);
+
+ p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
+
+ if (cfg6->fc_dst_len > sizeof(__be64)) {
+ /* Precompute checksum difference for translation since we
+ * know both the old locator and the new one.
+ */
+ p->locator_match = *(__be64 *)&cfg6->fc_dst;
+ p->csum_diff = compute_csum_diff8(
+ (__be32 *)&p->locator_match, (__be32 *)&p->locator);
+ }
+
+ newts->type = LWTUNNEL_ENCAP_ILA;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+ LWTUNNEL_STATE_INPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static int ila_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ila_params *p = ila_params_lwtunnel(lwtstate);
+
+ if (nla_put_u64(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ /* No encapsulation overhead */
+ return 0;
+}
+
+static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct ila_params *a_p = ila_params_lwtunnel(a);
+ struct ila_params *b_p = ila_params_lwtunnel(b);
+
+ return (a_p->locator != b_p->locator);
+}
+
+static const struct lwtunnel_encap_ops ila_encap_ops = {
+ .build_state = ila_build_state,
+ .output = ila_output,
+ .input = ila_input,
+ .fill_encap = ila_fill_encap_info,
+ .get_encap_size = ila_encap_nlsize,
+ .cmp_encap = ila_encap_cmp,
+};
+
+static int __init ila_init(void)
+{
+ return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
+}
+
+static void __exit ila_fini(void)
+{
+ lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/ipv6/inet6_connection_sock.c b/kernel/net/ipv6/inet6_connection_sock.c
index 6927f3fb5..a7ca2cde2 100644
--- a/kernel/net/ipv6/inet6_connection_sock.c
+++ b/kernel/net/ipv6/inet6_connection_sock.c
@@ -65,19 +65,22 @@ int inet6_csk_bind_conflict(const struct sock *sk,
}
EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
-struct dst_entry *inet6_csk_route_req(struct sock *sk,
+struct dst_entry *inet6_csk_route_req(const struct sock *sk,
struct flowi6 *fl6,
- const struct request_sock *req)
+ const struct request_sock *req,
+ u8 proto)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *final_p, final;
struct dst_entry *dst;
memset(fl6, 0, sizeof(*fl6));
- fl6->flowi6_proto = IPPROTO_TCP;
+ fl6->flowi6_proto = proto;
fl6->daddr = ireq->ir_v6_rmt_addr;
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
fl6->saddr = ireq->ir_v6_loc_addr;
fl6->flowi6_oif = ireq->ir_iif;
fl6->flowi6_mark = ireq->ir_mark;
@@ -91,73 +94,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
return dst;
}
-
-/*
- * request_sock (formerly open request) hash tables.
- */
-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
- const u32 rnd, const u32 synq_hsize)
-{
- u32 c;
-
- c = jhash_3words((__force u32)raddr->s6_addr32[0],
- (__force u32)raddr->s6_addr32[1],
- (__force u32)raddr->s6_addr32[2],
- rnd);
-
- c = jhash_2words((__force u32)raddr->s6_addr32[3],
- (__force u32)rport,
- c);
-
- return c & (synq_hsize - 1);
-}
-
-struct request_sock *inet6_csk_search_req(struct sock *sk,
- const __be16 rport,
- const struct in6_addr *raddr,
- const struct in6_addr *laddr,
- const int iif)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req;
- u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries);
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
- for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->ir_rmt_port == rport &&
- req->rsk_ops->family == AF_INET6 &&
- ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) &&
- ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) &&
- (!ireq->ir_iif || ireq->ir_iif == iif)) {
- atomic_inc(&req->rsk_refcnt);
- WARN_ON(req->sk != NULL);
- break;
- }
- }
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return req;
-}
-EXPORT_SYMBOL_GPL(inet6_csk_search_req);
-
-void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
- struct request_sock *req,
- const unsigned long timeout)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
- inet_rsk(req)->ir_rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
-}
-EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+EXPORT_SYMBOL(inet6_csk_route_req);
void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
{
@@ -174,14 +111,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
static inline
-void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
-{
- __ip6_dst_store(sk, dst, daddr, saddr);
-}
-
-static inline
struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
{
return __sk_dst_check(sk, cookie);
@@ -207,14 +136,16 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->fl6_dport = inet->inet_dport;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
dst = __inet6_csk_dst_check(sk, np->dst_cookie);
if (!dst) {
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
if (!IS_ERR(dst))
- __inet6_csk_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
}
return dst;
}
@@ -240,7 +171,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
/* Restore final destination back after routing done */
fl6.daddr = sk->sk_v6_daddr;
- res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+ np->tclass);
rcu_read_unlock();
return res;
}
diff --git a/kernel/net/ipv6/inet6_hashtables.c b/kernel/net/ipv6/inet6_hashtables.c
index 871641bc1..21ace5a2b 100644
--- a/kernel/net/ipv6/inet6_hashtables.c
+++ b/kernel/net/ipv6/inet6_hashtables.c
@@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score++;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
}
return score;
}
@@ -207,7 +209,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -234,21 +235,17 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
@@ -257,7 +254,7 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static inline u32 inet6_sk_port_offset(const struct sock *sk)
+static u32 inet6_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -269,7 +266,11 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk),
+ u32 port_offset = 0;
+
+ if (!inet_sk(sk)->inet_num)
+ port_offset = inet6_sk_port_offset(sk);
+ return __inet_hash_connect(death_row, sk, port_offset,
__inet6_check_established);
}
EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/kernel/net/ipv6/ip6_fib.c b/kernel/net/ipv6/ip6_fib.c
index bde57b113..0c7e276c2 100644
--- a/kernel/net/ipv6/ip6_fib.c
+++ b/kernel/net/ipv6/ip6_fib.c
@@ -32,6 +32,7 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
+#include <net/lwtunnel.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -154,10 +155,39 @@ static void node_free(struct fib6_node *fn)
kmem_cache_free(fib6_node_kmem, fn);
}
+static void rt6_rcu_free(struct rt6_info *rt)
+{
+ call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+{
+ int cpu;
+
+ if (!non_pcpu_rt->rt6i_pcpu)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ rt6_rcu_free(pcpu_rt);
+ *ppcpu_rt = NULL;
+ }
+ }
+
+ non_pcpu_rt->rt6i_pcpu = NULL;
+}
+
static void rt6_release(struct rt6_info *rt)
{
- if (atomic_dec_and_test(&rt->rt6i_ref))
- dst_free(&rt->dst);
+ if (atomic_dec_and_test(&rt->rt6i_ref)) {
+ rt6_free_pcpu(rt);
+ rt6_rcu_free(rt);
+ }
}
static void fib6_link_table(struct net *net, struct fib6_table *tb)
@@ -234,6 +264,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)
return NULL;
}
+EXPORT_SYMBOL_GPL(fib6_get_table);
static void __net_init fib6_tables_init(struct net *net)
{
@@ -255,7 +286,17 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
- return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+ struct rt6_info *rt;
+
+ rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+ if (rt->rt6i_flags & RTF_REJECT &&
+ rt->dst.error == -EAGAIN) {
+ ip6_rt_put(rt);
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ }
+
+ return &rt->dst;
}
static void __net_init fib6_tables_init(struct net *net)
@@ -738,6 +779,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
rt6_clean_expires(iter);
else
rt6_set_expires(iter, rt->dst.expires);
+ iter->rt6i_pmtu = rt->rt6i_pmtu;
return -EEXIST;
}
/* If we have the same destination and the same metric,
@@ -820,7 +862,7 @@ add:
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info);
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -846,7 +888,7 @@ add:
rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info);
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
@@ -907,6 +949,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
int replace_required = 0;
int sernum = fib6_new_sernum(info->nl_net);
+ if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) &&
+ !atomic_read(&rt->dst.__refcnt)))
+ return -EINVAL;
+
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0;
@@ -999,6 +1045,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
fib6_start_gc(info->nl_net, rt);
if (!(rt->rt6i_flags & RTF_CACHE))
fib6_prune_clones(info->nl_net, pn);
+ rt->dst.flags &= ~DST_NOCACHE;
}
out:
@@ -1023,7 +1070,8 @@ out:
atomic_inc(&pn->leaf->rt6i_ref);
}
#endif
- dst_free(&rt->dst);
+ if (!(rt->dst.flags & DST_NOCACHE))
+ dst_free(&rt->dst);
}
return err;
@@ -1034,7 +1082,8 @@ out:
st_failure:
if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
fib6_repair_tree(info->nl_net, fn);
- dst_free(&rt->dst);
+ if (!(rt->dst.flags & DST_NOCACHE))
+ dst_free(&rt->dst);
return err;
#endif
}
@@ -1384,7 +1433,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
fib6_purge_rt(rt, fn, net);
- inet6_rt_notify(RTM_DELROUTE, rt, info);
+ inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
rt6_release(rt);
}
diff --git a/kernel/net/ipv6/ip6_flowlabel.c b/kernel/net/ipv6/ip6_flowlabel.c
index d49112501..dc2db4f7b 100644
--- a/kernel/net/ipv6/ip6_flowlabel.c
+++ b/kernel/net/ipv6/ip6_flowlabel.c
@@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
}
spin_lock_bh(&ip6_sk_fl_lock);
for (sflp = &np->ipv6_fl_list;
- (sfl = rcu_dereference(*sflp)) != NULL;
+ (sfl = rcu_dereference_protected(*sflp,
+ lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
sflp = &sfl->next) {
if (sfl->fl->label == freq.flr_label) {
if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
np->flow_label &= ~IPV6_FLOWLABEL_MASK;
- *sflp = rcu_dereference(sfl->next);
+ *sflp = sfl->next;
spin_unlock_bh(&ip6_sk_fl_lock);
fl_release(sfl->fl);
kfree_rcu(sfl, rcu);
@@ -595,6 +596,10 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
return -EINVAL;
+ if (net->ipv6.sysctl.flowlabel_state_ranges &&
+ (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
+ return -ERANGE;
+
fl = fl_create(net, sk, &freq, optval, optlen, &err);
if (!fl)
return err;
diff --git a/kernel/net/ipv6/ip6_gre.c b/kernel/net/ipv6/ip6_gre.c
index 69f4f689f..e5ea177d3 100644
--- a/kernel/net/ipv6/ip6_gre.c
+++ b/kernel/net/ipv6/ip6_gre.c
@@ -404,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct ipv6_tlv_tnl_enc_lim *tel;
__u32 mtu;
case ICMPV6_DEST_UNREACH:
- net_warn_ratelimited("%s: Path to destination invalid or inactive!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
+ t->parms.name);
break;
case ICMPV6_TIME_EXCEED:
if (code == ICMPV6_EXC_HOPLIMIT) {
- net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+ t->parms.name);
}
break;
case ICMPV6_PARAMPROB:
@@ -421,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (teli && teli == be32_to_cpu(info) - 2) {
tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
if (tel->encap_limit == 0) {
- net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
+ t->parms.name);
}
} else {
- net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+ t->parms.name);
}
break;
case ICMPV6_PKT_TOOBIG:
@@ -634,20 +634,20 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
}
if (!fl6->flowi6_mark)
- dst = ip6_tnl_dst_check(tunnel);
+ dst = ip6_tnl_dst_get(tunnel);
if (!dst) {
- ndst = ip6_route_output(net, NULL, fl6);
+ dst = ip6_route_output(net, NULL, fl6);
- if (ndst->error)
+ if (dst->error)
goto tx_err_link_failure;
- ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0);
- if (IS_ERR(ndst)) {
- err = PTR_ERR(ndst);
- ndst = NULL;
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
goto tx_err_link_failure;
}
- dst = ndst;
+ ndst = dst;
}
tdev = dst->dev;
@@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
skb = new_skb;
}
- if (fl6->flowi6_mark) {
- skb_dst_set(skb, dst);
- ndst = NULL;
- } else {
- skb_dst_set_noref(skb, dst);
- }
+ if (!fl6->flowi6_mark && ndst)
+ ip6_tnl_dst_set(tunnel, ndst);
+ skb_dst_set(skb, dst);
proto = NEXTHDR_GRE;
if (encap_limit >= 0) {
@@ -729,7 +726,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
*/
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
- ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = tunnel->parms.hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
@@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
skb_set_inner_protocol(skb, protocol);
ip6tunnel_xmit(NULL, skb, dev);
- if (ndst)
- ip6_tnl_dst_store(tunnel, ndst);
return 0;
tx_err_link_failure:
stats->tx_carrier_errors++;
dst_link_failure(skb);
tx_err_dst_release:
- dst_release(ndst);
+ dst_release(dst);
return err;
}
@@ -1183,7 +1178,8 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
ip6_flow_hdr(ipv6h, 0,
ip6_make_flowlabel(dev_net(dev), skb,
- t->fl.u.ip6.flowlabel, false));
+ t->fl.u.ip6.flowlabel, true,
+ &t->fl.u.ip6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = NEXTHDR_GRE;
ipv6h->saddr = t->parms.laddr;
@@ -1222,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = {
static void ip6gre_dev_free(struct net_device *dev)
{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ ip6_tnl_dst_destroy(t);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -1244,9 +1243,10 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
netif_keep_dst(dev);
}
-static int ip6gre_tunnel_init(struct net_device *dev)
+static int ip6gre_tunnel_init_common(struct net_device *dev)
{
struct ip6_tnl *tunnel;
+ int ret;
tunnel = netdev_priv(dev);
@@ -1254,16 +1254,37 @@ static int ip6gre_tunnel_init(struct net_device *dev)
tunnel->net = dev_net(dev);
strcpy(tunnel->parms.name, dev->name);
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ ret = ip6_tnl_dst_init(tunnel);
+ if (ret) {
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ip6gre_tunnel_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+ int ret;
+
+ ret = ip6gre_tunnel_init_common(dev);
+ if (ret)
+ return ret;
+
+ tunnel = netdev_priv(dev);
+
memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
if (ipv6_addr_any(&tunnel->parms.raddr))
dev->header_ops = &ip6gre_header_ops;
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
-
return 0;
}
@@ -1459,19 +1480,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
static int ip6gre_tap_init(struct net_device *dev)
{
struct ip6_tnl *tunnel;
+ int ret;
- tunnel = netdev_priv(dev);
+ ret = ip6gre_tunnel_init_common(dev);
+ if (ret)
+ return ret;
- tunnel->dev = dev;
- tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
+ tunnel = netdev_priv(dev);
ip6gre_tnl_link_config(tunnel, 1);
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
-
return 0;
}
@@ -1553,13 +1571,11 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
return -EEXIST;
} else {
t = nt;
-
- ip6gre_tunnel_unlink(ign, t);
- ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
- ip6gre_tunnel_link(ign, t);
- netdev_state_change(dev);
}
+ ip6gre_tunnel_unlink(ign, t);
+ ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+ ip6gre_tunnel_link(ign, t);
return 0;
}
diff --git a/kernel/net/ipv6/ip6_input.c b/kernel/net/ipv6/ip6_input.c
index 57990c929..9075acf08 100644
--- a/kernel/net/ipv6/ip6_input.c
+++ b/kernel/net/ipv6/ip6_input.c
@@ -45,8 +45,9 @@
#include <net/addrconf.h>
#include <net/xfrm.h>
#include <net/inet_ecn.h>
+#include <net/dst_metadata.h>
-int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb)
+int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot;
@@ -55,7 +56,7 @@ int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb)
if (ipprot && ipprot->early_demux)
ipprot->early_demux(skb);
}
- if (!skb_dst(skb))
+ if (!skb_valid_dst(skb))
ip6_route_input(skb);
return dst_input(skb);
@@ -98,7 +99,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
* arrived via the sending interface (ethX), because of the
* nature of scoping architecture. --yoshfuji
*/
- IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
+ IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
goto err;
@@ -108,7 +109,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (hdr->version != 6)
goto err;
- IP6_ADD_STATS_BH(dev_net(dev), idev,
+ IP6_ADD_STATS_BH(net, idev,
IPSTATS_MIB_NOECTPKTS +
(ipv6_get_dsfield(hdr) & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
@@ -182,8 +183,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb,
- dev, NULL,
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
ip6_rcv_finish);
err:
IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
@@ -198,9 +199,8 @@ drop:
*/
-static int ip6_input_finish(struct sock *sk, struct sk_buff *skb)
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
@@ -277,8 +277,8 @@ discard:
int ip6_input(struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
ip6_input_finish);
}
diff --git a/kernel/net/ipv6/ip6_offload.c b/kernel/net/ipv6/ip6_offload.c
index 08b62047c..eeca943f1 100644
--- a/kernel/net/ipv6/ip6_offload.c
+++ b/kernel/net/ipv6/ip6_offload.c
@@ -264,6 +264,9 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
int err = -ENOSYS;
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
+
iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
rcu_read_lock();
@@ -280,6 +283,13 @@ out_unlock:
return err;
}
+static int sit_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_SIT;
+ return ipv6_gro_complete(skb, nhoff);
+}
+
static struct packet_offload ipv6_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.callbacks = {
@@ -292,6 +302,8 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
static const struct net_offload sit_offload = {
.callbacks = {
.gso_segment = ipv6_gso_segment,
+ .gro_receive = ipv6_gro_receive,
+ .gro_complete = sit_gro_complete,
},
};
diff --git a/kernel/net/ipv6/ip6_output.c b/kernel/net/ipv6/ip6_output.c
index bc09cb97b..31144c486 100644
--- a/kernel/net/ipv6/ip6_output.c
+++ b/kernel/net/ipv6/ip6_output.c
@@ -55,8 +55,9 @@
#include <net/xfrm.h>
#include <net/checksum.h>
#include <linux/mroute6.h>
+#include <net/l3mdev.h>
-static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct net_device *dev = dst->dev;
@@ -71,7 +72,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
- ((mroute6_socket(dev_net(dev), skb) &&
+ ((mroute6_socket(net, skb) &&
!(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
&ipv6_hdr(skb)->saddr))) {
@@ -82,19 +83,18 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
*/
if (newskb)
NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- sk, newskb, NULL, newskb->dev,
+ net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
if (ipv6_hdr(skb)->hop_limit == 0) {
- IP6_INC_STATS(dev_net(dev), idev,
+ IP6_INC_STATS(net, idev,
IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return 0;
}
}
- IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
- skb->len);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
IPV6_ADDR_SCOPE_NODELOCAL &&
@@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
}
rcu_read_lock_bh();
- nexthop = rt6_nexthop((struct rt6_info *)dst);
+ nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
@@ -116,48 +116,49 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
}
rcu_read_unlock_bh();
- IP6_INC_STATS(dev_net(dst->dev),
- ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EINVAL;
}
-static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
- return ip6_fragment(sk, skb, ip6_finish_output2);
+ return ip6_fragment(net, sk, skb, ip6_finish_output2);
else
- return ip6_finish_output2(sk, skb);
+ return ip6_finish_output2(net, sk, skb);
}
-int ip6_output(struct sock *sk, struct sk_buff *skb)
+int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+
if (unlikely(idev->cnf.disable_ipv6)) {
- IP6_INC_STATS(dev_net(dev), idev,
- IPSTATS_MIB_OUTDISCARDS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return 0;
}
- return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
- NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, dev,
ip6_finish_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
/*
- * xmit an sk_buff (used by TCP, SCTP and DCCP)
+ * xmit an sk_buff (used by TCP, SCTP and DCCP)
+ * Note : socket lock is not held for SYNACK packets, but might be modified
+ * by calls to skb_set_owner_w() and ipv6_local_error(),
+ * which are using proper atomic operations or spinlocks.
*/
-
-int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
+int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
struct ipv6_txoptions *opt, int tclass)
{
struct net *net = sock_net(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr;
@@ -186,7 +187,10 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
}
consume_skb(skb);
skb = skb2;
- skb_set_owner_w(skb, sk);
+ /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
+ * it is safe to call in our context (socket lock not held)
+ */
+ skb_set_owner_w(skb, (struct sock *)sk);
}
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
@@ -207,7 +211,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
hlimit = ip6_dst_hoplimit(dst);
ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
- np->autoflowlabel));
+ np->autoflowlabel, fl6));
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
@@ -224,12 +228,20 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUT, skb->len);
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
- NULL, dst->dev, dst_output_sk);
+ /* hooks should never assume socket lock is held.
+ * we promote our socket to non const
+ */
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, (struct sock *)sk, skb, NULL, dst->dev,
+ dst_output);
}
skb->dev = dst->dev;
- ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
+ /* ipv6_local_error() does not require socket lock,
+ * we promote our socket to non const
+ */
+ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
+
IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
return -EMSGSIZE;
@@ -317,10 +329,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
return 0;
}
-static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
+static inline int ip6_forward_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
skb_sender_cpu_clear(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
@@ -376,6 +389,9 @@ int ip6_forward(struct sk_buff *skb)
if (skb->pkt_type != PACKET_HOST)
goto drop;
+ if (unlikely(skb->sk))
+ goto drop;
+
if (skb_warn_if_lro(skb))
goto drop;
@@ -459,7 +475,7 @@ int ip6_forward(struct sk_buff *skb)
else
target = &hdr->daddr;
- peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+ peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
@@ -512,8 +528,8 @@ int ip6_forward(struct sk_buff *skb)
IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
- return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
- skb->dev, dst->dev,
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dst->dev,
ip6_forward_finish);
error:
@@ -540,8 +556,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
-int ip6_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct sk_buff *frag;
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
@@ -551,10 +567,9 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
struct frag_hdr *fh;
unsigned int mtu, hlen, left, len;
int hroom, troom;
- __be32 frag_id = 0;
+ __be32 frag_id;
int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
- struct net *net = dev_net(skb_dst(skb)->dev);
hlen = ip6_find_1stfragopt(skb, &prevhdr);
nexthdr = *prevhdr;
@@ -564,40 +579,50 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
/* We must not fragment if the socket is set to force MTU discovery
* or if the skb it not generated by a local socket.
*/
- if (unlikely(!skb->ignore_df && skb->len > mtu) ||
- (IP6CB(skb)->frag_max_size &&
- IP6CB(skb)->frag_max_size > mtu)) {
- if (skb->sk && dst_allfrag(skb_dst(skb)))
- sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
+ if (unlikely(!skb->ignore_df && skb->len > mtu))
+ goto fail_toobig;
- skb->dev = skb_dst(skb)->dev;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
- return -EMSGSIZE;
+ if (IP6CB(skb)->frag_max_size) {
+ if (IP6CB(skb)->frag_max_size > mtu)
+ goto fail_toobig;
+
+ /* don't send fragments larger than what we received */
+ mtu = IP6CB(skb)->frag_max_size;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
}
if (np && np->frag_size < mtu) {
if (np->frag_size)
mtu = np->frag_size;
}
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto fail_toobig;
mtu -= hlen + sizeof(struct frag_hdr);
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
+
+ hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
int first_len = skb_pagelen(skb);
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
- skb_cloned(skb))
+ skb_cloned(skb) ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
goto slow_path;
skb_walk_frags(skb, frag) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
- skb_headroom(frag) < hlen)
+ skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
goto slow_path_clean;
/* Partially cloned skb? */
@@ -614,8 +639,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
err = 0;
offset = 0;
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
/* BUILD HEADER */
*prevhdr = NEXTHDR_FRAGMENT;
@@ -623,8 +646,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
if (!tmp_hdr) {
IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_FRAGFAILS);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto fail;
}
+ frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
__skb_pull(skb, hlen);
fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
@@ -632,11 +658,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
skb_reset_network_header(skb);
memcpy(skb_network_header(skb), tmp_hdr, hlen);
- ipv6_select_ident(net, fh, rt);
fh->nexthdr = nexthdr;
fh->reserved = 0;
fh->frag_off = htons(IP6_MF);
- frag_id = fh->identification;
+ fh->identification = frag_id;
first_len = skb_pagelen(skb);
skb->data_len = first_len - skb_headlen(skb);
@@ -670,7 +695,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
ip6_copy_metadata(frag, skb);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGCREATES);
@@ -710,10 +735,6 @@ slow_path_clean:
}
slow_path:
- if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
- skb_checksum_help(skb))
- goto fail;
-
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
@@ -722,7 +743,6 @@ slow_path:
*/
*prevhdr = NEXTHDR_FRAGMENT;
- hroom = LL_RESERVED_SPACE(rt->dst.dev);
troom = rt->dst.dev->needed_tailroom;
/*
@@ -778,11 +798,7 @@ slow_path:
*/
fh->nexthdr = nexthdr;
fh->reserved = 0;
- if (!frag_id) {
- ipv6_select_ident(net, fh, rt);
- frag_id = fh->identification;
- } else
- fh->identification = frag_id;
+ fh->identification = frag_id;
/*
* Copy a block of the IP datagram.
@@ -803,7 +819,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
- err = output(sk, frag);
+ err = output(net, sk, frag);
if (err)
goto fail;
@@ -815,6 +831,14 @@ slow_path:
consume_skb(skb);
return err;
+fail_toobig:
+ if (skb->sk && dst_allfrag(skb_dst(skb)))
+ sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
+
+ skb->dev = skb_dst(skb)->dev;
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ err = -EMSGSIZE;
+
fail:
IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_FRAGFAILS);
@@ -867,7 +891,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
#ifdef CONFIG_IPV6_SUBTREES
ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
#endif
- (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
+ (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
dst_release(dst);
dst = NULL;
}
@@ -876,15 +901,15 @@ out:
return dst;
}
-static int ip6_dst_lookup_tail(struct sock *sk,
+static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
struct dst_entry **dst, struct flowi6 *fl6)
{
- struct net *net = sock_net(sk);
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
struct neighbour *n;
struct rt6_info *rt;
#endif
int err;
+ int flags = 0;
/* The correct way to handle this would be to do
* ip6_route_get_saddr, and then ip6_route_output; however,
@@ -916,10 +941,13 @@ static int ip6_dst_lookup_tail(struct sock *sk,
dst_release(*dst);
*dst = NULL;
}
+
+ if (fl6->flowi6_oif)
+ flags |= RT6_LOOKUP_F_IFACE;
}
if (!*dst)
- *dst = ip6_route_output(net, sk, fl6);
+ *dst = ip6_route_output_flags(net, sk, fl6, flags);
err = (*dst)->error;
if (err)
@@ -936,7 +964,8 @@ static int ip6_dst_lookup_tail(struct sock *sk,
*/
rt = (struct rt6_info *) *dst;
rcu_read_lock_bh();
- n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
+ n = __ipv6_neigh_lookup_noref(rt->dst.dev,
+ rt6_nexthop(rt, &fl6->daddr));
err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
rcu_read_unlock_bh();
@@ -988,10 +1017,11 @@ out_err_release:
*
* It returns zero on success, or a standard errno code on error.
*/
-int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
+int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
+ struct flowi6 *fl6)
{
*dst = NULL;
- return ip6_dst_lookup_tail(sk, dst, fl6);
+ return ip6_dst_lookup_tail(net, sk, dst, fl6);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
@@ -1006,17 +1036,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
* It returns a valid dst pointer on success, or a pointer encoded
* error code.
*/
-struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst)
{
struct dst_entry *dst = NULL;
int err;
- err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
if (err)
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
}
@@ -1044,7 +1076,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
dst = ip6_sk_dst_check(sk, dst, fl6);
- err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
if (err)
return ERR_PTR(err);
if (final_dst)
@@ -1060,11 +1092,10 @@ static inline int ip6_ufo_append_data(struct sock *sk,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
int transhdrlen, int mtu, unsigned int flags,
- struct rt6_info *rt)
+ const struct flowi6 *fl6)
{
struct sk_buff *skb;
- struct frag_hdr fhdr;
int err;
/* There is support for UDP large send offload by network
@@ -1106,8 +1137,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
sizeof(struct frag_hdr)) & ~7;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- ipv6_select_ident(sock_net(sk), &fhdr, rt);
- skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
+ skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
+ &fl6->daddr,
+ &fl6->saddr);
append:
return skb_append_datato_frags(sk, skb, getfrag, from,
@@ -1242,6 +1274,7 @@ static int __ip6_append_data(struct sock *sk,
struct rt6_info *rt = (struct rt6_info *)cork->dst;
struct ipv6_txoptions *opt = v6_cork->opt;
int csummode = CHECKSUM_NONE;
+ unsigned int maxnonfragsize, headersize;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1259,38 +1292,43 @@ static int __ip6_append_data(struct sock *sk,
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
sizeof(struct frag_hdr);
- if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
- unsigned int maxnonfragsize, headersize;
-
- headersize = sizeof(struct ipv6hdr) +
- (opt ? opt->opt_flen + opt->opt_nflen : 0) +
- (dst_allfrag(&rt->dst) ?
- sizeof(struct frag_hdr) : 0) +
- rt->rt6i_nfheader_len;
-
- if (ip6_sk_ignore_df(sk))
- maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
- else
- maxnonfragsize = mtu;
+ headersize = sizeof(struct ipv6hdr) +
+ (opt ? opt->opt_flen + opt->opt_nflen : 0) +
+ (dst_allfrag(&rt->dst) ?
+ sizeof(struct frag_hdr) : 0) +
+ rt->rt6i_nfheader_len;
+
+ if (cork->length + length > mtu - headersize && dontfrag &&
+ (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_RAW)) {
+ ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
+ sizeof(struct ipv6hdr));
+ goto emsgsize;
+ }
- /* dontfrag active */
- if ((cork->length + length > mtu - headersize) && dontfrag &&
- (sk->sk_protocol == IPPROTO_UDP ||
- sk->sk_protocol == IPPROTO_RAW)) {
- ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
- sizeof(struct ipv6hdr));
- goto emsgsize;
- }
+ if (ip6_sk_ignore_df(sk))
+ maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
+ else
+ maxnonfragsize = mtu;
- if (cork->length + length > maxnonfragsize - headersize) {
+ if (cork->length + length > maxnonfragsize - headersize) {
emsgsize:
- ipv6_local_error(sk, EMSGSIZE, fl6,
- mtu - headersize +
- sizeof(struct ipv6hdr));
- return -EMSGSIZE;
- }
+ ipv6_local_error(sk, EMSGSIZE, fl6,
+ mtu - headersize +
+ sizeof(struct ipv6hdr));
+ return -EMSGSIZE;
}
+ /* CHECKSUM_PARTIAL only with no extension headers and when
+ * we are not going to fragment
+ */
+ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
+ headersize == sizeof(struct ipv6hdr) &&
+ length < mtu - headersize &&
+ !(flags & MSG_MORE) &&
+ rt->dst.dev->features & NETIF_F_V6_CSUM)
+ csummode = CHECKSUM_PARTIAL;
+
if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
sock_tx_timestamp(sk, &tx_flags);
if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
@@ -1298,16 +1336,6 @@ emsgsize:
tskey = sk->sk_tskey++;
}
- /* If this is the first and only packet and device
- * supports checksum offloading, let's use it.
- * Use transhdrlen, same as IPv4, because partial
- * sums only work when transhdrlen is set.
- */
- if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
- length + fragheaderlen < mtu &&
- rt->dst.dev->features & NETIF_F_V6_CSUM &&
- !exthdrlen)
- csummode = CHECKSUM_PARTIAL;
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1329,10 +1357,10 @@ emsgsize:
(skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen,
- transhdrlen, mtu, flags, rt);
+ transhdrlen, mtu, flags, fl6);
if (err)
goto error;
return 0;
@@ -1641,7 +1669,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ip6_flow_hdr(hdr, v6_cork->tclass,
ip6_make_flowlabel(net, skb, fl6->flowlabel,
- np->autoflowlabel));
+ np->autoflowlabel, fl6));
hdr->hop_limit = v6_cork->hop_limit;
hdr->nexthdr = proto;
hdr->saddr = fl6->saddr;
@@ -1670,7 +1698,7 @@ int ip6_send_skb(struct sk_buff *skb)
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
int err;
- err = ip6_local_out(skb);
+ err = ip6_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
diff --git a/kernel/net/ipv6/ip6_tunnel.c b/kernel/net/ipv6/ip6_tunnel.c
index 5cafd92c2..137fca42a 100644
--- a/kernel/net/ipv6/ip6_tunnel.c
+++ b/kernel/net/ipv6/ip6_tunnel.c
@@ -126,36 +126,92 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
* Locking : hash tables are protected by RCU and RTNL
*/
-struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
+static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst,
+ struct dst_entry *dst)
{
- struct dst_entry *dst = t->dst_cache;
+ write_seqlock_bh(&idst->lock);
+ dst_release(rcu_dereference_protected(
+ idst->dst,
+ lockdep_is_held(&idst->lock.lock)));
+ if (dst) {
+ dst_hold(dst);
+ idst->cookie = rt6_get_cookie((struct rt6_info *)dst);
+ } else {
+ idst->cookie = 0;
+ }
+ rcu_assign_pointer(idst->dst, dst);
+ write_sequnlock_bh(&idst->lock);
+}
+
+struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t)
+{
+ struct ip6_tnl_dst *idst;
+ struct dst_entry *dst;
+ unsigned int seq;
+ u32 cookie;
- if (dst && dst->obsolete &&
- !dst->ops->check(dst, t->dst_cookie)) {
- t->dst_cache = NULL;
+ idst = raw_cpu_ptr(t->dst_cache);
+
+ rcu_read_lock();
+ do {
+ seq = read_seqbegin(&idst->lock);
+ dst = rcu_dereference(idst->dst);
+ cookie = idst->cookie;
+ } while (read_seqretry(&idst->lock, seq));
+
+ if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+ dst = NULL;
+ rcu_read_unlock();
+
+ if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) {
+ ip6_tnl_per_cpu_dst_set(idst, NULL);
dst_release(dst);
- return NULL;
+ dst = NULL;
}
-
return dst;
}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_check);
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_get);
void ip6_tnl_dst_reset(struct ip6_tnl *t)
{
- dst_release(t->dst_cache);
- t->dst_cache = NULL;
+ int i;
+
+ for_each_possible_cpu(i)
+ ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
}
EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
-void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
+void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst)
+{
+ ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst);
+
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_set);
+
+void ip6_tnl_dst_destroy(struct ip6_tnl *t)
{
- struct rt6_info *rt = (struct rt6_info *) dst;
- t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- dst_release(t->dst_cache);
- t->dst_cache = dst;
+ if (!t->dst_cache)
+ return;
+
+ ip6_tnl_dst_reset(t);
+ free_percpu(t->dst_cache);
}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_store);
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy);
+
+int ip6_tnl_dst_init(struct ip6_tnl *t)
+{
+ int i;
+
+ t->dst_cache = alloc_percpu(struct ip6_tnl_dst);
+ if (!t->dst_cache)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i)
+ seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);
/**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses
@@ -271,6 +327,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
static void ip6_dev_free(struct net_device *dev)
{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ ip6_tnl_dst_destroy(t);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -510,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
struct ipv6_tlv_tnl_enc_lim *tel;
__u32 mtu;
case ICMPV6_DEST_UNREACH:
- net_warn_ratelimited("%s: Path to destination invalid or inactive!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
+ t->parms.name);
rel_msg = 1;
break;
case ICMPV6_TIME_EXCEED:
if ((*code) == ICMPV6_EXC_HOPLIMIT) {
- net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+ t->parms.name);
rel_msg = 1;
}
break;
@@ -529,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
if (teli && teli == *info - 2) {
tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
if (tel->encap_limit == 0) {
- net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
+ t->parms.name);
rel_msg = 1;
}
} else {
- net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
- t->parms.name);
+ net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+ t->parms.name);
}
break;
case ICMPV6_PKT_TOOBIG:
@@ -1010,23 +1069,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
} else if (!fl6->flowi6_mark)
- dst = ip6_tnl_dst_check(t);
+ dst = ip6_tnl_dst_get(t);
if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
goto tx_err_link_failure;
if (!dst) {
- ndst = ip6_route_output(net, NULL, fl6);
+ dst = ip6_route_output(net, NULL, fl6);
- if (ndst->error)
+ if (dst->error)
goto tx_err_link_failure;
- ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0);
- if (IS_ERR(ndst)) {
- err = PTR_ERR(ndst);
- ndst = NULL;
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
goto tx_err_link_failure;
}
- dst = ndst;
+ ndst = dst;
}
tdev = dst->dev;
@@ -1072,12 +1131,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
consume_skb(skb);
skb = new_skb;
}
- if (fl6->flowi6_mark) {
- skb_dst_set(skb, dst);
- ndst = NULL;
- } else {
- skb_dst_set_noref(skb, dst);
- }
+
+ if (!fl6->flowi6_mark && ndst)
+ ip6_tnl_dst_set(t, ndst);
+ skb_dst_set(skb, dst);
+
skb->transport_header = skb->network_header;
proto = fl6->flowi6_proto;
@@ -1095,20 +1153,18 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
- ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
ip6tunnel_xmit(NULL, skb, dev);
- if (ndst)
- ip6_tnl_dst_store(t, ndst);
return 0;
tx_err_link_failure:
stats->tx_carrier_errors++;
dst_link_failure(skb);
tx_err_dst_release:
- dst_release(ndst);
+ dst_release(dst);
return err;
}
@@ -1573,12 +1629,21 @@ static inline int
ip6_tnl_dev_init_gen(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
+ int ret;
t->dev = dev;
t->net = dev_net(dev);
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
+
+ ret = ip6_tnl_dst_init(t);
+ if (ret) {
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
+ }
+
return 0;
}
diff --git a/kernel/net/ipv6/ip6_udp_tunnel.c b/kernel/net/ipv6/ip6_udp_tunnel.c
index bba8903e8..14dacf1df 100644
--- a/kernel/net/ipv6/ip6_udp_tunnel.c
+++ b/kernel/net/ipv6/ip6_udp_tunnel.c
@@ -19,11 +19,18 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
int err;
struct socket *sock = NULL;
- err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
+ err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock);
if (err < 0)
goto error;
- sk_change_net(sock->sk, net);
+ if (cfg->ipv6_v6only) {
+ int val = 1;
+
+ err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+ (char *) &val, sizeof(val));
+ if (err < 0)
+ goto error;
+ }
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
@@ -55,7 +62,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
error:
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
*sockp = NULL;
return err;
diff --git a/kernel/net/ipv6/ip6_vti.c b/kernel/net/ipv6/ip6_vti.c
index 0224c032d..0a8610b33 100644
--- a/kernel/net/ipv6/ip6_vti.c
+++ b/kernel/net/ipv6/ip6_vti.c
@@ -482,7 +482,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
return -EMSGSIZE;
}
- err = dst_output(skb);
+ err = dst_output(t->net, skb->sk, skb);
if (net_xmit_eval(err) == 0) {
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
diff --git a/kernel/net/ipv6/ip6mr.c b/kernel/net/ipv6/ip6mr.c
index 5f36266b1..a10e77103 100644
--- a/kernel/net/ipv6/ip6mr.c
+++ b/kernel/net/ipv6/ip6mr.c
@@ -118,7 +118,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
int cmd);
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
struct netlink_callback *cb);
-static void mroute_clean_tables(struct mr6_table *mrt);
+static void mroute_clean_tables(struct mr6_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
@@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
.match = ip6mr_rule_match,
.configure = ip6mr_rule_configure,
.compare = ip6mr_rule_compare,
- .default_pref = fib_default_rule_pref,
.fill = ip6mr_rule_fill,
.nlgroup = RTNLGRP_IPV6_RULE,
.policy = ip6mr_rule_policy,
@@ -335,7 +334,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
static void ip6mr_free_table(struct mr6_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, true);
kfree(mrt);
}
@@ -766,10 +765,6 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -1543,7 +1538,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct mr6_table *mrt)
+static void mroute_clean_tables(struct mr6_table *mrt, bool all)
{
int i;
LIST_HEAD(list);
@@ -1553,8 +1548,9 @@ static void mroute_clean_tables(struct mr6_table *mrt)
* Shut down all active vif entries
*/
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif6_table[i].flags & VIFF_STATIC))
- mif6_delete(mrt, i, &list);
+ if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
+ continue;
+ mif6_delete(mrt, i, &list);
}
unregister_netdevice_many(&list);
@@ -1563,7 +1559,7 @@ static void mroute_clean_tables(struct mr6_table *mrt)
*/
for (i = 0; i < MFC6_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) {
- if (c->mfc_flags & MFC_STATIC)
+ if (!all && (c->mfc_flags & MFC_STATIC))
continue;
write_lock_bh(&mrt_lock);
list_del(&c->list);
@@ -1626,7 +1622,7 @@ int ip6mr_sk_done(struct sock *sk)
net->ipv6.devconf_all);
write_unlock_bh(&mrt_lock);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, false);
err = 0;
break;
}
@@ -1986,13 +1982,13 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
}
#endif
-static inline int ip6mr_forward2_finish(struct sock *sk, struct sk_buff *skb)
+static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
+ IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUTOCTETS, skb->len);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
/*
@@ -2064,8 +2060,8 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
IP6CB(skb)->flags |= IP6SKB_FORWARDED;
- return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
- skb->dev, dev,
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dev,
ip6mr_forward2_finish);
out_free:
diff --git a/kernel/net/ipv6/ipv6_sockglue.c b/kernel/net/ipv6/ipv6_sockglue.c
index 63e695691..4449ad1f8 100644
--- a/kernel/net/ipv6/ipv6_sockglue.c
+++ b/kernel/net/ipv6/ipv6_sockglue.c
@@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
- opt = xchg(&inet6_sk(sk)->opt, opt);
+ opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt,
+ opt);
sk_dst_reset(sk);
return opt;
@@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
sk->sk_socket->ops = &inet_dgram_ops;
sk->sk_family = PF_INET;
}
- opt = xchg(&np->opt, NULL);
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ opt = xchg((__force struct ipv6_txoptions **)&np->opt,
+ NULL);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
pktopt = xchg(&np->pktoptions, NULL);
kfree_skb(pktopt);
@@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
break;
- opt = ipv6_renew_options(sk, np->opt, optname,
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ opt = ipv6_renew_options(sk, opt, optname,
(struct ipv6_opt_hdr __user *)optval,
optlen);
if (IS_ERR(opt)) {
@@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = 0;
opt = ipv6_update_options(sk, opt);
sticky_done:
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
break;
}
@@ -486,6 +493,7 @@ sticky_done:
break;
memset(opt, 0, sizeof(*opt));
+ atomic_set(&opt->refcnt, 1);
opt->tot_len = sizeof(*opt) + optlen;
retv = -EFAULT;
if (copy_from_user(opt+1, optval, optlen))
@@ -502,8 +510,10 @@ update:
retv = 0;
opt = ipv6_update_options(sk, opt);
done:
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
break;
}
case IPV6_UNICAST_HOPS:
@@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
case IPV6_RTHDR:
case IPV6_DSTOPTS:
{
+ struct ipv6_txoptions *opt;
lock_sock(sk);
- len = ipv6_getsockopt_sticky(sk, np->opt,
- optname, optval, len);
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
release_sock(sk);
/* check if ipv6_getsockopt_sticky() returns err code */
if (len < 0)
diff --git a/kernel/net/ipv6/mcast.c b/kernel/net/ipv6/mcast.c
index 083b2927f..5ee56d0a8 100644
--- a/kernel/net/ipv6/mcast.c
+++ b/kernel/net/ipv6/mcast.c
@@ -1645,13 +1645,12 @@ static void mld_sendpack(struct sk_buff *skb)
payload_len = skb->len;
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net->ipv6.igmp_sk, skb, NULL, skb->dev,
- dst_output_sk);
+ net, net->ipv6.igmp_sk, skb, NULL, skb->dev,
+ dst_output);
out:
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
- IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len);
} else {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
}
@@ -2008,13 +2007,13 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
}
skb_dst_set(skb, dst);
- err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
- NULL, skb->dev, dst_output_sk);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb->dev,
+ dst_output);
out:
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, type);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
- IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len);
} else
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
diff --git a/kernel/net/ipv6/mcast_snoop.c b/kernel/net/ipv6/mcast_snoop.c
new file mode 100644
index 000000000..9405b04ee
--- /dev/null
+++ b/kernel/net/ipv6/mcast_snoop.c
@@ -0,0 +1,216 @@
+/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
+ */
+
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+
+static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h;
+ unsigned int len;
+ unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->version != 6)
+ return -EINVAL;
+
+ len = offset + ntohs(ip6h->payload_len);
+ if (skb->len < len || len <= offset)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h;
+ int offset;
+ u8 nexthdr;
+ __be16 frag_off;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_HOPOPTS)
+ return -ENOMSG;
+
+ nexthdr = ip6h->nexthdr;
+ offset = skb_network_offset(skb) + sizeof(*ip6h);
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+
+ if (offset < 0)
+ return -EINVAL;
+
+ if (nexthdr != IPPROTO_ICMPV6)
+ return -ENOMSG;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct mld2_report);
+
+ return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ipv6_mc_check_mld_query(struct sk_buff *skb)
+{
+ struct mld_msg *mld;
+ unsigned int len = skb_transport_offset(skb);
+
+ /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+ return -EINVAL;
+
+ len += sizeof(struct mld_msg);
+ if (skb->len < len)
+ return -EINVAL;
+
+ /* MLDv1? */
+ if (skb->len != len) {
+ /* or MLDv2? */
+ len += sizeof(struct mld2_query) - sizeof(struct mld_msg);
+ if (skb->len < len || !pskb_may_pull(skb, len))
+ return -EINVAL;
+ }
+
+ mld = (struct mld_msg *)skb_transport_header(skb);
+
+ /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
+ * all-nodes destination address (ff02::1) for general queries
+ */
+ if (ipv6_addr_any(&mld->mld_mca) &&
+ !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
+{
+ struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb);
+
+ switch (mld->mld_type) {
+ case ICMPV6_MGM_REDUCTION:
+ case ICMPV6_MGM_REPORT:
+ /* fall through */
+ return 0;
+ case ICMPV6_MLD2_REPORT:
+ return ipv6_mc_check_mld_reportv2(skb);
+ case ICMPV6_MGM_QUERY:
+ return ipv6_mc_check_mld_query(skb);
+ default:
+ return -ENOMSG;
+ }
+}
+
+static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
+{
+ return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
+}
+
+static int __ipv6_mc_check_mld(struct sk_buff *skb,
+ struct sk_buff **skb_trimmed)
+
+{
+ struct sk_buff *skb_chk = NULL;
+ unsigned int transport_len;
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
+ int ret = -EINVAL;
+
+ transport_len = ntohs(ipv6_hdr(skb)->payload_len);
+ transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr);
+
+ skb_chk = skb_checksum_trimmed(skb, transport_len,
+ ipv6_mc_validate_checksum);
+ if (!skb_chk)
+ goto err;
+
+ if (!pskb_may_pull(skb_chk, len))
+ goto err;
+
+ ret = ipv6_mc_check_mld_msg(skb_chk);
+ if (ret)
+ goto err;
+
+ if (skb_trimmed)
+ *skb_trimmed = skb_chk;
+ /* free now unneeded clone */
+ else if (skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ ret = 0;
+
+err:
+ if (ret && skb_chk && skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return ret;
+}
+
+/**
+ * ipv6_mc_check_mld - checks whether this is a sane MLD packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional)
+ *
+ * Checks whether an IPv6 packet is a valid MLD packet. If so sets
+ * skb transport header accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ * standard
+ * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an MLD packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * Caller needs to set the skb network header and free any returned skb if it
+ * differs from the provided skb.
+ */
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+ int ret;
+
+ ret = ipv6_mc_check_ip6hdr(skb);
+ if (ret < 0)
+ return ret;
+
+ ret = ipv6_mc_check_exthdrs(skb);
+ if (ret < 0)
+ return ret;
+
+ return __ipv6_mc_check_mld(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ipv6_mc_check_mld);
diff --git a/kernel/net/ipv6/mip6.c b/kernel/net/ipv6/mip6.c
index b9779d441..60c79a08e 100644
--- a/kernel/net/ipv6/mip6.c
+++ b/kernel/net/ipv6/mip6.c
@@ -118,7 +118,7 @@ static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
struct mip6_report_rate_limiter {
spinlock_t lock;
- struct timeval stamp;
+ ktime_t stamp;
int iif;
struct in6_addr src;
struct in6_addr dst;
@@ -184,20 +184,18 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb)
return 0;
}
-static inline int mip6_report_rl_allow(struct timeval *stamp,
+static inline int mip6_report_rl_allow(ktime_t stamp,
const struct in6_addr *dst,
const struct in6_addr *src, int iif)
{
int allow = 0;
spin_lock_bh(&mip6_report_rl.lock);
- if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec ||
- mip6_report_rl.stamp.tv_usec != stamp->tv_usec ||
+ if (!ktime_equal(mip6_report_rl.stamp, stamp) ||
mip6_report_rl.iif != iif ||
!ipv6_addr_equal(&mip6_report_rl.src, src) ||
!ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
- mip6_report_rl.stamp.tv_sec = stamp->tv_sec;
- mip6_report_rl.stamp.tv_usec = stamp->tv_usec;
+ mip6_report_rl.stamp = stamp;
mip6_report_rl.iif = iif;
mip6_report_rl.src = *src;
mip6_report_rl.dst = *dst;
@@ -216,7 +214,7 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
struct ipv6_destopt_hao *hao = NULL;
struct xfrm_selector sel;
int offset;
- struct timeval stamp;
+ ktime_t stamp;
int err = 0;
if (unlikely(fl6->flowi6_proto == IPPROTO_MH &&
@@ -230,9 +228,9 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
(skb_network_header(skb) + offset);
}
- skb_get_timestamp(skb, &stamp);
+ stamp = skb_get_ktime(skb);
- if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr,
+ if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr,
hao ? &hao->addr : &ipv6_hdr(skb)->saddr,
opt->iif))
goto out;
diff --git a/kernel/net/ipv6/ndisc.c b/kernel/net/ipv6/ndisc.c
index 96f153c08..84afb9a77 100644
--- a/kernel/net/ipv6/ndisc.c
+++ b/kernel/net/ipv6/ndisc.c
@@ -67,6 +67,7 @@
#include <net/flow.h>
#include <net/ip6_checksum.h>
#include <net/inet_common.h>
+#include <net/l3mdev.h>
#include <linux/proc_fs.h>
#include <linux/netfilter.h>
@@ -147,6 +148,7 @@ struct neigh_table nd_tbl = {
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
+EXPORT_SYMBOL_GPL(nd_tbl);
static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
{
@@ -441,8 +443,11 @@ static void ndisc_send_skb(struct sk_buff *skb,
if (!dst) {
struct flowi6 fl6;
+ int oif = l3mdev_fib_oif(skb->dev);
- icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex);
+ icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
+ if (oif != skb->dev->ifindex)
+ fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
dst = icmp6_dst_alloc(skb->dev, &fl6);
if (IS_ERR(dst)) {
kfree_skb(skb);
@@ -463,9 +468,9 @@ static void ndisc_send_skb(struct sk_buff *skb,
idev = __in6_dev_get(dst->dev);
IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
- err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
- NULL, dst->dev,
- dst_output_sk);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, dst->dev,
+ dst_output);
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, type);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
@@ -474,8 +479,7 @@ static void ndisc_send_skb(struct sk_buff *skb,
rcu_read_unlock();
}
-void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
- const struct in6_addr *daddr,
+void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
const struct in6_addr *solicited_addr,
bool router, bool solicited, bool override, bool inc_opt)
{
@@ -541,7 +545,7 @@ static void ndisc_send_unsol_na(struct net_device *dev)
read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
- ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &ifa->addr,
+ ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr,
/*router=*/ !!idev->cnf.forwarding,
/*solicited=*/ false, /*override=*/ true,
/*inc_opt=*/ true);
@@ -551,8 +555,7 @@ static void ndisc_send_unsol_na(struct net_device *dev)
in6_dev_put(idev);
}
-void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
- const struct in6_addr *solicit,
+void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
const struct in6_addr *daddr, const struct in6_addr *saddr)
{
struct sk_buff *skb;
@@ -675,12 +678,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
__func__, target);
}
- ndisc_send_ns(dev, neigh, target, target, saddr);
+ ndisc_send_ns(dev, target, target, saddr);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
} else {
addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
+ ndisc_send_ns(dev, target, &mcaddr, saddr);
}
}
@@ -764,7 +767,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
if (ifp) {
-
+have_ifp:
if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
if (dad) {
/*
@@ -790,6 +793,18 @@ static void ndisc_recv_ns(struct sk_buff *skb)
} else {
struct net *net = dev_net(dev);
+ /* perhaps an address on the master device */
+ if (netif_is_l3_slave(dev)) {
+ struct net_device *mdev;
+
+ mdev = netdev_master_upper_dev_get_rcu(dev);
+ if (mdev) {
+ ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1);
+ if (ifp)
+ goto have_ifp;
+ }
+ }
+
idev = in6_dev_get(dev);
if (!idev) {
/* XXX: count this drop? */
@@ -824,7 +839,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
is_router = idev->cnf.forwarding;
if (dad) {
- ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target,
+ ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target,
!!is_router, false, (ifp != NULL), true);
goto out;
}
@@ -845,8 +860,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE);
if (neigh || !dev->header_ops) {
- ndisc_send_na(dev, neigh, saddr, &msg->target,
- !!is_router,
+ ndisc_send_na(dev, saddr, &msg->target, !!is_router,
true, (ifp != NULL && inc), inc);
if (neigh)
neigh_release(neigh);
@@ -1074,6 +1088,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
struct ndisc_options ndopts;
int optlen;
unsigned int pref = 0;
+ __u32 old_if_flags;
+ bool send_ifinfo_notify = false;
__u8 *opt = (__u8 *)(ra_msg + 1);
@@ -1144,6 +1160,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
* Remember the managed/otherconf flags from most recently
* received RA message (RFC 2462) -- yoshfuji
*/
+ old_if_flags = in6_dev->if_flags;
in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
IF_RA_OTHERCONF)) |
(ra_msg->icmph.icmp6_addrconf_managed ?
@@ -1151,6 +1168,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
(ra_msg->icmph.icmp6_addrconf_other ?
IF_RA_OTHERCONF : 0);
+ if (old_if_flags != in6_dev->if_flags)
+ send_ifinfo_notify = true;
+
if (!in6_dev->cnf.accept_ra_defrtr) {
ND_PRINTK(2, info,
"RA: %s, defrtr is false for dev: %s\n",
@@ -1163,7 +1183,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
*/
if (!in6_dev->cnf.accept_ra_from_local &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- NULL, 0)) {
+ in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: default router ignored\n",
skb->dev->name);
@@ -1225,18 +1245,16 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (rt)
rt6_set_expires(rt, jiffies + (HZ * lifetime));
- if (ra_msg->icmph.icmp6_hop_limit) {
- /* Only set hop_limit on the interface if it is higher than
- * the current hop_limit.
- */
- if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+ if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
+ ra_msg->icmph.icmp6_hop_limit) {
+ if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+ if (rt)
+ dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+ ra_msg->icmph.icmp6_hop_limit);
} else {
- ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+ ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
}
- if (rt)
- dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
- ra_msg->icmph.icmp6_hop_limit);
}
skip_defrtr:
@@ -1254,7 +1272,7 @@ skip_defrtr:
rtime = HZ/10;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+ send_ifinfo_notify = true;
}
rtime = ntohl(ra_msg->reachable_time);
@@ -1271,11 +1289,17 @@ skip_defrtr:
GC_STALETIME, 3 * rtime);
in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
in6_dev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+ send_ifinfo_notify = true;
}
}
}
+ /*
+ * Send a notify if RA changed managed/otherconf flags or timer settings
+ */
+ if (send_ifinfo_notify)
+ inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+
skip_linkparms:
/*
@@ -1313,7 +1337,7 @@ skip_linkparms:
#ifdef CONFIG_IPV6_ROUTE_INFO
if (!in6_dev->cnf.accept_ra_from_local &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- NULL, 0)) {
+ in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: router info ignored.\n",
skb->dev->name);
@@ -1472,6 +1496,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
struct flowi6 fl6;
int rd_len;
u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+ int oif = l3mdev_fib_oif(dev);
bool ret;
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1488,7 +1513,10 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
}
icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
- &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
+ &saddr_buf, &ipv6_hdr(skb)->saddr, oif);
+
+ if (oif != skb->dev->ifindex)
+ fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
@@ -1506,7 +1534,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
"Redirect: destination is not a neighbour\n");
goto release;
}
- peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+ peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1);
ret = inet_peer_xrlim_allow(peer, 1*HZ);
if (peer)
inet_putpeer(peer);
@@ -1650,6 +1678,7 @@ int ndisc_rcv(struct sk_buff *skb)
static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
@@ -1664,6 +1693,11 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
ndisc_send_unsol_na(dev);
in6_dev_put(idev);
break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&nd_tbl, dev);
+ break;
case NETDEV_DOWN:
neigh_ifdown(&nd_tbl, dev);
fib6_run_gc(0, net, false);
diff --git a/kernel/net/ipv6/netfilter.c b/kernel/net/ipv6/netfilter.c
index d958718b5..d11c46833 100644
--- a/kernel/net/ipv6/netfilter.c
+++ b/kernel/net/ipv6/netfilter.c
@@ -18,9 +18,8 @@
#include <net/ip6_checksum.h>
#include <net/netfilter/nf_queue.h>
-int ip6_route_me_harder(struct sk_buff *skb)
+int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
const struct ipv6hdr *iph = ipv6_hdr(skb);
unsigned int hh_len;
struct dst_entry *dst;
@@ -93,7 +92,7 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
}
}
-static int nf_ip6_reroute(struct sk_buff *skb,
+static int nf_ip6_reroute(struct net *net, struct sk_buff *skb,
const struct nf_queue_entry *entry)
{
struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -103,7 +102,7 @@ static int nf_ip6_reroute(struct sk_buff *skb,
if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
!ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
skb->mark != rt_info->mark)
- return ip6_route_me_harder(skb);
+ return ip6_route_me_harder(net, skb);
}
return 0;
}
@@ -191,6 +190,8 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
static const struct nf_ipv6_ops ipv6ops = {
.chk_addr = ipv6_chk_addr,
+ .route_input = ip6_route_input,
+ .fragment = ip6_fragment
};
static const struct nf_afinfo nf_ip6_afinfo = {
diff --git a/kernel/net/ipv6/netfilter/Kconfig b/kernel/net/ipv6/netfilter/Kconfig
index ca6998345..e10a04c9c 100644
--- a/kernel/net/ipv6/netfilter/Kconfig
+++ b/kernel/net/ipv6/netfilter/Kconfig
@@ -47,9 +47,23 @@ config NFT_REJECT_IPV6
default NFT_REJECT
tristate
+config NFT_DUP_IPV6
+ tristate "IPv6 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV6
+ help
+ This module enables IPv6 packet duplication support for nf_tables.
+
endif # NF_TABLES_IPV6
endif # NF_TABLES
+config NF_DUP_IPV6
+ tristate "Netfilter IPv6 packet duplication to alternate destination"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ help
+ This option enables the nf_dup_ipv6 core, which duplicates an IPv6
+ packet to be rerouted to another destination.
+
config NF_REJECT_IPV6
tristate "IPv6 packet rejection"
default m if NETFILTER_ADVANCED=n
@@ -186,7 +200,8 @@ config IP6_NF_MATCH_MH
config IP6_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW)
+ depends on NETFILTER_ADVANCED
+ depends on IP6_NF_MANGLE || IP6_NF_RAW
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
diff --git a/kernel/net/ipv6/netfilter/Makefile b/kernel/net/ipv6/netfilter/Makefile
index c36e0a549..b4f7d0b4e 100644
--- a/kernel/net/ipv6/netfilter/Makefile
+++ b/kernel/net/ipv6/netfilter/Makefile
@@ -30,6 +30,8 @@ obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
# reject
obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
+obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o
+
# nf_tables
obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
@@ -37,6 +39,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
+obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/kernel/net/ipv6/netfilter/ip6_tables.c b/kernel/net/ipv6/netfilter/ip6_tables.c
index 62f5b0d0b..99425cf28 100644
--- a/kernel/net/ipv6/netfilter/ip6_tables.c
+++ b/kernel/net/ipv6/netfilter/ip6_tables.c
@@ -117,7 +117,7 @@ ip6_packet_match(const struct sk_buff *skb,
if (FWINV(ret != 0, IP6T_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, ip6info->iniface,
- ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":"");
+ ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : "");
return false;
}
@@ -126,14 +126,14 @@ ip6_packet_match(const struct sk_buff *skb,
if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, ip6info->outiface,
- ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":"");
+ ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : "");
return false;
}
/* ... might want to do something with class and flowlabel here ... */
/* look for the desired protocol header */
- if((ip6info->flags & IP6T_F_PROTO)) {
+ if (ip6info->flags & IP6T_F_PROTO) {
int protohdr;
unsigned short _frag_off;
@@ -151,9 +151,9 @@ ip6_packet_match(const struct sk_buff *skb,
ip6info->proto);
if (ip6info->proto == protohdr) {
- if(ip6info->invflags & IP6T_INV_PROTO) {
+ if (ip6info->invflags & IP6T_INV_PROTO)
return false;
- }
+
return true;
}
@@ -275,7 +275,8 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
return 0;
}
-static void trace_packet(const struct sk_buff *skb,
+static void trace_packet(struct net *net,
+ const struct sk_buff *skb,
unsigned int hook,
const struct net_device *in,
const struct net_device *out,
@@ -283,15 +284,12 @@ static void trace_packet(const struct sk_buff *skb,
const struct xt_table_info *private,
const struct ip6t_entry *e)
{
- const void *table_base;
const struct ip6t_entry *root;
const char *hookname, *chainname, *comment;
const struct ip6t_entry *iter;
unsigned int rulenum = 0;
- struct net *net = dev_net(in ? in : out);
- table_base = private->entries[smp_processor_id()];
- root = get_entry(table_base, private->hook_entry[hook]);
+ root = get_entry(private->entries, private->hook_entry[hook]);
hookname = chainname = hooknames[hook];
comment = comments[NF_IP6_TRACE_COMMENT_RULE];
@@ -307,7 +305,7 @@ static void trace_packet(const struct sk_buff *skb,
}
#endif
-static inline __pure struct ip6t_entry *
+static inline struct ip6t_entry *
ip6t_next_entry(const struct ip6t_entry *entry)
{
return (void *)entry + entry->next_offset;
@@ -316,22 +314,23 @@ ip6t_next_entry(const struct ip6t_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ip6t_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
struct ip6t_entry *e, **jumpstack;
- unsigned int *stackptr, origptr, cpu;
+ unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
+ stackidx = 0;
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
@@ -341,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb,
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.hotdrop = false;
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.family = NFPROTO_IPV6;
@@ -357,16 +357,25 @@ ip6t_do_table(struct sk_buff *skb,
*/
smp_read_barrier_depends();
cpu = smp_processor_id();
- table_base = private->entries[cpu];
+ table_base = private->entries;
jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
- stackptr = per_cpu_ptr(private->stackptr, cpu);
- origptr = *stackptr;
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
+ struct xt_counters *counter;
IP_NF_ASSERT(e);
acpar.thoff = 0;
@@ -384,7 +393,8 @@ ip6t_do_table(struct sk_buff *skb,
goto no_match;
}
- ADD_COUNTER(e->counters, skb->len, 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
t = ip6t_get_target_c(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -392,8 +402,8 @@ ip6t_do_table(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
- trace_packet(skb, hook, state->in, state->out,
- table->name, private, e);
+ trace_packet(state->net, skb, hook, state->in,
+ state->out, table->name, private, e);
#endif
/* Standard target? */
if (!t->u.kernel.target->target) {
@@ -406,20 +416,16 @@ ip6t_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- if (*stackptr <= origptr)
+ if (stackidx == 0)
e = get_entry(table_base,
private->underflow[hook]);
else
- e = ip6t_next_entry(jumpstack[--*stackptr]);
+ e = ip6t_next_entry(jumpstack[--stackidx]);
continue;
}
if (table_base + v != ip6t_next_entry(e) &&
!(e->ipv6.flags & IP6T_F_GOTO)) {
- if (*stackptr >= private->stacksize) {
- verdict = NF_DROP;
- break;
- }
- jumpstack[(*stackptr)++] = e;
+ jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
@@ -437,10 +443,8 @@ ip6t_do_table(struct sk_buff *skb,
break;
} while (!acpar.hotdrop);
- *stackptr = origptr;
-
- xt_write_recseq_end(addend);
- local_bh_enable();
+ xt_write_recseq_end(addend);
+ local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -557,7 +561,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -679,6 +683,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -714,6 +722,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -797,13 +808,15 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
- const struct ip6t_replace *repl)
+ const struct ip6t_replace *repl)
{
struct ip6t_entry *iter;
unsigned int i;
@@ -879,12 +892,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -900,14 +907,16 @@ get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -952,11 +961,7 @@ copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
@@ -1064,16 +1069,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct ip6t_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(AF_INET6, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1085,7 +1090,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1147,7 +1152,7 @@ static int get_info(struct net *net, void __user *user,
static int
get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
- const int *len)
+ const int *len)
{
int ret;
struct ip6t_get_entries get;
@@ -1194,7 +1199,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_table *t;
struct xt_table_info *oldinfo;
struct xt_counters *counters;
- const void *loc_cpu_old_entry;
struct ip6t_entry *iter;
ret = 0;
@@ -1237,8 +1241,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
@@ -1284,8 +1287,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1316,7 +1318,7 @@ static int
do_add_counters(struct net *net, const void __user *user, unsigned int len,
int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1326,7 +1328,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- const void *loc_cpu_entry;
struct ip6t_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1374,7 +1375,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
goto free;
}
-
local_bh_disable();
private = t->private;
if (private->number != num_counters) {
@@ -1383,16 +1383,15 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
addend = xt_write_recseq_begin();
- loc_cpu_entry = private->entries[curcpu];
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
-
unlock_up_free:
local_bh_enable();
xt_table_unlock(t);
@@ -1459,7 +1458,6 @@ static int
compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ip6t_ip6 *ipv6,
- unsigned int hookmask,
int *size)
{
struct xt_match *match;
@@ -1528,8 +1526,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
entry_offset = (void *)e - (void *)base;
j = 0;
xt_ematch_foreach(ematch, e) {
- ret = compat_find_calc_match(ematch, name,
- &e->ipv6, e->comefrom, &off);
+ ret = compat_find_calc_match(ematch, name, &e->ipv6, &off);
if (ret != 0)
goto release_matches;
++j;
@@ -1623,6 +1620,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -1647,6 +1647,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -1731,7 +1734,7 @@ translate_compat_table(struct net *net,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1783,11 +1786,6 @@ translate_compat_table(struct net *net,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1834,8 +1832,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1906,7 +1903,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
void __user *pos;
unsigned int size;
int ret = 0;
- const void *loc_cpu_entry;
unsigned int i = 0;
struct ip6t_entry *iter;
@@ -1914,14 +1910,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -2096,8 +2087,7 @@ struct xt_table *ip6t_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu, but dont care about preemption */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2127,7 +2117,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter, net);
if (private->number > private->initial_entries)
diff --git a/kernel/net/ipv6/netfilter/ip6t_REJECT.c b/kernel/net/ipv6/netfilter/ip6t_REJECT.c
index 12331efd4..db29bbf41 100644
--- a/kernel/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/kernel/net/ipv6/netfilter/ip6t_REJECT.c
@@ -35,14 +35,12 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
MODULE_LICENSE("GPL");
-
static unsigned int
reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_reject_info *reject = par->targinfo;
- struct net *net = dev_net((par->in != NULL) ? par->in : par->out);
+ struct net *net = par->net;
- pr_debug("%s: medium point\n", __func__);
switch (reject->with) {
case IP6T_ICMP6_NO_ROUTE:
nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum);
@@ -65,8 +63,11 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
case IP6T_TCP_RESET:
nf_send_reset6(net, skb, par->hooknum);
break;
- default:
- net_info_ratelimited("case %u not handled yet\n", reject->with);
+ case IP6T_ICMP6_POLICY_FAIL:
+ nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum);
+ break;
+ case IP6T_ICMP6_REJECT_ROUTE:
+ nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum);
break;
}
diff --git a/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c b/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 6edb7b106..3deed5860 100644
--- a/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/kernel/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -37,12 +37,13 @@ synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr,
}
static void
-synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+synproxy_send_tcp(const struct synproxy_net *snet,
+ const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct ipv6hdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
- struct net *net = nf_ct_net((struct nf_conn *)nfct);
+ struct net *net = nf_ct_net(snet->tmpl);
struct dst_entry *dst;
struct flowi6 fl6;
@@ -75,7 +76,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
nf_conntrack_get(nfct);
}
- ip6_local_out(nskb);
+ ip6_local_out(net, nskb->sk, nskb);
return;
free_nskb:
@@ -83,7 +84,8 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+synproxy_send_client_synack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
@@ -119,7 +121,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
@@ -163,7 +165,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
@@ -203,7 +205,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
@@ -235,13 +237,14 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
nth->ack_seq = th->ack_seq;
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
- nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
}
static bool
@@ -272,7 +275,7 @@ static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_net *snet = synproxy_pernet(par->net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -301,7 +304,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(skb, th, &opts);
+ synproxy_send_client_synack(snet, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
@@ -313,11 +316,11 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops,
+static unsigned int ipv6_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ struct synproxy_net *snet = synproxy_pernet(nhs->net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -455,14 +458,12 @@ static struct xt_target synproxy_tg6_reg __read_mostly = {
static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = {
{
.hook = ipv6_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv6_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
diff --git a/kernel/net/ipv6/netfilter/ip6t_rpfilter.c b/kernel/net/ipv6/netfilter/ip6t_rpfilter.c
index 790e0c6b1..1ee1b25df 100644
--- a/kernel/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/kernel/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -26,7 +26,7 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr)
return addr_type & IPV6_ADDR_UNICAST;
}
-static bool rpfilter_lookup_reverse6(const struct sk_buff *skb,
+static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
const struct net_device *dev, u8 flags)
{
struct rt6_info *rt;
@@ -53,7 +53,7 @@ static bool rpfilter_lookup_reverse6(const struct sk_buff *skb,
lookup_flags |= RT6_LOOKUP_F_IFACE;
}
- rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags);
+ rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags);
if (rt->dst.error)
goto out;
@@ -93,7 +93,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (unlikely(saddrtype == IPV6_ADDR_ANY))
return true ^ invert; /* not routable: forward path will drop it */
- return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/kernel/net/ipv6/netfilter/ip6table_filter.c b/kernel/net/ipv6/netfilter/ip6table_filter.c
index 5c33d8abc..8b277b983 100644
--- a/kernel/net/ipv6/netfilter/ip6table_filter.c
+++ b/kernel/net/ipv6/netfilter/ip6table_filter.c
@@ -32,12 +32,10 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c. */
static unsigned int
-ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip6table_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net = dev_net(state->in ? state->in : state->out);
-
- return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_filter);
+ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/kernel/net/ipv6/netfilter/ip6table_mangle.c b/kernel/net/ipv6/netfilter/ip6table_mangle.c
index b551f5b79..abe278b07 100644
--- a/kernel/net/ipv6/netfilter/ip6table_mangle.c
+++ b/kernel/net/ipv6/netfilter/ip6table_mangle.c
@@ -57,8 +57,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* flowlabel and prio (includes version, which shouldn't change either */
flowlabel = *((u_int32_t *)ipv6_hdr(skb));
- ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, state,
- dev_net(state->out)->ipv6.ip6table_mangle);
+ ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
if (ret != NF_DROP && ret != NF_STOLEN &&
(!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
@@ -66,7 +65,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
- err = ip6_route_me_harder(skb);
+ err = ip6_route_me_harder(state->net, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -76,17 +75,16 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* The work comes in here from netfilter.c. */
static unsigned int
-ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip6table_mangle_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (ops->hooknum == NF_INET_LOCAL_OUT)
+ if (state->hook == NF_INET_LOCAL_OUT)
return ip6t_mangle_out(skb, state);
- if (ops->hooknum == NF_INET_POST_ROUTING)
- return ip6t_do_table(skb, ops->hooknum, state,
- dev_net(state->out)->ipv6.ip6table_mangle);
+ if (state->hook == NF_INET_POST_ROUTING)
+ return ip6t_do_table(skb, state,
+ state->net->ipv6.ip6table_mangle);
/* INPUT/FORWARD */
- return ip6t_do_table(skb, ops->hooknum, state,
- dev_net(state->in)->ipv6.ip6table_mangle);
+ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/kernel/net/ipv6/netfilter/ip6table_nat.c b/kernel/net/ipv6/netfilter/ip6table_nat.c
index c3a7f7af0..de2a10a56 100644
--- a/kernel/net/ipv6/netfilter/ip6table_nat.c
+++ b/kernel/net/ipv6/netfilter/ip6table_nat.c
@@ -30,49 +30,46 @@ static const struct xt_table nf_nat_ipv6_table = {
.af = NFPROTO_IPV6,
};
-static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int ip6table_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
-
- return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_nat);
+ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
}
-static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops,
+static unsigned int ip6table_nat_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_fn(ops, skb, state, ip6table_nat_do_chain);
+ return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain);
}
-static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops,
+static unsigned int ip6table_nat_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_in(ops, skb, state, ip6table_nat_do_chain);
+ return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain);
}
-static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops,
+static unsigned int ip6table_nat_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_out(ops, skb, state, ip6table_nat_do_chain);
+ return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain);
}
-static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops,
+static unsigned int ip6table_nat_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_local_fn(ops, skb, state, ip6table_nat_do_chain);
+ return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain);
}
static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = ip6table_nat_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_NAT_DST,
@@ -80,7 +77,6 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = ip6table_nat_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_NAT_SRC,
@@ -88,7 +84,6 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = ip6table_nat_local_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST,
@@ -96,7 +91,6 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = ip6table_nat_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
diff --git a/kernel/net/ipv6/netfilter/ip6table_raw.c b/kernel/net/ipv6/netfilter/ip6table_raw.c
index 0b33caad2..902196356 100644
--- a/kernel/net/ipv6/netfilter/ip6table_raw.c
+++ b/kernel/net/ipv6/netfilter/ip6table_raw.c
@@ -19,12 +19,10 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip6table_raw_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net = dev_net(state->in ? state->in : state->out);
-
- return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_raw);
+ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/kernel/net/ipv6/netfilter/ip6table_security.c b/kernel/net/ipv6/netfilter/ip6table_security.c
index fcef83c25..0d856fedf 100644
--- a/kernel/net/ipv6/netfilter/ip6table_security.c
+++ b/kernel/net/ipv6/netfilter/ip6table_security.c
@@ -36,13 +36,10 @@ static const struct xt_table security_table = {
};
static unsigned int
-ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip6table_security_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net = dev_net(state->in ? state->in : state->out);
-
- return ip6t_do_table(skb, ops->hooknum, state,
- net->ipv6.ip6table_security);
+ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 4ba0c34c6..1aa584876 100644
--- a/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -95,7 +95,7 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
return NF_ACCEPT;
}
-static unsigned int ipv6_helper(const struct nf_hook_ops *ops,
+static unsigned int ipv6_helper(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -131,7 +131,7 @@ static unsigned int ipv6_helper(const struct nf_hook_ops *ops,
return helper->help(skb, protoff, ct, ctinfo);
}
-static unsigned int ipv6_confirm(const struct nf_hook_ops *ops,
+static unsigned int ipv6_confirm(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -165,14 +165,14 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops,
+static unsigned int ipv6_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_conntrack_in(dev_net(state->in), PF_INET6, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
}
-static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops,
+static unsigned int ipv6_conntrack_local(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -181,48 +181,42 @@ static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops,
net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
return NF_ACCEPT;
}
- return nf_conntrack_in(dev_net(state->out), PF_INET6, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
}
static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
{
.hook = ipv6_conntrack_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_CONNTRACK,
},
{
.hook = ipv6_conntrack_local,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_CONNTRACK,
},
{
.hook = ipv6_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv6_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_LAST,
},
{
.hook = ipv6_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv6_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_LAST-1,
@@ -251,7 +245,7 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
if (*len < 0 || (unsigned int) *len < sizeof(sin6))
return -EINVAL;
- h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
if (!h) {
pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
&tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
diff --git a/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 90388d606..660bc10c7 100644
--- a/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/kernel/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -36,6 +36,7 @@ static inline struct nf_icmp_net *icmpv6_pernet(struct net *net)
static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
+ struct net *net,
struct nf_conntrack_tuple *tuple)
{
const struct icmp6hdr *hp;
@@ -56,12 +57,12 @@ static const u_int8_t invmap[] = {
[ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1,
[ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1,
[ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1,
- [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1
+ [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1
};
static const u_int8_t noct_valid_new[] = {
[ICMPV6_MGM_QUERY - 130] = 1,
- [ICMPV6_MGM_REPORT -130] = 1,
+ [ICMPV6_MGM_REPORT - 130] = 1,
[ICMPV6_MGM_REDUCTION - 130] = 1,
[NDISC_ROUTER_SOLICITATION - 130] = 1,
[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
@@ -150,7 +151,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple intuple, origtuple;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_l4proto *inproto;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ struct nf_conntrack_zone tmp;
NF_CT_ASSERT(skb->nfct == NULL);
@@ -159,7 +160,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
skb_network_offset(skb)
+ sizeof(struct ipv6hdr)
+ sizeof(struct icmp6hdr),
- PF_INET6, &origtuple)) {
+ PF_INET6, net, &origtuple)) {
pr_debug("icmpv6_error: Can't get tuple\n");
return -NF_ACCEPT;
}
@@ -177,7 +178,8 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
*ctinfo = IP_CT_RELATED;
- h = nf_conntrack_find_get(net, zone, &intuple);
+ h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
+ &intuple);
if (!h) {
pr_debug("icmpv6_error: no match\n");
return -NF_ACCEPT;
diff --git a/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c b/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6f187c8d8..bab4441ed 100644
--- a/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/kernel/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -59,7 +59,7 @@ struct nf_ct_frag6_skb_cb
struct sk_buff *orig;
};
-#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb))
+#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb))
static struct inet_frags nf_frags;
@@ -190,7 +190,7 @@ static void nf_ct_frag6_expire(unsigned long data)
/* Creation primitives. */
static inline struct frag_queue *fq_find(struct net *net, __be32 id,
u32 user, struct in6_addr *src,
- struct in6_addr *dst, u8 ecn)
+ struct in6_addr *dst, int iif, u8 ecn)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -200,6 +200,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
arg.user = user;
arg.src = src;
arg.dst = dst;
+ arg.iif = iif;
arg.ecn = ecn;
local_bh_disable();
@@ -348,7 +349,7 @@ found:
fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
fq->q.max_size = payload_len;
- add_frag_mem_limit(&fq->q, skb->truesize);
+ add_frag_mem_limit(fq->q.net, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -430,7 +431,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
clone->ip_summed = head->ip_summed;
NFCT_FRAG6_CB(clone)->orig = NULL;
- add_frag_mem_limit(&fq->q, clone->truesize);
+ add_frag_mem_limit(fq->q.net, clone->truesize);
}
/* We have to remove fragment header from datagram and to relocate
@@ -445,7 +446,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
skb_reset_transport_header(head);
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
+ for (fp = head->next; fp; fp = fp->next) {
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
@@ -454,7 +455,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
}
- sub_frag_mem_limit(&fq->q, head->truesize);
+ sub_frag_mem_limit(fq->q.net, head->truesize);
head->ignore_df = 1;
head->next = NULL;
@@ -563,12 +564,10 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
return 0;
}
-struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
+struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
struct sk_buff *clone;
struct net_device *dev = skb->dev;
- struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev)
- : dev_net(skb->dev);
struct frag_hdr *fhdr;
struct frag_queue *fq;
struct ipv6hdr *hdr;
@@ -603,7 +602,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
fhdr = (struct frag_hdr *)skb_transport_header(clone);
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
- ip6_frag_ecn(hdr));
+ skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n");
goto ret_orig;
@@ -633,6 +632,7 @@ ret_orig:
kfree_skb(clone);
return skb;
}
+EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
void nf_ct_frag6_consume_orig(struct sk_buff *skb)
{
@@ -645,15 +645,22 @@ void nf_ct_frag6_consume_orig(struct sk_buff *skb)
s = s2;
}
}
+EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig);
static int nf_ct_net_init(struct net *net)
{
+ int res;
+
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
- inet_frags_init_net(&net->nf_frag.frags);
-
- return nf_ct_frag6_sysctl_register(net);
+ res = inet_frags_init_net(&net->nf_frag.frags);
+ if (res)
+ return res;
+ res = nf_ct_frag6_sysctl_register(net);
+ if (res)
+ inet_frags_uninit_net(&net->nf_frag.frags);
+ return res;
}
static void nf_ct_net_exit(struct net *net)
diff --git a/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index a45db0b47..4fdbed5eb 100644
--- a/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/kernel/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -33,26 +33,25 @@
static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
struct sk_buff *skb)
{
- u16 zone = NF_CT_DEFAULT_ZONE;
-
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct)
- zone = nf_ct_zone((struct nf_conn *)skb->nfct);
-#endif
+ if (skb->nfct) {
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge &&
- skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
- return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+ }
#endif
+ if (nf_bridge_in_prerouting(skb))
+ return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
+
if (hooknum == NF_INET_PRE_ROUTING)
- return IP6_DEFRAG_CONNTRACK_IN + zone;
+ return IP6_DEFRAG_CONNTRACK_IN + zone_id;
else
- return IP6_DEFRAG_CONNTRACK_OUT + zone;
-
+ return IP6_DEFRAG_CONNTRACK_OUT + zone_id;
}
-static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
+static unsigned int ipv6_defrag(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -64,7 +63,8 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
return NF_ACCEPT;
#endif
- reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb));
+ reasm = nf_ct_frag6_gather(state->net, skb,
+ nf_ct6_defrag_user(state->hook, skb));
/* queued */
if (reasm == NULL)
return NF_STOLEN;
@@ -75,7 +75,7 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
nf_ct_frag6_consume_orig(reasm);
- NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, state->sk, reasm,
+ NF_HOOK_THRESH(NFPROTO_IPV6, state->hook, state->net, state->sk, reasm,
state->in, state->out,
state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
@@ -85,14 +85,12 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv6_defrag_ops[] = {
{
.hook = ipv6_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv6_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
diff --git a/kernel/net/ipv6/netfilter/nf_dup_ipv6.c b/kernel/net/ipv6/netfilter/nf_dup_ipv6.c
new file mode 100644
index 000000000..6989c70ae
--- /dev/null
+++ b/kernel/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -0,0 +1,82 @@
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 or later, as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *gw, int oif)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ if (oif != -1)
+ fl6.flowi6_oif = oif;
+
+ fl6.daddr = *gw;
+ fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) |
+ (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]);
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return false;
+ }
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ skb->dev = dst->dev;
+ skb->protocol = htons(ETH_P_IPV6);
+
+ return true;
+}
+
+void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
+ const struct in6_addr *gw, int oif)
+{
+ if (this_cpu_read(nf_skb_duplicated))
+ return;
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_IN) {
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ --iph->hop_limit;
+ }
+ if (nf_dup_ipv6_route(net, skb, gw, oif)) {
+ __this_cpu_write(nf_skb_duplicated, true);
+ ip6_local_out(net, skb->sk, skb);
+ __this_cpu_write(nf_skb_duplicated, false);
+ } else {
+ kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv6);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv6: IPv6 packet duplication");
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index e76900e0a..238e70c3f 100644
--- a/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -124,7 +124,7 @@ static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
newip = &t->dst.u3.in6;
}
inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
- newip->s6_addr32, 1);
+ newip->s6_addr32, true);
}
static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
@@ -155,7 +155,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
}
} else
inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), 1);
+ htons(oldlen), htons(datalen), true);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -262,9 +262,9 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
unsigned int
-nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -272,7 +272,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
__be16 frag_off;
int hdrlen;
u8 nexthdr;
@@ -303,7 +303,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
- ops->hooknum,
+ state->hook,
hdrlen))
return NF_DROP;
else
@@ -317,21 +317,21 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = do_chain(ops, skb, state, ct);
+ ret = do_chain(priv, skb, state, ct);
if (ret != NF_ACCEPT)
return ret;
- if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
break;
- ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
break;
@@ -340,11 +340,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+ return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
@@ -353,9 +353,9 @@ oif_changed:
EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
unsigned int
-nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -363,7 +363,7 @@ nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int ret;
struct in6_addr daddr = ipv6_hdr(skb)->daddr;
- ret = nf_nat_ipv6_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
skb_dst_drop(skb);
@@ -373,9 +373,9 @@ nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv6_in);
unsigned int
-nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -391,7 +391,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (skb->len < sizeof(struct ipv6hdr))
return NF_ACCEPT;
- ret = nf_nat_ipv6_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -403,7 +403,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
ct->tuplehash[dir].tuple.src.u.all !=
ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(skb, AF_INET6);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -414,9 +414,9 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv6_out);
unsigned int
-nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -430,14 +430,14 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (skb->len < sizeof(struct ipv6hdr))
return NF_ACCEPT;
- ret = nf_nat_ipv6_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
&ct->tuplehash[!dir].tuple.src.u3)) {
- err = ip6_route_me_harder(skb);
+ err = ip6_route_me_harder(state->net, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -446,7 +446,7 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(skb, AF_INET6);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 774560966..31ba7ca19 100644
--- a/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -34,7 +34,7 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY));
- if (ipv6_dev_get_saddr(dev_net(out), out,
+ if (ipv6_dev_get_saddr(nf_ct_net(ct), out,
&ipv6_hdr(skb)->daddr, 0, &src) < 0)
return NF_DROP;
diff --git a/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 2205e8eee..57593b00c 100644
--- a/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/kernel/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -73,7 +73,7 @@ icmpv6_manip_pkt(struct sk_buff *skb,
hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
hdr->icmp6_identifier,
- tuple->src.u.icmp.id, 0);
+ tuple->src.u.icmp.id, false);
hdr->icmp6_identifier = tuple->src.u.icmp.id;
}
return true;
diff --git a/kernel/net/ipv6/netfilter/nf_reject_ipv6.c b/kernel/net/ipv6/netfilter/nf_reject_ipv6.c
index 94b4c6dfb..e0f922b77 100644
--- a/kernel/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -26,7 +26,7 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
int tcphoff;
proto = oip6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data),
+ tcphoff = ipv6_skip_exthdr(oldskb, ((u8 *)(oip6h + 1) - oldskb->data),
&proto, &frag_off);
if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
@@ -206,7 +206,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip6_local_out(nskb);
+ ip6_local_out(net, nskb->sk, nskb);
}
EXPORT_SYMBOL_GPL(nf_send_reset6);
@@ -224,7 +224,7 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook)
return true;
proto = ip6h->nexthdr;
- thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo);
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
diff --git a/kernel/net/ipv6/netfilter/nf_tables_ipv6.c b/kernel/net/ipv6/netfilter/nf_tables_ipv6.c
index c8148ba76..120ea9131 100644
--- a/kernel/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -16,20 +16,20 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_ipv6.h>
-static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops,
+static unsigned int nft_do_chain_ipv6(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
/* malformed packet, drop it */
- if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0)
+ if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
return NF_DROP;
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
+static unsigned int nft_ipv6_output(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -40,7 +40,7 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
return NF_ACCEPT;
}
- return nft_do_chain_ipv6(ops, skb, state);
+ return nft_do_chain_ipv6(priv, skb, state);
}
struct nft_af_info nft_af_ipv6 __read_mostly = {
diff --git a/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 951bb458b..443cd306c 100644
--- a/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -24,44 +24,44 @@
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/ipv6.h>
-static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv6(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv6(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv6_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv6_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_in(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv6_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_out(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv6_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_local_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
}
static const struct nf_chain_type nft_chain_nat_ipv6 = {
diff --git a/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c b/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 0dafdaac5..71d995ff3 100644
--- a/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -22,7 +22,7 @@
#include <net/netfilter/nf_tables_ipv6.h>
#include <net/route.h>
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+static unsigned int nf_route_table_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -33,7 +33,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
u32 mark, flowlabel;
/* malformed packet, drop it */
- if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0)
+ if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
return NF_DROP;
/* save source/dest address, mark, hoplimit, flowlabel, priority */
@@ -45,14 +45,14 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
/* flowlabel and prio (includes version, which shouldn't change either */
flowlabel = *((u32 *)ipv6_hdr(skb));
- ret = nft_do_chain(&pkt, ops);
+ ret = nft_do_chain(&pkt, priv);
if (ret != NF_DROP && ret != NF_QUEUE &&
(memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
- return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+ return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP;
return ret;
}
@@ -61,11 +61,11 @@ static const struct nf_chain_type nft_chain_route_ipv6 = {
.name = "route",
.type = NFT_CHAIN_T_ROUTE,
.family = NFPROTO_IPV6,
- .owner = THIS_MODULE,
+ .owner = THIS_MODULE,
.hook_mask = (1 << NF_INET_LOCAL_OUT),
.hooks = {
- [NF_INET_LOCAL_OUT] = nf_route_table_hook,
- },
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook,
+ },
};
static int __init nft_chain_route_init(void)
diff --git a/kernel/net/ipv6/netfilter/nft_dup_ipv6.c b/kernel/net/ipv6/netfilter/nft_dup_ipv6.c
new file mode 100644
index 000000000..8bfd470cb
--- /dev/null
+++ b/kernel/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+
+struct nft_dup_ipv6 {
+ enum nft_registers sreg_addr:8;
+ enum nft_registers sreg_dev:8;
+};
+
+static void nft_dup_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+ struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
+ int oif = regs->data[priv->sreg_dev];
+
+ nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif);
+}
+
+static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+ return -EINVAL;
+
+ priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+ err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+ priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+ return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ }
+ return 0;
+}
+
+static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+ nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv6_type;
+static const struct nft_expr_ops nft_dup_ipv6_ops = {
+ .type = &nft_dup_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv6)),
+ .eval = nft_dup_ipv6_eval,
+ .init = nft_dup_ipv6_init,
+ .dump = nft_dup_ipv6_dump,
+};
+
+static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = {
+ [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "dup",
+ .ops = &nft_dup_ipv6_ops,
+ .policy = nft_dup_ipv6_policy,
+ .maxattr = NFTA_DUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_dup_ipv6_module_init(void)
+{
+ return nft_register_expr(&nft_dup_ipv6_type);
+}
+
+static void __exit nft_dup_ipv6_module_exit(void)
+{
+ nft_unregister_expr(&nft_dup_ipv6_type);
+}
+
+module_init(nft_dup_ipv6_module_init);
+module_exit(nft_dup_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup");
diff --git a/kernel/net/ipv6/netfilter/nft_redir_ipv6.c b/kernel/net/ipv6/netfilter/nft_redir_ipv6.c
index effd393bd..aca44e89a 100644
--- a/kernel/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -35,8 +35,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
range.flags |= priv->flags;
- regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
- pkt->ops->hooknum);
+ regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->hook);
}
static struct nft_expr_type nft_redir_ipv6_type;
diff --git a/kernel/net/ipv6/netfilter/nft_reject_ipv6.c b/kernel/net/ipv6/netfilter/nft_reject_ipv6.c
index d0d1540ec..533cd5719 100644
--- a/kernel/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/kernel/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -24,15 +24,14 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_reject *priv = nft_expr_priv(expr);
- struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(net, pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
+ pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(net, pkt->skb, pkt->ops->hooknum);
+ nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
break;
default:
break;
diff --git a/kernel/net/ipv6/output_core.c b/kernel/net/ipv6/output_core.c
index 85892af57..462f2a76b 100644
--- a/kernel/net/ipv6/output_core.c
+++ b/kernel/net/ipv6/output_core.c
@@ -8,9 +8,11 @@
#include <net/ip6_fib.h>
#include <net/addrconf.h>
#include <net/secure_seq.h>
+#include <linux/netfilter.h>
static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
- struct in6_addr *dst, struct in6_addr *src)
+ const struct in6_addr *dst,
+ const struct in6_addr *src)
{
u32 hash, id;
@@ -60,17 +62,17 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
-void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
- struct rt6_info *rt)
+__be32 ipv6_select_ident(struct net *net,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
static u32 ip6_idents_hashrnd __read_mostly;
u32 id;
net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
- id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr,
- &rt->rt6i_src.addr);
- fhdr->identification = htonl(id);
+ id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
+ return htonl(id);
}
EXPORT_SYMBOL(ipv6_select_ident);
@@ -136,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
EXPORT_SYMBOL(ip6_dst_hoplimit);
#endif
-static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int len;
@@ -146,30 +148,20 @@ static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
ipv6_hdr(skb)->payload_len = htons(len);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
- return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
- NULL, skb_dst(skb)->dev, dst_output_sk);
-}
-
-int __ip6_local_out(struct sk_buff *skb)
-{
- return __ip6_local_out_sk(skb->sk, skb);
+ return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
}
EXPORT_SYMBOL_GPL(__ip6_local_out);
-int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip6_local_out_sk(sk, skb);
+ err = __ip6_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output_sk(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip6_local_out_sk);
-
-int ip6_local_out(struct sk_buff *skb)
-{
- return ip6_local_out_sk(skb->sk, skb);
-}
EXPORT_SYMBOL_GPL(ip6_local_out);
diff --git a/kernel/net/ipv6/raw.c b/kernel/net/ipv6/raw.c
index 8072bd413..99140986e 100644
--- a/kernel/net/ipv6/raw.c
+++ b/kernel/net/ipv6/raw.c
@@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* unspecified and mapped address have a v4 equivalent.
*/
v4addr = LOOPBACK4_IPV6;
- if (!(addr_type & IPV6_ADDR_MULTICAST)) {
+ if (!(addr_type & IPV6_ADDR_MULTICAST) &&
+ !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) {
err = -EADDRNOTAVAIL;
if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
dev, 0)) {
@@ -613,6 +614,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
unsigned int flags)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
struct ipv6hdr *iph;
struct sk_buff *skb;
int err;
@@ -651,9 +653,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
if (err)
goto error_fault;
- IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
- err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
- NULL, rt->dst.dev, dst_output_sk);
+ IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
+ NULL, rt->dst.dev, dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -665,7 +667,7 @@ error_fault:
err = -EFAULT;
kfree_skb(skb);
error:
- IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
if (err == -ENOBUFS && !np->recverr)
err = 0;
return err;
@@ -731,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ipv6_txoptions opt_space;
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
struct in6_addr *daddr, *final_p, final;
@@ -837,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!(opt->opt_nflen|opt->opt_flen))
opt = NULL;
}
- if (!opt)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -865,6 +870,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_oif = np->ucast_oif;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ if (inet->hdrincl)
+ fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
+
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
@@ -901,6 +909,7 @@ done:
dst_release(dst);
out:
fl6_sock_release(flowlabel);
+ txopt_put(opt_to_free);
return err < 0 ? err : len;
do_confirm:
dst_confirm(dst);
@@ -1324,13 +1333,7 @@ static struct inet_protosw rawv6_protosw = {
int __init rawv6_init(void)
{
- int ret;
-
- ret = inet6_register_protosw(&rawv6_protosw);
- if (ret)
- goto out;
-out:
- return ret;
+ return inet6_register_protosw(&rawv6_protosw);
}
void rawv6_exit(void)
diff --git a/kernel/net/ipv6/reassembly.c b/kernel/net/ipv6/reassembly.c
index 8ffa2c8cc..45f5ae51d 100644
--- a/kernel/net/ipv6/reassembly.c
+++ b/kernel/net/ipv6/reassembly.c
@@ -108,7 +108,10 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
return fq->id == arg->id &&
fq->user == arg->user &&
ipv6_addr_equal(&fq->saddr, arg->src) &&
- ipv6_addr_equal(&fq->daddr, arg->dst);
+ ipv6_addr_equal(&fq->daddr, arg->dst) &&
+ (arg->iif == fq->iif ||
+ !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
+ IPV6_ADDR_LINKLOCAL)));
}
EXPORT_SYMBOL(ip6_frag_match);
@@ -144,7 +147,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
- if (fq->q.flags & INET_FRAG_EVICTED)
+ if (inet_frag_evicting(&fq->q))
goto out_rcu_unlock;
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
@@ -180,7 +183,7 @@ static void ip6_frag_expire(unsigned long data)
static struct frag_queue *
fq_find(struct net *net, __be32 id, const struct in6_addr *src,
- const struct in6_addr *dst, u8 ecn)
+ const struct in6_addr *dst, int iif, u8 ecn)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -190,6 +193,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src,
arg.user = IP6_DEFRAG_LOCAL_DELIVER;
arg.src = src;
arg.dst = dst;
+ arg.iif = iif;
arg.ecn = ecn;
hash = inet6_hash_frag(id, src, dst);
@@ -330,7 +334,7 @@ found:
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
fq->ecn |= ecn;
- add_frag_mem_limit(&fq->q, skb->truesize);
+ add_frag_mem_limit(fq->q.net, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -443,7 +447,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(&fq->q, clone->truesize);
+ add_frag_mem_limit(fq->q.net, clone->truesize);
}
/* We have to remove fragment header from datagram and to relocate
@@ -481,7 +485,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
}
fp = next;
}
- sub_frag_mem_limit(&fq->q, sum_truesize);
+ sub_frag_mem_limit(fq->q.net, sum_truesize);
head->next = NULL;
head->dev = dev;
@@ -551,7 +555,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
}
fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
- ip6_frag_ecn(hdr));
+ skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
if (fq) {
int ret;
@@ -706,13 +710,19 @@ static void ip6_frags_sysctl_unregister(void)
static int __net_init ipv6_frags_init_net(struct net *net)
{
+ int res;
+
net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
- inet_frags_init_net(&net->ipv6.frags);
-
- return ip6_frags_ns_sysctl_register(net);
+ res = inet_frags_init_net(&net->ipv6.frags);
+ if (res)
+ return res;
+ res = ip6_frags_ns_sysctl_register(net);
+ if (res)
+ inet_frags_uninit_net(&net->ipv6.frags);
+ return res;
}
static void __net_exit ipv6_frags_exit_net(struct net *net)
diff --git a/kernel/net/ipv6/route.c b/kernel/net/ipv6/route.c
index f371fefa7..3f164d3aa 100644
--- a/kernel/net/ipv6/route.c
+++ b/kernel/net/ipv6/route.c
@@ -54,10 +54,14 @@
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
#include <asm/uaccess.h>
@@ -72,8 +76,7 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
- const struct in6_addr *dest);
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
@@ -84,14 +87,15 @@ static void ip6_dst_ifdown(struct dst_entry *,
static int ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb);
-static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int ip6_pkt_prohibit(struct sk_buff *skb);
-static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
+static void rt6_dst_from_metrics_check(struct rt6_info *rt);
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -104,65 +108,83 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *gwaddr, int ifindex);
#endif
-static void rt6_bind_peer(struct rt6_info *rt, int create)
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
+
+static void rt6_uncached_list_add(struct rt6_info *rt)
{
- struct inet_peer_base *base;
- struct inet_peer *peer;
+ struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- base = inetpeer_base_ptr(rt->_rt6i_peer);
- if (!base)
- return;
+ rt->dst.flags |= DST_NOCACHE;
+ rt->rt6i_uncached_list = ul;
- peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
- if (peer) {
- if (!rt6_set_peer(rt, peer))
- inet_putpeer(peer);
- }
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->rt6i_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
}
-static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
+static void rt6_uncached_list_del(struct rt6_info *rt)
{
- if (rt6_has_peer(rt))
- return rt6_peer_ptr(rt);
+ if (!list_empty(&rt->rt6i_uncached)) {
+ struct uncached_list *ul = rt->rt6i_uncached_list;
- rt6_bind_peer(rt, create);
- return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
+ spin_lock_bh(&ul->lock);
+ list_del(&rt->rt6i_uncached);
+ spin_unlock_bh(&ul->lock);
+ }
}
-static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
+static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
- return __rt6_get_peer(rt, 1);
-}
+ struct net_device *loopback_dev = net->loopback_dev;
+ int cpu;
-static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
-{
- struct rt6_info *rt = (struct rt6_info *) dst;
- struct inet_peer *peer;
- u32 *p = NULL;
-
- if (!(rt->dst.flags & DST_HOST))
- return dst_cow_metrics_generic(dst, old);
+ if (dev == loopback_dev)
+ return;
- peer = rt6_get_peer_create(rt);
- if (peer) {
- u32 *old_p = __DST_METRICS_PTR(old);
- unsigned long prev, new;
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+ struct rt6_info *rt;
- p = peer->metrics;
- if (inet_metrics_new(peer) ||
- (old & DST_METRICS_FORCE_OVERWRITE))
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+ spin_lock_bh(&ul->lock);
+ list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+ struct inet6_dev *rt_idev = rt->rt6i_idev;
+ struct net_device *rt_dev = rt->dst.dev;
- new = (unsigned long) p;
- prev = cmpxchg(&dst->_metrics, old, new);
+ if (rt_idev->dev == dev) {
+ rt->rt6i_idev = in6_dev_get(loopback_dev);
+ in6_dev_put(rt_idev);
+ }
- if (prev != old) {
- p = __DST_METRICS_PTR(prev);
- if (prev & DST_METRICS_READ_ONLY)
- p = NULL;
+ if (rt_dev == dev) {
+ rt->dst.dev = loopback_dev;
+ dev_hold(rt->dst.dev);
+ dev_put(rt_dev);
+ }
}
+ spin_unlock_bh(&ul->lock);
}
- return p;
+}
+
+static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
+{
+ return dst_metrics_write_ptr(rt->dst.from);
+}
+
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rt6_info *rt = (struct rt6_info *)dst;
+
+ if (rt->rt6i_flags & RTF_PCPU)
+ return rt6_pcpu_cow_metrics(rt);
+ else if (rt->rt6i_flags & RTF_CACHE)
+ return NULL;
+ else
+ return dst_cow_metrics_generic(dst, old);
}
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
@@ -227,12 +249,6 @@ static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
{
}
-static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
- unsigned long old)
-{
- return NULL;
-}
-
static struct dst_ops ip6_dst_blackhole_ops = {
.family = AF_INET6,
.destroy = ip6_dst_destroy,
@@ -241,7 +257,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
.redirect = ip6_rt_blackhole_redirect,
- .cow_metrics = ip6_rt_blackhole_cow_metrics,
+ .cow_metrics = dst_cow_metrics_generic,
.neigh_lookup = ip6_neigh_lookup,
};
@@ -288,7 +304,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EINVAL,
.input = dst_discard,
- .output = dst_discard_sk,
+ .output = dst_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
.rt6i_protocol = RTPROT_KERNEL,
@@ -298,34 +314,67 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
#endif
+static void rt6_info_init(struct rt6_info *rt)
+{
+ struct dst_entry *dst = &rt->dst;
+
+ memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
+ INIT_LIST_HEAD(&rt->rt6i_siblings);
+ INIT_LIST_HEAD(&rt->rt6i_uncached);
+}
+
/* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct net *net,
- struct net_device *dev,
- int flags,
- struct fib6_table *table)
+static struct rt6_info *__ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
0, DST_OBSOLETE_FORCE_CHK, flags);
+ if (rt)
+ rt6_info_init(rt);
+
+ return rt;
+}
+
+static struct rt6_info *ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags)
+{
+ struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
+
if (rt) {
- struct dst_entry *dst = &rt->dst;
+ rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+ if (rt->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **p;
- memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
- rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
- INIT_LIST_HEAD(&rt->rt6i_siblings);
+ p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ /* no one shares rt */
+ *p = NULL;
+ }
+ } else {
+ dst_destroy((struct dst_entry *)rt);
+ return NULL;
+ }
}
+
return rt;
}
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
- struct inet6_dev *idev = rt->rt6i_idev;
struct dst_entry *from = dst->from;
+ struct inet6_dev *idev;
- if (!(rt->dst.flags & DST_HOST))
- dst_destroy_metrics_generic(dst);
+ dst_destroy_metrics_generic(dst);
+ free_percpu(rt->rt6i_pcpu);
+ rt6_uncached_list_del(rt);
+ idev = rt->rt6i_idev;
if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
@@ -333,11 +382,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
dst->from = NULL;
dst_release(from);
-
- if (rt6_has_peer(rt)) {
- struct inet_peer *peer = rt6_peer_ptr(rt);
- inet_putpeer(peer);
- }
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -360,6 +404,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
}
}
+static bool __rt6_check_expired(const struct rt6_info *rt)
+{
+ if (rt->rt6i_flags & RTF_EXPIRES)
+ return time_after(jiffies, rt->dst.expires);
+ else
+ return false;
+}
+
static bool rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES) {
@@ -378,31 +430,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
const struct flowi6 *fl6)
{
- unsigned int val = fl6->flowi6_proto;
-
- val ^= ipv6_addr_hash(&fl6->daddr);
- val ^= ipv6_addr_hash(&fl6->saddr);
-
- /* Work only if this not encapsulated */
- switch (fl6->flowi6_proto) {
- case IPPROTO_UDP:
- case IPPROTO_TCP:
- case IPPROTO_SCTP:
- val ^= (__force u16)fl6->fl6_sport;
- val ^= (__force u16)fl6->fl6_dport;
- break;
-
- case IPPROTO_ICMPV6:
- val ^= (__force u16)fl6->fl6_icmp_type;
- val ^= (__force u16)fl6->fl6_icmp_code;
- break;
- }
- /* RFC6438 recommands to use flowlabel */
- val ^= (__force u32)fl6->flowlabel;
-
- /* Perhaps, we need to tune, this function? */
- val = val ^ (val >> 7) ^ (val >> 12);
- return val % candidate_count;
+ return get_hash_from_flowi6(fl6) % candidate_count;
}
static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
@@ -455,10 +483,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
if (dev->flags & IFF_LOOPBACK) {
if (!sprt->rt6i_idev ||
sprt->rt6i_idev->dev->ifindex != oif) {
- if (flags & RT6_LOOKUP_F_IFACE && oif)
+ if (flags & RT6_LOOKUP_F_IFACE)
continue;
- if (local && (!oif ||
- local->rt6i_idev->dev->ifindex == oif))
+ if (local &&
+ local->rt6i_idev->dev->ifindex == oif)
continue;
}
local = sprt;
@@ -495,13 +523,14 @@ static void rt6_probe_deferred(struct work_struct *w)
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
- ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
dev_put(work->dev);
kfree(work);
}
static void rt6_probe(struct rt6_info *rt)
{
+ struct __rt6_probe_work *work;
struct neighbour *neigh;
/*
* Okay, this does not seem to be appropriate
@@ -516,34 +545,33 @@ static void rt6_probe(struct rt6_info *rt)
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
if (neigh) {
- write_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
goto out;
- }
-
- if (!neigh ||
- time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
- struct __rt6_probe_work *work;
+ work = NULL;
+ write_lock(&neigh->lock);
+ if (!(neigh->nud_state & NUD_VALID) &&
+ time_after(jiffies,
+ neigh->updated +
+ rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work)
+ __neigh_set_probe_once(neigh);
+ }
+ write_unlock(&neigh->lock);
+ } else {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ }
- if (neigh && work)
- __neigh_set_probe_once(neigh);
-
- if (neigh)
- write_unlock(&neigh->lock);
+ if (work) {
+ INIT_WORK(&work->work, rt6_probe_deferred);
+ work->target = rt->rt6i_gateway;
+ dev_hold(rt->dst.dev);
+ work->dev = rt->dst.dev;
+ schedule_work(&work->work);
+ }
- if (work) {
- INIT_WORK(&work->work, rt6_probe_deferred);
- work->target = rt->rt6i_gateway;
- dev_hold(rt->dst.dev);
- work->dev = rt->dst.dev;
- schedule_work(&work->work);
- }
- } else {
out:
- write_unlock(&neigh->lock);
- }
rcu_read_unlock_bh();
}
#else
@@ -622,6 +650,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
{
int m;
bool match_do_rr = false;
+ struct inet6_dev *idev = rt->rt6i_idev;
+ struct net_device *dev = rt->dst.dev;
+
+ if (dev && !netif_carrier_ok(dev) &&
+ idev->cnf.ignore_routes_with_linkdown)
+ goto out;
if (rt6_check_expired(rt))
goto out;
@@ -652,15 +686,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
u32 metric, int oif, int strict,
bool *do_rr)
{
- struct rt6_info *rt, *match;
+ struct rt6_info *rt, *match, *cont;
int mpri = -1;
match = NULL;
- for (rt = rr_head; rt && rt->rt6i_metric == metric;
- rt = rt->dst.rt6_next)
+ cont = NULL;
+ for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+ if (rt->rt6i_metric != metric) {
+ cont = rt;
+ break;
+ }
+
match = find_match(rt, oif, strict, &mpri, match, do_rr);
- for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
- rt = rt->dst.rt6_next)
+ }
+
+ for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+ if (rt->rt6i_metric != metric) {
+ cont = rt;
+ break;
+ }
+
+ match = find_match(rt, oif, strict, &mpri, match, do_rr);
+ }
+
+ if (match || !cont)
+ return match;
+
+ for (rt = cont; rt; rt = rt->dst.rt6_next)
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
@@ -694,6 +746,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
return match ? match : net->ipv6.ip6_null_entry;
}
+static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+{
+ return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+}
+
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr)
@@ -872,9 +929,9 @@ int ip6_ins_rt(struct rt6_info *rt)
return __ip6_ins_rt(rt, &info, &mxc);
}
-static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct rt6_info *rt;
@@ -882,15 +939,25 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
* Clone the route.
*/
- rt = ip6_rt_copy(ort, daddr);
+ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+ ort = (struct rt6_info *)ort->dst.from;
- if (rt) {
+ rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
+
+ if (!rt)
+ return NULL;
+
+ ip6_rt_copy_init(rt, ort);
+ rt->rt6i_flags |= RTF_CACHE;
+ rt->rt6i_metric = 0;
+ rt->dst.flags |= DST_HOST;
+ rt->rt6i_dst.addr = *daddr;
+ rt->rt6i_dst.plen = 128;
+
+ if (!rt6_is_gw_or_nonexthop(ort)) {
if (ort->rt6i_dst.plen != 128 &&
ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
-
- rt->rt6i_flags |= RTF_CACHE;
-
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
rt->rt6i_src.addr = *saddr;
@@ -902,35 +969,93 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
return rt;
}
-static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
- const struct in6_addr *daddr)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
{
- struct rt6_info *rt = ip6_rt_copy(ort, daddr);
+ struct rt6_info *pcpu_rt;
- if (rt)
- rt->rt6i_flags |= RTF_CACHE;
- return rt;
+ pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
+ rt->dst.dev, rt->dst.flags);
+
+ if (!pcpu_rt)
+ return NULL;
+ ip6_rt_copy_init(pcpu_rt, rt);
+ pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
+ pcpu_rt->rt6i_flags |= RTF_PCPU;
+ return pcpu_rt;
+}
+
+/* It should be called with read_lock_bh(&tb6_lock) acquired */
+static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+{
+ struct rt6_info *pcpu_rt, **p;
+
+ p = this_cpu_ptr(rt->rt6i_pcpu);
+ pcpu_rt = *p;
+
+ if (pcpu_rt) {
+ dst_hold(&pcpu_rt->dst);
+ rt6_dst_from_metrics_check(pcpu_rt);
+ }
+ return pcpu_rt;
+}
+
+static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
+{
+ struct fib6_table *table = rt->rt6i_table;
+ struct rt6_info *pcpu_rt, *prev, **p;
+
+ pcpu_rt = ip6_rt_pcpu_alloc(rt);
+ if (!pcpu_rt) {
+ struct net *net = dev_net(rt->dst.dev);
+
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return net->ipv6.ip6_null_entry;
+ }
+
+ read_lock_bh(&table->tb6_lock);
+ if (rt->rt6i_pcpu) {
+ p = this_cpu_ptr(rt->rt6i_pcpu);
+ prev = cmpxchg(p, NULL, pcpu_rt);
+ if (prev) {
+ /* If someone did it before us, return prev instead */
+ dst_destroy(&pcpu_rt->dst);
+ pcpu_rt = prev;
+ }
+ } else {
+ /* rt has been removed from the fib6 tree
+ * before we have a chance to acquire the read_lock.
+ * In this case, don't brother to create a pcpu rt
+ * since rt is going away anyway. The next
+ * dst_check() will trigger a re-lookup.
+ */
+ dst_destroy(&pcpu_rt->dst);
+ pcpu_rt = rt;
+ }
+ dst_hold(&pcpu_rt->dst);
+ rt6_dst_from_metrics_check(pcpu_rt);
+ read_unlock_bh(&table->tb6_lock);
+ return pcpu_rt;
}
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
- struct rt6_info *rt, *nrt;
+ struct rt6_info *rt;
int strict = 0;
- int attempts = 3;
- int err;
strict |= flags & RT6_LOOKUP_F_IFACE;
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
-redo_fib6_lookup_lock:
read_lock_bh(&table->tb6_lock);
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
+ if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+ oif = 0;
+
redo_rt6_select:
rt = rt6_select(fn, oif, strict);
if (rt->rt6i_nsiblings)
@@ -944,51 +1069,65 @@ redo_rt6_select:
strict &= ~RT6_LOOKUP_F_REACHABLE;
fn = saved_fn;
goto redo_rt6_select;
- } else {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- goto out2;
}
}
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- if (rt->rt6i_flags & RTF_CACHE)
- goto out2;
+ if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
+ dst_use(&rt->dst, jiffies);
+ read_unlock_bh(&table->tb6_lock);
- if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
- nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
- else if (!(rt->dst.flags & DST_HOST))
- nrt = rt6_alloc_clone(rt, &fl6->daddr);
- else
- goto out2;
+ rt6_dst_from_metrics_check(rt);
+ return rt;
+ } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
+ !(rt->rt6i_flags & RTF_GATEWAY))) {
+ /* Create a RTF_CACHE clone which will not be
+ * owned by the fib6 tree. It is for the special case where
+ * the daddr in the skb during the neighbor look-up is different
+ * from the fl6->daddr used to look-up route here.
+ */
- ip6_rt_put(rt);
- rt = nrt ? : net->ipv6.ip6_null_entry;
+ struct rt6_info *uncached_rt;
- dst_hold(&rt->dst);
- if (nrt) {
- err = ip6_ins_rt(nrt);
- if (!err)
- goto out2;
- }
+ dst_use(&rt->dst, jiffies);
+ read_unlock_bh(&table->tb6_lock);
- if (--attempts <= 0)
- goto out2;
+ uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+ dst_release(&rt->dst);
- /*
- * Race condition! In the gap, when table->tb6_lock was
- * released someone could insert this route. Relookup.
- */
- ip6_rt_put(rt);
- goto redo_fib6_lookup_lock;
+ if (uncached_rt)
+ rt6_uncached_list_add(uncached_rt);
+ else
+ uncached_rt = net->ipv6.ip6_null_entry;
-out2:
- rt->dst.lastuse = jiffies;
- rt->dst.__use++;
+ dst_hold(&uncached_rt->dst);
+ return uncached_rt;
- return rt;
+ } else {
+ /* Get a percpu copy */
+
+ struct rt6_info *pcpu_rt;
+
+ rt->dst.lastuse = jiffies;
+ rt->dst.__use++;
+ pcpu_rt = rt6_get_pcpu_route(rt);
+
+ if (pcpu_rt) {
+ read_unlock_bh(&table->tb6_lock);
+ } else {
+ /* We have to do the read_unlock first
+ * because rt6_make_pcpu_route() may trigger
+ * ip6_dst_gc() which will take the write_lock.
+ */
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+ pcpu_rt = rt6_make_pcpu_route(rt);
+ dst_release(&rt->dst);
+ }
+
+ return pcpu_rt;
+
+ }
}
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
@@ -1012,8 +1151,9 @@ void ip6_route_input(struct sk_buff *skb)
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
- .flowi6_iif = skb->dev->ifindex,
+ .flowi6_iif = l3mdev_fib_oif(skb->dev),
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
@@ -1021,6 +1161,10 @@ void ip6_route_input(struct sk_buff *skb)
.flowi6_proto = iph->nexthdr,
};
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+ skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
}
@@ -1030,24 +1174,31 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
}
-struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
- struct flowi6 *fl6)
+struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6, int flags)
{
- int flags = 0;
+ struct dst_entry *dst;
+ bool any_src;
+
+ dst = l3mdev_rt6_dst_by_oif(net, fl6);
+ if (dst)
+ return dst;
fl6->flowi6_iif = LOOPBACK_IFINDEX;
- if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
+ any_src = ipv6_addr_any(&fl6->saddr);
+ if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
+ (fl6->flowi6_oif && any_src))
flags |= RT6_LOOKUP_F_IFACE;
- if (!ipv6_addr_any(&fl6->saddr))
+ if (!any_src)
flags |= RT6_LOOKUP_F_HAS_SADDR;
else if (sk)
flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
}
-EXPORT_SYMBOL(ip6_route_output);
+EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
@@ -1056,25 +1207,20 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
if (rt) {
- new = &rt->dst;
-
- memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
- rt6_init_peer(rt, net->ipv6.peers);
+ rt6_info_init(rt);
+ new = &rt->dst;
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
- if (dst_metrics_read_only(&ort->dst))
- new->_metrics = ort->dst._metrics;
- else
- dst_copy_metrics(new, &ort->dst);
+ dst_copy_metrics(new, &ort->dst);
rt->rt6i_idev = ort->rt6i_idev;
if (rt->rt6i_idev)
in6_dev_hold(rt->rt6i_idev);
rt->rt6i_gateway = ort->rt6i_gateway;
- rt->rt6i_flags = ort->rt6i_flags;
+ rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
rt->rt6i_metric = 0;
memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
@@ -1093,6 +1239,34 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
* Destination cache support functions
*/
+static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+{
+ if (rt->dst.from &&
+ dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
+ dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+}
+
+static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+{
+ if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+ return NULL;
+
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return &rt->dst;
+}
+
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+{
+ if (!__rt6_check_expired(rt) &&
+ rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+ return &rt->dst;
+ else
+ return NULL;
+}
+
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
struct rt6_info *rt;
@@ -1103,13 +1277,14 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
- if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
- return NULL;
- if (rt6_check_expired(rt))
- return NULL;
+ rt6_dst_from_metrics_check(rt);
- return dst;
+ if (rt->rt6i_flags & RTF_PCPU ||
+ (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
+ return rt6_dst_from_check(rt, cookie);
+ else
+ return rt6_check(rt, cookie);
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -1140,32 +1315,76 @@ static void ip6_link_failure(struct sk_buff *skb)
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
dst_hold(&rt->dst);
- if (ip6_del_rt(rt))
- dst_free(&rt->dst);
+ ip6_del_rt(rt);
} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
rt->rt6i_node->fn_sernum = -1;
}
}
}
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
+{
+ struct net *net = dev_net(rt->dst.dev);
+
+ rt->rt6i_flags |= RTF_MODIFIED;
+ rt->rt6i_pmtu = mtu;
+ rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
+{
+ return !(rt->rt6i_flags & RTF_CACHE) &&
+ (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+}
+
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+ const struct ipv6hdr *iph, u32 mtu)
{
struct rt6_info *rt6 = (struct rt6_info *)dst;
- dst_confirm(dst);
- if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
- struct net *net = dev_net(dst->dev);
+ if (rt6->rt6i_flags & RTF_LOCAL)
+ return;
- rt6->rt6i_flags |= RTF_MODIFIED;
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
+ dst_confirm(dst);
+ mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+ if (mtu >= dst_mtu(dst))
+ return;
- dst_metric_set(dst, RTAX_MTU, mtu);
- rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+ if (!rt6_cache_allowed_for_pmtu(rt6)) {
+ rt6_do_update_pmtu(rt6, mtu);
+ } else {
+ const struct in6_addr *daddr, *saddr;
+ struct rt6_info *nrt6;
+
+ if (iph) {
+ daddr = &iph->daddr;
+ saddr = &iph->saddr;
+ } else if (sk) {
+ daddr = &sk->sk_v6_daddr;
+ saddr = &inet6_sk(sk)->saddr;
+ } else {
+ return;
+ }
+ nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+ if (nrt6) {
+ rt6_do_update_pmtu(nrt6, mtu);
+
+ /* ip6_ins_rt(nrt6) will bump the
+ * rt6->rt6i_node->fn_sernum
+ * which will fail the next rt6_check() and
+ * invalidate the sk->sk_dst_cache.
+ */
+ ip6_ins_rt(nrt6);
+ }
}
}
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+}
+
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
int oif, u32 mark)
{
@@ -1182,7 +1401,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
- ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+ __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -1341,12 +1560,17 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
static unsigned int ip6_mtu(const struct dst_entry *dst)
{
+ const struct rt6_info *rt = (const struct rt6_info *)dst;
+ unsigned int mtu = rt->rt6i_pmtu;
struct inet6_dev *idev;
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu)
goto out;
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+ if (mtu)
+ goto out;
+
mtu = IPV6_MIN_MTU;
rcu_read_lock();
@@ -1373,7 +1597,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
if (unlikely(!idev))
return ERR_PTR(-ENODEV);
- rt = ip6_dst_alloc(net, dev, 0, NULL);
+ rt = ip6_dst_alloc(net, dev, 0);
if (unlikely(!rt)) {
in6_dev_put(idev);
dst = ERR_PTR(-ENOMEM);
@@ -1472,6 +1696,7 @@ out:
static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg)
{
+ bool ecn_ca = false;
struct nlattr *nla;
int remaining;
u32 *mp;
@@ -1485,51 +1710,57 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (unlikely(type > RTAX_MAX))
+ goto err;
- if (type) {
- u32 val;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
- if (unlikely(type > RTAX_MAX))
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
goto err;
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ goto err;
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp);
- if (val == TCP_CA_UNSPEC)
- goto err;
- } else {
- val = nla_get_u32(nla);
- }
+ mp[type - 1] = val;
+ __set_bit(type - 1, mxc->mx_valid);
+ }
- mp[type - 1] = val;
- __set_bit(type - 1, mxc->mx_valid);
- }
+ if (ecn_ca) {
+ __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
+ mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
}
mxc->mx = mp;
-
return 0;
err:
kfree(mp);
return -EINVAL;
}
-int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
+static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
{
- int err;
struct net *net = cfg->fc_nlinfo.nl_net;
struct rt6_info *rt = NULL;
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
struct fib6_table *table;
int addr_type;
+ int err = -EINVAL;
if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
- return -EINVAL;
+ goto out;
#ifndef CONFIG_IPV6_SUBTREES
if (cfg->fc_src_len)
- return -EINVAL;
+ goto out;
#endif
if (cfg->fc_ifindex) {
err = -ENODEV;
@@ -1559,7 +1790,8 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
if (!table)
goto out;
- rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
+ rt = ip6_dst_alloc(net, NULL,
+ (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
if (!rt) {
err = -ENOMEM;
@@ -1587,12 +1819,29 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
rt->dst.output = ip6_output;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, AF_INET6, cfg,
+ &lwtstate);
+ if (err)
+ goto out;
+ rt->dst.lwtstate = lwtstate_get(lwtstate);
+ if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
+ rt->dst.lwtstate->orig_output = rt->dst.output;
+ rt->dst.output = lwtunnel_output;
+ }
+ if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
+ rt->dst.lwtstate->orig_input = rt->dst.input;
+ rt->dst.input = lwtunnel_input;
+ }
+ }
+
ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->rt6i_dst.plen = cfg->fc_dst_len;
- if (rt->rt6i_dst.plen == 128) {
+ if (rt->rt6i_dst.plen == 128)
rt->dst.flags |= DST_HOST;
- dst_metrics_set_force_overwrite(&rt->dst);
- }
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -1626,7 +1875,7 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
switch (cfg->fc_type) {
case RTN_BLACKHOLE:
rt->dst.error = -EINVAL;
- rt->dst.output = dst_discard_sk;
+ rt->dst.output = dst_discard_out;
rt->dst.input = dst_discard;
break;
case RTN_PROHIBIT:
@@ -1635,9 +1884,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
rt->dst.input = ip6_pkt_prohibit;
break;
case RTN_THROW:
+ case RTN_UNREACHABLE:
default:
rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
- : -ENETUNREACH;
+ : (cfg->fc_type == RTN_UNREACHABLE)
+ ? -EHOSTUNREACH : -ENETUNREACH;
rt->dst.output = ip6_pkt_discard_out;
rt->dst.input = ip6_pkt_discard;
break;
@@ -1650,9 +1901,21 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
int gwa_type;
gw_addr = &cfg->fc_gateway;
- rt->rt6i_gateway = *gw_addr;
gwa_type = ipv6_addr_type(gw_addr);
+ /* if gw_addr is local we will fail to detect this in case
+ * address is still TENTATIVE (DAD in progress). rt6_lookup()
+ * will return already-added prefix route via interface that
+ * prefix route was assigned to, which might be non-loopback.
+ */
+ err = -EINVAL;
+ if (ipv6_chk_addr_and_flags(net, gw_addr,
+ gwa_type & IPV6_ADDR_LINKLOCAL ?
+ dev : NULL, 0, 0))
+ goto out;
+
+ rt->rt6i_gateway = *gw_addr;
+
if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
struct rt6_info *grt;
@@ -1663,7 +1926,6 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
(SIT, PtP, NBMA NOARP links) it is handy to allow
some exceptions. --ANK
*/
- err = -EINVAL;
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
@@ -1718,9 +1980,7 @@ install_route:
cfg->fc_nlinfo.nl_net = dev_net(dev);
- *rt_ret = rt;
-
- return 0;
+ return rt;
out:
if (dev)
dev_put(dev);
@@ -1729,20 +1989,21 @@ out:
if (rt)
dst_free(&rt->dst);
- *rt_ret = NULL;
-
- return err;
+ return ERR_PTR(err);
}
int ip6_route_add(struct fib6_config *cfg)
{
struct mx6_config mxc = { .mx = NULL, };
- struct rt6_info *rt = NULL;
+ struct rt6_info *rt;
int err;
- err = ip6_route_info_create(cfg, &rt);
- if (err)
+ rt = ip6_route_info_create(cfg);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto out;
+ }
err = ip6_convert_metrics(&mxc, cfg);
if (err)
@@ -1766,7 +2027,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
struct fib6_table *table;
struct net *net = dev_net(rt->dst.dev);
- if (rt == net->ipv6.ip6_null_entry) {
+ if (rt == net->ipv6.ip6_null_entry ||
+ rt->dst.flags & DST_NOCACHE) {
err = -ENOENT;
goto out;
}
@@ -1808,6 +2070,9 @@ static int ip6_route_del(struct fib6_config *cfg)
if (fn) {
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ if ((rt->rt6i_flags & RTF_CACHE) &&
+ !(cfg->fc_flags & RTF_CACHE))
+ continue;
if (cfg->fc_ifindex &&
(!rt->dst.dev ||
rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -1830,7 +2095,6 @@ static int ip6_route_del(struct fib6_config *cfg)
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
struct netevent_redirect netevent;
struct rt6_info *rt, *nrt = NULL;
struct ndisc_options ndopts;
@@ -1891,7 +2155,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
}
rt = (struct rt6_info *) dst;
- if (rt == net->ipv6.ip6_null_entry) {
+ if (rt->rt6i_flags & RTF_REJECT) {
net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
return;
}
@@ -1917,7 +2181,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NEIGH_UPDATE_F_ISROUTER))
);
- nrt = ip6_rt_copy(rt, &msg->dest);
+ nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
if (!nrt)
goto out;
@@ -1949,42 +2213,36 @@ out:
* Misc support functions
*/
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
- const struct in6_addr *dest)
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
{
- struct net *net = dev_net(ort->dst.dev);
- struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
- ort->rt6i_table);
+ BUG_ON(from->dst.from);
- if (rt) {
- rt->dst.input = ort->dst.input;
- rt->dst.output = ort->dst.output;
- rt->dst.flags |= DST_HOST;
-
- rt->rt6i_dst.addr = *dest;
- rt->rt6i_dst.plen = 128;
- dst_copy_metrics(&rt->dst, &ort->dst);
- rt->dst.error = ort->dst.error;
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
- rt->dst.lastuse = jiffies;
-
- if (ort->rt6i_flags & RTF_GATEWAY)
- rt->rt6i_gateway = ort->rt6i_gateway;
- else
- rt->rt6i_gateway = *dest;
- rt->rt6i_flags = ort->rt6i_flags;
- rt6_set_from(rt, ort);
- rt->rt6i_metric = 0;
+ rt->rt6i_flags &= ~RTF_EXPIRES;
+ dst_hold(&from->dst);
+ rt->dst.from = &from->dst;
+ dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+}
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+{
+ rt->dst.input = ort->dst.input;
+ rt->dst.output = ort->dst.output;
+ rt->rt6i_dst = ort->rt6i_dst;
+ rt->dst.error = ort->dst.error;
+ rt->rt6i_idev = ort->rt6i_idev;
+ if (rt->rt6i_idev)
+ in6_dev_hold(rt->rt6i_idev);
+ rt->dst.lastuse = jiffies;
+ rt->rt6i_gateway = ort->rt6i_gateway;
+ rt->rt6i_flags = ort->rt6i_flags;
+ rt6_set_from(rt, ort);
+ rt->rt6i_metric = ort->rt6i_metric;
#ifdef CONFIG_IPV6_SUBTREES
- memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+ rt->rt6i_src = ort->rt6i_src;
#endif
- memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
- rt->rt6i_table = ort->rt6i_table;
- }
- return rt;
+ rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ rt->rt6i_table = ort->rt6i_table;
+ rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -2026,7 +2284,6 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_INFO,
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = ifindex,
.fc_dst_len = prefixlen,
@@ -2037,6 +2294,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
+ cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -2077,7 +2335,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_DFLT,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2124,7 +2382,8 @@ static void rtmsg_to_fib6_config(struct net *net,
{
memset(cfg, 0, sizeof(*cfg));
- cfg->fc_table = RT6_TABLE_MAIN;
+ cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+ : RT6_TABLE_MAIN;
cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
cfg->fc_metric = rtmsg->rtmsg_metric;
cfg->fc_expires = rtmsg->rtmsg_info;
@@ -2208,7 +2467,7 @@ static int ip6_pkt_discard(struct sk_buff *skb)
return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
}
-static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb->dev = skb_dst(skb)->dev;
return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
@@ -2219,7 +2478,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb)
return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
}
-static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb->dev = skb_dst(skb)->dev;
return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
@@ -2233,9 +2492,10 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
const struct in6_addr *addr,
bool anycast)
{
+ u32 tb_id;
struct net *net = dev_net(idev->dev);
struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
- DST_NOCOUNT, NULL);
+ DST_NOCOUNT);
if (!rt)
return ERR_PTR(-ENOMEM);
@@ -2255,7 +2515,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->rt6i_gateway = *addr;
rt->rt6i_dst.addr = *addr;
rt->rt6i_dst.plen = 128;
- rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
+ tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+ rt->rt6i_table = fib6_get_table(net, tb_id);
+ rt->dst.flags |= DST_NOCACHE;
atomic_set(&rt->dst.__refcnt, 1);
@@ -2359,6 +2621,8 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
fib6_clean_all(net, fib6_ifdown, &adn);
icmp6_clean_all(fib6_ifdown, &adn);
+ if (dev)
+ rt6_uncached_list_flush_dev(net, dev);
}
struct rt6_mtu_change_arg {
@@ -2396,11 +2660,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
PMTU discouvery.
*/
if (rt->dst.dev == arg->dev &&
- !dst_metric_locked(&rt->dst, RTAX_MTU) &&
- (dst_mtu(&rt->dst) >= arg->mtu ||
- (dst_mtu(&rt->dst) < arg->mtu &&
- dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
- dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+ !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+ if (rt->rt6i_flags & RTF_CACHE) {
+ /* For RTF_CACHE with rt6i_pmtu == 0
+ * (i.e. a redirected route),
+ * the metrics of its rt->dst.from has already
+ * been updated.
+ */
+ if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
+ rt->rt6i_pmtu = arg->mtu;
+ } else if (dst_mtu(&rt->dst) >= arg->mtu ||
+ (dst_mtu(&rt->dst) < arg->mtu &&
+ dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+ dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+ }
}
return 0;
}
@@ -2423,6 +2696,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_PREF] = { .type = NLA_U8 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2457,6 +2732,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rtm->rtm_type == RTN_LOCAL)
cfg->fc_flags |= RTF_LOCAL;
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ cfg->fc_flags |= RTF_CACHE;
+
cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->fc_nlinfo.nlh = nlh;
cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2514,6 +2792,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= RTF_PREF(pref);
}
+ if (tb[RTA_ENCAP])
+ cfg->fc_encap = tb[RTA_ENCAP];
+
+ if (tb[RTA_ENCAP_TYPE])
+ cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
err = 0;
errout:
return err;
@@ -2605,11 +2889,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ r_cfg.fc_encap_type = nla_get_u16(nla);
}
- err = ip6_route_info_create(&r_cfg, &rt);
- if (err)
+ rt = ip6_route_info_create(&r_cfg);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto cleanup;
+ }
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
@@ -2658,8 +2949,7 @@ cleanup:
list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
if (nh->rt6_info)
dst_free(&nh->rt6_info->dst);
- if (nh->mxc.mx)
- kfree(nh->mxc.mx);
+ kfree(nh->mxc.mx);
list_del(&nh->next);
kfree(nh);
}
@@ -2734,7 +3024,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
return ip6_route_add(&cfg);
}
-static inline size_t rt6_nlmsg_size(void)
+static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
{
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
@@ -2748,7 +3038,8 @@ static inline size_t rt6_nlmsg_size(void)
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
- + nla_total_size(1); /* RTA_PREF */
+ + nla_total_size(1) /* RTA_PREF */
+ + lwtunnel_get_encap_size(rt->dst.lwtstate);
}
static int rt6_fill_node(struct net *net,
@@ -2757,6 +3048,7 @@ static int rt6_fill_node(struct net *net,
int iif, int type, u32 portid, u32 seq,
int prefix, int nowait, unsigned int flags)
{
+ u32 metrics[RTAX_MAX];
struct rtmsg *rtm;
struct nlmsghdr *nlh;
long expires;
@@ -2808,6 +3100,11 @@ static int rt6_fill_node(struct net *net,
else
rtm->rtm_type = RTN_UNICAST;
rtm->rtm_flags = 0;
+ if (!netif_carrier_ok(rt->dst.dev)) {
+ rtm->rtm_flags |= RTNH_F_LINKDOWN;
+ if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+ rtm->rtm_flags |= RTNH_F_DEAD;
+ }
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->rt6i_protocol;
if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -2870,7 +3167,10 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
}
- if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt6i_pmtu)
+ metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -2892,6 +3192,8 @@ static int rt6_fill_node(struct net *net,
if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
goto nla_put_failure;
+ lwtunnel_fill_encap(skb, rt->dst.lwtstate);
+
nlmsg_end(skb, nlh);
return 0;
@@ -2977,6 +3279,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
} else {
fl6.flowi6_oif = oif;
+ if (netif_index_is_l3_master(net, oif)) {
+ fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
+ FLOWI_FLAG_SKIP_NH_OIF;
+ }
+
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}
@@ -3008,7 +3315,8 @@ errout:
return err;
}
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+ unsigned int nlm_flags)
{
struct sk_buff *skb;
struct net *net = info->nl_net;
@@ -3018,12 +3326,12 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (!skb)
goto errout;
err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
- event, info->portid, seq, 0, 0, 0);
+ event, info->portid, seq, 0, 0, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3365,6 +3673,7 @@ static struct notifier_block ip6_route_dev_notifier = {
int __init ip6_route_init(void)
{
int ret;
+ int cpu;
ret = -ENOMEM;
ip6_dst_ops_template.kmem_cachep =
@@ -3424,6 +3733,13 @@ int __init ip6_route_init(void)
if (ret)
goto out_register_late_subsys;
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+
+ INIT_LIST_HEAD(&ul->head);
+ spin_lock_init(&ul->lock);
+ }
+
out:
return ret;
diff --git a/kernel/net/ipv6/sit.c b/kernel/net/ipv6/sit.c
index ac35a2859..dcccae861 100644
--- a/kernel/net/ipv6/sit.c
+++ b/kernel/net/ipv6/sit.c
@@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
return 1;
@@ -1394,34 +1394,20 @@ static int ipip6_tunnel_init(struct net_device *dev)
return 0;
}
-static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
+static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
struct net *net = dev_net(dev);
struct sit_net *sitn = net_generic(net, sit_net_id);
- tunnel->dev = dev;
- tunnel->net = dev_net(dev);
-
iph->version = 4;
iph->protocol = IPPROTO_IPV6;
iph->ihl = 5;
iph->ttl = 64;
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
- if (!tunnel->dst_cache) {
- free_percpu(dev->tstats);
- return -ENOMEM;
- }
-
dev_hold(dev);
rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
- return 0;
}
static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1831,23 +1817,19 @@ static int __net_init sit_init_net(struct net *net)
*/
sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
- err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
- if (err)
- goto err_dev_free;
-
- ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
err = register_netdev(sitn->fb_tunnel_dev);
if (err)
goto err_reg_dev;
+ ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
+ ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+
t = netdev_priv(sitn->fb_tunnel_dev);
strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
return 0;
err_reg_dev:
- dev_put(sitn->fb_tunnel_dev);
-err_dev_free:
ipip6_dev_free(sitn->fb_tunnel_dev);
err_alloc_dev:
return err;
diff --git a/kernel/net/ipv6/syncookies.c b/kernel/net/ipv6/syncookies.c
index 21bc2eb53..eaf7ac496 100644
--- a/kernel/net/ipv6/syncookies.c
+++ b/kernel/net/ipv6/syncookies.c
@@ -41,23 +41,6 @@ static __u16 const msstab[] = {
9000 - 60,
};
-static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct sock *child;
-
- child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
- if (child) {
- atomic_set(&req->rsk_refcnt, 1);
- inet_csk_reqsk_queue_add(sk, req, child);
- } else {
- reqsk_free(req);
- }
- return child;
-}
-
static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
ipv6_cookie_scratch);
@@ -131,14 +114,11 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
}
EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
-__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp)
+__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- tcp_synq_overflow(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
return __cookie_v6_init_sequence(iph, th, mssp);
}
@@ -190,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk);
+ req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
if (!req)
goto out;
@@ -227,7 +207,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
- treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->snt_synack.v64 = 0;
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
@@ -242,7 +222,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_TCP;
fl6.daddr = ireq->ir_v6_rmt_addr;
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
fl6.saddr = ireq->ir_v6_loc_addr;
fl6.flowi6_oif = sk->sk_bound_dev_if;
fl6.flowi6_mark = ireq->ir_mark;
@@ -255,16 +235,16 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
goto out_free;
}
- req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
+ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
- ret = get_cookie_sock(sk, skb, req, dst);
+ ret = tcp_get_cookie_sock(sk, skb, req, dst);
out:
return ret;
out_free:
diff --git a/kernel/net/ipv6/sysctl_net_ipv6.c b/kernel/net/ipv6/sysctl_net_ipv6.c
index abcc79f64..45243bbe5 100644
--- a/kernel/net/ipv6/sysctl_net_ipv6.c
+++ b/kernel/net/ipv6/sysctl_net_ipv6.c
@@ -17,6 +17,9 @@
#include <net/inet_frag.h>
static int one = 1;
+static int auto_flowlabels_min;
+static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
+
static struct ctl_table ipv6_table_template[] = {
{
@@ -45,7 +48,9 @@ static struct ctl_table ipv6_table_template[] = {
.data = &init_net.ipv6.sysctl.auto_flowlabels,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &auto_flowlabels_min,
+ .extra2 = &auto_flowlabels_max
},
{
.procname = "fwmark_reflect",
@@ -68,6 +73,20 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "flowlabel_state_ranges",
+ .data = &init_net.ipv6.sysctl.flowlabel_state_ranges,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv6.sysctl.ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
@@ -109,6 +128,8 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
+ ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
+ ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/kernel/net/ipv6/tcp_ipv6.c b/kernel/net/ipv6/tcp_ipv6.c
index e541d68db..b8d405623 100644
--- a/kernel/net/ipv6/tcp_ipv6.c
+++ b/kernel/net/ipv6/tcp_ipv6.c
@@ -70,8 +70,8 @@
#include <linux/crypto.h>
#include <linux/scatterlist.h>
-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
+static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
@@ -82,7 +82,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
#else
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
const struct in6_addr *addr)
{
return NULL;
@@ -93,14 +93,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
+ if (dst && dst_hold_safe(dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
- dst_hold(dst);
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- if (rt->rt6i_node)
- inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+ inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
}
}
@@ -121,7 +119,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
- struct rt6_info *rt;
+ struct ipv6_txoptions *opt;
struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
@@ -237,7 +235,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ final_p = fl6_update_dst(&fl6, opt, &final);
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
@@ -257,18 +256,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
sk->sk_gso_type = SKB_GSO_TCPV6;
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
- rt = (struct rt6_info *) dst;
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp &&
- ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr))
+ ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
tcp_fetch_timewait_stamp(sk, dst);
icsk->icsk_ext_hdr_len = 0;
- if (np->opt)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
+ if (opt)
+ icsk->icsk_ext_hdr_len = opt->opt_flen +
+ opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
@@ -279,7 +277,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err)
goto late_failure;
- ip6_set_txhash(sk);
+ sk_set_txhash(sk);
if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
@@ -330,6 +328,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct tcp_sock *tp;
__u32 seq, snd_una;
struct sock *sk;
+ bool fatal;
int err;
sk = __inet6_lookup_established(net, &tcp_hashinfo,
@@ -348,8 +347,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return;
}
seq = ntohl(th->seq);
+ fatal = icmpv6_err_convert(type, code, &err);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq, fatal);
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -403,7 +403,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- icmpv6_err_convert(type, code, &err);
/* Might be for an request_sock */
switch (sk->sk_state) {
@@ -437,11 +436,11 @@ out:
}
-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
+static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
- u16 queue_mapping,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -450,10 +449,11 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
int err = -ENOMEM;
/* First, grab a route. */
- if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
+ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
+ IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc);
+ skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
@@ -463,8 +463,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
if (np->repflow && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
- skb_set_queue_mapping(skb, queue_mapping);
- err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+ rcu_read_lock();
+ err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt),
+ np->tclass);
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -479,13 +481,13 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
}
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
const struct in6_addr *addr)
{
return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6);
}
-static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk,
+static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
@@ -624,8 +626,12 @@ clear_hash_noput:
return 1;
}
-static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+#endif
+
+static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
+ const struct sk_buff *skb)
{
+#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected;
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -662,26 +668,27 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
&ip6h->daddr, ntohs(th->dest));
return true;
}
+#endif
return false;
}
-#endif
-static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
+static void tcp_v6_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk_listener);
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
/* So that link locals have meaning */
- if (!sk->sk_bound_dev_if &&
+ if (!sk_listener->sk_bound_dev_if &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
ireq->ir_iif = tcp_v6_iif(skb);
if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
- (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
+ (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo ||
np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
np->rxopt.bits.rxohlim || np->repflow)) {
@@ -690,13 +697,14 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
}
}
-static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
+static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
+ struct flowi *fl,
const struct request_sock *req,
bool *strict)
{
if (strict)
*strict = true;
- return inet6_csk_route_req(sk, &fl->u.ip6, req);
+ return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}
struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
@@ -723,10 +731,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.route_req = tcp_v6_route_req,
.init_seq = tcp_v6_init_sequence,
.send_synack = tcp_v6_send_synack,
- .queue_hash_add = inet6_csk_reqsk_queue_hash_add,
};
-static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq,
+static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
u8 tclass, u32 label)
@@ -825,7 +832,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq,
kfree_skb(buff);
}
-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = 0, ack_seq = 0;
@@ -896,7 +903,7 @@ release_sk1:
#endif
}
-static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq,
+static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
u32 label)
@@ -919,7 +926,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
inet_twsk_put(tw);
}
-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
@@ -927,44 +934,18 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
0, 0);
}
-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
+#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
- struct request_sock *req;
- struct sock *nsk;
-
- /* Find possible connection requests. */
- req = inet6_csk_search_req(sk, th->source,
- &ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
- if (req) {
- nsk = tcp_check_req(sk, skb, req, false);
- if (!nsk || nsk == sk)
- reqsk_put(req);
- return nsk;
- }
- nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
- &ipv6_hdr(skb)->saddr, th->source,
- &ipv6_hdr(skb)->daddr, ntohs(th->dest),
- tcp_v6_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-#ifdef CONFIG_SYN_COOKIES
if (!th->syn)
sk = cookie_v6_check(sk, skb);
#endif
@@ -987,12 +968,16 @@ drop:
return 0; /* don't send reset */
}
-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
- struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct ipv6_pinfo *newnp;
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct tcp6_sock *newtcp6sk;
struct inet_sock *newinet;
struct tcp_sock *newtp;
@@ -1007,7 +992,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
* v6 mapped
*/
- newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst);
+ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst,
+ req_unhash, own_req);
if (!newsk)
return NULL;
@@ -1060,7 +1046,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
goto out_overflow;
if (!dst) {
- dst = inet6_csk_route_req(sk, &fl6, req);
+ dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
if (!dst)
goto out;
}
@@ -1076,7 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
*/
newsk->sk_gso_type = SKB_GSO_TCPV6;
- __ip6_dst_store(newsk, dst, NULL, NULL);
+ ip6_dst_store(newsk, dst, NULL, NULL);
inet6_sk_rx_dst_set(newsk, skb);
newtcp6sk = (struct tcp6_sock *)newsk;
@@ -1093,8 +1079,6 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
newsk->sk_bound_dev_if = ireq->ir_iif;
- ip6_set_txhash(newsk);
-
/* Now IPv6 options...
First: no IPv4 options.
@@ -1106,16 +1090,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
- /* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (ireq->pktopts) {
- newnp->pktoptions = skb_clone(ireq->pktopts,
- sk_gfp_atomic(sk, GFP_ATOMIC));
- consume_skb(ireq->pktopts);
- ireq->pktopts = NULL;
- if (newnp->pktoptions)
- skb_set_owner_r(newnp->pktoptions, newsk);
- }
newnp->opt = NULL;
newnp->mcast_oif = tcp_v6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
@@ -1129,13 +1104,15 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
but we make one more one thing there: reattach optmem
to newsk.
*/
- if (np->opt)
- newnp->opt = ipv6_dup_options(newsk, np->opt);
-
+ opt = rcu_dereference(np->opt);
+ if (opt) {
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ }
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt)
- inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
- newnp->opt->opt_flen);
+ if (opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+ opt->opt_flen;
tcp_ca_openreq_child(newsk, dst);
@@ -1170,7 +1147,20 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
tcp_done(newsk);
goto out;
}
- __inet_hash(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ if (*own_req) {
+ tcp_move_syn(newtp, req);
+
+ /* Clone pktoptions received with SYN, if we own the req */
+ if (ireq->pktopts) {
+ newnp->pktoptions = skb_clone(ireq->pktopts,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
+ }
return newsk;
@@ -1184,7 +1174,7 @@ out:
}
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
@@ -1251,22 +1241,18 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
- if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v6_hnd_req(sk, skb);
+ struct sock *nsk = tcp_v6_cookie_check(sk, skb);
+
if (!nsk)
goto discard;
- /*
- * Queue it on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket..
- */
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
- sk_mark_napi_id(sk, skb);
+ sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb))
goto reset;
if (opt_skb)
@@ -1276,7 +1262,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len))
+ if (tcp_rcv_state_process(sk, skb))
goto reset;
if (opt_skb)
goto ipv6_pktoptions;
@@ -1390,6 +1376,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
th = tcp_hdr(skb);
hdr = ipv6_hdr(skb);
+lookup:
sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
inet6_iif(skb));
if (!sk)
@@ -1399,6 +1386,37 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ tcp_v6_fill_cb(skb, hdr, th);
+ if (tcp_v6_inbound_md5_hash(sk, skb)) {
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sock_hold(sk);
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_and_relse;
+ }
+ if (nsk == sk) {
+ reqsk_put(req);
+ tcp_v6_restore_cb(skb);
+ } else if (tcp_child_process(sk, nsk, skb)) {
+ tcp_v6_send_reset(nsk, skb);
+ goto discard_and_relse;
+ } else {
+ sock_put(sk);
+ return 0;
+ }
+ }
if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
@@ -1409,18 +1427,23 @@ process:
tcp_v6_fill_cb(skb, hdr, th);
-#ifdef CONFIG_TCP_MD5SIG
if (tcp_v6_inbound_md5_hash(sk, skb))
goto discard_and_relse;
-#endif
if (sk_filter(sk, skb))
goto discard_and_relse;
- sk_incoming_cpu_update(sk);
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v6_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
+ tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1433,6 +1456,7 @@ process:
}
bh_unlock_sock(sk);
+put_and_return:
sock_put(sk);
return ret ? -1 : 0;
@@ -1442,7 +1466,7 @@ no_tcp_socket:
tcp_v6_fill_cb(skb, hdr, th);
- if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+ if (tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
@@ -1467,10 +1491,6 @@ do_time_wait:
tcp_v6_fill_cb(skb, hdr, th);
- if (skb->len < (th->doff<<2)) {
- inet_twsk_put(inet_twsk(sk));
- goto bad_packet;
- }
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
@@ -1487,8 +1507,7 @@ do_time_wait:
ntohs(th->dest), tcp_v6_iif(skb));
if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
sk = sk2;
tcp_v6_restore_cb(skb);
goto process;
@@ -1638,7 +1657,7 @@ static void tcp_v6_destroy_sock(struct sock *sk)
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
- struct request_sock *req, int i, kuid_t uid)
+ const struct request_sock *req, int i)
{
long ttd = req->rsk_timer.expires - jiffies;
const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
@@ -1662,7 +1681,8 @@ static void get_openreq6(struct seq_file *seq,
1, /* timers active (only the expire timer) */
jiffies_to_clock_t(ttd),
req->num_timeout,
- from_kuid_munged(seq_user_ns(seq), uid),
+ from_kuid_munged(seq_user_ns(seq),
+ sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0, req);
@@ -1677,7 +1697,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
const struct inet_sock *inet = inet_sk(sp);
const struct tcp_sock *tp = tcp_sk(sp);
const struct inet_connection_sock *icsk = inet_csk(sp);
- struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
+ int rx_queue;
+ int state;
dest = &sp->sk_v6_daddr;
src = &sp->sk_v6_rcv_saddr;
@@ -1698,6 +1720,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
timer_expires = jiffies;
}
+ state = sk_state_load(sp);
+ if (state == TCP_LISTEN)
+ rx_queue = sp->sk_ack_backlog;
+ else
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
+ */
+ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
@@ -1706,9 +1737,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
- sp->sk_state,
- tp->write_seq-tp->snd_una,
- (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
+ state,
+ tp->write_seq - tp->snd_una,
+ rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
@@ -1720,8 +1751,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- sp->sk_state == TCP_LISTEN ?
- (fastopenq ? fastopenq->max_qlen : 0) :
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
);
}
@@ -1767,18 +1798,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
- switch (st->state) {
- case TCP_SEQ_STATE_LISTENING:
- case TCP_SEQ_STATE_ESTABLISHED:
- if (sk->sk_state == TCP_TIME_WAIT)
- get_timewait6_sock(seq, v, st->num);
- else
- get_tcp6_sock(seq, v, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq6(seq, v, st->num, st->uid);
- break;
- }
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait6_sock(seq, v, st->num);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq6(seq, v, st->num);
+ else
+ get_tcp6_sock(seq, v, st->num);
out:
return 0;
}
diff --git a/kernel/net/ipv6/tunnel6.c b/kernel/net/ipv6/tunnel6.c
index 3c758007b..dae25cad0 100644
--- a/kernel/net/ipv6/tunnel6.c
+++ b/kernel/net/ipv6/tunnel6.c
@@ -144,6 +144,16 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
break;
}
+static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel46_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ break;
+}
+
static const struct inet6_protocol tunnel6_protocol = {
.handler = tunnel6_rcv,
.err_handler = tunnel6_err,
@@ -152,7 +162,7 @@ static const struct inet6_protocol tunnel6_protocol = {
static const struct inet6_protocol tunnel46_protocol = {
.handler = tunnel46_rcv,
- .err_handler = tunnel6_err,
+ .err_handler = tunnel46_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
diff --git a/kernel/net/ipv6/udp.c b/kernel/net/ipv6/udp.c
index e51fc3eee..9da3287a3 100644
--- a/kernel/net/ipv6/udp.c
+++ b/kernel/net/ipv6/udp.c
@@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
score++;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
-#define SCORE2_MAX (1 + 1 + 1)
static inline int compute_score2(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr,
@@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score++;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
@@ -251,8 +256,7 @@ begin:
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
matches = 1;
- } else if (score == SCORE2_MAX)
- goto exact_match;
+ }
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -269,7 +273,6 @@ begin:
goto begin;
if (result) {
-exact_match:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -1107,6 +1110,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
struct in6_addr *daddr, *final_p, final;
struct ipv6_txoptions *opt = NULL;
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
struct flowi6 fl6;
struct dst_entry *dst;
@@ -1260,8 +1264,10 @@ do_udp_sendmsg:
opt = NULL;
connected = 0;
}
- if (!opt)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -1370,6 +1376,7 @@ release_dst:
out:
dst_release(dst);
fl6_sock_release(flowlabel);
+ txopt_put(opt_to_free);
if (!err)
return len;
/*
@@ -1496,7 +1503,8 @@ int __net_init udp6_proc_init(struct net *net)
return udp_proc_register(net, &udp6_seq_afinfo);
}
-void udp6_proc_exit(struct net *net) {
+void udp6_proc_exit(struct net *net)
+{
udp_proc_unregister(net, &udp6_seq_afinfo);
}
#endif /* CONFIG_PROC_FS */
diff --git a/kernel/net/ipv6/xfrm6_input.c b/kernel/net/ipv6/xfrm6_input.c
index 74bd17882..0eaab1fa6 100644
--- a/kernel/net/ipv6/xfrm6_input.c
+++ b/kernel/net/ipv6/xfrm6_input.c
@@ -42,8 +42,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
ipv6_hdr(skb)->payload_len = htons(skb->len);
__skb_push(skb, skb->data - skb_network_header(skb));
- NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
ip6_rcv_finish);
return -1;
}
diff --git a/kernel/net/ipv6/xfrm6_mode_tunnel.c b/kernel/net/ipv6/xfrm6_mode_tunnel.c
index 901ef6f8a..372855eea 100644
--- a/kernel/net/ipv6/xfrm6_mode_tunnel.c
+++ b/kernel/net/ipv6/xfrm6_mode_tunnel.c
@@ -20,11 +20,10 @@
static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
{
- const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
struct ipv6hdr *inner_iph = ipipv6_hdr(skb);
- if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
- IP6_ECN_set_ce(inner_iph);
+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+ IP6_ECN_set_ce(skb, inner_iph);
}
/* Add encapsulation header.
diff --git a/kernel/net/ipv6/xfrm6_output.c b/kernel/net/ipv6/xfrm6_output.c
index 09c76a7b4..4d09ce6fa 100644
--- a/kernel/net/ipv6/xfrm6_output.c
+++ b/kernel/net/ipv6/xfrm6_output.c
@@ -79,6 +79,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
if (!skb->ignore_df && skb->len > mtu) {
skb->dev = dst->dev;
+ skb->protocol = htons(ETH_P_IPV6);
if (xfrm6_local_dontfrag(skb))
xfrm6_local_rxpmtu(skb, mtu);
@@ -131,44 +132,57 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
return xfrm_output(sk, skb);
}
-static int __xfrm6_output(struct sock *sk, struct sk_buff *skb)
+static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+ return x->outer_mode->afinfo->output_finish(sk, skb);
+}
+
+static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct xfrm_state *x = dst->xfrm;
int mtu;
+ bool toobig;
#ifdef CONFIG_NETFILTER
if (!x) {
IP6CB(skb)->flags |= IP6SKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
+ if (x->props.mode != XFRM_MODE_TUNNEL)
+ goto skip_frag;
+
if (skb->protocol == htons(ETH_P_IPV6))
mtu = ip6_skb_dst_mtu(skb);
else
mtu = dst_mtu(skb_dst(skb));
- if (skb->len > mtu && xfrm6_local_dontfrag(skb)) {
+ toobig = skb->len > mtu && !skb_is_gso(skb);
+
+ if (toobig && xfrm6_local_dontfrag(skb)) {
xfrm6_local_rxpmtu(skb, mtu);
return -EMSGSIZE;
- } else if (!skb->ignore_df && skb->len > mtu && skb->sk) {
+ } else if (!skb->ignore_df && toobig && skb->sk) {
xfrm_local_error(skb, mtu);
return -EMSGSIZE;
}
- if (x->props.mode == XFRM_MODE_TUNNEL &&
- ((skb->len > mtu && !skb_is_gso(skb)) ||
- dst_allfrag(skb_dst(skb)))) {
- return ip6_fragment(sk, skb,
- x->outer_mode->afinfo->output_finish);
- }
+ if (toobig || dst_allfrag(skb_dst(skb)))
+ return ip6_fragment(net, sk, skb,
+ __xfrm6_output_finish);
+
+skip_frag:
return x->outer_mode->afinfo->output_finish(sk, skb);
}
-int xfrm6_output(struct sock *sk, struct sk_buff *skb)
+int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
- NULL, skb_dst(skb)->dev, __xfrm6_output,
+ return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ __xfrm6_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
diff --git a/kernel/net/ipv6/xfrm6_policy.c b/kernel/net/ipv6/xfrm6_policy.c
index f337a908a..c074771a1 100644
--- a/kernel/net/ipv6/xfrm6_policy.c
+++ b/kernel/net/ipv6/xfrm6_policy.c
@@ -20,13 +20,14 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/l3mdev.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
-static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
+static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr)
{
@@ -35,6 +36,8 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
int err;
memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
if (saddr)
memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -50,13 +53,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
return dst;
}
-static int xfrm6_get_saddr(struct net *net,
+static int xfrm6_get_saddr(struct net *net, int oif,
xfrm_address_t *saddr, xfrm_address_t *daddr)
{
struct dst_entry *dst;
struct net_device *dev;
- dst = xfrm6_dst_lookup(net, 0, NULL, daddr);
+ dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
if (IS_ERR(dst))
return -EHOSTUNREACH;
@@ -71,20 +74,12 @@ static int xfrm6_get_tos(const struct flowi *fl)
return 0;
}
-static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst)
-{
- struct rt6_info *rt = (struct rt6_info *)xdst;
-
- rt6_init_peer(rt, net->ipv6.peers);
-}
-
static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
int nfheader_len)
{
if (dst->ops->family == AF_INET6) {
struct rt6_info *rt = (struct rt6_info *)dst;
- if (rt->rt6i_node)
- path->path_cookie = rt->rt6i_node->fn_sernum;
+ path->path_cookie = rt6_get_cookie(rt);
}
path->u.rt6.rt6i_nfheader_len = nfheader_len;
@@ -106,16 +101,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
return -ENODEV;
}
- rt6_transfer_peer(&xdst->u.rt6, rt);
-
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
RTF_LOCAL);
xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
xdst->u.rt6.rt6i_node = rt->rt6i_node;
- if (rt->rt6i_node)
- xdst->route_cookie = rt->rt6i_node->fn_sernum;
+ xdst->route_cookie = rt6_get_cookie(rt);
xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
xdst->u.rt6.rt6i_src = rt->rt6i_src;
@@ -142,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
nexthdr = nh[nhoff];
if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
+ oif = l3mdev_fib_oif(skb_dst(skb)->dev);
memset(fl6, 0, sizeof(struct flowi6));
fl6->flowi6_mark = skb->mark;
@@ -185,7 +177,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
return;
case IPPROTO_ICMPV6:
- if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
+ if (!onlyproto && (nh + offset + 2 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
u8 *icmp;
nh = skb_network_header(skb);
@@ -199,7 +192,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPPROTO_MH:
offset += ipv6_optlen(exthdr);
- if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
+ if (!onlyproto && (nh + offset + 3 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
struct ip6_mh *mh;
nh = skb_network_header(skb);
@@ -255,10 +249,6 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
if (likely(xdst->u.rt6.rt6i_idev))
in6_dev_put(xdst->u.rt6.rt6i_idev);
dst_destroy_metrics_generic(dst);
- if (rt6_has_peer(&xdst->u.rt6)) {
- struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6);
- inet_putpeer(peer);
- }
xfrm_dst_destroy(xdst);
}
@@ -289,7 +279,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm6_dst_ops = {
+static struct dst_ops xfrm6_dst_ops_template = {
.family = AF_INET6,
.gc = xfrm6_garbage_collect,
.update_pmtu = xfrm6_update_pmtu,
@@ -298,17 +288,16 @@ static struct dst_ops xfrm6_dst_ops = {
.destroy = xfrm6_dst_destroy,
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,
- .gc_thresh = 32768,
+ .gc_thresh = INT_MAX,
};
static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
.family = AF_INET6,
- .dst_ops = &xfrm6_dst_ops,
+ .dst_ops = &xfrm6_dst_ops_template,
.dst_lookup = xfrm6_dst_lookup,
.get_saddr = xfrm6_get_saddr,
.decode_session = _decode_session6,
.get_tos = xfrm6_get_tos,
- .init_dst = xfrm6_init_dst,
.init_path = xfrm6_init_path,
.fill_dst = xfrm6_fill_dst,
.blackhole_route = ip6_blackhole_route,
@@ -336,7 +325,7 @@ static struct ctl_table xfrm6_policy_table[] = {
{ }
};
-static int __net_init xfrm6_net_init(struct net *net)
+static int __net_init xfrm6_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -364,7 +353,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm6_net_exit(struct net *net)
+static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -376,24 +365,52 @@ static void __net_exit xfrm6_net_exit(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm6_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm6_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm6_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template,
+ sizeof(xfrm6_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm6_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm6_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm6_net_exit(struct net *net)
+{
+ xfrm6_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm6_dst_ops);
+}
static struct pernet_operations xfrm6_net_ops = {
.init = xfrm6_net_init,
.exit = xfrm6_net_exit,
};
-#endif
int __init xfrm6_init(void)
{
int ret;
- dst_entries_init(&xfrm6_dst_ops);
-
ret = xfrm6_policy_init();
- if (ret) {
- dst_entries_destroy(&xfrm6_dst_ops);
+ if (ret)
goto out;
- }
ret = xfrm6_state_init();
if (ret)
goto out_policy;
@@ -402,9 +419,7 @@ int __init xfrm6_init(void)
if (ret)
goto out_state;
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm6_net_ops);
-#endif
out:
return ret;
out_state:
@@ -416,11 +431,8 @@ out_policy:
void xfrm6_fini(void)
{
-#ifdef CONFIG_SYSCTL
unregister_pernet_subsys(&xfrm6_net_ops);
-#endif
xfrm6_protocol_fini();
xfrm6_policy_fini();
xfrm6_state_fini();
- dst_entries_destroy(&xfrm6_dst_ops);
}
diff --git a/kernel/net/ipx/af_ipx.c b/kernel/net/ipx/af_ipx.c
index 4ea5d7497..48d0dc89b 100644
--- a/kernel/net/ipx/af_ipx.c
+++ b/kernel/net/ipx/af_ipx.c
@@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol,
goto out;
rc = -ENOMEM;
- sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto);
+ sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern);
if (!sk)
goto out;
diff --git a/kernel/net/irda/af_irda.c b/kernel/net/irda/af_irda.c
index ee0ea25c8..923abd6b3 100644
--- a/kernel/net/irda/af_irda.c
+++ b/kernel/net/irda/af_irda.c
@@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
struct sock *sk;
struct irda_sock *self;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (net != &init_net)
return -EAFNOSUPPORT;
@@ -1100,7 +1103,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
}
/* Allocate networking socket */
- sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto);
+ sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern);
if (sk == NULL)
return -ENOMEM;
@@ -2123,8 +2126,7 @@ static int irda_setsockopt(struct socket *sock, int level, int optname,
}
/* Unregister any old registration */
- if (self->skey)
- irlmp_unregister_service(self->skey);
+ irlmp_unregister_service(self->skey);
self->skey = irlmp_register_service((__u16) opt);
break;
diff --git a/kernel/net/irda/ircomm/ircomm_tty.c b/kernel/net/irda/ircomm/ircomm_tty.c
index 683346d2d..a4237707f 100644
--- a/kernel/net/irda/ircomm/ircomm_tty.c
+++ b/kernel/net/irda/ircomm/ircomm_tty.c
@@ -335,8 +335,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
* specified, we cannot return before the IrCOMM link is
* ready
*/
- if (!test_bit(ASYNCB_CLOSING, &port->flags) &&
- (do_clocal || tty_port_carrier_raised(port)) &&
+ if ((do_clocal || tty_port_carrier_raised(port)) &&
self->state == IRCOMM_TTY_READY)
{
break;
@@ -443,34 +442,6 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
/* Not really used by us, but lets do it anyway */
self->port.low_latency = (self->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0;
- /*
- * If the port is the middle of closing, bail out now
- */
- if (test_bit(ASYNCB_CLOSING, &self->port.flags)) {
-
- /* Hm, why are we blocking on ASYNC_CLOSING if we
- * do return -EAGAIN/-ERESTARTSYS below anyway?
- * IMHO it's either not needed in the first place
- * or for some reason we need to make sure the async
- * closing has been finished - if so, wouldn't we
- * probably better sleep uninterruptible?
- */
-
- if (wait_event_interruptible(self->port.close_wait,
- !test_bit(ASYNCB_CLOSING, &self->port.flags))) {
- net_warn_ratelimited("%s - got signal while blocking on ASYNC_CLOSING!\n",
- __func__);
- return -ERESTARTSYS;
- }
-
-#ifdef SERIAL_DO_RESTART
- return (self->port.flags & ASYNC_HUP_NOTIFY) ?
- -EAGAIN : -ERESTARTSYS;
-#else
- return -EAGAIN;
-#endif
- }
-
/* Check if this is a "normal" ircomm device, or an irlpt device */
if (self->line < 0x10) {
self->service_type = IRCOMM_3_WIRE | IRCOMM_9_WIRE;
diff --git a/kernel/net/irda/irlmp.c b/kernel/net/irda/irlmp.c
index a26c401ef..43964594a 100644
--- a/kernel/net/irda/irlmp.c
+++ b/kernel/net/irda/irlmp.c
@@ -1839,7 +1839,7 @@ static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off)
for (element = hashbin_get_first(iter->hashbin);
element != NULL;
element = hashbin_get_next(iter->hashbin)) {
- if (!off || *off-- == 0) {
+ if (!off || (*off)-- == 0) {
/* NB: hashbin left locked */
return element;
}
diff --git a/kernel/net/irda/timer.c b/kernel/net/irda/timer.c
index 0c4c115a5..f2280f73b 100644
--- a/kernel/net/irda/timer.c
+++ b/kernel/net/irda/timer.c
@@ -60,8 +60,8 @@ void irlap_start_query_timer(struct irlap_cb *self, int S, int s)
* to avoid messing with for incoming connections requests and
* to accommodate devices that perform discovery slower than us.
* Jean II */
- timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s)
- + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT);
+ timeout = msecs_to_jiffies(sysctl_slot_timeout) * (S - s)
+ + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT;
/* Set or re-set the timer. We reset the timer for each received
* discovery query, which allow us to automatically adjust to
diff --git a/kernel/net/iucv/af_iucv.c b/kernel/net/iucv/af_iucv.c
index 6daa52a18..20ab7b2ec 100644
--- a/kernel/net/iucv/af_iucv.c
+++ b/kernel/net/iucv/af_iucv.c
@@ -95,11 +95,10 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify);
/* Call Back functions */
static void iucv_callback_rx(struct iucv_path *, struct iucv_message *);
static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *);
-static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]);
-static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8],
- u8 ipuser[16]);
-static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]);
-static void iucv_callback_shutdown(struct iucv_path *, u8 ipuser[16]);
+static void iucv_callback_connack(struct iucv_path *, u8 *);
+static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *);
+static void iucv_callback_connrej(struct iucv_path *, u8 *);
+static void iucv_callback_shutdown(struct iucv_path *, u8 *);
static struct iucv_sock_list iucv_sk_list = {
.lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock),
@@ -535,12 +534,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent)
sk->sk_type = parent->sk_type;
}
-static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern)
{
struct sock *sk;
struct iucv_sock *iucv;
- sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto);
+ sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern);
if (!sk)
return NULL;
iucv = iucv_sk(sk);
@@ -602,7 +601,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
return -ESOCKTNOSUPPORT;
}
- sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL);
+ sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
if (!sk)
return -ENOMEM;
@@ -709,6 +708,9 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
if (!addr || addr->sa_family != AF_IUCV)
return -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_iucv))
+ return -EINVAL;
+
lock_sock(sk);
if (sk->sk_state != IUCV_OPEN) {
err = -EBADFD;
@@ -1484,7 +1486,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
if (sock_writeable(sk) && iucv_below_msglim(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
@@ -1723,7 +1725,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
}
/* Create the new socket */
- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+ nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
if (!nsk) {
err = pr_iucv->path_sever(path, user_data);
iucv_path_free(path);
@@ -1933,7 +1935,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
goto out;
}
- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+ nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
bh_lock_sock(sk);
if ((sk->sk_state != IUCV_LISTEN) ||
sk_acceptq_is_full(sk) ||
diff --git a/kernel/net/iucv/iucv.c b/kernel/net/iucv/iucv.c
index 2a6a1fdd6..7eaa000c9 100644
--- a/kernel/net/iucv/iucv.c
+++ b/kernel/net/iucv/iucv.c
@@ -713,7 +713,7 @@ static struct notifier_block __refdata iucv_cpu_notifier = {
*
* Sever an iucv path to free up the pathid. Used internally.
*/
-static int iucv_sever_pathid(u16 pathid, u8 userdata[16])
+static int iucv_sever_pathid(u16 pathid, u8 *userdata)
{
union iucv_param *parm;
@@ -876,7 +876,7 @@ static struct notifier_block iucv_reboot_notifier = {
* Returns the result of the CP IUCV call.
*/
int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler,
- u8 userdata[16], void *private)
+ u8 *userdata, void *private)
{
union iucv_param *parm;
int rc;
@@ -923,7 +923,7 @@ EXPORT_SYMBOL(iucv_path_accept);
* Returns the result of the CP IUCV call.
*/
int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
- u8 userid[8], u8 system[8], u8 userdata[16],
+ u8 *userid, u8 *system, u8 *userdata,
void *private)
{
union iucv_param *parm;
@@ -985,7 +985,7 @@ EXPORT_SYMBOL(iucv_path_connect);
*
* Returns the result from the CP IUCV call.
*/
-int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16])
+int iucv_path_quiesce(struct iucv_path *path, u8 *userdata)
{
union iucv_param *parm;
int rc;
@@ -1017,7 +1017,7 @@ EXPORT_SYMBOL(iucv_path_quiesce);
*
* Returns the result from the CP IUCV call.
*/
-int iucv_path_resume(struct iucv_path *path, u8 userdata[16])
+int iucv_path_resume(struct iucv_path *path, u8 *userdata)
{
union iucv_param *parm;
int rc;
@@ -1047,7 +1047,7 @@ out:
*
* Returns the result from the CP IUCV call.
*/
-int iucv_path_sever(struct iucv_path *path, u8 userdata[16])
+int iucv_path_sever(struct iucv_path *path, u8 *userdata)
{
int rc;
diff --git a/kernel/net/key/af_key.c b/kernel/net/key/af_key.c
index f0d52d721..f9c9ecb0c 100644
--- a/kernel/net/key/af_key.c
+++ b/kernel/net/key/af_key.c
@@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
return -EPROTONOSUPPORT;
err = -ENOMEM;
- sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto);
+ sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern);
if (sk == NULL)
goto out;
@@ -219,7 +219,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
#define BROADCAST_ONE 1
#define BROADCAST_REGISTERED 2
#define BROADCAST_PROMISC_ONLY 4
-static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
+static int pfkey_broadcast(struct sk_buff *skb,
int broadcast_flags, struct sock *one_sk,
struct net *net)
{
@@ -244,7 +244,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
* socket.
*/
if (pfk->promisc)
- pfkey_broadcast_one(skb, &skb2, allocation, sk);
+ pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk);
/* the exact target will be processed later */
if (sk == one_sk)
@@ -259,9 +259,9 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
continue;
}
- err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk);
+ err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk);
- /* Error is cleare after succecful sending to at least one
+ /* Error is cleared after successful sending to at least one
* registered KM */
if ((broadcast_flags & BROADCAST_REGISTERED) && err)
err = err2;
@@ -269,7 +269,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
rcu_read_unlock();
if (one_sk != NULL)
- err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
+ err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk);
kfree_skb(skb2);
kfree_skb(skb);
@@ -292,7 +292,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
hdr = (struct sadb_msg *) pfk->dump.skb->data;
hdr->sadb_msg_seq = 0;
hdr->sadb_msg_errno = rc;
- pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+ pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
&pfk->sk, sock_net(&pfk->sk));
pfk->dump.skb = NULL;
}
@@ -333,7 +333,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
hdr->sadb_msg_len = (sizeof(struct sadb_msg) /
sizeof(uint64_t));
- pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk));
+ pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk));
return 0;
}
@@ -1190,6 +1190,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
memcpy(x->ealg->alg_key, key+1, keysize);
}
x->props.ealgo = sa->sadb_sa_encrypt;
+ x->geniv = a->uinfo.encr.geniv;
}
}
/* x->algo.flags = sa->sadb_sa_flags; */
@@ -1364,7 +1365,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
xfrm_state_put(x);
- pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net);
+ pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net);
return 0;
}
@@ -1451,7 +1452,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
hdr->sadb_msg_seq = c->seq;
hdr->sadb_msg_pid = c->portid;
- pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x));
+ pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x));
return 0;
}
@@ -1564,7 +1565,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg
out_hdr->sadb_msg_reserved = 0;
out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
+ pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk));
return 0;
}
@@ -1669,7 +1670,7 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad
return -ENOBUFS;
}
- pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk));
+ pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk));
return 0;
}
@@ -1688,7 +1689,7 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr)
hdr->sadb_msg_errno = (uint8_t) 0;
hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
- return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
+ return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk));
}
static int key_notify_sa_flush(const struct km_event *c)
@@ -1709,7 +1710,7 @@ static int key_notify_sa_flush(const struct km_event *c)
hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
hdr->sadb_msg_reserved = 0;
- pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
+ pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net);
return 0;
}
@@ -1766,7 +1767,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr)
out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
if (pfk->dump.skb)
- pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+ pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
&pfk->sk, sock_net(&pfk->sk));
pfk->dump.skb = out_skb;
@@ -1846,7 +1847,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb
new_hdr->sadb_msg_errno = 0;
}
- pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
+ pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk));
return 0;
}
@@ -2180,7 +2181,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev
out_hdr->sadb_msg_errno = 0;
out_hdr->sadb_msg_seq = c->seq;
out_hdr->sadb_msg_pid = c->portid;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp));
+ pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp));
return 0;
}
@@ -2400,7 +2401,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc
out_hdr->sadb_msg_errno = 0;
out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp));
+ pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp));
err = 0;
out:
@@ -2654,7 +2655,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
if (pfk->dump.skb)
- pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+ pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
&pfk->sk, sock_net(&pfk->sk));
pfk->dump.skb = out_skb;
@@ -2707,7 +2708,7 @@ static int key_notify_policy_flush(const struct km_event *c)
hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
hdr->sadb_msg_reserved = 0;
- pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
+ pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net);
return 0;
}
@@ -2769,7 +2770,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb
void *ext_hdrs[SADB_EXT_MAX];
int err;
- pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
+ pfkey_broadcast(skb_clone(skb, GFP_KERNEL),
BROADCAST_PROMISC_ONLY, NULL, sock_net(sk));
memset(ext_hdrs, 0, sizeof(ext_hdrs));
@@ -2991,7 +2992,7 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
out_hdr->sadb_msg_seq = 0;
out_hdr->sadb_msg_pid = 0;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+ pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x));
return 0;
}
@@ -3181,7 +3182,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
xfrm_ctx->ctx_len);
}
- return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+ return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x));
}
static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
@@ -3379,7 +3380,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
n_port->sadb_x_nat_t_port_port = sport;
n_port->sadb_x_nat_t_port_reserved = 0;
- return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+ return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x));
}
#ifdef CONFIG_NET_KEY_MIGRATE
@@ -3571,7 +3572,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
}
/* broadcast migrate message to sockets */
- pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net);
+ pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net);
return 0;
diff --git a/kernel/net/l2tp/l2tp_core.c b/kernel/net/l2tp/l2tp_core.c
index a29a50449..afca2eb4d 100644
--- a/kernel/net/l2tp/l2tp_core.c
+++ b/kernel/net/l2tp/l2tp_core.c
@@ -1319,7 +1319,7 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
tunnel = container_of(work, struct l2tp_tunnel, del_work);
sk = l2tp_tunnel_sock_lookup(tunnel);
if (!sk)
- return;
+ goto out;
sock = sk->sk_socket;
@@ -1334,12 +1334,15 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
if (sock)
inet_shutdown(sock, 2);
} else {
- if (sock)
+ if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sk);
+ sock_release(sock);
+ }
}
l2tp_tunnel_sock_put(sk);
+out:
+ l2tp_tunnel_dec_refcount(tunnel);
}
/* Create a socket for the tunnel, if one isn't set up by
@@ -1399,13 +1402,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
if (cfg->local_ip6 && cfg->peer_ip6) {
struct sockaddr_l2tpip6 ip6_addr = {0};
- err = sock_create_kern(AF_INET6, SOCK_DGRAM,
+ err = sock_create_kern(net, AF_INET6, SOCK_DGRAM,
IPPROTO_L2TP, &sock);
if (err < 0)
goto out;
- sk_change_net(sock->sk, net);
-
ip6_addr.l2tp_family = AF_INET6;
memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
sizeof(ip6_addr.l2tp_addr));
@@ -1429,13 +1430,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
{
struct sockaddr_l2tpip ip_addr = {0};
- err = sock_create_kern(AF_INET, SOCK_DGRAM,
+ err = sock_create_kern(net, AF_INET, SOCK_DGRAM,
IPPROTO_L2TP, &sock);
if (err < 0)
goto out;
- sk_change_net(sock->sk, net);
-
ip_addr.l2tp_family = AF_INET;
ip_addr.l2tp_addr = cfg->local_ip;
ip_addr.l2tp_conn_id = tunnel_id;
@@ -1462,7 +1461,7 @@ out:
*sockp = sock;
if ((err < 0) && sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
*sockp = NULL;
}
@@ -1639,8 +1638,13 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
*/
int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
{
+ l2tp_tunnel_inc_refcount(tunnel);
l2tp_tunnel_closeall(tunnel);
- return (false == queue_work(l2tp_wq, &tunnel->del_work));
+ if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
+ l2tp_tunnel_dec_refcount(tunnel);
+ return 1;
+ }
+ return 0;
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
diff --git a/kernel/net/l2tp/l2tp_core.h b/kernel/net/l2tp/l2tp_core.h
index 68aa9ffd4..5871537af 100644
--- a/kernel/net/l2tp/l2tp_core.h
+++ b/kernel/net/l2tp/l2tp_core.h
@@ -321,4 +321,7 @@ do { \
#define l2tp_dbg(ptr, type, fmt, ...) \
l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__)
+#define MODULE_ALIAS_L2TP_PWTYPE(type) \
+ MODULE_ALIAS("net-l2tp-type-" __stringify(type))
+
#endif /* _L2TP_CORE_H_ */
diff --git a/kernel/net/l2tp/l2tp_eth.c b/kernel/net/l2tp/l2tp_eth.c
index 4b552873b..e253c26f3 100644
--- a/kernel/net/l2tp/l2tp_eth.c
+++ b/kernel/net/l2tp/l2tp_eth.c
@@ -358,3 +358,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
MODULE_DESCRIPTION("L2TP ethernet pseudowire driver");
MODULE_VERSION("1.0");
+MODULE_ALIAS_L2TP_PWTYPE(5);
diff --git a/kernel/net/l2tp/l2tp_ip.c b/kernel/net/l2tp/l2tp_ip.c
index 79649937e..ec22078b0 100644
--- a/kernel/net/l2tp/l2tp_ip.c
+++ b/kernel/net/l2tp/l2tp_ip.c
@@ -655,3 +655,4 @@ MODULE_VERSION("1.0");
* enums
*/
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP);
+MODULE_ALIAS_NET_PF_PROTO(PF_INET, IPPROTO_L2TP);
diff --git a/kernel/net/l2tp/l2tp_ip6.c b/kernel/net/l2tp/l2tp_ip6.c
index d1ded3777..a2c8747d2 100644
--- a/kernel/net/l2tp/l2tp_ip6.c
+++ b/kernel/net/l2tp/l2tp_ip6.c
@@ -486,6 +486,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name);
struct in6_addr *daddr, *final_p, final;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ipv6_txoptions *opt = NULL;
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
@@ -575,8 +576,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt = NULL;
}
- if (opt == NULL)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -631,6 +634,7 @@ done:
dst_release(dst);
out:
fl6_sock_release(flowlabel);
+ txopt_put(opt_to_free);
return err < 0 ? err : len;
@@ -801,3 +805,4 @@ MODULE_VERSION("1.0");
* enums
*/
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP);
+MODULE_ALIAS_NET_PF_PROTO(PF_INET6, IPPROTO_L2TP);
diff --git a/kernel/net/l2tp/l2tp_netlink.c b/kernel/net/l2tp/l2tp_netlink.c
index 9e13c2ff8..2caaa84ce 100644
--- a/kernel/net/l2tp/l2tp_netlink.c
+++ b/kernel/net/l2tp/l2tp_netlink.c
@@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family,
ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq,
NLM_F_ACK, tunnel, cmd);
- if (ret >= 0)
- return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ /* We don't care if no one is listening */
+ if (ret == -ESRCH)
+ ret = 0;
+ return ret;
+ }
nlmsg_free(msg);
@@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family,
ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
NLM_F_ACK, session, cmd);
- if (ret >= 0)
- return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ /* We don't care if no one is listening */
+ if (ret == -ESRCH)
+ ret = 0;
+ return ret;
+ }
nlmsg_free(msg);
@@ -576,6 +586,13 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_MRU])
cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
+#ifdef CONFIG_MODULES
+ if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
+ genl_unlock();
+ request_module("net-l2tp-type-%u", cfg.pw_type);
+ genl_lock();
+ }
+#endif
if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) ||
(l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) {
ret = -EPROTONOSUPPORT;
diff --git a/kernel/net/l2tp/l2tp_ppp.c b/kernel/net/l2tp/l2tp_ppp.c
index e9b0dec56..1ad18c550 100644
--- a/kernel/net/l2tp/l2tp_ppp.c
+++ b/kernel/net/l2tp/l2tp_ppp.c
@@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb)
/* socket() handler. Initialize a new struct sock.
*/
-static int pppol2tp_create(struct net *net, struct socket *sock)
+static int pppol2tp_create(struct net *net, struct socket *sock, int kern)
{
int error = -ENOMEM;
struct sock *sk;
- sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern);
if (!sk)
goto out;
@@ -1863,3 +1863,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP");
MODULE_LICENSE("GPL");
MODULE_VERSION(PPPOL2TP_DRV_VERSION);
MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP));
+MODULE_ALIAS_L2TP_PWTYPE(11);
diff --git a/kernel/net/l3mdev/Kconfig b/kernel/net/l3mdev/Kconfig
new file mode 100644
index 000000000..5d4732503
--- /dev/null
+++ b/kernel/net/l3mdev/Kconfig
@@ -0,0 +1,10 @@
+#
+# Configuration for L3 master device support
+#
+
+config NET_L3_MASTER_DEV
+ bool "L3 Master device support"
+ depends on INET || IPV6
+ ---help---
+ This module provides glue between core networking code and device
+ drivers to support L3 master devices like VRF.
diff --git a/kernel/net/l3mdev/Makefile b/kernel/net/l3mdev/Makefile
new file mode 100644
index 000000000..84a53a6f6
--- /dev/null
+++ b/kernel/net/l3mdev/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the L3 device API
+#
+
+obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o
diff --git a/kernel/net/l3mdev/l3mdev.c b/kernel/net/l3mdev/l3mdev.c
new file mode 100644
index 000000000..8e5ead366
--- /dev/null
+++ b/kernel/net/l3mdev/l3mdev.c
@@ -0,0 +1,92 @@
+/*
+ * net/l3mdev/l3mdev.c - L3 master device implementation
+ * Copyright (c) 2015 Cumulus Networks
+ * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <net/l3mdev.h>
+
+/**
+ * l3mdev_master_ifindex - get index of L3 master device
+ * @dev: targeted interface
+ */
+
+int l3mdev_master_ifindex_rcu(struct net_device *dev)
+{
+ int ifindex = 0;
+
+ if (!dev)
+ return 0;
+
+ if (netif_is_l3_master(dev)) {
+ ifindex = dev->ifindex;
+ } else if (netif_is_l3_slave(dev)) {
+ struct net_device *master;
+
+ master = netdev_master_upper_dev_get_rcu(dev);
+ if (master)
+ ifindex = master->ifindex;
+ }
+
+ return ifindex;
+}
+EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
+
+/**
+ * l3mdev_fib_table - get FIB table id associated with an L3
+ * master interface
+ * @dev: targeted interface
+ */
+
+u32 l3mdev_fib_table_rcu(const struct net_device *dev)
+{
+ u32 tb_id = 0;
+
+ if (!dev)
+ return 0;
+
+ if (netif_is_l3_master(dev)) {
+ if (dev->l3mdev_ops->l3mdev_fib_table)
+ tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev);
+ } else if (netif_is_l3_slave(dev)) {
+ /* Users of netdev_master_upper_dev_get_rcu need non-const,
+ * but current inet_*type functions take a const
+ */
+ struct net_device *_dev = (struct net_device *) dev;
+ const struct net_device *master;
+
+ master = netdev_master_upper_dev_get_rcu(_dev);
+ if (master &&
+ master->l3mdev_ops->l3mdev_fib_table)
+ tb_id = master->l3mdev_ops->l3mdev_fib_table(master);
+ }
+
+ return tb_id;
+}
+EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu);
+
+u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ u32 tb_id = 0;
+
+ if (!ifindex)
+ return 0;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ tb_id = l3mdev_fib_table_rcu(dev);
+
+ rcu_read_unlock();
+
+ return tb_id;
+}
+EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
diff --git a/kernel/net/llc/af_llc.c b/kernel/net/llc/af_llc.c
index 17a8dff06..8dab4e569 100644
--- a/kernel/net/llc/af_llc.c
+++ b/kernel/net/llc/af_llc.c
@@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
rc = -ENOMEM;
- sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto);
+ sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern);
if (sk) {
rc = 0;
llc_ui_sk_init(sock, sk);
@@ -613,7 +613,7 @@ static int llc_wait_data(struct sock *sk, long timeo)
if (signal_pending(current))
break;
rc = 0;
- if (sk_wait_data(sk, &timeo))
+ if (sk_wait_data(sk, &timeo, NULL))
break;
}
return rc;
@@ -802,7 +802,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
release_sock(sk);
lock_sock(sk);
} else
- sk_wait_data(sk, &timeo);
+ sk_wait_data(sk, &timeo, NULL);
if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n",
diff --git a/kernel/net/llc/llc_conn.c b/kernel/net/llc/llc_conn.c
index 81a61fce3..3e821daf9 100644
--- a/kernel/net/llc/llc_conn.c
+++ b/kernel/net/llc/llc_conn.c
@@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk,
struct llc_addr *daddr)
{
struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC,
- sk->sk_prot);
+ sk->sk_prot, 0);
struct llc_sock *newllc, *llc = llc_sk(sk);
if (!newsk)
@@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk)
* Allocates a LLC sock and initializes it. Returns the new LLC sock
* or %NULL if there's no memory available for one
*/
-struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot)
+struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
{
- struct sock *sk = sk_alloc(net, family, priority, prot);
+ struct sock *sk = sk_alloc(net, family, priority, prot, kern);
if (!sk)
goto out;
diff --git a/kernel/net/mac80211/Kconfig b/kernel/net/mac80211/Kconfig
index 64a012a0c..3891cbd2a 100644
--- a/kernel/net/mac80211/Kconfig
+++ b/kernel/net/mac80211/Kconfig
@@ -7,7 +7,6 @@ config MAC80211
select CRYPTO_CCM
select CRYPTO_GCM
select CRC32
- select AVERAGE
---help---
This option enables the hardware independent IEEE 802.11
networking stack.
@@ -302,6 +301,20 @@ config MAC80211_DEBUG_COUNTERS
---help---
Selecting this option causes mac80211 to keep additional
and very verbose statistics about TX and RX handler use
- and show them in debugfs.
+ as well as a few selected dot11 counters. These will be
+ exposed in debugfs.
+
+ Note that some of the counters are not concurrency safe
+ and may thus not always be accurate.
If unsure, say N.
+
+config MAC80211_STA_HASH_MAX_SIZE
+ int "Station hash table maximum size" if MAC80211_DEBUG_MENU
+ default 0
+ ---help---
+ Setting this option to a low value (e.g. 4) allows testing the
+ hash table with collisions relatively deterministically (just
+ connect more stations than the number selected here.)
+
+ If unsure, leave the default of 0.
diff --git a/kernel/net/mac80211/Makefile b/kernel/net/mac80211/Makefile
index 3275f0188..f9137a834 100644
--- a/kernel/net/mac80211/Makefile
+++ b/kernel/net/mac80211/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_MAC80211) += mac80211.o
# mac80211 objects
mac80211-y := \
main.o status.o \
+ driver-ops.o \
sta_info.o \
wep.o \
wpa.o \
@@ -26,7 +27,6 @@ mac80211-y := \
key.o \
util.o \
wme.o \
- event.o \
chan.o \
trace.o mlme.o \
tdls.o \
diff --git a/kernel/net/mac80211/aes_ccm.c b/kernel/net/mac80211/aes_ccm.c
index 208df7c0b..7663c28ba 100644
--- a/kernel/net/mac80211/aes_ccm.c
+++ b/kernel/net/mac80211/aes_ccm.c
@@ -11,9 +11,8 @@
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/crypto.h>
#include <linux/err.h>
-#include <crypto/aes.h>
+#include <crypto/aead.h>
#include <net/mac80211.h>
#include "key.h"
@@ -23,7 +22,7 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic,
size_t mic_len)
{
- struct scatterlist assoc, pt, ct[2];
+ struct scatterlist sg[3];
char aead_req_data[sizeof(struct aead_request) +
crypto_aead_reqsize(tfm)]
@@ -32,15 +31,14 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
memset(aead_req, 0, sizeof(aead_req_data));
- sg_init_one(&pt, data, data_len);
- sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
- sg_init_table(ct, 2);
- sg_set_buf(&ct[0], data, data_len);
- sg_set_buf(&ct[1], mic, mic_len);
+ sg_init_table(sg, 3);
+ sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[1], data, data_len);
+ sg_set_buf(&sg[2], mic, mic_len);
aead_request_set_tfm(aead_req, tfm);
- aead_request_set_assoc(aead_req, &assoc, assoc.length);
- aead_request_set_crypt(aead_req, &pt, ct, data_len, b_0);
+ aead_request_set_crypt(aead_req, sg, sg, data_len, b_0);
+ aead_request_set_ad(aead_req, sg[0].length);
crypto_aead_encrypt(aead_req);
}
@@ -49,7 +47,7 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic,
size_t mic_len)
{
- struct scatterlist assoc, pt, ct[2];
+ struct scatterlist sg[3];
char aead_req_data[sizeof(struct aead_request) +
crypto_aead_reqsize(tfm)]
__aligned(__alignof__(struct aead_request));
@@ -60,15 +58,14 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
memset(aead_req, 0, sizeof(aead_req_data));
- sg_init_one(&pt, data, data_len);
- sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
- sg_init_table(ct, 2);
- sg_set_buf(&ct[0], data, data_len);
- sg_set_buf(&ct[1], mic, mic_len);
+ sg_init_table(sg, 3);
+ sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[1], data, data_len);
+ sg_set_buf(&sg[2], mic, mic_len);
aead_request_set_tfm(aead_req, tfm);
- aead_request_set_assoc(aead_req, &assoc, assoc.length);
- aead_request_set_crypt(aead_req, ct, &pt, data_len + mic_len, b_0);
+ aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0);
+ aead_request_set_ad(aead_req, sg[0].length);
return crypto_aead_decrypt(aead_req);
}
diff --git a/kernel/net/mac80211/aes_cmac.c b/kernel/net/mac80211/aes_cmac.c
index 4192806be..bdf0790d8 100644
--- a/kernel/net/mac80211/aes_cmac.c
+++ b/kernel/net/mac80211/aes_cmac.c
@@ -145,20 +145,3 @@ void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
{
crypto_free_cipher(tfm);
}
-
-void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf,
- u8 *k1, u8 *k2)
-{
- u8 l[AES_BLOCK_SIZE] = {};
- struct ieee80211_key *key =
- container_of(keyconf, struct ieee80211_key, conf);
-
- crypto_cipher_encrypt_one(key->u.aes_cmac.tfm, l, l);
-
- memcpy(k1, l, AES_BLOCK_SIZE);
- gf_mulx(k1);
-
- memcpy(k2, k1, AES_BLOCK_SIZE);
- gf_mulx(k2);
-}
-EXPORT_SYMBOL(ieee80211_aes_cmac_calculate_k1_k2);
diff --git a/kernel/net/mac80211/aes_gcm.c b/kernel/net/mac80211/aes_gcm.c
index fd278bbe1..3afe361fd 100644
--- a/kernel/net/mac80211/aes_gcm.c
+++ b/kernel/net/mac80211/aes_gcm.c
@@ -8,9 +8,8 @@
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/crypto.h>
#include <linux/err.h>
-#include <crypto/aes.h>
+#include <crypto/aead.h>
#include <net/mac80211.h>
#include "key.h"
@@ -19,7 +18,7 @@
void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic)
{
- struct scatterlist assoc, pt, ct[2];
+ struct scatterlist sg[3];
char aead_req_data[sizeof(struct aead_request) +
crypto_aead_reqsize(tfm)]
@@ -28,15 +27,14 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
memset(aead_req, 0, sizeof(aead_req_data));
- sg_init_one(&pt, data, data_len);
- sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
- sg_init_table(ct, 2);
- sg_set_buf(&ct[0], data, data_len);
- sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN);
+ sg_init_table(sg, 3);
+ sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[1], data, data_len);
+ sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN);
aead_request_set_tfm(aead_req, tfm);
- aead_request_set_assoc(aead_req, &assoc, assoc.length);
- aead_request_set_crypt(aead_req, &pt, ct, data_len, j_0);
+ aead_request_set_crypt(aead_req, sg, sg, data_len, j_0);
+ aead_request_set_ad(aead_req, sg[0].length);
crypto_aead_encrypt(aead_req);
}
@@ -44,7 +42,7 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic)
{
- struct scatterlist assoc, pt, ct[2];
+ struct scatterlist sg[3];
char aead_req_data[sizeof(struct aead_request) +
crypto_aead_reqsize(tfm)]
__aligned(__alignof__(struct aead_request));
@@ -55,16 +53,15 @@ int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
memset(aead_req, 0, sizeof(aead_req_data));
- sg_init_one(&pt, data, data_len);
- sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
- sg_init_table(ct, 2);
- sg_set_buf(&ct[0], data, data_len);
- sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN);
+ sg_init_table(sg, 3);
+ sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[1], data, data_len);
+ sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN);
aead_request_set_tfm(aead_req, tfm);
- aead_request_set_assoc(aead_req, &assoc, assoc.length);
- aead_request_set_crypt(aead_req, ct, &pt,
+ aead_request_set_crypt(aead_req, sg, sg,
data_len + IEEE80211_GCMP_MIC_LEN, j_0);
+ aead_request_set_ad(aead_req, sg[0].length);
return crypto_aead_decrypt(aead_req);
}
diff --git a/kernel/net/mac80211/aes_gmac.c b/kernel/net/mac80211/aes_gmac.c
index f1321b7d6..3ddd927aa 100644
--- a/kernel/net/mac80211/aes_gmac.c
+++ b/kernel/net/mac80211/aes_gmac.c
@@ -9,8 +9,8 @@
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/crypto.h>
#include <linux/err.h>
+#include <crypto/aead.h>
#include <crypto/aes.h>
#include <net/mac80211.h>
@@ -24,7 +24,7 @@
int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
const u8 *data, size_t data_len, u8 *mic)
{
- struct scatterlist sg[3], ct[1];
+ struct scatterlist sg[4];
char aead_req_data[sizeof(struct aead_request) +
crypto_aead_reqsize(tfm)]
__aligned(__alignof__(struct aead_request));
@@ -37,21 +37,19 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
memset(aead_req, 0, sizeof(aead_req_data));
memset(zero, 0, GMAC_MIC_LEN);
- sg_init_table(sg, 3);
+ sg_init_table(sg, 4);
sg_set_buf(&sg[0], aad, AAD_LEN);
sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN);
sg_set_buf(&sg[2], zero, GMAC_MIC_LEN);
+ sg_set_buf(&sg[3], mic, GMAC_MIC_LEN);
memcpy(iv, nonce, GMAC_NONCE_LEN);
memset(iv + GMAC_NONCE_LEN, 0, sizeof(iv) - GMAC_NONCE_LEN);
iv[AES_BLOCK_SIZE - 1] = 0x01;
- sg_init_table(ct, 1);
- sg_set_buf(&ct[0], mic, GMAC_MIC_LEN);
-
aead_request_set_tfm(aead_req, tfm);
- aead_request_set_assoc(aead_req, sg, AAD_LEN + data_len);
- aead_request_set_crypt(aead_req, NULL, ct, 0, iv);
+ aead_request_set_crypt(aead_req, sg, sg, 0, iv);
+ aead_request_set_ad(aead_req, AAD_LEN + data_len);
crypto_aead_encrypt(aead_req);
diff --git a/kernel/net/mac80211/agg-rx.c b/kernel/net/mac80211/agg-rx.c
index 5c564a68f..367784be5 100644
--- a/kernel/net/mac80211/agg-rx.c
+++ b/kernel/net/mac80211/agg-rx.c
@@ -79,7 +79,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
(int)reason);
if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP,
- &sta->sta, tid, NULL, 0))
+ &sta->sta, tid, NULL, 0, false))
sdata_info(sta->sdata,
"HW problem - can not stop rx aggregation for %pM tid %d\n",
sta->sta.addr, tid);
@@ -189,6 +189,7 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
+ bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU);
u16 capab;
skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
@@ -217,7 +218,8 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
- capab = (u16)(policy << 1); /* bit 1 aggregation policy */
+ capab = (u16)(amsdu << 0); /* bit 0 A-MSDU support */
+ capab |= (u16)(policy << 1); /* bit 1 aggregation policy */
capab |= (u16)(tid << 2); /* bit 5:2 TID number */
capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */
@@ -289,7 +291,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* prepare A-MPDU MLME for Rx aggregation */
- tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+ tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
if (!tid_agg_rx)
goto end;
@@ -321,7 +323,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
__skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
- &sta->sta, tid, &start_seq_num, 0);
+ &sta->sta, tid, &start_seq_num, 0, false);
ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
sta->sta.addr, tid, ret);
if (ret) {
diff --git a/kernel/net/mac80211/agg-tx.c b/kernel/net/mac80211/agg-tx.c
index cce9d425c..ff757181b 100644
--- a/kernel/net/mac80211/agg-tx.c
+++ b/kernel/net/mac80211/agg-tx.c
@@ -97,7 +97,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
mgmt->u.action.u.addba_req.dialog_token = dialog_token;
- capab = (u16)(1 << 1); /* bit 1 aggregation policy */
+ capab = (u16)(1 << 0); /* bit 0 A-MSDU support */
+ capab |= (u16)(1 << 1); /* bit 1 aggregation policy */
capab |= (u16)(tid << 2); /* bit 5:2 TID number */
capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */
@@ -331,7 +332,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
return -EALREADY;
ret = drv_ampdu_action(local, sta->sdata,
IEEE80211_AMPDU_TX_STOP_FLUSH_CONT,
- &sta->sta, tid, NULL, 0);
+ &sta->sta, tid, NULL, 0, false);
WARN_ON_ONCE(ret);
return 0;
}
@@ -381,7 +382,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST;
ret = drv_ampdu_action(local, sta->sdata, action,
- &sta->sta, tid, NULL, 0);
+ &sta->sta, tid, NULL, 0, false);
/* HW shall not deny going back to legacy */
if (WARN_ON(ret)) {
@@ -469,7 +470,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
start_seq_num = sta->tid_seq[tid] >> 4;
ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
- &sta->sta, tid, &start_seq_num, 0);
+ &sta->sta, tid, &start_seq_num, 0, false);
if (ret) {
ht_dbg(sdata,
"BA request denied - HW unavailable for %pM tid %d\n",
@@ -499,7 +500,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
/* send AddBA request */
ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
tid_tx->dialog_token, start_seq_num,
- local->hw.max_tx_aggregation_subframes,
+ IEEE80211_MAX_AMPDU_BUF,
tid_tx->timeout);
}
@@ -564,8 +565,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
return -EINVAL;
if ((tid >= IEEE80211_NUM_TIDS) ||
- !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) ||
- (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW))
+ !ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) ||
+ ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW))
return -EINVAL;
ht_dbg(sdata, "Open BA session requested for %pM tid %u\n",
@@ -693,7 +694,8 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
drv_ampdu_action(local, sta->sdata,
IEEE80211_AMPDU_TX_OPERATIONAL,
- &sta->sta, tid, NULL, tid_tx->buf_size);
+ &sta->sta, tid, NULL, tid_tx->buf_size,
+ tid_tx->amsdu);
/*
* synchronize with TX path, while splicing the TX path
@@ -918,10 +920,13 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
struct tid_ampdu_tx *tid_tx;
u16 capab, tid;
u8 buf_size;
+ bool amsdu;
capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
+ amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK;
tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+ buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
mutex_lock(&sta->ampdu_mlme.mtx);
@@ -968,6 +973,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
}
tid_tx->buf_size = buf_size;
+ tid_tx->amsdu = amsdu;
if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))
ieee80211_agg_tx_operational(local, sta, tid);
diff --git a/kernel/net/mac80211/cfg.c b/kernel/net/mac80211/cfg.c
index f06d42267..c12f34813 100644
--- a/kernel/net/mac80211/cfg.c
+++ b/kernel/net/mac80211/cfg.c
@@ -2,7 +2,7 @@
* mac80211 configuration hooks for cfg80211
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
- * Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2013-2015 Intel Mobile Communications GmbH
*
* This file is GPLv2 as found in COPYING.
*/
@@ -17,7 +17,6 @@
#include <net/cfg80211.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
-#include "cfg.h"
#include "rate.h"
#include "mesh.h"
#include "wme.h"
@@ -137,6 +136,9 @@ static int ieee80211_set_noack_map(struct wiphy *wiphy,
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
sdata->noack_map = noack_map;
+
+ ieee80211_check_fast_xmit_iface(sdata);
+
return 0;
}
@@ -309,6 +311,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
u32 iv32;
u16 iv16;
int err = -ENOENT;
+ struct ieee80211_key_seq kseq = {};
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -339,10 +342,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
iv32 = key->u.tkip.tx.iv32;
iv16 = key->u.tkip.tx.iv16;
- if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
- drv_get_tkip_seq(sdata->local,
- key->conf.hw_key_idx,
- &iv32, &iv16);
+ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
+ !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
+ drv_get_key_seq(sdata->local, key, &kseq);
+ iv32 = kseq.tkip.iv32;
+ iv16 = kseq.tkip.iv16;
+ }
seq[0] = iv16 & 0xff;
seq[1] = (iv16 >> 8) & 0xff;
@@ -355,52 +360,44 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
- pn64 = atomic64_read(&key->u.ccmp.tx_pn);
- seq[0] = pn64;
- seq[1] = pn64 >> 8;
- seq[2] = pn64 >> 16;
- seq[3] = pn64 >> 24;
- seq[4] = pn64 >> 32;
- seq[5] = pn64 >> 40;
- params.seq = seq;
- params.seq_len = 6;
- break;
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
- seq[0] = pn64;
- seq[1] = pn64 >> 8;
- seq[2] = pn64 >> 16;
- seq[3] = pn64 >> 24;
- seq[4] = pn64 >> 32;
- seq[5] = pn64 >> 40;
- params.seq = seq;
- params.seq_len = 6;
- break;
+ BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
+ offsetof(typeof(kseq), aes_cmac));
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- pn64 = atomic64_read(&key->u.aes_gmac.tx_pn);
- seq[0] = pn64;
- seq[1] = pn64 >> 8;
- seq[2] = pn64 >> 16;
- seq[3] = pn64 >> 24;
- seq[4] = pn64 >> 32;
- seq[5] = pn64 >> 40;
- params.seq = seq;
- params.seq_len = 6;
- break;
+ BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
+ offsetof(typeof(kseq), aes_gmac));
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
- pn64 = atomic64_read(&key->u.gcmp.tx_pn);
- seq[0] = pn64;
- seq[1] = pn64 >> 8;
- seq[2] = pn64 >> 16;
- seq[3] = pn64 >> 24;
- seq[4] = pn64 >> 32;
- seq[5] = pn64 >> 40;
+ BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
+ offsetof(typeof(kseq), gcmp));
+
+ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
+ !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
+ drv_get_key_seq(sdata->local, key, &kseq);
+ memcpy(seq, kseq.ccmp.pn, 6);
+ } else {
+ pn64 = atomic64_read(&key->conf.tx_pn);
+ seq[0] = pn64;
+ seq[1] = pn64 >> 8;
+ seq[2] = pn64 >> 16;
+ seq[3] = pn64 >> 24;
+ seq[4] = pn64 >> 32;
+ seq[5] = pn64 >> 40;
+ }
params.seq = seq;
params.seq_len = 6;
break;
+ default:
+ if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ break;
+ if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+ break;
+ drv_get_key_seq(sdata->local, key, &kseq);
+ params.seq = kseq.hw.seq;
+ params.seq_len = kseq.hw.seq_len;
+ break;
}
params.key = key->conf.key;
@@ -471,45 +468,6 @@ void sta_set_rate_info_tx(struct sta_info *sta,
rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
}
-void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
-{
- rinfo->flags = 0;
-
- if (sta->last_rx_rate_flag & RX_FLAG_HT) {
- rinfo->flags |= RATE_INFO_FLAGS_MCS;
- rinfo->mcs = sta->last_rx_rate_idx;
- } else if (sta->last_rx_rate_flag & RX_FLAG_VHT) {
- rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS;
- rinfo->nss = sta->last_rx_rate_vht_nss;
- rinfo->mcs = sta->last_rx_rate_idx;
- } else {
- struct ieee80211_supported_band *sband;
- int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
- u16 brate;
-
- sband = sta->local->hw.wiphy->bands[
- ieee80211_get_sdata_band(sta->sdata)];
- brate = sband->bitrates[sta->last_rx_rate_idx].bitrate;
- rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
- }
-
- if (sta->last_rx_rate_flag & RX_FLAG_SHORT_GI)
- rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
-
- if (sta->last_rx_rate_flag & RX_FLAG_5MHZ)
- rinfo->bw = RATE_INFO_BW_5;
- else if (sta->last_rx_rate_flag & RX_FLAG_10MHZ)
- rinfo->bw = RATE_INFO_BW_10;
- else if (sta->last_rx_rate_flag & RX_FLAG_40MHZ)
- rinfo->bw = RATE_INFO_BW_40;
- else if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_80MHZ)
- rinfo->bw = RATE_INFO_BW_80;
- else if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_160MHZ)
- rinfo->bw = RATE_INFO_BW_160;
- else
- rinfo->bw = RATE_INFO_BW_20;
-}
-
static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
int idx, u8 *mac, struct station_info *sinfo)
{
@@ -983,7 +941,7 @@ static int sta_apply_auth_flags(struct ieee80211_local *local,
* well. Some drivers require rate control initialized
* before drv_sta_state() is called.
*/
- if (test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
rate_control_rate_init(sta);
ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
@@ -1021,6 +979,65 @@ static int sta_apply_auth_flags(struct ieee80211_local *local,
return 0;
}
+static void sta_apply_mesh_params(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct station_parameters *params)
+{
+#ifdef CONFIG_MAC80211_MESH
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed = 0;
+
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) {
+ switch (params->plink_state) {
+ case NL80211_PLINK_ESTAB:
+ if (sta->mesh->plink_state != NL80211_PLINK_ESTAB)
+ changed = mesh_plink_inc_estab_count(sdata);
+ sta->mesh->plink_state = params->plink_state;
+
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ sdata->u.mesh.mshcfg.power_mode);
+ break;
+ case NL80211_PLINK_LISTEN:
+ case NL80211_PLINK_BLOCKED:
+ case NL80211_PLINK_OPN_SNT:
+ case NL80211_PLINK_OPN_RCVD:
+ case NL80211_PLINK_CNF_RCVD:
+ case NL80211_PLINK_HOLDING:
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
+ changed = mesh_plink_dec_estab_count(sdata);
+ sta->mesh->plink_state = params->plink_state;
+
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ NL80211_MESH_POWER_UNKNOWN);
+ break;
+ default:
+ /* nothing */
+ break;
+ }
+ }
+
+ switch (params->plink_action) {
+ case NL80211_PLINK_ACTION_NO_ACTION:
+ /* nothing */
+ break;
+ case NL80211_PLINK_ACTION_OPEN:
+ changed |= mesh_plink_open(sta);
+ break;
+ case NL80211_PLINK_ACTION_BLOCK:
+ changed |= mesh_plink_block(sta);
+ break;
+ }
+
+ if (params->local_pm)
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ params->local_pm);
+
+ ieee80211_mbss_info_change_notify(sdata, changed);
+#endif
+}
+
static int sta_apply_parameters(struct ieee80211_local *local,
struct sta_info *sta,
struct station_parameters *params)
@@ -1063,8 +1080,11 @@ static int sta_apply_parameters(struct ieee80211_local *local,
local->hw.queues >= IEEE80211_NUM_ACS)
sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME);
- /* auth flags will be set later for TDLS stations */
- if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ /* auth flags will be set later for TDLS,
+ * and for unassociated stations that move to assocaited */
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
+ !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) &&
+ (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) {
ret = sta_apply_auth_flags(local, sta, mask, set);
if (ret)
return ret;
@@ -1099,6 +1119,13 @@ static int sta_apply_parameters(struct ieee80211_local *local,
params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH)
set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH);
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
+ !sdata->u.mgd.tdls_wider_bw_prohibited &&
+ ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) &&
+ params->ext_capab_len >= 8 &&
+ params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED)
+ set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW);
+
if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) {
sta->sta.uapsd_queues = params->uapsd_queues;
sta->sta.max_sp = params->max_sp;
@@ -1142,69 +1169,15 @@ static int sta_apply_parameters(struct ieee80211_local *local,
* rc isn't initialized here yet, so ignore it
*/
__ieee80211_vht_handle_opmode(sdata, sta,
- params->opmode_notif,
- band, false);
+ params->opmode_notif, band);
}
- if (ieee80211_vif_is_mesh(&sdata->vif)) {
-#ifdef CONFIG_MAC80211_MESH
- u32 changed = 0;
-
- if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) {
- switch (params->plink_state) {
- case NL80211_PLINK_ESTAB:
- if (sta->plink_state != NL80211_PLINK_ESTAB)
- changed = mesh_plink_inc_estab_count(
- sdata);
- sta->plink_state = params->plink_state;
-
- ieee80211_mps_sta_status_update(sta);
- changed |= ieee80211_mps_set_sta_local_pm(sta,
- sdata->u.mesh.mshcfg.power_mode);
- break;
- case NL80211_PLINK_LISTEN:
- case NL80211_PLINK_BLOCKED:
- case NL80211_PLINK_OPN_SNT:
- case NL80211_PLINK_OPN_RCVD:
- case NL80211_PLINK_CNF_RCVD:
- case NL80211_PLINK_HOLDING:
- if (sta->plink_state == NL80211_PLINK_ESTAB)
- changed = mesh_plink_dec_estab_count(
- sdata);
- sta->plink_state = params->plink_state;
-
- ieee80211_mps_sta_status_update(sta);
- changed |= ieee80211_mps_set_sta_local_pm(sta,
- NL80211_MESH_POWER_UNKNOWN);
- break;
- default:
- /* nothing */
- break;
- }
- }
-
- switch (params->plink_action) {
- case NL80211_PLINK_ACTION_NO_ACTION:
- /* nothing */
- break;
- case NL80211_PLINK_ACTION_OPEN:
- changed |= mesh_plink_open(sta);
- break;
- case NL80211_PLINK_ACTION_BLOCK:
- changed |= mesh_plink_block(sta);
- break;
- }
-
- if (params->local_pm)
- changed |=
- ieee80211_mps_set_sta_local_pm(sta,
- params->local_pm);
- ieee80211_mbss_info_change_notify(sdata, changed);
-#endif
- }
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ sta_apply_mesh_params(local, sta, params);
/* set the STA state after all sta info from usermode has been set */
- if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) ||
+ set & BIT(NL80211_STA_FLAG_ASSOCIATED)) {
ret = sta_apply_auth_flags(local, sta, mask, set);
if (ret)
return ret;
@@ -1246,12 +1219,14 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
* defaults -- if userspace wants something else we'll
* change it accordingly in sta_apply_parameters()
*/
- if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) {
+ if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) &&
+ !(params->sta_flags_set & (BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED)))) {
sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
- } else {
- sta->sta.tdls = true;
}
+ if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
+ sta->sta.tdls = true;
err = sta_apply_parameters(local, sta, params);
if (err) {
@@ -1260,10 +1235,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
}
/*
- * for TDLS, rate control should be initialized only when
- * rates are known and station is marked authorized
+ * for TDLS and for unassociated station, rate control should be
+ * initialized only when rates are known and station is marked
+ * authorized/associated
*/
- if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
+ test_sta_flag(sta, WLAN_STA_ASSOC))
rate_control_rate_init(sta);
layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
@@ -1338,7 +1315,10 @@ static int ieee80211_change_station(struct wiphy *wiphy,
break;
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
- statype = CFG80211_STA_AP_CLIENT;
+ if (test_sta_flag(sta, WLAN_STA_ASSOC))
+ statype = CFG80211_STA_AP_CLIENT;
+ else
+ statype = CFG80211_STA_AP_CLIENT_UNASSOC;
break;
default:
err = -EOPNOTSUPP;
@@ -1372,6 +1352,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
}
sta->sdata = vlansdata;
+ ieee80211_check_fast_xmit(sta);
if (sta->sta_state == IEEE80211_STA_AUTHORIZED &&
prev_4addr != new_4addr) {
@@ -1406,7 +1387,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
ieee80211_recalc_ps_vif(sdata);
}
@@ -1764,7 +1745,7 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
/* our RSSI threshold implementation is supported only for
* devices that report signal in dBm.
*/
- if (!(sdata->local->hw.flags & IEEE80211_HW_SIGNAL_DBM))
+ if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM))
return -ENOTSUPP;
conf->rssi_threshold = nconf->rssi_threshold;
}
@@ -2028,12 +2009,12 @@ ieee80211_sched_scan_start(struct wiphy *wiphy,
static int
ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev)
{
- struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = wiphy_priv(wiphy);
- if (!sdata->local->ops->sched_scan_stop)
+ if (!local->ops->sched_scan_stop)
return -EOPNOTSUPP;
- return ieee80211_request_sched_scan_stop(sdata);
+ return ieee80211_request_sched_scan_stop(local);
}
static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev,
@@ -2099,10 +2080,14 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
int err;
if (changed & WIPHY_PARAM_FRAG_THRESHOLD) {
+ ieee80211_check_fast_xmit_all(local);
+
err = drv_set_frag_threshold(local, wiphy->frag_threshold);
- if (err)
+ if (err) {
+ ieee80211_check_fast_xmit_all(local);
return err;
+ }
}
if ((changed & WIPHY_PARAM_COVERAGE_CLASS) ||
@@ -2355,6 +2340,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
const u8 *ap;
enum ieee80211_smps_mode old_req;
int err;
+ struct sta_info *sta;
+ bool tdls_peer_found = false;
lockdep_assert_held(&sdata->wdev.mtx);
@@ -2379,11 +2366,22 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
ap = sdata->u.mgd.associated->bssid;
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+ if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ continue;
+
+ tdls_peer_found = true;
+ break;
+ }
+ rcu_read_unlock();
+
if (smps_mode == IEEE80211_SMPS_AUTOMATIC) {
- if (sdata->u.mgd.powersave)
- smps_mode = IEEE80211_SMPS_DYNAMIC;
- else
+ if (tdls_peer_found || !sdata->u.mgd.powersave)
smps_mode = IEEE80211_SMPS_OFF;
+ else
+ smps_mode = IEEE80211_SMPS_DYNAMIC;
}
/* send SM PS frame to AP */
@@ -2391,6 +2389,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
ap, ap);
if (err)
sdata->u.mgd.req_smps = old_req;
+ else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found)
+ ieee80211_teardown_tdls_peers(sdata);
return err;
}
@@ -2404,7 +2404,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
if (sdata->vif.type != NL80211_IFTYPE_STATION)
return -EOPNOTSUPP;
- if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS))
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS))
return -EOPNOTSUPP;
if (enabled == sdata->u.mgd.powersave &&
@@ -2419,10 +2419,10 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
__ieee80211_request_smps_mgd(sdata, sdata->u.mgd.req_smps);
sdata_unlock(sdata);
- if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
ieee80211_recalc_ps_vif(sdata);
return 0;
@@ -2440,8 +2440,13 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
rssi_hyst == bss_conf->cqm_rssi_hyst)
return 0;
+ if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER &&
+ !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
+ return -EOPNOTSUPP;
+
bss_conf->cqm_rssi_thold = rssi_thold;
bss_conf->cqm_rssi_hyst = rssi_hyst;
+ sdata->u.mgd.last_cqm_event_signal = 0;
/* tell the driver upon association, unless already associated */
if (sdata->u.mgd.associated &&
@@ -2463,7 +2468,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
if (!ieee80211_sdata_running(sdata))
return -ENETDOWN;
- if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) {
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
ret = drv_set_bitrate_mask(local, sdata, mask);
if (ret)
return ret;
@@ -2476,16 +2481,28 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
sdata->rc_rateidx_mask[i] = mask->control[i].legacy;
memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs,
sizeof(mask->control[i].ht_mcs));
+ memcpy(sdata->rc_rateidx_vht_mcs_mask[i],
+ mask->control[i].vht_mcs,
+ sizeof(mask->control[i].vht_mcs));
sdata->rc_has_mcs_mask[i] = false;
+ sdata->rc_has_vht_mcs_mask[i] = false;
if (!sband)
continue;
- for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++)
+ for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) {
if (~sdata->rc_rateidx_mcs_mask[i][j]) {
sdata->rc_has_mcs_mask[i] = true;
break;
}
+ }
+
+ for (j = 0; j < NL80211_VHT_NSS_MAX; j++) {
+ if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) {
+ sdata->rc_has_vht_mcs_mask[i] = true;
+ break;
+ }
+ }
}
return 0;
@@ -2514,6 +2531,19 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local,
return true;
}
+static u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local)
+{
+ lockdep_assert_held(&local->mtx);
+
+ local->roc_cookie_counter++;
+
+ /* wow, you wrapped 64 bits ... more likely a bug */
+ if (WARN_ON(local->roc_cookie_counter == 0))
+ local->roc_cookie_counter++;
+
+ return local->roc_cookie_counter;
+}
+
static int ieee80211_start_roc_work(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel,
@@ -2551,7 +2581,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local,
roc->req_duration = duration;
roc->frame = txskb;
roc->type = type;
- roc->mgmt_tx_cookie = (unsigned long)txskb;
roc->sdata = sdata;
INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work);
INIT_LIST_HEAD(&roc->dependents);
@@ -2561,17 +2590,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local,
* or the SKB (for mgmt TX)
*/
if (!txskb) {
- /* local->mtx protects this */
- local->roc_cookie_counter++;
- roc->cookie = local->roc_cookie_counter;
- /* wow, you wrapped 64 bits ... more likely a bug */
- if (WARN_ON(roc->cookie == 0)) {
- roc->cookie = 1;
- local->roc_cookie_counter++;
- }
+ roc->cookie = ieee80211_mgmt_tx_cookie(local);
*cookie = roc->cookie;
} else {
- *cookie = (unsigned long)txskb;
+ roc->mgmt_tx_cookie = *cookie;
}
/* if there's one pending or we're scanning, queue this one */
@@ -3244,13 +3266,43 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
return err;
}
+static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local,
+ struct sk_buff *skb, u64 *cookie,
+ gfp_t gfp)
+{
+ unsigned long spin_flags;
+ struct sk_buff *ack_skb;
+ int id;
+
+ ack_skb = skb_copy(skb, gfp);
+ if (!ack_skb)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_irqsave(&local->ack_status_lock, spin_flags);
+ id = idr_alloc(&local->ack_status_frames, ack_skb,
+ 1, 0x10000, GFP_ATOMIC);
+ spin_unlock_irqrestore(&local->ack_status_lock, spin_flags);
+
+ if (id < 0) {
+ kfree_skb(ack_skb);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ IEEE80211_SKB_CB(skb)->ack_frame_id = id;
+
+ *cookie = ieee80211_mgmt_tx_cookie(local);
+ IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie;
+
+ return ack_skb;
+}
+
static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
struct cfg80211_mgmt_tx_params *params,
u64 *cookie)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
struct ieee80211_local *local = sdata->local;
- struct sk_buff *skb;
+ struct sk_buff *skb, *ack_skb;
struct sta_info *sta;
const struct ieee80211_mgmt *mgmt = (void *)params->buf;
bool need_offchan = false;
@@ -3299,8 +3351,14 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
break;
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_P2P_CLIENT:
- if (!sdata->u.mgd.associated)
+ sdata_lock(sdata);
+ if (!sdata->u.mgd.associated ||
+ (params->offchan && params->wait &&
+ local->ops->remain_on_channel &&
+ memcmp(sdata->u.mgd.associated->bssid,
+ mgmt->bssid, ETH_ALEN)))
need_offchan = true;
+ sdata_unlock(sdata);
break;
case NL80211_IFTYPE_P2P_DEVICE:
need_offchan = true;
@@ -3383,8 +3441,27 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
skb->dev = sdata->dev;
+ if (!params->dont_wait_for_ack) {
+ /* make a copy to preserve the frame contents
+ * in case of encryption.
+ */
+ ack_skb = ieee80211_make_ack_skb(local, skb, cookie,
+ GFP_KERNEL);
+ if (IS_ERR(ack_skb)) {
+ ret = PTR_ERR(ack_skb);
+ kfree_skb(skb);
+ goto out_unlock;
+ }
+ } else {
+ /* Assign a dummy non-zero cookie, it's not sent to
+ * userspace in this case but we rely on its value
+ * internally in the need_offchan case to distinguish
+ * mgmt-tx from remain-on-channel.
+ */
+ *cookie = 0xffffffff;
+ }
+
if (!need_offchan) {
- *cookie = (unsigned long) skb;
ieee80211_tx_skb(sdata, skb);
ret = 0;
goto out_unlock;
@@ -3392,7 +3469,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN |
IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
- if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
IEEE80211_SKB_CB(skb)->hw_queue =
local->hw.offchannel_tx_hw_queue;
@@ -3421,18 +3498,32 @@ static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
u16 frame_type, bool reg)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
switch (frame_type) {
case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ:
- if (reg)
+ if (reg) {
local->probe_req_reg++;
- else
- local->probe_req_reg--;
+ sdata->vif.probe_req_reg++;
+ } else {
+ if (local->probe_req_reg)
+ local->probe_req_reg--;
+
+ if (sdata->vif.probe_req_reg)
+ sdata->vif.probe_req_reg--;
+ }
if (!local->open_count)
break;
- ieee80211_queue_work(&local->hw, &local->reconfig_filter);
+ if (sdata->vif.probe_req_reg == 1)
+ drv_config_iface_filter(local, sdata, FIF_PROBE_REQ,
+ FIF_PROBE_REQ);
+ else if (sdata->vif.probe_req_reg == 0)
+ drv_config_iface_filter(local, sdata, 0,
+ FIF_PROBE_REQ);
+
+ ieee80211_configure_filter(local);
break;
default:
break;
@@ -3477,7 +3568,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
struct ieee80211_qos_hdr *nullfunc;
- struct sk_buff *skb;
+ struct sk_buff *skb, *ack_skb;
int size = sizeof(*nullfunc);
__le16 fc;
bool qos;
@@ -3485,20 +3576,24 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
struct sta_info *sta;
struct ieee80211_chanctx_conf *chanctx_conf;
enum ieee80211_band band;
+ int ret;
+
+ /* the lock is needed to assign the cookie later */
+ mutex_lock(&local->mtx);
rcu_read_lock();
chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
if (WARN_ON(!chanctx_conf)) {
- rcu_read_unlock();
- return -EINVAL;
+ ret = -EINVAL;
+ goto unlock;
}
band = chanctx_conf->def.chan->band;
sta = sta_info_get_bss(sdata, peer);
if (sta) {
qos = sta->sta.wme;
} else {
- rcu_read_unlock();
- return -ENOLINK;
+ ret = -ENOLINK;
+ goto unlock;
}
if (qos) {
@@ -3514,8 +3609,8 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
skb = dev_alloc_skb(local->hw.extra_tx_headroom + size);
if (!skb) {
- rcu_read_unlock();
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto unlock;
}
skb->dev = dev;
@@ -3541,13 +3636,23 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
if (qos)
nullfunc->qos_ctrl = cpu_to_le16(7);
+ ack_skb = ieee80211_make_ack_skb(local, skb, cookie, GFP_ATOMIC);
+ if (IS_ERR(ack_skb)) {
+ kfree_skb(skb);
+ ret = PTR_ERR(ack_skb);
+ goto unlock;
+ }
+
local_bh_disable();
ieee80211_xmit(sdata, sta, skb);
local_bh_enable();
+
+ ret = 0;
+unlock:
rcu_read_unlock();
+ mutex_unlock(&local->mtx);
- *cookie = (unsigned long) skb;
- return 0;
+ return ret;
}
static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
diff --git a/kernel/net/mac80211/cfg.h b/kernel/net/mac80211/cfg.h
deleted file mode 100644
index 2d51f62dc..000000000
--- a/kernel/net/mac80211/cfg.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * mac80211 configuration hooks for cfg80211
- */
-#ifndef __CFG_H
-#define __CFG_H
-
-extern const struct cfg80211_ops mac80211_config_ops;
-
-#endif /* __CFG_H */
diff --git a/kernel/net/mac80211/chan.c b/kernel/net/mac80211/chan.c
index 5bcd4e558..1d1b9b7bd 100644
--- a/kernel/net/mac80211/chan.c
+++ b/kernel/net/mac80211/chan.c
@@ -190,7 +190,7 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
return NULL;
}
-static enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
+enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
{
switch (sta->bandwidth) {
case IEEE80211_STA_RX_BW_20:
@@ -264,9 +264,17 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
case NL80211_IFTYPE_AP_VLAN:
width = ieee80211_get_max_required_bw(sdata);
break;
+ case NL80211_IFTYPE_STATION:
+ /*
+ * The ap's sta->bandwidth is not set yet at this
+ * point, so take the width from the chandef, but
+ * account also for TDLS peers
+ */
+ width = max(vif->bss_conf.chandef.width,
+ ieee80211_get_max_required_bw(sdata));
+ break;
case NL80211_IFTYPE_P2P_DEVICE:
continue;
- case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_ADHOC:
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
@@ -554,12 +562,13 @@ static void ieee80211_free_chanctx(struct ieee80211_local *local,
kfree_rcu(ctx, rcu_head);
}
-static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx)
+void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
{
struct ieee80211_chanctx_conf *conf = &ctx->conf;
struct ieee80211_sub_if_data *sdata;
const struct cfg80211_chan_def *compat = NULL;
+ struct sta_info *sta;
lockdep_assert_held(&local->chanctx_mtx);
@@ -581,6 +590,20 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
if (WARN_ON_ONCE(!compat))
break;
}
+
+ /* TDLS peers can sometimes affect the chandef width */
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (!sta->uploaded ||
+ !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) ||
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+ !sta->tdls_chandef.chan)
+ continue;
+
+ compat = cfg80211_chandef_compatible(&sta->tdls_chandef,
+ compat);
+ if (WARN_ON_ONCE(!compat))
+ break;
+ }
rcu_read_unlock();
if (!compat)
@@ -664,6 +687,8 @@ out:
ieee80211_bss_info_change_notify(sdata,
BSS_CHANGED_IDLE);
+ ieee80211_check_fast_xmit_iface(sdata);
+
return ret;
}
@@ -1008,6 +1033,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
if (WARN_ON(!chandef))
return -EINVAL;
+ ieee80211_change_chanctx(local, new_ctx, chandef);
+
vif_chsw[0].vif = &sdata->vif;
vif_chsw[0].old_ctx = &old_ctx->conf;
vif_chsw[0].new_ctx = &new_ctx->conf;
@@ -1030,6 +1057,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
if (sdata->vif.type == NL80211_IFTYPE_AP)
__ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
+ ieee80211_check_fast_xmit_iface(sdata);
+
if (ieee80211_chanctx_refcount(local, old_ctx) == 0)
ieee80211_free_chanctx(local, old_ctx);
@@ -1079,6 +1108,8 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
if (WARN_ON(!chandef))
return -EINVAL;
+ ieee80211_change_chanctx(local, new_ctx, chandef);
+
list_del(&sdata->reserved_chanctx_list);
sdata->reserved_chanctx = NULL;
@@ -1376,6 +1407,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
__ieee80211_vif_copy_chanctx_to_vlans(sdata,
false);
+ ieee80211_check_fast_xmit_iface(sdata);
+
sdata->radar_required = sdata->reserved_radar_required;
if (sdata->vif.bss_conf.chandef.width !=
diff --git a/kernel/net/mac80211/debugfs.c b/kernel/net/mac80211/debugfs.c
index 23813ebb3..4d2aaebd4 100644
--- a/kernel/net/mac80211/debugfs.c
+++ b/kernel/net/mac80211/debugfs.c
@@ -1,4 +1,3 @@
-
/*
* mac80211 debugfs for wireless PHYs
*
@@ -92,62 +91,69 @@ static const struct file_operations reset_ops = {
};
#endif
+static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
+#define FLAG(F) [IEEE80211_HW_##F] = #F
+ FLAG(HAS_RATE_CONTROL),
+ FLAG(RX_INCLUDES_FCS),
+ FLAG(HOST_BROADCAST_PS_BUFFERING),
+ FLAG(SIGNAL_UNSPEC),
+ FLAG(SIGNAL_DBM),
+ FLAG(NEED_DTIM_BEFORE_ASSOC),
+ FLAG(SPECTRUM_MGMT),
+ FLAG(AMPDU_AGGREGATION),
+ FLAG(SUPPORTS_PS),
+ FLAG(PS_NULLFUNC_STACK),
+ FLAG(SUPPORTS_DYNAMIC_PS),
+ FLAG(MFP_CAPABLE),
+ FLAG(WANT_MONITOR_VIF),
+ FLAG(NO_AUTO_VIF),
+ FLAG(SW_CRYPTO_CONTROL),
+ FLAG(SUPPORT_FAST_XMIT),
+ FLAG(REPORTS_TX_ACK_STATUS),
+ FLAG(CONNECTION_MONITOR),
+ FLAG(QUEUE_CONTROL),
+ FLAG(SUPPORTS_PER_STA_GTK),
+ FLAG(AP_LINK_PS),
+ FLAG(TX_AMPDU_SETUP_IN_HW),
+ FLAG(SUPPORTS_RC_TABLE),
+ FLAG(P2P_DEV_ADDR_FOR_INTF),
+ FLAG(TIMING_BEACON_ONLY),
+ FLAG(SUPPORTS_HT_CCK_RATES),
+ FLAG(CHANCTX_STA_CSA),
+ FLAG(SUPPORTS_CLONED_SKBS),
+ FLAG(SINGLE_SCAN_ON_ALL_BANDS),
+ FLAG(TDLS_WIDER_BW),
+ FLAG(SUPPORTS_AMSDU_IN_AMPDU),
+ FLAG(BEACON_TX_STATUS),
+
+ /* keep last for the build bug below */
+ (void *)0x1
+#undef FLAG
+};
+
static ssize_t hwflags_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
struct ieee80211_local *local = file->private_data;
- int mxln = 500;
+ size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS;
+ char *buf = kzalloc(bufsz, GFP_KERNEL);
+ char *pos = buf, *end = buf + bufsz - 1;
ssize_t rv;
- char *buf = kzalloc(mxln, GFP_KERNEL);
- int sf = 0; /* how many written so far */
+ int i;
if (!buf)
- return 0;
-
- sf += scnprintf(buf, mxln - sf, "0x%x\n", local->hw.flags);
- if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
- sf += scnprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n");
- if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
- sf += scnprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n");
- if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)
- sf += scnprintf(buf + sf, mxln - sf,
- "HOST_BCAST_PS_BUFFERING\n");
- if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)
- sf += scnprintf(buf + sf, mxln - sf,
- "2GHZ_SHORT_SLOT_INCAPABLE\n");
- if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)
- sf += scnprintf(buf + sf, mxln - sf,
- "2GHZ_SHORT_PREAMBLE_INCAPABLE\n");
- if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
- sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n");
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
- sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n");
- if (local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC)
- sf += scnprintf(buf + sf, mxln - sf,
- "NEED_DTIM_BEFORE_ASSOC\n");
- if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)
- sf += scnprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n");
- if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)
- sf += scnprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n");
- if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS)
- sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n");
- if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
- sf += scnprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n");
- if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
- sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n");
- if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE)
- sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n");
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
- sf += scnprintf(buf + sf, mxln - sf,
- "REPORTS_TX_ACK_STATUS\n");
- if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
- sf += scnprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n");
- if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)
- sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n");
- if (local->hw.flags & IEEE80211_HW_AP_LINK_PS)
- sf += scnprintf(buf + sf, mxln - sf, "AP_LINK_PS\n");
- if (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)
- sf += scnprintf(buf + sf, mxln - sf, "TX_AMPDU_SETUP_IN_HW\n");
+ return -ENOMEM;
+
+ /* fail compilation if somebody adds or removes
+ * a flag without updating the name array above
+ */
+ BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1);
+
+ for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) {
+ if (test_bit(i, local->hw.flags))
+ pos += scnprintf(pos, end - pos, "%s\n",
+ hw_flag_names[i]);
+ }
rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
kfree(buf);
@@ -219,8 +225,8 @@ static const struct file_operations stats_ ##name## _ops = { \
.llseek = generic_file_llseek, \
};
-#define DEBUGFS_STATS_ADD(name, field) \
- debugfs_create_u32(#name, 0400, statsd, (u32 *) &field);
+#define DEBUGFS_STATS_ADD(name) \
+ debugfs_create_u32(#name, 0400, statsd, &local->name);
#define DEBUGFS_DEVSTATS_ADD(name) \
debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops);
@@ -255,53 +261,30 @@ void debugfs_hw_add(struct ieee80211_local *local)
if (!statsd)
return;
- DEBUGFS_STATS_ADD(transmitted_fragment_count,
- local->dot11TransmittedFragmentCount);
- DEBUGFS_STATS_ADD(multicast_transmitted_frame_count,
- local->dot11MulticastTransmittedFrameCount);
- DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount);
- DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount);
- DEBUGFS_STATS_ADD(multiple_retry_count,
- local->dot11MultipleRetryCount);
- DEBUGFS_STATS_ADD(frame_duplicate_count,
- local->dot11FrameDuplicateCount);
- DEBUGFS_STATS_ADD(received_fragment_count,
- local->dot11ReceivedFragmentCount);
- DEBUGFS_STATS_ADD(multicast_received_frame_count,
- local->dot11MulticastReceivedFrameCount);
- DEBUGFS_STATS_ADD(transmitted_frame_count,
- local->dot11TransmittedFrameCount);
#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
- DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop);
- DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued);
- DEBUGFS_STATS_ADD(tx_handlers_drop_fragment,
- local->tx_handlers_drop_fragment);
- DEBUGFS_STATS_ADD(tx_handlers_drop_wep,
- local->tx_handlers_drop_wep);
- DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc,
- local->tx_handlers_drop_not_assoc);
- DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port,
- local->tx_handlers_drop_unauth_port);
- DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop);
- DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued);
- DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc,
- local->rx_handlers_drop_nullfunc);
- DEBUGFS_STATS_ADD(rx_handlers_drop_defrag,
- local->rx_handlers_drop_defrag);
- DEBUGFS_STATS_ADD(rx_handlers_drop_short,
- local->rx_handlers_drop_short);
- DEBUGFS_STATS_ADD(tx_expand_skb_head,
- local->tx_expand_skb_head);
- DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned,
- local->tx_expand_skb_head_cloned);
- DEBUGFS_STATS_ADD(rx_expand_skb_head,
- local->rx_expand_skb_head);
- DEBUGFS_STATS_ADD(rx_expand_skb_head2,
- local->rx_expand_skb_head2);
- DEBUGFS_STATS_ADD(rx_handlers_fragments,
- local->rx_handlers_fragments);
- DEBUGFS_STATS_ADD(tx_status_drop,
- local->tx_status_drop);
+ DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount);
+ DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount);
+ DEBUGFS_STATS_ADD(dot11FailedCount);
+ DEBUGFS_STATS_ADD(dot11RetryCount);
+ DEBUGFS_STATS_ADD(dot11MultipleRetryCount);
+ DEBUGFS_STATS_ADD(dot11FrameDuplicateCount);
+ DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount);
+ DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount);
+ DEBUGFS_STATS_ADD(dot11TransmittedFrameCount);
+ DEBUGFS_STATS_ADD(tx_handlers_drop);
+ DEBUGFS_STATS_ADD(tx_handlers_queued);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_wep);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port);
+ DEBUGFS_STATS_ADD(rx_handlers_drop);
+ DEBUGFS_STATS_ADD(rx_handlers_queued);
+ DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc);
+ DEBUGFS_STATS_ADD(rx_handlers_drop_defrag);
+ DEBUGFS_STATS_ADD(tx_expand_skb_head);
+ DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned);
+ DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag);
+ DEBUGFS_STATS_ADD(rx_handlers_fragments);
+ DEBUGFS_STATS_ADD(tx_status_drop);
#endif
DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount);
DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount);
diff --git a/kernel/net/mac80211/debugfs_key.c b/kernel/net/mac80211/debugfs_key.c
index 71ac1b5f4..7961e7d0b 100644
--- a/kernel/net/mac80211/debugfs_key.c
+++ b/kernel/net/mac80211/debugfs_key.c
@@ -2,6 +2,7 @@
* Copyright 2003-2005 Devicescape Software, Inc.
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -34,6 +35,14 @@ static const struct file_operations key_ ##name## _ops = { \
.llseek = generic_file_llseek, \
}
+#define KEY_OPS_W(name) \
+static const struct file_operations key_ ##name## _ops = { \
+ .read = key_##name##_read, \
+ .write = key_##name##_write, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
#define KEY_FILE(name, format) \
KEY_READ_##format(name) \
KEY_OPS(name)
@@ -57,7 +66,6 @@ KEY_CONF_FILE(keylen, D);
KEY_CONF_FILE(keyidx, D);
KEY_CONF_FILE(hw_key_idx, D);
KEY_FILE(flags, X);
-KEY_FILE(tx_rx_count, D);
KEY_READ(ifindex, sdata->name, "%s\n");
KEY_OPS(ifindex);
@@ -75,6 +83,41 @@ static ssize_t key_algorithm_read(struct file *file,
}
KEY_OPS(algorithm);
+static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_key *key = file->private_data;
+ u64 pn;
+ int ret;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ return -EINVAL;
+ case WLAN_CIPHER_SUITE_TKIP:
+ /* not supported yet */
+ return -EOPNOTSUPP;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ ret = kstrtou64_from_user(userbuf, count, 16, &pn);
+ if (ret)
+ return ret;
+ /* PN is a 48-bit counter */
+ if (pn >= (1ULL << 48))
+ return -ERANGE;
+ atomic64_set(&key->conf.tx_pn, pn);
+ return count;
+ default:
+ return 0;
+ }
+}
+
static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -95,28 +138,13 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
- pn = atomic64_read(&key->u.ccmp.tx_pn);
- len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
- (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
- (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
- break;
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- pn = atomic64_read(&key->u.aes_cmac.tx_pn);
- len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
- (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
- (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
- break;
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- pn = atomic64_read(&key->u.aes_gmac.tx_pn);
- len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
- (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
- (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
- break;
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
- pn = atomic64_read(&key->u.gcmp.tx_pn);
+ pn = atomic64_read(&key->conf.tx_pn);
len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
(u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
(u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
@@ -126,7 +154,7 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
}
return simple_read_from_buffer(userbuf, count, ppos, buf, len);
}
-KEY_OPS(tx_spec);
+KEY_OPS_W(tx_spec);
static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
@@ -294,6 +322,9 @@ KEY_OPS(key);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, key->debugfs.dir, \
key, &key_##name##_ops);
+#define DEBUGFS_ADD_W(name) \
+ debugfs_create_file(#name, 0600, key->debugfs.dir, \
+ key, &key_##name##_ops);
void ieee80211_debugfs_key_add(struct ieee80211_key *key)
{
@@ -325,9 +356,8 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key)
DEBUGFS_ADD(flags);
DEBUGFS_ADD(keyidx);
DEBUGFS_ADD(hw_key_idx);
- DEBUGFS_ADD(tx_rx_count);
DEBUGFS_ADD(algorithm);
- DEBUGFS_ADD(tx_spec);
+ DEBUGFS_ADD_W(tx_spec);
DEBUGFS_ADD(rx_spec);
DEBUGFS_ADD(replays);
DEBUGFS_ADD(icverrors);
diff --git a/kernel/net/mac80211/debugfs_netdev.c b/kernel/net/mac80211/debugfs_netdev.c
index c09c0131b..37ea30e07 100644
--- a/kernel/net/mac80211/debugfs_netdev.c
+++ b/kernel/net/mac80211/debugfs_netdev.c
@@ -114,14 +114,6 @@ static ssize_t ieee80211_if_fmt_##name( \
return scnprintf(buf, buflen, "%pM\n", sdata->field); \
}
-#define IEEE80211_IF_FMT_DEC_DIV_16(name, field) \
-static ssize_t ieee80211_if_fmt_##name( \
- const struct ieee80211_sub_if_data *sdata, \
- char *buf, int buflen) \
-{ \
- return scnprintf(buf, buflen, "%d\n", sdata->field / 16); \
-}
-
#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field) \
static ssize_t ieee80211_if_fmt_##name( \
const struct ieee80211_sub_if_data *sdata, \
@@ -186,6 +178,38 @@ IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz,
IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz,
rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY);
+static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_2ghz(
+ const struct ieee80211_sub_if_data *sdata,
+ char *buf, int buflen)
+{
+ int i, len = 0;
+ const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_2GHZ];
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
+ len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]);
+ len += scnprintf(buf + len, buflen - len, "\n");
+
+ return len;
+}
+
+IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_2ghz);
+
+static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_5ghz(
+ const struct ieee80211_sub_if_data *sdata,
+ char *buf, int buflen)
+{
+ int i, len = 0;
+ const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_5GHZ];
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
+ len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]);
+ len += scnprintf(buf + len, buflen - len, "\n");
+
+ return len;
+}
+
+IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_5ghz);
+
IEEE80211_IF_FILE(flags, flags, HEX);
IEEE80211_IF_FILE(state, state, LHEX);
IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC);
@@ -215,8 +239,6 @@ IEEE80211_IF_FILE_R(hw_queues);
/* STA attributes */
IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
-IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC);
-IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16);
IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
@@ -423,6 +445,34 @@ static ssize_t ieee80211_if_parse_uapsd_max_sp_len(
}
IEEE80211_IF_FILE_RW(uapsd_max_sp_len);
+static ssize_t ieee80211_if_fmt_tdls_wider_bw(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ const struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ bool tdls_wider_bw;
+
+ tdls_wider_bw = ieee80211_hw_check(&sdata->local->hw, TDLS_WIDER_BW) &&
+ !ifmgd->tdls_wider_bw_prohibited;
+
+ return snprintf(buf, buflen, "%d\n", tdls_wider_bw);
+}
+
+static ssize_t ieee80211_if_parse_tdls_wider_bw(
+ struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 val;
+ int ret;
+
+ ret = kstrtou8(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ ifmgd->tdls_wider_bw_prohibited = !val;
+ return buflen;
+}
+IEEE80211_IF_FILE_RW(tdls_wider_bw);
+
/* AP attributes */
IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC);
IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC);
@@ -565,6 +615,8 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(rc_rateidx_mask_5ghz);
DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
+ DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz);
+ DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
DEBUGFS_ADD(hw_queues);
}
@@ -572,14 +624,13 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata)
{
DEBUGFS_ADD(bssid);
DEBUGFS_ADD(aid);
- DEBUGFS_ADD(last_beacon);
- DEBUGFS_ADD(ave_beacon);
DEBUGFS_ADD(beacon_timeout);
DEBUGFS_ADD_MODE(smps, 0600);
DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
DEBUGFS_ADD_MODE(beacon_loss, 0200);
DEBUGFS_ADD_MODE(uapsd_queues, 0600);
DEBUGFS_ADD_MODE(uapsd_max_sp_len, 0600);
+ DEBUGFS_ADD_MODE(tdls_wider_bw, 0600);
}
static void add_ap_files(struct ieee80211_sub_if_data *sdata)
diff --git a/kernel/net/mac80211/debugfs_sta.c b/kernel/net/mac80211/debugfs_sta.c
index 252859e90..a39512f09 100644
--- a/kernel/net/mac80211/debugfs_sta.c
+++ b/kernel/net/mac80211/debugfs_sta.c
@@ -29,8 +29,6 @@ static ssize_t sta_ ##name## _read(struct file *file, \
format_string, sta->field); \
}
#define STA_READ_D(name, field) STA_READ(name, field, "%d\n")
-#define STA_READ_U(name, field) STA_READ(name, field, "%u\n")
-#define STA_READ_S(name, field) STA_READ(name, field, "%s\n")
#define STA_OPS(name) \
static const struct file_operations sta_ ##name## _ops = { \
@@ -52,10 +50,6 @@ static const struct file_operations sta_ ##name## _ops = { \
STA_OPS(name)
STA_FILE(aid, sta.aid, D);
-STA_FILE(dev, sdata->name, S);
-STA_FILE(last_signal, last_signal, D);
-STA_FILE(last_ack_signal, last_ack_signal, D);
-STA_FILE(beacon_loss_count, beacon_loss_count, D);
static ssize_t sta_flags_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
@@ -101,40 +95,6 @@ static ssize_t sta_num_ps_buf_frames_read(struct file *file,
}
STA_OPS(num_ps_buf_frames);
-static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
-{
- struct sta_info *sta = file->private_data;
- return mac80211_format_buffer(userbuf, count, ppos, "%d\n",
- jiffies_to_msecs(jiffies - sta->last_rx));
-}
-STA_OPS(inactive_ms);
-
-
-static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
-{
- struct sta_info *sta = file->private_data;
- struct timespec uptime;
- struct tm result;
- long connected_time_secs;
- char buf[100];
- int res;
- ktime_get_ts(&uptime);
- connected_time_secs = uptime.tv_sec - sta->last_connected;
- time_to_tm(connected_time_secs, 0, &result);
- result.tm_year -= 70;
- result.tm_mday -= 1;
- res = scnprintf(buf, sizeof(buf),
- "years - %ld\nmonths - %d\ndays - %d\nclock - %d:%d:%d\n\n",
- result.tm_year, result.tm_mon, result.tm_mday,
- result.tm_hour, result.tm_min, result.tm_sec);
- return simple_read_from_buffer(userbuf, count, ppos, buf, res);
-}
-STA_OPS(connected_time);
-
-
-
static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -359,37 +319,6 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
}
STA_OPS(vht_capa);
-static ssize_t sta_current_tx_rate_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
-{
- struct sta_info *sta = file->private_data;
- struct rate_info rinfo;
- u16 rate;
- sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo);
- rate = cfg80211_calculate_bitrate(&rinfo);
-
- return mac80211_format_buffer(userbuf, count, ppos,
- "%d.%d MBit/s\n",
- rate/10, rate%10);
-}
-STA_OPS(current_tx_rate);
-
-static ssize_t sta_last_rx_rate_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
-{
- struct sta_info *sta = file->private_data;
- struct rate_info rinfo;
- u16 rate;
-
- sta_set_rate_info_rx(sta, &rinfo);
-
- rate = cfg80211_calculate_bitrate(&rinfo);
-
- return mac80211_format_buffer(userbuf, count, ppos,
- "%d.%d MBit/s\n",
- rate/10, rate%10);
-}
-STA_OPS(last_rx_rate);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, \
@@ -432,30 +361,14 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
DEBUGFS_ADD(flags);
DEBUGFS_ADD(num_ps_buf_frames);
- DEBUGFS_ADD(inactive_ms);
- DEBUGFS_ADD(connected_time);
DEBUGFS_ADD(last_seq_ctrl);
DEBUGFS_ADD(agg_status);
- DEBUGFS_ADD(dev);
- DEBUGFS_ADD(last_signal);
- DEBUGFS_ADD(beacon_loss_count);
DEBUGFS_ADD(ht_capa);
DEBUGFS_ADD(vht_capa);
- DEBUGFS_ADD(last_ack_signal);
- DEBUGFS_ADD(current_tx_rate);
- DEBUGFS_ADD(last_rx_rate);
-
- DEBUGFS_ADD_COUNTER(rx_packets, rx_packets);
- DEBUGFS_ADD_COUNTER(tx_packets, tx_packets);
- DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes);
- DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes);
- DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates);
- DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments);
- DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped);
- DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments);
- DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count);
- DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed);
- DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count);
+
+ DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates);
+ DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
+ DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered);
if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
debugfs_create_x32("driver_buffered_tids", 0400,
diff --git a/kernel/net/mac80211/driver-ops.c b/kernel/net/mac80211/driver-ops.c
new file mode 100644
index 000000000..ca1fe5576
--- /dev/null
+++ b/kernel/net/mac80211/driver-ops.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2015 Intel Deutschland GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "trace.h"
+#include "driver-ops.h"
+
+int drv_start(struct ieee80211_local *local)
+{
+ int ret;
+
+ might_sleep();
+
+ if (WARN_ON(local->started))
+ return -EALREADY;
+
+ trace_drv_start(local);
+ local->started = true;
+ /* allow rx frames */
+ smp_mb();
+ ret = local->ops->start(&local->hw);
+ trace_drv_return_int(local, ret);
+
+ if (ret)
+ local->started = false;
+
+ return ret;
+}
+
+void drv_stop(struct ieee80211_local *local)
+{
+ might_sleep();
+
+ if (WARN_ON(!local->started))
+ return;
+
+ trace_drv_stop(local);
+ local->ops->stop(&local->hw);
+ trace_drv_return_void(local);
+
+ /* sync away all work on the tasklet before clearing started */
+ tasklet_disable(&local->tasklet);
+ tasklet_enable(&local->tasklet);
+
+ barrier();
+
+ local->started = false;
+}
+
+int drv_add_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ int ret;
+
+ might_sleep();
+
+ if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
+ !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))))
+ return -EINVAL;
+
+ trace_drv_add_interface(local, sdata);
+ ret = local->ops->add_interface(&local->hw, &sdata->vif);
+ trace_drv_return_int(local, ret);
+
+ if (ret == 0)
+ sdata->flags |= IEEE80211_SDATA_IN_DRIVER;
+
+ return ret;
+}
+
+int drv_change_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type, bool p2p)
+{
+ int ret;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_change_interface(local, sdata, type, p2p);
+ ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+void drv_remove_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_remove_interface(local, sdata);
+ local->ops->remove_interface(&local->hw, &sdata->vif);
+ sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;
+ trace_drv_return_void(local);
+}
+
+__must_check
+int drv_sta_state(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ enum ieee80211_sta_state old_state,
+ enum ieee80211_sta_state new_state)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state);
+ if (local->ops->sta_state) {
+ ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta,
+ old_state, new_state);
+ } else if (old_state == IEEE80211_STA_AUTH &&
+ new_state == IEEE80211_STA_ASSOC) {
+ ret = drv_sta_add(local, sdata, &sta->sta);
+ if (ret == 0)
+ sta->uploaded = true;
+ } else if (old_state == IEEE80211_STA_ASSOC &&
+ new_state == IEEE80211_STA_AUTH) {
+ drv_sta_remove(local, sdata, &sta->sta);
+ }
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+void drv_sta_rc_update(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, u32 changed)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED &&
+ (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT));
+
+ trace_drv_sta_rc_update(local, sdata, sta, changed);
+ if (local->ops->sta_rc_update)
+ local->ops->sta_rc_update(&local->hw, &sdata->vif,
+ sta, changed);
+
+ trace_drv_return_void(local);
+}
+
+int drv_conf_tx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata, u16 ac,
+ const struct ieee80211_tx_queue_params *params)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ if (WARN_ONCE(params->cw_min == 0 ||
+ params->cw_min > params->cw_max,
+ "%s: invalid CW_min/CW_max: %d/%d\n",
+ sdata->name, params->cw_min, params->cw_max))
+ return -EINVAL;
+
+ trace_drv_conf_tx(local, sdata, ac, params);
+ if (local->ops->conf_tx)
+ ret = local->ops->conf_tx(&local->hw, &sdata->vif,
+ ac, params);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+u64 drv_get_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ u64 ret = -1ULL;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return ret;
+
+ trace_drv_get_tsf(local, sdata);
+ if (local->ops->get_tsf)
+ ret = local->ops->get_tsf(&local->hw, &sdata->vif);
+ trace_drv_return_u64(local, ret);
+ return ret;
+}
+
+void drv_set_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u64 tsf)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_set_tsf(local, sdata, tsf);
+ if (local->ops->set_tsf)
+ local->ops->set_tsf(&local->hw, &sdata->vif, tsf);
+ trace_drv_return_void(local);
+}
+
+void drv_reset_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_reset_tsf(local, sdata);
+ if (local->ops->reset_tsf)
+ local->ops->reset_tsf(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+int drv_switch_vif_chanctx(struct ieee80211_local *local,
+ struct ieee80211_vif_chanctx_switch *vifs,
+ int n_vifs, enum ieee80211_chanctx_switch_mode mode)
+{
+ int ret = 0;
+ int i;
+
+ might_sleep();
+
+ if (!local->ops->switch_vif_chanctx)
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < n_vifs; i++) {
+ struct ieee80211_chanctx *new_ctx =
+ container_of(vifs[i].new_ctx,
+ struct ieee80211_chanctx,
+ conf);
+ struct ieee80211_chanctx *old_ctx =
+ container_of(vifs[i].old_ctx,
+ struct ieee80211_chanctx,
+ conf);
+
+ WARN_ON_ONCE(!old_ctx->driver_present);
+ WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS &&
+ new_ctx->driver_present) ||
+ (mode == CHANCTX_SWMODE_REASSIGN_VIF &&
+ !new_ctx->driver_present));
+ }
+
+ trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode);
+ ret = local->ops->switch_vif_chanctx(&local->hw,
+ vifs, n_vifs, mode);
+ trace_drv_return_int(local, ret);
+
+ if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) {
+ for (i = 0; i < n_vifs; i++) {
+ struct ieee80211_chanctx *new_ctx =
+ container_of(vifs[i].new_ctx,
+ struct ieee80211_chanctx,
+ conf);
+ struct ieee80211_chanctx *old_ctx =
+ container_of(vifs[i].old_ctx,
+ struct ieee80211_chanctx,
+ conf);
+
+ new_ctx->driver_present = true;
+ old_ctx->driver_present = false;
+ }
+ }
+
+ return ret;
+}
+
+int drv_ampdu_action(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_ampdu_mlme_action action,
+ struct ieee80211_sta *sta, u16 tid,
+ u16 *ssn, u8 buf_size, bool amsdu)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_ampdu_action(local, sdata, action, sta, tid,
+ ssn, buf_size, amsdu);
+
+ if (local->ops->ampdu_action)
+ ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
+ sta, tid, ssn, buf_size, amsdu);
+
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
diff --git a/kernel/net/mac80211/driver-ops.h b/kernel/net/mac80211/driver-ops.h
index 26e1ca8a4..154ce4b13 100644
--- a/kernel/net/mac80211/driver-ops.h
+++ b/kernel/net/mac80211/driver-ops.h
@@ -66,36 +66,8 @@ static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata,
return rv;
}
-static inline int drv_start(struct ieee80211_local *local)
-{
- int ret;
-
- might_sleep();
-
- trace_drv_start(local);
- local->started = true;
- smp_mb();
- ret = local->ops->start(&local->hw);
- trace_drv_return_int(local, ret);
- return ret;
-}
-
-static inline void drv_stop(struct ieee80211_local *local)
-{
- might_sleep();
-
- trace_drv_stop(local);
- local->ops->stop(&local->hw);
- trace_drv_return_void(local);
-
- /* sync away all work on the tasklet before clearing started */
- tasklet_disable(&local->tasklet);
- tasklet_enable(&local->tasklet);
-
- barrier();
-
- local->started = false;
-}
+int drv_start(struct ieee80211_local *local);
+void drv_stop(struct ieee80211_local *local);
#ifdef CONFIG_PM
static inline int drv_suspend(struct ieee80211_local *local,
@@ -137,59 +109,15 @@ static inline void drv_set_wakeup(struct ieee80211_local *local,
}
#endif
-static inline int drv_add_interface(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata)
-{
- int ret;
-
- might_sleep();
-
- if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
- !(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF) &&
- !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))))
- return -EINVAL;
-
- trace_drv_add_interface(local, sdata);
- ret = local->ops->add_interface(&local->hw, &sdata->vif);
- trace_drv_return_int(local, ret);
-
- if (ret == 0)
- sdata->flags |= IEEE80211_SDATA_IN_DRIVER;
-
- return ret;
-}
-
-static inline int drv_change_interface(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- enum nl80211_iftype type, bool p2p)
-{
- int ret;
-
- might_sleep();
-
- if (!check_sdata_in_driver(sdata))
- return -EIO;
-
- trace_drv_change_interface(local, sdata, type, p2p);
- ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
- trace_drv_return_int(local, ret);
- return ret;
-}
-
-static inline void drv_remove_interface(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata)
-{
- might_sleep();
+int drv_add_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
- if (!check_sdata_in_driver(sdata))
- return;
+int drv_change_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type, bool p2p);
- trace_drv_remove_interface(local, sdata);
- local->ops->remove_interface(&local->hw, &sdata->vif);
- sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;
- trace_drv_return_void(local);
-}
+void drv_remove_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
static inline int drv_config(struct ieee80211_local *local, u32 changed)
{
@@ -260,6 +188,22 @@ static inline void drv_configure_filter(struct ieee80211_local *local,
trace_drv_return_void(local);
}
+static inline void drv_config_iface_filter(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ unsigned int filter_flags,
+ unsigned int changed_flags)
+{
+ might_sleep();
+
+ trace_drv_config_iface_filter(local, sdata, filter_flags,
+ changed_flags);
+ if (local->ops->config_iface_filter)
+ local->ops->config_iface_filter(&local->hw, &sdata->vif,
+ filter_flags,
+ changed_flags);
+ trace_drv_return_void(local);
+}
+
static inline int drv_set_tim(struct ieee80211_local *local,
struct ieee80211_sta *sta, bool set)
{
@@ -417,12 +361,13 @@ static inline int drv_get_stats(struct ieee80211_local *local,
return ret;
}
-static inline void drv_get_tkip_seq(struct ieee80211_local *local,
- u8 hw_key_idx, u32 *iv32, u16 *iv16)
+static inline void drv_get_key_seq(struct ieee80211_local *local,
+ struct ieee80211_key *key,
+ struct ieee80211_key_seq *seq)
{
- if (local->ops->get_tkip_seq)
- local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16);
- trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16);
+ if (local->ops->get_key_seq)
+ local->ops->get_key_seq(&local->hw, &key->conf, seq);
+ trace_drv_get_key_seq(local, &key->conf);
}
static inline int drv_set_frag_threshold(struct ieee80211_local *local,
@@ -572,57 +517,16 @@ static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
trace_drv_return_void(local);
}
-static inline __must_check
+__must_check
int drv_sta_state(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
enum ieee80211_sta_state old_state,
- enum ieee80211_sta_state new_state)
-{
- int ret = 0;
-
- might_sleep();
+ enum ieee80211_sta_state new_state);
- sdata = get_bss_sdata(sdata);
- if (!check_sdata_in_driver(sdata))
- return -EIO;
-
- trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state);
- if (local->ops->sta_state) {
- ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta,
- old_state, new_state);
- } else if (old_state == IEEE80211_STA_AUTH &&
- new_state == IEEE80211_STA_ASSOC) {
- ret = drv_sta_add(local, sdata, &sta->sta);
- if (ret == 0)
- sta->uploaded = true;
- } else if (old_state == IEEE80211_STA_ASSOC &&
- new_state == IEEE80211_STA_AUTH) {
- drv_sta_remove(local, sdata, &sta->sta);
- }
- trace_drv_return_int(local, ret);
- return ret;
-}
-
-static inline void drv_sta_rc_update(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- struct ieee80211_sta *sta, u32 changed)
-{
- sdata = get_bss_sdata(sdata);
- if (!check_sdata_in_driver(sdata))
- return;
-
- WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED &&
- (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
- sdata->vif.type != NL80211_IFTYPE_MESH_POINT));
-
- trace_drv_sta_rc_update(local, sdata, sta, changed);
- if (local->ops->sta_rc_update)
- local->ops->sta_rc_update(&local->hw, &sdata->vif,
- sta, changed);
-
- trace_drv_return_void(local);
-}
+void drv_sta_rc_update(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, u32 changed);
static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
@@ -654,76 +558,17 @@ static inline void drv_sta_statistics(struct ieee80211_local *local,
trace_drv_return_void(local);
}
-static inline int drv_conf_tx(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata, u16 ac,
- const struct ieee80211_tx_queue_params *params)
-{
- int ret = -EOPNOTSUPP;
-
- might_sleep();
-
- if (!check_sdata_in_driver(sdata))
- return -EIO;
-
- if (WARN_ONCE(params->cw_min == 0 ||
- params->cw_min > params->cw_max,
- "%s: invalid CW_min/CW_max: %d/%d\n",
- sdata->name, params->cw_min, params->cw_max))
- return -EINVAL;
-
- trace_drv_conf_tx(local, sdata, ac, params);
- if (local->ops->conf_tx)
- ret = local->ops->conf_tx(&local->hw, &sdata->vif,
- ac, params);
- trace_drv_return_int(local, ret);
- return ret;
-}
-
-static inline u64 drv_get_tsf(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata)
-{
- u64 ret = -1ULL;
-
- might_sleep();
-
- if (!check_sdata_in_driver(sdata))
- return ret;
-
- trace_drv_get_tsf(local, sdata);
- if (local->ops->get_tsf)
- ret = local->ops->get_tsf(&local->hw, &sdata->vif);
- trace_drv_return_u64(local, ret);
- return ret;
-}
-
-static inline void drv_set_tsf(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- u64 tsf)
-{
- might_sleep();
-
- if (!check_sdata_in_driver(sdata))
- return;
-
- trace_drv_set_tsf(local, sdata, tsf);
- if (local->ops->set_tsf)
- local->ops->set_tsf(&local->hw, &sdata->vif, tsf);
- trace_drv_return_void(local);
-}
-
-static inline void drv_reset_tsf(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata)
-{
- might_sleep();
-
- if (!check_sdata_in_driver(sdata))
- return;
+int drv_conf_tx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata, u16 ac,
+ const struct ieee80211_tx_queue_params *params);
- trace_drv_reset_tsf(local, sdata);
- if (local->ops->reset_tsf)
- local->ops->reset_tsf(&local->hw, &sdata->vif);
- trace_drv_return_void(local);
-}
+u64 drv_get_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
+void drv_set_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u64 tsf);
+void drv_reset_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
static inline int drv_tx_last_beacon(struct ieee80211_local *local)
{
@@ -738,30 +583,11 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local)
return ret;
}
-static inline int drv_ampdu_action(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- enum ieee80211_ampdu_mlme_action action,
- struct ieee80211_sta *sta, u16 tid,
- u16 *ssn, u8 buf_size)
-{
- int ret = -EOPNOTSUPP;
-
- might_sleep();
-
- sdata = get_bss_sdata(sdata);
- if (!check_sdata_in_driver(sdata))
- return -EIO;
-
- trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size);
-
- if (local->ops->ampdu_action)
- ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
- sta, tid, ssn, buf_size);
-
- trace_drv_return_int(local, ret);
-
- return ret;
-}
+int drv_ampdu_action(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_ampdu_mlme_action action,
+ struct ieee80211_sta *sta, u16 tid,
+ u16 *ssn, u8 buf_size, bool amsdu);
static inline int drv_get_survey(struct ieee80211_local *local, int idx,
struct survey_info *survey)
@@ -1017,6 +843,8 @@ static inline int drv_add_chanctx(struct ieee80211_local *local,
{
int ret = -EOPNOTSUPP;
+ might_sleep();
+
trace_drv_add_chanctx(local, ctx);
if (local->ops->add_chanctx)
ret = local->ops->add_chanctx(&local->hw, &ctx->conf);
@@ -1030,6 +858,8 @@ static inline int drv_add_chanctx(struct ieee80211_local *local,
static inline void drv_remove_chanctx(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
{
+ might_sleep();
+
if (WARN_ON(!ctx->driver_present))
return;
@@ -1044,6 +874,8 @@ static inline void drv_change_chanctx(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx,
u32 changed)
{
+ might_sleep();
+
trace_drv_change_chanctx(local, ctx, changed);
if (local->ops->change_chanctx) {
WARN_ON_ONCE(!ctx->driver_present);
@@ -1077,6 +909,8 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
struct ieee80211_chanctx *ctx)
{
+ might_sleep();
+
if (!check_sdata_in_driver(sdata))
return;
@@ -1090,64 +924,17 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
trace_drv_return_void(local);
}
-static inline int
-drv_switch_vif_chanctx(struct ieee80211_local *local,
- struct ieee80211_vif_chanctx_switch *vifs,
- int n_vifs,
- enum ieee80211_chanctx_switch_mode mode)
-{
- int ret = 0;
- int i;
-
- if (!local->ops->switch_vif_chanctx)
- return -EOPNOTSUPP;
-
- for (i = 0; i < n_vifs; i++) {
- struct ieee80211_chanctx *new_ctx =
- container_of(vifs[i].new_ctx,
- struct ieee80211_chanctx,
- conf);
- struct ieee80211_chanctx *old_ctx =
- container_of(vifs[i].old_ctx,
- struct ieee80211_chanctx,
- conf);
-
- WARN_ON_ONCE(!old_ctx->driver_present);
- WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS &&
- new_ctx->driver_present) ||
- (mode == CHANCTX_SWMODE_REASSIGN_VIF &&
- !new_ctx->driver_present));
- }
-
- trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode);
- ret = local->ops->switch_vif_chanctx(&local->hw,
- vifs, n_vifs, mode);
- trace_drv_return_int(local, ret);
-
- if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) {
- for (i = 0; i < n_vifs; i++) {
- struct ieee80211_chanctx *new_ctx =
- container_of(vifs[i].new_ctx,
- struct ieee80211_chanctx,
- conf);
- struct ieee80211_chanctx *old_ctx =
- container_of(vifs[i].old_ctx,
- struct ieee80211_chanctx,
- conf);
-
- new_ctx->driver_present = true;
- old_ctx->driver_present = false;
- }
- }
-
- return ret;
-}
+int drv_switch_vif_chanctx(struct ieee80211_local *local,
+ struct ieee80211_vif_chanctx_switch *vifs,
+ int n_vifs, enum ieee80211_chanctx_switch_mode mode);
static inline int drv_start_ap(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata)
{
int ret = 0;
+ might_sleep();
+
if (!check_sdata_in_driver(sdata))
return -EIO;
diff --git a/kernel/net/mac80211/ethtool.c b/kernel/net/mac80211/ethtool.c
index 52bcea6ad..9cc986ded 100644
--- a/kernel/net/mac80211/ethtool.c
+++ b/kernel/net/mac80211/ethtool.c
@@ -38,9 +38,9 @@ static void ieee80211_get_ringparam(struct net_device *dev,
static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = {
"rx_packets", "rx_bytes",
"rx_duplicates", "rx_fragments", "rx_dropped",
- "tx_packets", "tx_bytes", "tx_fragments",
+ "tx_packets", "tx_bytes",
"tx_filtered", "tx_retry_failed", "tx_retries",
- "beacon_loss", "sta_state", "txrate", "rxrate", "signal",
+ "sta_state", "txrate", "rxrate", "signal",
"channel", "noise", "ch_time", "ch_time_busy",
"ch_time_ext_busy", "ch_time_rx", "ch_time_tx"
};
@@ -77,21 +77,19 @@ static void ieee80211_get_stats(struct net_device *dev,
memset(data, 0, sizeof(u64) * STA_STATS_LEN);
-#define ADD_STA_STATS(sta) \
- do { \
- data[i++] += sta->rx_packets; \
- data[i++] += sta->rx_bytes; \
- data[i++] += sta->num_duplicates; \
- data[i++] += sta->rx_fragments; \
- data[i++] += sta->rx_dropped; \
- \
- data[i++] += sinfo.tx_packets; \
- data[i++] += sinfo.tx_bytes; \
- data[i++] += sta->tx_fragments; \
- data[i++] += sta->tx_filtered_count; \
- data[i++] += sta->tx_retry_failed; \
- data[i++] += sta->tx_retry_count; \
- data[i++] += sta->beacon_loss_count; \
+#define ADD_STA_STATS(sta) \
+ do { \
+ data[i++] += sta->rx_stats.packets; \
+ data[i++] += sta->rx_stats.bytes; \
+ data[i++] += sta->rx_stats.num_duplicates; \
+ data[i++] += sta->rx_stats.fragments; \
+ data[i++] += sta->rx_stats.dropped; \
+ \
+ data[i++] += sinfo.tx_packets; \
+ data[i++] += sinfo.tx_bytes; \
+ data[i++] += sta->status_stats.filtered; \
+ data[i++] += sta->status_stats.retry_failed; \
+ data[i++] += sta->status_stats.retry_count; \
} while (0)
/* For Managed stations, find the single station based on BSSID
diff --git a/kernel/net/mac80211/event.c b/kernel/net/mac80211/event.c
deleted file mode 100644
index 01ae75951..000000000
--- a/kernel/net/mac80211/event.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * mac80211 - events
- */
-#include <net/cfg80211.h>
-#include "ieee80211_i.h"
-
-/*
- * Indicate a failed Michael MIC to userspace. If the caller knows the TSC of
- * the frame that generated the MIC failure (i.e., if it was provided by the
- * driver or is still in the frame), it should provide that information.
- */
-void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
- struct ieee80211_hdr *hdr, const u8 *tsc,
- gfp_t gfp)
-{
- cfg80211_michael_mic_failure(sdata->dev, hdr->addr2,
- (hdr->addr1[0] & 0x01) ?
- NL80211_KEYTYPE_GROUP :
- NL80211_KEYTYPE_PAIRWISE,
- keyidx, tsc, gfp);
-}
diff --git a/kernel/net/mac80211/ibss.c b/kernel/net/mac80211/ibss.c
index a9c9d961f..6a12b0f5c 100644
--- a/kernel/net/mac80211/ibss.c
+++ b/kernel/net/mac80211/ibss.c
@@ -188,7 +188,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
* keep them at 0
*/
pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap,
- chandef, 0);
+ chandef, 0, false);
/* add VHT capability and information IEs */
if (chandef->width != NL80211_CHAN_WIDTH_20 &&
@@ -229,7 +229,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
struct cfg80211_chan_def chandef;
struct ieee80211_channel *chan;
struct beacon_data *presp;
- enum nl80211_bss_scan_width scan_width;
+ struct cfg80211_inform_bss bss_meta = {};
bool have_higher_than_11mbit;
bool radar_required;
int err;
@@ -356,7 +356,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
else
sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
- ieee80211_set_wmm_default(sdata, true);
+ ieee80211_set_wmm_default(sdata, true, false);
sdata->vif.bss_conf.ibss_joined = true;
sdata->vif.bss_conf.ibss_creator = creator;
@@ -383,10 +383,11 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
mod_timer(&ifibss->timer,
round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
- scan_width = cfg80211_chandef_to_scan_width(&chandef);
- bss = cfg80211_inform_bss_width_frame(local->hw.wiphy, chan,
- scan_width, mgmt,
- presp->head_len, 0, GFP_KERNEL);
+ bss_meta.chan = chan;
+ bss_meta.scan_width = cfg80211_chandef_to_scan_width(&chandef);
+ bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt,
+ presp->head_len, GFP_KERNEL);
+
cfg80211_put_bss(local->hw.wiphy, bss);
netif_carrier_on(sdata->dev);
cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL);
@@ -646,7 +647,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
return NULL;
}
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
/* make sure mandatory rates are always added */
sband = local->hw.wiphy->bands[band];
@@ -668,7 +669,8 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
list_for_each_entry_rcu(sta, &local->sta_list, list) {
if (sta->sdata == sdata &&
- time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
+ time_after(sta->rx_stats.last_rx +
+ IEEE80211_IBSS_MERGE_INTERVAL,
jiffies)) {
active++;
break;
@@ -1032,8 +1034,11 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
}
}
- if (sta && elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS)
+ if (sta && !sta->sta.wme &&
+ elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) {
sta->sta.wme = true;
+ ieee80211_check_fast_xmit(sta);
+ }
if (sta && elems->ht_operation && elems->ht_cap_elem &&
sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
@@ -1231,7 +1236,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
if (!sta)
return;
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
/* make sure mandatory rates are always added */
sband = local->hw.wiphy->bands[band];
@@ -1249,7 +1254,7 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
struct ieee80211_local *local = sdata->local;
struct sta_info *sta, *tmp;
unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT;
- unsigned long exp_rsn_time = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT;
+ unsigned long exp_rsn = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT;
mutex_lock(&local->sta_mtx);
@@ -1257,8 +1262,8 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
if (sdata != sta->sdata)
continue;
- if (time_after(jiffies, sta->last_rx + exp_time) ||
- (time_after(jiffies, sta->last_rx + exp_rsn_time) &&
+ if (time_after(jiffies, sta->rx_stats.last_rx + exp_time) ||
+ (time_after(jiffies, sta->rx_stats.last_rx + exp_rsn) &&
sta->sta_state != IEEE80211_STA_AUTHORIZED)) {
sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n",
sta->sta_state != IEEE80211_STA_AUTHORIZED ?
@@ -1727,7 +1732,6 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
continue;
sdata->u.ibss.last_scan_completed = jiffies;
- ieee80211_queue_work(&local->hw, &sdata->work);
}
mutex_unlock(&local->iflist_mtx);
}
diff --git a/kernel/net/mac80211/ieee80211_i.h b/kernel/net/mac80211/ieee80211_i.h
index c0a9187bc..6837a46ca 100644
--- a/kernel/net/mac80211/ieee80211_i.h
+++ b/kernel/net/mac80211/ieee80211_i.h
@@ -34,6 +34,8 @@
#include "sta_info.h"
#include "debug.h"
+extern const struct cfg80211_ops mac80211_config_ops;
+
struct ieee80211_local;
/* Maximum number of broadcast/multicast frames to buffer when some of the
@@ -84,13 +86,13 @@ struct ieee80211_local;
#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */)
struct ieee80211_fragment_entry {
- unsigned long first_frag_time;
- unsigned int seq;
- unsigned int rx_queue;
- unsigned int last_frag;
- unsigned int extra_len;
struct sk_buff_head skb_list;
- int ccmp; /* Whether fragments were encrypted with CCMP */
+ unsigned long first_frag_time;
+ u16 seq;
+ u16 extra_len;
+ u16 last_frag;
+ u8 rx_queue;
+ bool check_sequential_pn; /* needed for CCMP/GCMP */
u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
};
@@ -181,9 +183,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result;
/**
* enum ieee80211_packet_rx_flags - packet RX flags
- * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed
- * (incl. multicast frames)
- * @IEEE80211_RX_FRAGMENTED: fragmented frame
* @IEEE80211_RX_AMSDU: a-MSDU packet
* @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
* @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering
@@ -192,8 +191,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result;
* @rx_flags field of &struct ieee80211_rx_status.
*/
enum ieee80211_packet_rx_flags {
- IEEE80211_RX_RA_MATCH = BIT(1),
- IEEE80211_RX_FRAGMENTED = BIT(2),
IEEE80211_RX_AMSDU = BIT(3),
IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4),
IEEE80211_RX_DEFERRED_RELEASE = BIT(5),
@@ -205,8 +202,6 @@ enum ieee80211_packet_rx_flags {
* @IEEE80211_RX_CMNTR: received on cooked monitor already
* @IEEE80211_RX_BEACON_REPORTED: This frame was already reported
* to cfg80211_report_obss_beacon().
- * @IEEE80211_RX_REORDER_TIMER: this frame is released by the
- * reorder buffer timeout timer, not the normal RX path
*
* These flags are used across handling multiple interfaces
* for a single frame.
@@ -214,10 +209,10 @@ enum ieee80211_packet_rx_flags {
enum ieee80211_rx_flags {
IEEE80211_RX_CMNTR = BIT(0),
IEEE80211_RX_BEACON_REPORTED = BIT(1),
- IEEE80211_RX_REORDER_TIMER = BIT(2),
};
struct ieee80211_rx_data {
+ struct napi_struct *napi;
struct sk_buff *skb;
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
@@ -426,6 +421,8 @@ struct ieee80211_sta_tx_tspec {
bool downgraded;
};
+DECLARE_EWMA(beacon_signal, 16, 4)
+
struct ieee80211_if_managed {
struct timer_list timer;
struct timer_list conn_mon_timer;
@@ -497,16 +494,7 @@ struct ieee80211_if_managed {
s16 p2p_noa_index;
- /* Signal strength from the last Beacon frame in the current BSS. */
- int last_beacon_signal;
-
- /*
- * Weighted average of the signal strength from Beacon frames in the
- * current BSS. This is in units of 1/16 of the signal unit to maintain
- * accuracy and to speed up calculations, i.e., the value need to be
- * divided by 16 to get the actual value.
- */
- int ave_beacon_signal;
+ struct ewma_beacon_signal ave_beacon_signal;
/*
* Number of Beacon frames used in ave_beacon_signal. This can be used
@@ -515,6 +503,9 @@ struct ieee80211_if_managed {
*/
unsigned int count_beacon_signal;
+ /* Number of times beacon loss was invoked. */
+ unsigned int beacon_loss_count;
+
/*
* Last Beacon frame signal strength average (ave_beacon_signal / 16)
* that triggered a cqm event. 0 indicates that no event has been
@@ -542,6 +533,7 @@ struct ieee80211_if_managed {
struct sk_buff *teardown_skb; /* A copy to send through the AP */
spinlock_t teardown_lock; /* To lock changing teardown_skb */
bool tdls_chan_switch_prohibited;
+ bool tdls_wider_bw_prohibited;
/* WMM-AC TSPEC support */
struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS];
@@ -722,21 +714,21 @@ struct ieee80211_if_mesh {
* enum ieee80211_sub_if_data_flags - virtual interface flags
*
* @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets
- * @IEEE80211_SDATA_PROMISC: interface is promisc
* @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode
* @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between
* associated stations and deliver multicast frames both
* back to wireless media and to the local net stack.
* @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
* @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
+ * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability
*/
enum ieee80211_sub_if_data_flags {
IEEE80211_SDATA_ALLMULTI = BIT(0),
- IEEE80211_SDATA_PROMISC = BIT(1),
IEEE80211_SDATA_OPERATING_GMODE = BIT(2),
IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3),
IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4),
IEEE80211_SDATA_IN_DRIVER = BIT(5),
+ IEEE80211_SDATA_MU_MIMO_OWNER = BIT(6),
};
/**
@@ -908,6 +900,9 @@ struct ieee80211_sub_if_data {
bool rc_has_mcs_mask[IEEE80211_NUM_BANDS];
u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN];
+ bool rc_has_vht_mcs_mask[IEEE80211_NUM_BANDS];
+ u16 rc_rateidx_vht_mcs_mask[IEEE80211_NUM_BANDS][NL80211_VHT_NSS_MAX];
+
union {
struct ieee80211_if_ap ap;
struct ieee80211_if_wds wds;
@@ -1015,7 +1010,6 @@ enum sdata_queue_type {
IEEE80211_SDATA_QUEUE_AGG_STOP = 2,
IEEE80211_SDATA_QUEUE_RX_AGG_START = 3,
IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4,
- IEEE80211_SDATA_QUEUE_TDLS_CHSW = 5,
};
enum {
@@ -1040,7 +1034,6 @@ enum queue_stop_reason {
#ifdef CONFIG_MAC80211_LEDS
struct tpt_led_trigger {
- struct led_trigger trig;
char name[32];
const struct ieee80211_tpt_blink *blink_table;
unsigned int blink_table_len;
@@ -1208,8 +1201,8 @@ struct ieee80211_local {
atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
- /* number of interfaces with corresponding IFF_ flags */
- atomic_t iff_allmultis, iff_promiscs;
+ /* number of interfaces with allmulti RX */
+ atomic_t iff_allmultis;
struct rate_control_ref *rate_ctrl;
@@ -1261,6 +1254,15 @@ struct ieee80211_local {
struct list_head chanctx_list;
struct mutex chanctx_mtx;
+#ifdef CONFIG_MAC80211_LEDS
+ struct led_trigger tx_led, rx_led, assoc_led, radio_led;
+ struct led_trigger tpt_led;
+ atomic_t tx_led_active, rx_led_active, assoc_led_active;
+ atomic_t radio_led_active, tpt_led_active;
+ struct tpt_led_trigger *tpt_led_trigger;
+#endif
+
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
/* SNMP counters */
/* dot11CountersTable */
u32 dot11TransmittedFragmentCount;
@@ -1273,18 +1275,9 @@ struct ieee80211_local {
u32 dot11MulticastReceivedFrameCount;
u32 dot11TransmittedFrameCount;
-#ifdef CONFIG_MAC80211_LEDS
- struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led;
- struct tpt_led_trigger *tpt_led_trigger;
- char tx_led_name[32], rx_led_name[32],
- assoc_led_name[32], radio_led_name[32];
-#endif
-
-#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
/* TX/RX handler statistics */
unsigned int tx_handlers_drop;
unsigned int tx_handlers_queued;
- unsigned int tx_handlers_drop_fragment;
unsigned int tx_handlers_drop_wep;
unsigned int tx_handlers_drop_not_assoc;
unsigned int tx_handlers_drop_unauth_port;
@@ -1292,11 +1285,9 @@ struct ieee80211_local {
unsigned int rx_handlers_queued;
unsigned int rx_handlers_drop_nullfunc;
unsigned int rx_handlers_drop_defrag;
- unsigned int rx_handlers_drop_short;
unsigned int tx_expand_skb_head;
unsigned int tx_expand_skb_head_cloned;
- unsigned int rx_expand_skb_head;
- unsigned int rx_expand_skb_head2;
+ unsigned int rx_expand_skb_head_defrag;
unsigned int rx_handlers_fragments;
unsigned int tx_status_drop;
#define I802_DEBUG_INC(c) (c)++
@@ -1319,7 +1310,6 @@ struct ieee80211_local {
struct work_struct dynamic_ps_enable_work;
struct work_struct dynamic_ps_disable_work;
struct timer_list dynamic_ps_timer;
- struct notifier_block network_latency_notifier;
struct notifier_block ifa_notifier;
struct notifier_block ifa6_notifier;
@@ -1355,14 +1345,16 @@ struct ieee80211_local {
struct ieee80211_sub_if_data __rcu *p2p_sdata;
- struct napi_struct *napi;
-
/* virtual monitor interface */
struct ieee80211_sub_if_data __rcu *monitor_sdata;
struct cfg80211_chan_def monitor_chandef;
/* extended capabilities provided by mac80211 */
u8 ext_capa[8];
+
+ /* TDLS channel switch */
+ struct work_struct tdls_chsw_work;
+ struct sk_buff_head skb_queue_tdls_chsw;
};
static inline struct ieee80211_sub_if_data *
@@ -1503,10 +1495,8 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
struct cfg80211_disassoc_request *req);
void ieee80211_send_pspoll(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
-void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency);
+void ieee80211_recalc_ps(struct ieee80211_local *local);
void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata);
-int ieee80211_max_network_latency(struct notifier_block *nb,
- unsigned long data, void *dummy);
int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
@@ -1583,7 +1573,7 @@ __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
struct cfg80211_sched_scan_request *req);
int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
struct cfg80211_sched_scan_request *req);
-int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata);
+int ieee80211_request_sched_scan_stop(struct ieee80211_local *local);
void ieee80211_sched_scan_end(struct ieee80211_local *local);
void ieee80211_sched_scan_stopped_work(struct work_struct *work);
@@ -1647,6 +1637,14 @@ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw,
struct sk_buff *
ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, u32 info_flags);
+void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
+ struct ieee80211_supported_band *sband,
+ int retry_count, int shift, bool send_to_cooked);
+
+void ieee80211_check_fast_xmit(struct sta_info *sta);
+void ieee80211_check_fast_xmit_all(struct ieee80211_local *local);
+void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_clear_fast_xmit(struct sta_info *sta);
/* HT */
void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
@@ -1711,12 +1709,14 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta);
void ieee80211_sta_set_rx_nss(struct sta_info *sta);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only);
+ enum ieee80211_band band);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only);
+ enum ieee80211_band band);
void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sta_vht_cap *vht_cap);
+void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
+ u16 vht_mask[NL80211_VHT_NSS_MAX]);
/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
@@ -1765,16 +1765,11 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw)
/* utility functions/constants */
extern const void *const mac80211_wiphy_privid; /* for wiphy privid */
-u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
- enum nl80211_iftype type);
int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
int rate, int erp, int short_preamble,
int shift);
-void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
- struct ieee80211_hdr *hdr, const u8 *tsc,
- gfp_t gfp);
void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
- bool bss_notify);
+ bool bss_notify, bool enable_qos);
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, struct sk_buff *skb);
@@ -1854,7 +1849,7 @@ void ieee80211_dynamic_ps_disable_work(struct work_struct *work);
void ieee80211_dynamic_ps_timer(unsigned long data);
void ieee80211_send_nullfunc(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
- int powersave);
+ bool powersave);
void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
struct ieee80211_hdr *hdr);
void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
@@ -1967,7 +1962,7 @@ u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
u16 cap);
u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
const struct cfg80211_chan_def *chandef,
- u16 prot_mode);
+ u16 prot_mode, bool rifs_mode);
u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
u32 cap);
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
@@ -2044,6 +2039,9 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
enum ieee80211_chanctx_mode chanmode,
u8 radar_detect);
int ieee80211_max_num_channels(struct ieee80211_local *local);
+enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta);
+void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx);
/* TDLS */
int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
@@ -2060,8 +2058,8 @@ int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev,
void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy,
struct net_device *dev,
const u8 *addr);
-void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb);
+void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata);
+void ieee80211_tdls_chsw_work(struct work_struct *wk);
extern const struct ethtool_ops ieee80211_ethtool_ops;
diff --git a/kernel/net/mac80211/iface.c b/kernel/net/mac80211/iface.c
index 84cef600c..c9e325d2e 100644
--- a/kernel/net/mac80211/iface.c
+++ b/kernel/net/mac80211/iface.c
@@ -76,7 +76,8 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
bool update_bss)
{
- if (__ieee80211_recalc_txpower(sdata) || update_bss)
+ if (__ieee80211_recalc_txpower(sdata) ||
+ (update_bss && ieee80211_sdata_running(sdata)))
ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER);
}
@@ -338,7 +339,7 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
if ((iftype != NL80211_IFTYPE_AP &&
iftype != NL80211_IFTYPE_P2P_GO &&
iftype != NL80211_IFTYPE_MESH_POINT) ||
- !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) {
+ !ieee80211_hw_check(&sdata->local->hw, QUEUE_CONTROL)) {
sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
return 0;
}
@@ -378,7 +379,7 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
int i;
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
- if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
else if (local->hw.queues >= IEEE80211_NUM_ACS)
sdata->vif.hw_queue[i] = i;
@@ -393,7 +394,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
struct ieee80211_sub_if_data *sdata;
int ret;
- if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF))
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
return 0;
ASSERT_RTNL();
@@ -454,7 +455,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
- if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF))
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
return;
ASSERT_RTNL();
@@ -661,11 +662,13 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
}
/*
- * set default queue parameters so drivers don't
+ * Set default queue parameters so drivers don't
* need to initialise the hardware if the hardware
- * doesn't start up with sane defaults
+ * doesn't start up with sane defaults.
+ * Enable QoS for anything but station interfaces.
*/
- ieee80211_set_wmm_default(sdata, true);
+ ieee80211_set_wmm_default(sdata, true,
+ sdata->vif.type != NL80211_IFTYPE_STATION);
}
set_bit(SDATA_STATE_RUNNING, &sdata->state);
@@ -703,16 +706,13 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
atomic_inc(&local->iff_allmultis);
- if (sdata->flags & IEEE80211_SDATA_PROMISC)
- atomic_inc(&local->iff_promiscs);
-
if (coming_up)
local->open_count++;
if (hw_reconf_flags)
ieee80211_hw_config(local, hw_reconf_flags);
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
@@ -835,13 +835,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) ||
(sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)));
- /* don't count this interface for promisc/allmulti while it is down */
+ /* don't count this interface for allmulti while it is down */
if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
atomic_dec(&local->iff_allmultis);
- if (sdata->flags & IEEE80211_SDATA_PROMISC)
- atomic_dec(&local->iff_promiscs);
-
if (sdata->vif.type == NL80211_IFTYPE_AP) {
local->fif_pspoll--;
local->fif_probe_req--;
@@ -1022,7 +1019,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
drv_remove_interface(local, sdata);
}
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
if (cancel_scan)
flush_delayed_work(&local->scan_work);
@@ -1055,12 +1052,10 @@ static void ieee80211_set_multicast_list(struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
- int allmulti, promisc, sdata_allmulti, sdata_promisc;
+ int allmulti, sdata_allmulti;
allmulti = !!(dev->flags & IFF_ALLMULTI);
- promisc = !!(dev->flags & IFF_PROMISC);
sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI);
- sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC);
if (allmulti != sdata_allmulti) {
if (dev->flags & IFF_ALLMULTI)
@@ -1070,13 +1065,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev)
sdata->flags ^= IEEE80211_SDATA_ALLMULTI;
}
- if (promisc != sdata_promisc) {
- if (dev->flags & IFF_PROMISC)
- atomic_inc(&local->iff_promiscs);
- else
- atomic_dec(&local->iff_promiscs);
- sdata->flags ^= IEEE80211_SDATA_PROMISC;
- }
spin_lock_bh(&local->filter_lock);
__hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len);
spin_unlock_bh(&local->filter_lock);
@@ -1117,6 +1105,35 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
}
+static struct rtnl_link_stats64 *
+ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *tstats;
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ tstats = per_cpu_ptr(dev->tstats, i);
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ stats->rx_packets += rx_packets;
+ stats->tx_packets += tx_packets;
+ stats->rx_bytes += rx_bytes;
+ stats->tx_bytes += tx_bytes;
+ }
+
+ return stats;
+}
+
static const struct net_device_ops ieee80211_dataif_ops = {
.ndo_open = ieee80211_open,
.ndo_stop = ieee80211_stop,
@@ -1126,6 +1143,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
.ndo_change_mtu = ieee80211_change_mtu,
.ndo_set_mac_address = ieee80211_change_mac,
.ndo_select_queue = ieee80211_netdev_select_queue,
+ .ndo_get_stats64 = ieee80211_get_stats64,
};
static u16 ieee80211_monitor_select_queue(struct net_device *dev,
@@ -1159,14 +1177,21 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
.ndo_change_mtu = ieee80211_change_mtu,
.ndo_set_mac_address = ieee80211_change_mac,
.ndo_select_queue = ieee80211_monitor_select_queue,
+ .ndo_get_stats64 = ieee80211_get_stats64,
};
+static void ieee80211_if_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ieee80211_if_setup(struct net_device *dev)
{
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->netdev_ops = &ieee80211_dataif_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ieee80211_if_free;
}
static void ieee80211_iface_work(struct work_struct *work)
@@ -1182,7 +1207,7 @@ static void ieee80211_iface_work(struct work_struct *work)
if (!ieee80211_sdata_running(sdata))
return;
- if (local->scanning)
+ if (test_bit(SCAN_SW_SCANNING, &local->scanning))
return;
if (!ieee80211_can_run_worker(local))
@@ -1220,8 +1245,6 @@ static void ieee80211_iface_work(struct work_struct *work)
WLAN_BACK_RECIPIENT, 0,
false);
mutex_unlock(&local->sta_mtx);
- } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_TDLS_CHSW) {
- ieee80211_process_tdls_channel_switch(sdata, skb);
} else if (ieee80211_is_action(mgmt->frame_control) &&
mgmt->u.action.category == WLAN_CATEGORY_BACK) {
int len = skb->len;
@@ -1564,7 +1587,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local,
break;
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_P2P_GO:
- if (local->hw.flags & IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF) {
+ if (ieee80211_hw_check(&local->hw, P2P_DEV_ADDR_FOR_INTF)) {
list_for_each_entry(sdata, &local->interfaces, list) {
if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE)
continue;
@@ -1707,6 +1730,12 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
return -ENOMEM;
dev_net_set(ndev, wiphy_net(local->hw.wiphy));
+ ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!ndev->tstats) {
+ free_netdev(ndev);
+ return -ENOMEM;
+ }
+
ndev->needed_headroom = local->tx_headroom +
4*6 /* four MAC addresses */
+ 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */
@@ -1762,13 +1791,23 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
sband = local->hw.wiphy->bands[i];
sdata->rc_rateidx_mask[i] =
sband ? (1 << sband->n_bitrates) - 1 : 0;
- if (sband)
+ if (sband) {
+ __le16 cap;
+ u16 *vht_rate_mask;
+
memcpy(sdata->rc_rateidx_mcs_mask[i],
sband->ht_cap.mcs.rx_mask,
sizeof(sdata->rc_rateidx_mcs_mask[i]));
- else
+
+ cap = sband->vht_cap.vht_mcs.rx_mcs_map;
+ vht_rate_mask = sdata->rc_rateidx_vht_mcs_mask[i];
+ ieee80211_get_vht_mask_from_cap(cap, vht_rate_mask);
+ } else {
memset(sdata->rc_rateidx_mcs_mask[i], 0,
sizeof(sdata->rc_rateidx_mcs_mask[i]));
+ memset(sdata->rc_rateidx_vht_mcs_mask[i], 0,
+ sizeof(sdata->rc_rateidx_vht_mcs_mask[i]));
+ }
}
ieee80211_set_default_queues(sdata);
@@ -1823,6 +1862,7 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata)
unregister_netdevice(sdata->dev);
} else {
cfg80211_unregister_wdev(&sdata->wdev);
+ ieee80211_teardown_sdata(sdata);
kfree(sdata);
}
}
@@ -1832,13 +1872,8 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata)
if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state)))
return;
ieee80211_do_stop(sdata, true);
- ieee80211_teardown_sdata(sdata);
}
-/*
- * Remove all interfaces, may only be called at hardware unregistration
- * time because it doesn't do RCU-safe list removals.
- */
void ieee80211_remove_interfaces(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata, *tmp;
@@ -1847,14 +1882,21 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
ASSERT_RTNL();
- /*
- * Close all AP_VLAN interfaces first, as otherwise they
- * might be closed while the AP interface they belong to
- * is closed, causing unregister_netdevice_many() to crash.
+ /* Before destroying the interfaces, make sure they're all stopped so
+ * that the hardware is stopped. Otherwise, the driver might still be
+ * iterating the interfaces during the shutdown, e.g. from a worker
+ * or from RX processing or similar, and if it does so (using atomic
+ * iteration) while we're manipulating the list, the iteration will
+ * crash.
+ *
+ * After this, the hardware should be stopped and the driver should
+ * have stopped all of its activities, so that we can do RCU-unaware
+ * manipulations of the interface list below.
*/
- list_for_each_entry(sdata, &local->interfaces, list)
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
- dev_close(sdata->dev);
+ cfg80211_shutdown_all_interfaces(local->hw.wiphy);
+
+ WARN(local->open_count, "%s: open count remains %d\n",
+ wiphy_name(local->hw.wiphy), local->open_count);
mutex_lock(&local->iflist_mtx);
list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) {
diff --git a/kernel/net/mac80211/key.c b/kernel/net/mac80211/key.c
index 81e9785f3..44388d6a1 100644
--- a/kernel/net/mac80211/key.c
+++ b/kernel/net/mac80211/key.c
@@ -154,7 +154,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
* is supported; if not, return.
*/
if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) &&
- !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK))
+ !ieee80211_hw_check(&key->local->hw, SUPPORTS_PER_STA_GTK))
goto out_unsupported;
if (sta && !sta->uploaded)
@@ -208,7 +208,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
/* all of these we can do in software - if driver can */
if (ret == 1)
return 0;
- if (key->local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL)
+ if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL))
return -EINVAL;
return 0;
default:
@@ -263,6 +263,7 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
if (uni) {
rcu_assign_pointer(sdata->default_unicast_key, key);
+ ieee80211_check_fast_xmit_iface(sdata);
drv_set_default_unicast_key(sdata->local, sdata, idx);
}
@@ -332,9 +333,9 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
if (pairwise) {
rcu_assign_pointer(sta->ptk[idx], new);
sta->ptk_idx = idx;
+ ieee80211_check_fast_xmit(sta);
} else {
rcu_assign_pointer(sta->gtk[idx], new);
- sta->gtk_idx = idx;
}
} else {
defunikey = old &&
@@ -517,15 +518,17 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
break;
default:
if (cs) {
- size_t len = (seq_len > MAX_PN_LEN) ?
- MAX_PN_LEN : seq_len;
+ if (seq_len && seq_len != cs->pn_len) {
+ kfree(key);
+ return ERR_PTR(-EINVAL);
+ }
key->conf.iv_len = cs->hdr_len;
key->conf.icv_len = cs->mic_len;
for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
- for (j = 0; j < len; j++)
+ for (j = 0; j < seq_len; j++)
key->u.gen.rx_pn[i][j] =
- seq[len - j - 1];
+ seq[seq_len - j - 1];
key->flags |= KEY_FLAG_CIPHER_SCHEME;
}
}
@@ -899,27 +902,19 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
- pn64 = atomic64_read(&key->u.ccmp.tx_pn);
- seq->ccmp.pn[5] = pn64;
- seq->ccmp.pn[4] = pn64 >> 8;
- seq->ccmp.pn[3] = pn64 >> 16;
- seq->ccmp.pn[2] = pn64 >> 24;
- seq->ccmp.pn[1] = pn64 >> 32;
- seq->ccmp.pn[0] = pn64 >> 40;
- break;
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
- seq->ccmp.pn[5] = pn64;
- seq->ccmp.pn[4] = pn64 >> 8;
- seq->ccmp.pn[3] = pn64 >> 16;
- seq->ccmp.pn[2] = pn64 >> 24;
- seq->ccmp.pn[1] = pn64 >> 32;
- seq->ccmp.pn[0] = pn64 >> 40;
- break;
+ BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
+ offsetof(typeof(*seq), aes_cmac));
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- pn64 = atomic64_read(&key->u.aes_gmac.tx_pn);
+ BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
+ offsetof(typeof(*seq), aes_gmac));
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
+ offsetof(typeof(*seq), gcmp));
+ pn64 = atomic64_read(&key->conf.tx_pn);
seq->ccmp.pn[5] = pn64;
seq->ccmp.pn[4] = pn64 >> 8;
seq->ccmp.pn[3] = pn64 >> 16;
@@ -927,16 +922,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
seq->ccmp.pn[1] = pn64 >> 32;
seq->ccmp.pn[0] = pn64 >> 40;
break;
- case WLAN_CIPHER_SUITE_GCMP:
- case WLAN_CIPHER_SUITE_GCMP_256:
- pn64 = atomic64_read(&key->u.gcmp.tx_pn);
- seq->gcmp.pn[5] = pn64;
- seq->gcmp.pn[4] = pn64 >> 8;
- seq->gcmp.pn[3] = pn64 >> 16;
- seq->gcmp.pn[2] = pn64 >> 24;
- seq->gcmp.pn[1] = pn64 >> 32;
- seq->gcmp.pn[0] = pn64 >> 40;
- break;
default:
WARN_ON(1);
}
@@ -1011,43 +996,25 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
- pn64 = (u64)seq->ccmp.pn[5] |
- ((u64)seq->ccmp.pn[4] << 8) |
- ((u64)seq->ccmp.pn[3] << 16) |
- ((u64)seq->ccmp.pn[2] << 24) |
- ((u64)seq->ccmp.pn[1] << 32) |
- ((u64)seq->ccmp.pn[0] << 40);
- atomic64_set(&key->u.ccmp.tx_pn, pn64);
- break;
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- pn64 = (u64)seq->aes_cmac.pn[5] |
- ((u64)seq->aes_cmac.pn[4] << 8) |
- ((u64)seq->aes_cmac.pn[3] << 16) |
- ((u64)seq->aes_cmac.pn[2] << 24) |
- ((u64)seq->aes_cmac.pn[1] << 32) |
- ((u64)seq->aes_cmac.pn[0] << 40);
- atomic64_set(&key->u.aes_cmac.tx_pn, pn64);
- break;
+ BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
+ offsetof(typeof(*seq), aes_cmac));
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- pn64 = (u64)seq->aes_gmac.pn[5] |
- ((u64)seq->aes_gmac.pn[4] << 8) |
- ((u64)seq->aes_gmac.pn[3] << 16) |
- ((u64)seq->aes_gmac.pn[2] << 24) |
- ((u64)seq->aes_gmac.pn[1] << 32) |
- ((u64)seq->aes_gmac.pn[0] << 40);
- atomic64_set(&key->u.aes_gmac.tx_pn, pn64);
- break;
+ BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
+ offsetof(typeof(*seq), aes_gmac));
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
- pn64 = (u64)seq->gcmp.pn[5] |
- ((u64)seq->gcmp.pn[4] << 8) |
- ((u64)seq->gcmp.pn[3] << 16) |
- ((u64)seq->gcmp.pn[2] << 24) |
- ((u64)seq->gcmp.pn[1] << 32) |
- ((u64)seq->gcmp.pn[0] << 40);
- atomic64_set(&key->u.gcmp.tx_pn, pn64);
+ BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
+ offsetof(typeof(*seq), gcmp));
+ pn64 = (u64)seq->ccmp.pn[5] |
+ ((u64)seq->ccmp.pn[4] << 8) |
+ ((u64)seq->ccmp.pn[3] << 16) |
+ ((u64)seq->ccmp.pn[2] << 24) |
+ ((u64)seq->ccmp.pn[1] << 32) |
+ ((u64)seq->ccmp.pn[0] << 40);
+ atomic64_set(&key->conf.tx_pn, pn64);
break;
default:
WARN_ON(1);
diff --git a/kernel/net/mac80211/key.h b/kernel/net/mac80211/key.h
index 96557dd1e..9951ef063 100644
--- a/kernel/net/mac80211/key.h
+++ b/kernel/net/mac80211/key.h
@@ -18,7 +18,6 @@
#define NUM_DEFAULT_KEYS 4
#define NUM_DEFAULT_MGMT_KEYS 2
-#define MAX_PN_LEN 16
struct ieee80211_local;
struct ieee80211_sub_if_data;
@@ -78,7 +77,6 @@ struct ieee80211_key {
u32 mic_failures;
} tkip;
struct {
- atomic64_t tx_pn;
/*
* Last received packet number. The first
* IEEE80211_NUM_TIDS counters are used with Data
@@ -90,21 +88,18 @@ struct ieee80211_key {
u32 replays; /* dot11RSNAStatsCCMPReplays */
} ccmp;
struct {
- atomic64_t tx_pn;
u8 rx_pn[IEEE80211_CMAC_PN_LEN];
struct crypto_cipher *tfm;
u32 replays; /* dot11RSNAStatsCMACReplays */
u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
} aes_cmac;
struct {
- atomic64_t tx_pn;
u8 rx_pn[IEEE80211_GMAC_PN_LEN];
struct crypto_aead *tfm;
u32 replays; /* dot11RSNAStatsCMACReplays */
u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
} aes_gmac;
struct {
- atomic64_t tx_pn;
/* Last received packet number. The first
* IEEE80211_NUM_TIDS counters are used with Data
* frames and the last counter is used with Robust
@@ -116,13 +111,10 @@ struct ieee80211_key {
} gcmp;
struct {
/* generic cipher scheme */
- u8 rx_pn[IEEE80211_NUM_TIDS + 1][MAX_PN_LEN];
+ u8 rx_pn[IEEE80211_NUM_TIDS + 1][IEEE80211_MAX_PN_LEN];
} gen;
} u;
- /* number of times this key has been used */
- int tx_rx_count;
-
#ifdef CONFIG_MAC80211_DEBUGFS
struct {
struct dentry *stalink;
diff --git a/kernel/net/mac80211/led.c b/kernel/net/mac80211/led.c
index e2b836446..0505845b7 100644
--- a/kernel/net/mac80211/led.c
+++ b/kernel/net/mac80211/led.c
@@ -12,96 +12,175 @@
#include <linux/export.h>
#include "led.h"
-#define MAC80211_BLINK_DELAY 50 /* ms */
-
-void ieee80211_led_rx(struct ieee80211_local *local)
-{
- unsigned long led_delay = MAC80211_BLINK_DELAY;
- if (unlikely(!local->rx_led))
- return;
- led_trigger_blink_oneshot(local->rx_led, &led_delay, &led_delay, 0);
-}
-
-void ieee80211_led_tx(struct ieee80211_local *local)
-{
- unsigned long led_delay = MAC80211_BLINK_DELAY;
- if (unlikely(!local->tx_led))
- return;
- led_trigger_blink_oneshot(local->tx_led, &led_delay, &led_delay, 0);
-}
-
void ieee80211_led_assoc(struct ieee80211_local *local, bool associated)
{
- if (unlikely(!local->assoc_led))
+ if (!atomic_read(&local->assoc_led_active))
return;
if (associated)
- led_trigger_event(local->assoc_led, LED_FULL);
+ led_trigger_event(&local->assoc_led, LED_FULL);
else
- led_trigger_event(local->assoc_led, LED_OFF);
+ led_trigger_event(&local->assoc_led, LED_OFF);
}
void ieee80211_led_radio(struct ieee80211_local *local, bool enabled)
{
- if (unlikely(!local->radio_led))
+ if (!atomic_read(&local->radio_led_active))
return;
if (enabled)
- led_trigger_event(local->radio_led, LED_FULL);
+ led_trigger_event(&local->radio_led, LED_FULL);
else
- led_trigger_event(local->radio_led, LED_OFF);
+ led_trigger_event(&local->radio_led, LED_OFF);
+}
+
+void ieee80211_alloc_led_names(struct ieee80211_local *local)
+{
+ local->rx_led.name = kasprintf(GFP_KERNEL, "%srx",
+ wiphy_name(local->hw.wiphy));
+ local->tx_led.name = kasprintf(GFP_KERNEL, "%stx",
+ wiphy_name(local->hw.wiphy));
+ local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc",
+ wiphy_name(local->hw.wiphy));
+ local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio",
+ wiphy_name(local->hw.wiphy));
+}
+
+void ieee80211_free_led_names(struct ieee80211_local *local)
+{
+ kfree(local->rx_led.name);
+ kfree(local->tx_led.name);
+ kfree(local->assoc_led.name);
+ kfree(local->radio_led.name);
+}
+
+static void ieee80211_tx_led_activate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ tx_led);
+
+ atomic_inc(&local->tx_led_active);
+}
+
+static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ tx_led);
+
+ atomic_dec(&local->tx_led_active);
+}
+
+static void ieee80211_rx_led_activate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ rx_led);
+
+ atomic_inc(&local->rx_led_active);
+}
+
+static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ rx_led);
+
+ atomic_dec(&local->rx_led_active);
+}
+
+static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ assoc_led);
+
+ atomic_inc(&local->assoc_led_active);
+}
+
+static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ assoc_led);
+
+ atomic_dec(&local->assoc_led_active);
+}
+
+static void ieee80211_radio_led_activate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ radio_led);
+
+ atomic_inc(&local->radio_led_active);
+}
+
+static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ radio_led);
+
+ atomic_dec(&local->radio_led_active);
+}
+
+static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev)
+{
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ tpt_led);
+
+ atomic_inc(&local->tpt_led_active);
}
-void ieee80211_led_names(struct ieee80211_local *local)
+static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev)
{
- snprintf(local->rx_led_name, sizeof(local->rx_led_name),
- "%srx", wiphy_name(local->hw.wiphy));
- snprintf(local->tx_led_name, sizeof(local->tx_led_name),
- "%stx", wiphy_name(local->hw.wiphy));
- snprintf(local->assoc_led_name, sizeof(local->assoc_led_name),
- "%sassoc", wiphy_name(local->hw.wiphy));
- snprintf(local->radio_led_name, sizeof(local->radio_led_name),
- "%sradio", wiphy_name(local->hw.wiphy));
+ struct ieee80211_local *local = container_of(led_cdev->trigger,
+ struct ieee80211_local,
+ tpt_led);
+
+ atomic_dec(&local->tpt_led_active);
}
void ieee80211_led_init(struct ieee80211_local *local)
{
- local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
- if (local->rx_led) {
- local->rx_led->name = local->rx_led_name;
- if (led_trigger_register(local->rx_led)) {
- kfree(local->rx_led);
- local->rx_led = NULL;
- }
+ atomic_set(&local->rx_led_active, 0);
+ local->rx_led.activate = ieee80211_rx_led_activate;
+ local->rx_led.deactivate = ieee80211_rx_led_deactivate;
+ if (local->rx_led.name && led_trigger_register(&local->rx_led)) {
+ kfree(local->rx_led.name);
+ local->rx_led.name = NULL;
}
- local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
- if (local->tx_led) {
- local->tx_led->name = local->tx_led_name;
- if (led_trigger_register(local->tx_led)) {
- kfree(local->tx_led);
- local->tx_led = NULL;
- }
+ atomic_set(&local->tx_led_active, 0);
+ local->tx_led.activate = ieee80211_tx_led_activate;
+ local->tx_led.deactivate = ieee80211_tx_led_deactivate;
+ if (local->tx_led.name && led_trigger_register(&local->tx_led)) {
+ kfree(local->tx_led.name);
+ local->tx_led.name = NULL;
}
- local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
- if (local->assoc_led) {
- local->assoc_led->name = local->assoc_led_name;
- if (led_trigger_register(local->assoc_led)) {
- kfree(local->assoc_led);
- local->assoc_led = NULL;
- }
+ atomic_set(&local->assoc_led_active, 0);
+ local->assoc_led.activate = ieee80211_assoc_led_activate;
+ local->assoc_led.deactivate = ieee80211_assoc_led_deactivate;
+ if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) {
+ kfree(local->assoc_led.name);
+ local->assoc_led.name = NULL;
}
- local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
- if (local->radio_led) {
- local->radio_led->name = local->radio_led_name;
- if (led_trigger_register(local->radio_led)) {
- kfree(local->radio_led);
- local->radio_led = NULL;
- }
+ atomic_set(&local->radio_led_active, 0);
+ local->radio_led.activate = ieee80211_radio_led_activate;
+ local->radio_led.deactivate = ieee80211_radio_led_deactivate;
+ if (local->radio_led.name && led_trigger_register(&local->radio_led)) {
+ kfree(local->radio_led.name);
+ local->radio_led.name = NULL;
}
+ atomic_set(&local->tpt_led_active, 0);
if (local->tpt_led_trigger) {
- if (led_trigger_register(&local->tpt_led_trigger->trig)) {
+ local->tpt_led.activate = ieee80211_tpt_led_activate;
+ local->tpt_led.deactivate = ieee80211_tpt_led_deactivate;
+ if (led_trigger_register(&local->tpt_led)) {
kfree(local->tpt_led_trigger);
local->tpt_led_trigger = NULL;
}
@@ -110,58 +189,50 @@ void ieee80211_led_init(struct ieee80211_local *local)
void ieee80211_led_exit(struct ieee80211_local *local)
{
- if (local->radio_led) {
- led_trigger_unregister(local->radio_led);
- kfree(local->radio_led);
- }
- if (local->assoc_led) {
- led_trigger_unregister(local->assoc_led);
- kfree(local->assoc_led);
- }
- if (local->tx_led) {
- led_trigger_unregister(local->tx_led);
- kfree(local->tx_led);
- }
- if (local->rx_led) {
- led_trigger_unregister(local->rx_led);
- kfree(local->rx_led);
- }
+ if (local->radio_led.name)
+ led_trigger_unregister(&local->radio_led);
+ if (local->assoc_led.name)
+ led_trigger_unregister(&local->assoc_led);
+ if (local->tx_led.name)
+ led_trigger_unregister(&local->tx_led);
+ if (local->rx_led.name)
+ led_trigger_unregister(&local->rx_led);
if (local->tpt_led_trigger) {
- led_trigger_unregister(&local->tpt_led_trigger->trig);
+ led_trigger_unregister(&local->tpt_led);
kfree(local->tpt_led_trigger);
}
}
-char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw)
+const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
- return local->radio_led_name;
+ return local->radio_led.name;
}
EXPORT_SYMBOL(__ieee80211_get_radio_led_name);
-char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw)
+const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
- return local->assoc_led_name;
+ return local->assoc_led.name;
}
EXPORT_SYMBOL(__ieee80211_get_assoc_led_name);
-char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw)
+const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
- return local->tx_led_name;
+ return local->tx_led.name;
}
EXPORT_SYMBOL(__ieee80211_get_tx_led_name);
-char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw)
+const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
- return local->rx_led_name;
+ return local->rx_led.name;
}
EXPORT_SYMBOL(__ieee80211_get_rx_led_name);
@@ -205,16 +276,17 @@ static void tpt_trig_timer(unsigned long data)
}
}
- read_lock(&tpt_trig->trig.leddev_list_lock);
- list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
+ read_lock(&local->tpt_led.leddev_list_lock);
+ list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list)
led_blink_set(led_cdev, &on, &off);
- read_unlock(&tpt_trig->trig.leddev_list_lock);
+ read_unlock(&local->tpt_led.leddev_list_lock);
}
-char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw,
- unsigned int flags,
- const struct ieee80211_tpt_blink *blink_table,
- unsigned int blink_table_len)
+const char *
+__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw,
+ unsigned int flags,
+ const struct ieee80211_tpt_blink *blink_table,
+ unsigned int blink_table_len)
{
struct ieee80211_local *local = hw_to_local(hw);
struct tpt_led_trigger *tpt_trig;
@@ -229,7 +301,7 @@ char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw,
snprintf(tpt_trig->name, sizeof(tpt_trig->name),
"%stpt", wiphy_name(local->hw.wiphy));
- tpt_trig->trig.name = tpt_trig->name;
+ local->tpt_led.name = tpt_trig->name;
tpt_trig->blink_table = blink_table;
tpt_trig->blink_table_len = blink_table_len;
@@ -269,10 +341,10 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local)
tpt_trig->running = false;
del_timer_sync(&tpt_trig->timer);
- read_lock(&tpt_trig->trig.leddev_list_lock);
- list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
+ read_lock(&local->tpt_led.leddev_list_lock);
+ list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list)
led_set_brightness(led_cdev, LED_OFF);
- read_unlock(&tpt_trig->trig.leddev_list_lock);
+ read_unlock(&local->tpt_led.leddev_list_lock);
}
void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
diff --git a/kernel/net/mac80211/led.h b/kernel/net/mac80211/led.h
index 89f4344f1..a7893a1ac 100644
--- a/kernel/net/mac80211/led.h
+++ b/kernel/net/mac80211/led.h
@@ -11,25 +11,42 @@
#include <linux/leds.h>
#include "ieee80211_i.h"
+#define MAC80211_BLINK_DELAY 50 /* ms */
+
+static inline void ieee80211_led_rx(struct ieee80211_local *local)
+{
+#ifdef CONFIG_MAC80211_LEDS
+ unsigned long led_delay = MAC80211_BLINK_DELAY;
+
+ if (!atomic_read(&local->rx_led_active))
+ return;
+ led_trigger_blink_oneshot(&local->rx_led, &led_delay, &led_delay, 0);
+#endif
+}
+
+static inline void ieee80211_led_tx(struct ieee80211_local *local)
+{
+#ifdef CONFIG_MAC80211_LEDS
+ unsigned long led_delay = MAC80211_BLINK_DELAY;
+
+ if (!atomic_read(&local->tx_led_active))
+ return;
+ led_trigger_blink_oneshot(&local->tx_led, &led_delay, &led_delay, 0);
+#endif
+}
+
#ifdef CONFIG_MAC80211_LEDS
-void ieee80211_led_rx(struct ieee80211_local *local);
-void ieee80211_led_tx(struct ieee80211_local *local);
void ieee80211_led_assoc(struct ieee80211_local *local,
bool associated);
void ieee80211_led_radio(struct ieee80211_local *local,
bool enabled);
-void ieee80211_led_names(struct ieee80211_local *local);
+void ieee80211_alloc_led_names(struct ieee80211_local *local);
+void ieee80211_free_led_names(struct ieee80211_local *local);
void ieee80211_led_init(struct ieee80211_local *local);
void ieee80211_led_exit(struct ieee80211_local *local);
void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
unsigned int types_on, unsigned int types_off);
#else
-static inline void ieee80211_led_rx(struct ieee80211_local *local)
-{
-}
-static inline void ieee80211_led_tx(struct ieee80211_local *local)
-{
-}
static inline void ieee80211_led_assoc(struct ieee80211_local *local,
bool associated)
{
@@ -38,7 +55,10 @@ static inline void ieee80211_led_radio(struct ieee80211_local *local,
bool enabled)
{
}
-static inline void ieee80211_led_names(struct ieee80211_local *local)
+static inline void ieee80211_alloc_led_names(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_free_led_names(struct ieee80211_local *local)
{
}
static inline void ieee80211_led_init(struct ieee80211_local *local)
@@ -58,7 +78,7 @@ static inline void
ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes)
{
#ifdef CONFIG_MAC80211_LEDS
- if (local->tpt_led_trigger && ieee80211_is_data(fc))
+ if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active))
local->tpt_led_trigger->tx_bytes += bytes;
#endif
}
@@ -67,7 +87,7 @@ static inline void
ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes)
{
#ifdef CONFIG_MAC80211_LEDS
- if (local->tpt_led_trigger && ieee80211_is_data(fc))
+ if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active))
local->tpt_led_trigger->rx_bytes += bytes;
#endif
}
diff --git a/kernel/net/mac80211/main.c b/kernel/net/mac80211/main.c
index e86daed83..175ffcf7f 100644
--- a/kernel/net/mac80211/main.c
+++ b/kernel/net/mac80211/main.c
@@ -20,7 +20,6 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/bitmap.h>
-#include <linux/pm_qos.h>
#include <linux/inetdevice.h>
#include <net/net_namespace.h>
#include <net/cfg80211.h>
@@ -32,7 +31,6 @@
#include "mesh.h"
#include "wep.h"
#include "led.h"
-#include "cfg.h"
#include "debugfs.h"
void ieee80211_configure_filter(struct ieee80211_local *local)
@@ -41,9 +39,6 @@ void ieee80211_configure_filter(struct ieee80211_local *local)
unsigned int changed_flags;
unsigned int new_flags = 0;
- if (atomic_read(&local->iff_promiscs))
- new_flags |= FIF_PROMISC_IN_BSS;
-
if (atomic_read(&local->iff_allmultis))
new_flags |= FIF_ALLMULTI;
@@ -286,7 +281,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw)
local->in_reconfig = true;
barrier();
- schedule_work(&local->restart_work);
+ queue_work(system_freezable_wq, &local->restart_work);
}
EXPORT_SYMBOL(ieee80211_restart_hw);
@@ -632,6 +627,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
INIT_WORK(&local->sched_scan_stopped_work,
ieee80211_sched_scan_stopped_work);
+ INIT_WORK(&local->tdls_chsw_work, ieee80211_tdls_chsw_work);
+
spin_lock_init(&local->ack_status_lock);
idr_init(&local->ack_status_frames);
@@ -648,8 +645,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
skb_queue_head_init(&local->skb_queue);
skb_queue_head_init(&local->skb_queue_unreliable);
+ skb_queue_head_init(&local->skb_queue_tdls_chsw);
- ieee80211_led_names(local);
+ ieee80211_alloc_led_names(local);
ieee80211_roc_setup(local);
@@ -664,7 +662,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
{
bool have_wep = !(IS_ERR(local->wep_tx_tfm) ||
IS_ERR(local->wep_rx_tfm));
- bool have_mfp = local->hw.flags & IEEE80211_HW_MFP_CAPABLE;
+ bool have_mfp = ieee80211_hw_check(&local->hw, MFP_CAPABLE);
int n_suites = 0, r = 0, w = 0;
u32 *suites;
static const u32 cipher_suites[] = {
@@ -684,7 +682,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
WLAN_CIPHER_SUITE_BIP_GMAC_256,
};
- if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL ||
+ if (ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) ||
local->hw.wiphy->cipher_suites) {
/* If the driver advertises, or doesn't support SW crypto,
* we only need to remove WEP if necessary.
@@ -774,8 +772,13 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_256;
}
- for (r = 0; r < local->hw.n_cipher_schemes; r++)
+ for (r = 0; r < local->hw.n_cipher_schemes; r++) {
suites[w++] = cs[r].cipher;
+ if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) {
+ kfree(suites);
+ return -EINVAL;
+ }
+ }
}
local->hw.wiphy->cipher_suites = suites;
@@ -795,7 +798,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
netdev_features_t feature_whitelist;
struct cfg80211_chan_def dflt_chandef = {};
- if (hw->flags & IEEE80211_HW_QUEUE_CONTROL &&
+ if (ieee80211_hw_check(hw, QUEUE_CONTROL) &&
(local->hw.offchannel_tx_hw_queue == IEEE80211_INVAL_HW_QUEUE ||
local->hw.offchannel_tx_hw_queue >= local->hw.queues))
return -EINVAL;
@@ -843,7 +846,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* Only HW csum features are currently compatible with mac80211 */
feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
- NETIF_F_HW_CSUM;
+ NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA |
+ NETIF_F_GSO_SOFTWARE;
if (WARN_ON(hw->netdev_features & ~feature_whitelist))
return -EINVAL;
@@ -942,9 +946,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* mac80211 supports control port protocol changing */
local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL;
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+ if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) {
local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
- } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
+ } else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) {
local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;
if (hw->max_signal <= 0) {
result = -EINVAL;
@@ -998,7 +1002,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP;
/* mac80211 supports eCSA, if the driver supports STA CSA at all */
- if (local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)
+ if (ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA))
local->ext_capa[0] |= WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING;
local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM;
@@ -1066,7 +1070,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* add one default STA interface if supported */
if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&
- !(hw->flags & IEEE80211_HW_NO_AUTO_VIF)) {
+ !ieee80211_hw_check(hw, NO_AUTO_VIF)) {
result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL,
NL80211_IFTYPE_STATION, NULL);
if (result)
@@ -1076,13 +1080,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
rtnl_unlock();
- local->network_latency_notifier.notifier_call =
- ieee80211_max_network_latency;
- result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY,
- &local->network_latency_notifier);
- if (result)
- goto fail_pm_qos;
-
#ifdef CONFIG_INET
local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
result = register_inetaddr_notifier(&local->ifa_notifier);
@@ -1107,10 +1104,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
#endif
#if defined(CONFIG_INET) || defined(CONFIG_IPV6)
fail_ifa:
- pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
- &local->network_latency_notifier);
#endif
- fail_pm_qos:
rtnl_lock();
rate_control_deinitialize(local);
ieee80211_remove_interfaces(local);
@@ -1129,18 +1123,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
}
EXPORT_SYMBOL(ieee80211_register_hw);
-void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi,
- struct net_device *napi_dev,
- int (*poll)(struct napi_struct *, int),
- int weight)
-{
- struct ieee80211_local *local = hw_to_local(hw);
-
- netif_napi_add(napi_dev, napi, poll, weight);
- local->napi = napi;
-}
-EXPORT_SYMBOL_GPL(ieee80211_napi_add);
-
void ieee80211_unregister_hw(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
@@ -1148,8 +1130,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
tasklet_kill(&local->tx_pending_tasklet);
tasklet_kill(&local->tasklet);
- pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
- &local->network_latency_notifier);
#ifdef CONFIG_INET
unregister_inetaddr_notifier(&local->ifa_notifier);
#endif
@@ -1170,6 +1150,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
cancel_work_sync(&local->restart_work);
cancel_work_sync(&local->reconfig_filter);
+ cancel_work_sync(&local->tdls_chsw_work);
flush_work(&local->sched_scan_stopped_work);
ieee80211_clear_tx_pending(local);
@@ -1180,6 +1161,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
wiphy_warn(local->hw.wiphy, "skb_queue not empty\n");
skb_queue_purge(&local->skb_queue);
skb_queue_purge(&local->skb_queue_unreliable);
+ skb_queue_purge(&local->skb_queue_tdls_chsw);
destroy_workqueue(local->workqueue);
wiphy_unregister(local->hw.wiphy);
@@ -1212,6 +1194,8 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
sta_info_stop(local);
+ ieee80211_free_led_names(local);
+
wiphy_free(local->hw.wiphy);
}
EXPORT_SYMBOL(ieee80211_free_hw);
diff --git a/kernel/net/mac80211/mesh.c b/kernel/net/mac80211/mesh.c
index 817098add..6f85b6ab8 100644
--- a/kernel/net/mac80211/mesh.c
+++ b/kernel/net/mac80211/mesh.c
@@ -94,6 +94,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
ie->ht_operation, &sta_chan_def);
+ ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
+ ie->vht_operation, &sta_chan_def);
+
if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
&sta_chan_def))
return false;
@@ -158,7 +161,7 @@ void mesh_sta_cleanup(struct sta_info *sta)
changed = mesh_accept_plinks_update(sdata);
if (!sdata->u.mesh.user_mpm) {
changed |= mesh_plink_deactivate(sta);
- del_timer_sync(&sta->plink_timer);
+ del_timer_sync(&sta->mesh->plink_timer);
}
if (changed)
@@ -436,8 +439,6 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx_conf *chanctx_conf;
struct ieee80211_channel *channel;
- enum nl80211_channel_type channel_type =
- cfg80211_get_chandef_type(&sdata->vif.bss_conf.chandef);
struct ieee80211_supported_band *sband;
struct ieee80211_sta_ht_cap *ht_cap;
u8 *pos;
@@ -454,7 +455,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
sband = local->hw.wiphy->bands[channel->band];
ht_cap = &sband->ht_cap;
- if (!ht_cap->ht_supported || channel_type == NL80211_CHAN_NO_HT)
+ if (!ht_cap->ht_supported ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
return 0;
if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_operation))
@@ -462,7 +466,70 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation));
ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chandef,
- sdata->vif.bss_conf.ht_operation_mode);
+ sdata->vif.bss_conf.ht_operation_mode,
+ false);
+
+ return 0;
+}
+
+int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ struct ieee80211_supported_band *sband;
+ u8 *pos;
+
+ sband = local->hw.wiphy->bands[band];
+ if (!sband->vht_cap.vht_supported ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+ return 0;
+
+ if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap))
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap));
+ ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap);
+
+ return 0;
+}
+
+int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_channel *channel;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_sta_vht_cap *vht_cap;
+ u8 *pos;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ channel = chanctx_conf->def.chan;
+ rcu_read_unlock();
+
+ sband = local->hw.wiphy->bands[channel->band];
+ vht_cap = &sband->vht_cap;
+
+ if (!vht_cap->vht_supported ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+ return 0;
+
+ if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_operation))
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation));
+ ieee80211_ie_build_vht_oper(pos, vht_cap,
+ &sdata->vif.bss_conf.chandef);
return 0;
}
@@ -540,9 +607,9 @@ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc,
*
* Return the header length.
*/
-int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata,
- struct ieee80211s_hdr *meshhdr,
- const char *addr4or5, const char *addr6)
+unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211s_hdr *meshhdr,
+ const char *addr4or5, const char *addr6)
{
if (WARN_ON(!addr4or5 && addr6))
return 0;
@@ -637,6 +704,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
2 + ifmsh->mesh_id_len +
2 + sizeof(struct ieee80211_meshconf_ie) +
2 + sizeof(__le16) + /* awake window */
+ 2 + sizeof(struct ieee80211_vht_cap) +
+ 2 + sizeof(struct ieee80211_vht_operation) +
ifmsh->ie_len;
bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL);
@@ -718,6 +787,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
mesh_add_meshid_ie(sdata, skb) ||
mesh_add_meshconf_ie(sdata, skb) ||
mesh_add_awake_window_ie(sdata, skb) ||
+ mesh_add_vht_cap_ie(sdata, skb) ||
+ mesh_add_vht_oper_ie(sdata, skb) ||
mesh_add_vendor_ies(sdata, skb))
goto out_free;
@@ -1299,17 +1370,6 @@ out:
sdata_unlock(sdata);
}
-void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
-{
- struct ieee80211_sub_if_data *sdata;
-
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list)
- if (ieee80211_vif_is_mesh(&sdata->vif) &&
- ieee80211_sdata_running(sdata))
- ieee80211_queue_work(&local->hw, &sdata->work);
- rcu_read_unlock();
-}
void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
{
diff --git a/kernel/net/mac80211/mesh.h b/kernel/net/mac80211/mesh.h
index 50c8473cf..4a8019f79 100644
--- a/kernel/net/mac80211/mesh.h
+++ b/kernel/net/mac80211/mesh.h
@@ -207,9 +207,9 @@ struct mesh_rmc {
/* Various */
int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc,
const u8 *da, const u8 *sa);
-int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata,
- struct ieee80211s_hdr *meshhdr,
- const char *addr4or5, const char *addr6);
+unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211s_hdr *meshhdr,
+ const char *addr4or5, const char *addr6);
int mesh_rmc_check(struct ieee80211_sub_if_data *sdata,
const u8 *addr, struct ieee80211s_hdr *mesh_hdr);
bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
@@ -227,6 +227,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
+int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
void ieee80211s_init(void);
@@ -358,14 +362,10 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP;
}
-void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local);
-
void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
void ieee80211s_stop(void);
#else
-static inline void
-ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {}
static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
{ return false; }
static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
diff --git a/kernel/net/mac80211/mesh_hwmp.c b/kernel/net/mac80211/mesh_hwmp.c
index 214e63b84..c6be0b4f4 100644
--- a/kernel/net/mac80211/mesh_hwmp.c
+++ b/kernel/net/mac80211/mesh_hwmp.c
@@ -19,15 +19,6 @@
#define MAX_PREQ_QUEUE_LEN 64
-/* Destination only */
-#define MP_F_DO 0x1
-/* Reply and forward */
-#define MP_F_RF 0x2
-/* Unknown Sequence Number */
-#define MP_F_USN 0x01
-/* Reason code Present */
-#define MP_F_RCODE 0x02
-
static void mesh_queue_preq(struct mesh_path *, u8);
static inline u32 u32_field_get(const u8 *preq_elem, int offset, bool ae)
@@ -79,6 +70,12 @@ static inline u16 u16_field_get(const u8 *preq_elem, int offset, bool ae)
#define MSEC_TO_TU(x) (x*1000/1024)
#define SN_GT(x, y) ((s32)(y - x) < 0)
#define SN_LT(x, y) ((s32)(x - y) < 0)
+#define MAX_SANE_SN_DELTA 32
+
+static inline u32 SN_DELTA(u32 x, u32 y)
+{
+ return x >= y ? x - y : y - x;
+}
#define net_traversal_jiffies(s) \
msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime)
@@ -279,15 +276,10 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
*pos++ = ttl;
/* number of destinations */
*pos++ = 1;
- /*
- * flags bit, bit 1 is unset if we know the sequence number and
- * bit 2 is set if we have a reason code
+ /* Flags field has AE bit only as defined in
+ * sec 8.4.2.117 IEEE802.11-2012
*/
*pos = 0;
- if (!target_sn)
- *pos |= MP_F_USN;
- if (target_rcode)
- *pos |= MP_F_RCODE;
pos++;
memcpy(pos, target, ETH_ALEN);
pos += ETH_ALEN;
@@ -316,8 +308,9 @@ void ieee80211s_update_metric(struct ieee80211_local *local,
failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK);
/* moving average, scaled to 100 */
- sta->fail_avg = ((80 * sta->fail_avg + 5) / 100 + 20 * failed);
- if (sta->fail_avg > 95)
+ sta->mesh->fail_avg =
+ ((80 * sta->mesh->fail_avg + 5) / 100 + 20 * failed);
+ if (sta->mesh->fail_avg > 95)
mesh_plink_broken(sta);
}
@@ -333,15 +326,15 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local,
u32 tx_time, estimated_retx;
u64 result;
- if (sta->fail_avg >= 100)
+ if (sta->mesh->fail_avg >= 100)
return MAX_METRIC;
- sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo);
+ sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo);
rate = cfg80211_calculate_bitrate(&rinfo);
if (WARN_ON(!rate))
return MAX_METRIC;
- err = (sta->fail_avg << ARITH_SHIFT) / 100;
+ err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100;
/* bitrate is in units of 100 Kbps, while we need rate in units of
* 1Mbps. This will be corrected on tx_time computation.
@@ -441,6 +434,26 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
process = false;
fresh_info = false;
}
+ } else if (!(mpath->flags & MESH_PATH_ACTIVE)) {
+ bool have_sn, newer_sn, bounced;
+
+ have_sn = mpath->flags & MESH_PATH_SN_VALID;
+ newer_sn = have_sn && SN_GT(orig_sn, mpath->sn);
+ bounced = have_sn &&
+ (SN_DELTA(orig_sn, mpath->sn) >
+ MAX_SANE_SN_DELTA);
+
+ if (!have_sn || newer_sn) {
+ /* if SN is newer than what we had
+ * then we can take it */;
+ } else if (bounced) {
+ /* if SN is way different than what
+ * we had then assume the other side
+ * rebooted or restarted */;
+ } else {
+ process = false;
+ fresh_info = false;
+ }
}
} else {
mpath = mesh_path_add(sdata, orig_addr);
@@ -510,14 +523,14 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
- const u8 *preq_elem, u32 metric)
+ const u8 *preq_elem, u32 orig_metric)
{
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
struct mesh_path *mpath = NULL;
const u8 *target_addr, *orig_addr;
const u8 *da;
u8 target_flags, ttl, flags;
- u32 orig_sn, target_sn, lifetime, orig_metric;
+ u32 orig_sn, target_sn, lifetime, target_metric;
bool reply = false;
bool forward = true;
bool root_is_gate;
@@ -528,7 +541,6 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
target_sn = PREQ_IE_TARGET_SN(preq_elem);
orig_sn = PREQ_IE_ORIG_SN(preq_elem);
target_flags = PREQ_IE_TARGET_F(preq_elem);
- orig_metric = metric;
/* Proactive PREQ gate announcements */
flags = PREQ_IE_FLAGS(preq_elem);
root_is_gate = !!(flags & RANN_FLAG_IS_GATE);
@@ -539,7 +551,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
mhwmp_dbg(sdata, "PREQ is for us\n");
forward = false;
reply = true;
- metric = 0;
+ target_metric = 0;
if (time_after(jiffies, ifmsh->last_sn_update +
net_traversal_jiffies(sdata)) ||
time_before(jiffies, ifmsh->last_sn_update)) {
@@ -556,7 +568,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
reply = true;
target_addr = sdata->vif.addr;
target_sn = ++ifmsh->sn;
- metric = 0;
+ target_metric = 0;
ifmsh->last_sn_update = jiffies;
}
if (root_is_gate)
@@ -571,15 +583,13 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
SN_LT(mpath->sn, target_sn)) {
mpath->sn = target_sn;
mpath->flags |= MESH_PATH_SN_VALID;
- } else if ((!(target_flags & MP_F_DO)) &&
+ } else if ((!(target_flags & IEEE80211_PREQ_TO_FLAG)) &&
(mpath->flags & MESH_PATH_ACTIVE)) {
reply = true;
- metric = mpath->metric;
+ target_metric = mpath->metric;
target_sn = mpath->sn;
- if (target_flags & MP_F_RF)
- target_flags |= MP_F_DO;
- else
- forward = false;
+ /* Case E2 of sec 13.10.9.3 IEEE 802.11-2012*/
+ target_flags |= IEEE80211_PREQ_TO_FLAG;
}
}
rcu_read_unlock();
@@ -593,7 +603,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
mesh_path_sel_frame_tx(MPATH_PREP, 0, orig_addr,
orig_sn, 0, target_addr,
target_sn, mgmt->sa, 0, ttl,
- lifetime, metric, 0, sdata);
+ lifetime, target_metric, 0,
+ sdata);
} else {
ifmsh->mshstats.dropped_frames_ttl++;
}
@@ -619,13 +630,12 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) {
target_addr = PREQ_IE_TARGET_ADDR(preq_elem);
target_sn = PREQ_IE_TARGET_SN(preq_elem);
- metric = orig_metric;
}
mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr,
orig_sn, target_flags, target_addr,
target_sn, da, hopcount, ttl, lifetime,
- metric, preq_id, sdata);
+ orig_metric, preq_id, sdata);
if (!is_multicast_ether_addr(da))
ifmsh->mshstats.fwded_unicast++;
else
@@ -737,9 +747,12 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
if (mpath->flags & MESH_PATH_ACTIVE &&
ether_addr_equal(ta, sta->sta.addr) &&
(!(mpath->flags & MESH_PATH_SN_VALID) ||
- SN_GT(target_sn, mpath->sn))) {
+ SN_GT(target_sn, mpath->sn) || target_sn == 0)) {
mpath->flags &= ~MESH_PATH_ACTIVE;
- mpath->sn = target_sn;
+ if (target_sn != 0)
+ mpath->sn = target_sn;
+ else
+ mpath->sn += 1;
spin_unlock_bh(&mpath->state_lock);
if (!ifmsh->mshcfg.dot11MeshForwarding)
goto endperr;
@@ -854,7 +867,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
{
struct ieee802_11_elems elems;
size_t baselen;
- u32 last_hop_metric;
+ u32 path_metric;
struct sta_info *sta;
/* need action_code */
@@ -863,7 +876,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
rcu_read_lock();
sta = sta_info_get(sdata, mgmt->sa);
- if (!sta || sta->plink_state != NL80211_PLINK_ESTAB) {
+ if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) {
rcu_read_unlock();
return;
}
@@ -877,21 +890,21 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
if (elems.preq_len != 37)
/* Right now we support just 1 destination and no AE */
return;
- last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq,
- MPATH_PREQ);
- if (last_hop_metric)
+ path_metric = hwmp_route_info_get(sdata, mgmt, elems.preq,
+ MPATH_PREQ);
+ if (path_metric)
hwmp_preq_frame_process(sdata, mgmt, elems.preq,
- last_hop_metric);
+ path_metric);
}
if (elems.prep) {
if (elems.prep_len != 31)
/* Right now we support no AE */
return;
- last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep,
- MPATH_PREP);
- if (last_hop_metric)
+ path_metric = hwmp_route_info_get(sdata, mgmt, elems.prep,
+ MPATH_PREP);
+ if (path_metric)
hwmp_prep_frame_process(sdata, mgmt, elems.prep,
- last_hop_metric);
+ path_metric);
}
if (elems.perr) {
if (elems.perr_len != 15)
@@ -975,7 +988,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
struct mesh_preq_queue *preq_node;
struct mesh_path *mpath;
- u8 ttl, target_flags;
+ u8 ttl, target_flags = 0;
const u8 *da;
u32 lifetime;
@@ -1034,9 +1047,9 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
}
if (preq_node->flags & PREQ_Q_F_REFRESH)
- target_flags = MP_F_DO;
+ target_flags |= IEEE80211_PREQ_TO_FLAG;
else
- target_flags = MP_F_RF;
+ target_flags &= ~IEEE80211_PREQ_TO_FLAG;
spin_unlock_bh(&mpath->state_lock);
da = (mpath->is_root) ? mpath->rann_snd_addr : broadcast_addr;
@@ -1177,7 +1190,9 @@ void mesh_path_timer(unsigned long data)
spin_unlock_bh(&mpath->state_lock);
mesh_queue_preq(mpath, 0);
} else {
- mpath->flags = 0;
+ mpath->flags &= ~(MESH_PATH_RESOLVING |
+ MESH_PATH_RESOLVED |
+ MESH_PATH_REQ_QUEUED);
mpath->exp_time = jiffies;
spin_unlock_bh(&mpath->state_lock);
if (!mpath->is_gate && mesh_gate_num(sdata) > 0) {
diff --git a/kernel/net/mac80211/mesh_pathtbl.c b/kernel/net/mac80211/mesh_pathtbl.c
index b890e225a..b3b44a5dd 100644
--- a/kernel/net/mac80211/mesh_pathtbl.c
+++ b/kernel/net/mac80211/mesh_pathtbl.c
@@ -779,10 +779,8 @@ void mesh_plink_broken(struct sta_info *sta)
static void mesh_path_node_reclaim(struct rcu_head *rp)
{
struct mpath_node *node = container_of(rp, struct mpath_node, rcu);
- struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
del_timer_sync(&node->mpath->timer);
- atomic_dec(&sdata->u.mesh.mpaths);
kfree(node->mpath);
kfree(node);
}
@@ -790,8 +788,9 @@ static void mesh_path_node_reclaim(struct rcu_head *rp)
/* needs to be called with the corresponding hashwlock taken */
static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
{
- struct mesh_path *mpath;
- mpath = node->mpath;
+ struct mesh_path *mpath = node->mpath;
+ struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
+
spin_lock(&mpath->state_lock);
mpath->flags |= MESH_PATH_RESOLVING;
if (mpath->is_gate)
@@ -799,6 +798,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
hlist_del_rcu(&node->list);
call_rcu(&node->rcu, mesh_path_node_reclaim);
spin_unlock(&mpath->state_lock);
+ atomic_dec(&sdata->u.mesh.mpaths);
atomic_dec(&tbl->entries);
}
diff --git a/kernel/net/mac80211/mesh_plink.c b/kernel/net/mac80211/mesh_plink.c
index 60d737f14..bd3d55eb2 100644
--- a/kernel/net/mac80211/mesh_plink.c
+++ b/kernel/net/mac80211/mesh_plink.c
@@ -13,10 +13,11 @@
#include "rate.h"
#include "mesh.h"
+#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.u.self_prot.variable + 2)
#define PLINK_GET_LLID(p) (p + 2)
#define PLINK_GET_PLID(p) (p + 4)
-#define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \
+#define mod_plink_timer(s, t) (mod_timer(&s->mesh->plink_timer, \
jiffies + msecs_to_jiffies(t)))
enum plink_event {
@@ -53,18 +54,15 @@ static const char * const mplevents[] = {
[CLS_IGNR] = "CLS_IGNR"
};
-static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
- enum ieee80211_self_protected_actioncode action,
- u8 *da, u16 llid, u16 plid, u16 reason);
-
-
/* We only need a valid sta if user configured a minimum rssi_threshold. */
static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta)
{
s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold;
return rssi_threshold == 0 ||
- (sta && (s8) -ewma_read(&sta->avg_signal) > rssi_threshold);
+ (sta &&
+ (s8)-ewma_signal_read(&sta->rx_stats.avg_signal) >
+ rssi_threshold);
}
/**
@@ -72,13 +70,14 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata,
*
* @sta: mesh peer link to restart
*
- * Locking: this function must be called holding sta->lock
+ * Locking: this function must be called holding sta->mesh->plink_lock
*/
static inline void mesh_plink_fsm_restart(struct sta_info *sta)
{
- sta->plink_state = NL80211_PLINK_LISTEN;
- sta->llid = sta->plid = sta->reason = 0;
- sta->plink_retries = 0;
+ lockdep_assert_held(&sta->mesh->plink_lock);
+ sta->mesh->plink_state = NL80211_PLINK_LISTEN;
+ sta->mesh->llid = sta->mesh->plid = sta->mesh->reason = 0;
+ sta->mesh->plink_retries = 0;
}
/*
@@ -105,9 +104,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata)
/* (IEEE 802.11-2012 19.4.5) */
short_slot = true;
goto out;
- } else if (band != IEEE80211_BAND_2GHZ ||
- (band == IEEE80211_BAND_2GHZ &&
- local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
+ } else if (band != IEEE80211_BAND_2GHZ)
goto out;
for (i = 0; i < sband->n_bitrates; i++)
@@ -120,7 +117,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata)
rcu_read_lock();
list_for_each_entry_rcu(sta, &local->sta_list, list) {
if (sdata != sta->sdata ||
- sta->plink_state != NL80211_PLINK_ESTAB)
+ sta->mesh->plink_state != NL80211_PLINK_ESTAB)
continue;
short_slot = false;
@@ -170,7 +167,7 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata)
rcu_read_lock();
list_for_each_entry_rcu(sta, &local->sta_list, list) {
if (sdata != sta->sdata ||
- sta->plink_state != NL80211_PLINK_ESTAB)
+ sta->mesh->plink_state != NL80211_PLINK_ESTAB)
continue;
if (sta->sta.bandwidth > IEEE80211_STA_RX_BW_20)
@@ -205,57 +202,8 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata)
return BSS_CHANGED_HT;
}
-/**
- * __mesh_plink_deactivate - deactivate mesh peer link
- *
- * @sta: mesh peer link to deactivate
- *
- * All mesh paths with this peer as next hop will be flushed
- * Returns beacon changed flag if the beacon content changed.
- *
- * Locking: the caller must hold sta->lock
- */
-static u32 __mesh_plink_deactivate(struct sta_info *sta)
-{
- struct ieee80211_sub_if_data *sdata = sta->sdata;
- u32 changed = 0;
-
- if (sta->plink_state == NL80211_PLINK_ESTAB)
- changed = mesh_plink_dec_estab_count(sdata);
- sta->plink_state = NL80211_PLINK_BLOCKED;
- mesh_path_flush_by_nexthop(sta);
-
- ieee80211_mps_sta_status_update(sta);
- changed |= ieee80211_mps_set_sta_local_pm(sta,
- NL80211_MESH_POWER_UNKNOWN);
-
- return changed;
-}
-
-/**
- * mesh_plink_deactivate - deactivate mesh peer link
- *
- * @sta: mesh peer link to deactivate
- *
- * All mesh paths with this peer as next hop will be flushed
- */
-u32 mesh_plink_deactivate(struct sta_info *sta)
-{
- struct ieee80211_sub_if_data *sdata = sta->sdata;
- u32 changed;
-
- spin_lock_bh(&sta->lock);
- changed = __mesh_plink_deactivate(sta);
- sta->reason = WLAN_REASON_MESH_PEER_CANCELED;
- mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE,
- sta->sta.addr, sta->llid, sta->plid,
- sta->reason);
- spin_unlock_bh(&sta->lock);
-
- return changed;
-}
-
static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
enum ieee80211_self_protected_actioncode action,
u8 *da, u16 llid, u16 plid, u16 reason)
{
@@ -280,6 +228,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
2 + sizeof(struct ieee80211_meshconf_ie) +
2 + sizeof(struct ieee80211_ht_cap) +
2 + sizeof(struct ieee80211_ht_operation) +
+ 2 + sizeof(struct ieee80211_vht_cap) +
+ 2 + sizeof(struct ieee80211_vht_operation) +
2 + 8 + /* peering IE */
sdata->u.mesh.ie_len);
if (!skb)
@@ -305,7 +255,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
if (action == WLAN_SP_MESH_PEERING_CONFIRM) {
/* AID */
pos = skb_put(skb, 2);
- put_unaligned_le16(plid, pos + 2);
+ put_unaligned_le16(sta->sta.aid, pos);
}
if (ieee80211_add_srates_ie(sdata, skb, true, band) ||
ieee80211_add_ext_srates_ie(sdata, skb, true, band) ||
@@ -360,7 +310,9 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
if (action != WLAN_SP_MESH_PEERING_CLOSE) {
if (mesh_add_ht_cap_ie(sdata, skb) ||
- mesh_add_ht_oper_ie(sdata, skb))
+ mesh_add_ht_oper_ie(sdata, skb) ||
+ mesh_add_vht_cap_ie(sdata, skb) ||
+ mesh_add_vht_oper_ie(sdata, skb))
goto free;
}
@@ -374,6 +326,58 @@ free:
return err;
}
+/**
+ * __mesh_plink_deactivate - deactivate mesh peer link
+ *
+ * @sta: mesh peer link to deactivate
+ *
+ * All mesh paths with this peer as next hop will be flushed
+ * Returns beacon changed flag if the beacon content changed.
+ *
+ * Locking: the caller must hold sta->mesh->plink_lock
+ */
+static u32 __mesh_plink_deactivate(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed = 0;
+
+ lockdep_assert_held(&sta->mesh->plink_lock);
+
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
+ changed = mesh_plink_dec_estab_count(sdata);
+ sta->mesh->plink_state = NL80211_PLINK_BLOCKED;
+ mesh_path_flush_by_nexthop(sta);
+
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ NL80211_MESH_POWER_UNKNOWN);
+
+ return changed;
+}
+
+/**
+ * mesh_plink_deactivate - deactivate mesh peer link
+ *
+ * @sta: mesh peer link to deactivate
+ *
+ * All mesh paths with this peer as next hop will be flushed
+ */
+u32 mesh_plink_deactivate(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed;
+
+ spin_lock_bh(&sta->mesh->plink_lock);
+ changed = __mesh_plink_deactivate(sta);
+ sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED;
+ mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE,
+ sta->sta.addr, sta->mesh->llid, sta->mesh->plid,
+ sta->mesh->reason);
+ spin_unlock_bh(&sta->mesh->plink_lock);
+
+ return changed;
+}
+
static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
struct ieee802_11_elems *elems, bool insert)
@@ -387,12 +391,14 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
sband = local->hw.wiphy->bands[band];
rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates);
- spin_lock_bh(&sta->lock);
- sta->last_rx = jiffies;
+ spin_lock_bh(&sta->mesh->plink_lock);
+ sta->rx_stats.last_rx = jiffies;
/* rates and capabilities don't change during peering */
- if (sta->plink_state == NL80211_PLINK_ESTAB)
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB &&
+ sta->mesh->processed_beacon)
goto out;
+ sta->mesh->processed_beacon = true;
if (sta->sta.supp_rates[band] != rates)
changed |= IEEE80211_RC_SUPP_RATES_CHANGED;
@@ -402,6 +408,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
elems->ht_cap_elem, sta))
changed |= IEEE80211_RC_BW_CHANGED;
+ ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
+ elems->vht_cap_elem, sta);
+
if (bw != sta->sta.bandwidth)
changed |= IEEE80211_RC_BW_CHANGED;
@@ -419,23 +428,57 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
else
rate_control_rate_update(local, sband, sta, changed);
out:
- spin_unlock_bh(&sta->lock);
+ spin_unlock_bh(&sta->mesh->plink_lock);
+}
+
+static int mesh_allocate_aid(struct ieee80211_sub_if_data *sdata)
+{
+ struct sta_info *sta;
+ unsigned long *aid_map;
+ int aid;
+
+ aid_map = kcalloc(BITS_TO_LONGS(IEEE80211_MAX_AID + 1),
+ sizeof(*aid_map), GFP_KERNEL);
+ if (!aid_map)
+ return -ENOMEM;
+
+ /* reserve aid 0 for mcast indication */
+ __set_bit(0, aid_map);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &sdata->local->sta_list, list)
+ __set_bit(sta->sta.aid, aid_map);
+ rcu_read_unlock();
+
+ aid = find_first_zero_bit(aid_map, IEEE80211_MAX_AID + 1);
+ kfree(aid_map);
+
+ if (aid > IEEE80211_MAX_AID)
+ return -ENOBUFS;
+
+ return aid;
}
static struct sta_info *
__mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr)
{
struct sta_info *sta;
+ int aid;
if (sdata->local->num_sta >= MESH_MAX_PLINKS)
return NULL;
+ aid = mesh_allocate_aid(sdata);
+ if (aid < 0)
+ return NULL;
+
sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL);
if (!sta)
return NULL;
- sta->plink_state = NL80211_PLINK_LISTEN;
+ sta->mesh->plink_state = NL80211_PLINK_LISTEN;
sta->sta.wme = true;
+ sta->sta.aid = aid;
sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
@@ -522,7 +565,7 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata,
goto out;
if (mesh_peer_accepts_plinks(elems) &&
- sta->plink_state == NL80211_PLINK_LISTEN &&
+ sta->mesh->plink_state == NL80211_PLINK_LISTEN &&
sdata->u.mesh.accepting_plinks &&
sdata->u.mesh.mshcfg.auto_open_plinks &&
rssi_threshold_check(sdata, sta))
@@ -552,52 +595,52 @@ static void mesh_plink_timer(unsigned long data)
if (sta->sdata->local->quiescing)
return;
- spin_lock_bh(&sta->lock);
+ spin_lock_bh(&sta->mesh->plink_lock);
/* If a timer fires just before a state transition on another CPU,
* we may have already extended the timeout and changed state by the
* time we've acquired the lock and arrived here. In that case,
* skip this timer and wait for the new one.
*/
- if (time_before(jiffies, sta->plink_timer.expires)) {
+ if (time_before(jiffies, sta->mesh->plink_timer.expires)) {
mpl_dbg(sta->sdata,
"Ignoring timer for %pM in state %s (timer adjusted)",
- sta->sta.addr, mplstates[sta->plink_state]);
- spin_unlock_bh(&sta->lock);
+ sta->sta.addr, mplstates[sta->mesh->plink_state]);
+ spin_unlock_bh(&sta->mesh->plink_lock);
return;
}
/* del_timer() and handler may race when entering these states */
- if (sta->plink_state == NL80211_PLINK_LISTEN ||
- sta->plink_state == NL80211_PLINK_ESTAB) {
+ if (sta->mesh->plink_state == NL80211_PLINK_LISTEN ||
+ sta->mesh->plink_state == NL80211_PLINK_ESTAB) {
mpl_dbg(sta->sdata,
"Ignoring timer for %pM in state %s (timer deleted)",
- sta->sta.addr, mplstates[sta->plink_state]);
- spin_unlock_bh(&sta->lock);
+ sta->sta.addr, mplstates[sta->mesh->plink_state]);
+ spin_unlock_bh(&sta->mesh->plink_lock);
return;
}
mpl_dbg(sta->sdata,
"Mesh plink timer for %pM fired on state %s\n",
- sta->sta.addr, mplstates[sta->plink_state]);
+ sta->sta.addr, mplstates[sta->mesh->plink_state]);
sdata = sta->sdata;
mshcfg = &sdata->u.mesh.mshcfg;
- switch (sta->plink_state) {
+ switch (sta->mesh->plink_state) {
case NL80211_PLINK_OPN_RCVD:
case NL80211_PLINK_OPN_SNT:
/* retry timer */
- if (sta->plink_retries < mshcfg->dot11MeshMaxRetries) {
+ if (sta->mesh->plink_retries < mshcfg->dot11MeshMaxRetries) {
u32 rand;
mpl_dbg(sta->sdata,
"Mesh plink for %pM (retry, timeout): %d %d\n",
- sta->sta.addr, sta->plink_retries,
- sta->plink_timeout);
+ sta->sta.addr, sta->mesh->plink_retries,
+ sta->mesh->plink_timeout);
get_random_bytes(&rand, sizeof(u32));
- sta->plink_timeout = sta->plink_timeout +
- rand % sta->plink_timeout;
- ++sta->plink_retries;
- mod_plink_timer(sta, sta->plink_timeout);
+ sta->mesh->plink_timeout = sta->mesh->plink_timeout +
+ rand % sta->mesh->plink_timeout;
+ ++sta->mesh->plink_retries;
+ mod_plink_timer(sta, sta->mesh->plink_timeout);
action = WLAN_SP_MESH_PEERING_OPEN;
break;
}
@@ -607,31 +650,31 @@ static void mesh_plink_timer(unsigned long data)
/* confirm timer */
if (!reason)
reason = WLAN_REASON_MESH_CONFIRM_TIMEOUT;
- sta->plink_state = NL80211_PLINK_HOLDING;
+ sta->mesh->plink_state = NL80211_PLINK_HOLDING;
mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout);
action = WLAN_SP_MESH_PEERING_CLOSE;
break;
case NL80211_PLINK_HOLDING:
/* holding timer */
- del_timer(&sta->plink_timer);
+ del_timer(&sta->mesh->plink_timer);
mesh_plink_fsm_restart(sta);
break;
default:
break;
}
- spin_unlock_bh(&sta->lock);
+ spin_unlock_bh(&sta->mesh->plink_lock);
if (action)
- mesh_plink_frame_tx(sdata, action, sta->sta.addr,
- sta->llid, sta->plid, reason);
+ mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr,
+ sta->mesh->llid, sta->mesh->plid, reason);
}
static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout)
{
- sta->plink_timer.expires = jiffies + msecs_to_jiffies(timeout);
- sta->plink_timer.data = (unsigned long) sta;
- sta->plink_timer.function = mesh_plink_timer;
- sta->plink_timeout = timeout;
- add_timer(&sta->plink_timer);
+ sta->mesh->plink_timer.expires = jiffies + msecs_to_jiffies(timeout);
+ sta->mesh->plink_timer.data = (unsigned long) sta;
+ sta->mesh->plink_timer.function = mesh_plink_timer;
+ sta->mesh->plink_timeout = timeout;
+ add_timer(&sta->mesh->plink_timer);
}
static bool llid_in_use(struct ieee80211_sub_if_data *sdata,
@@ -643,7 +686,10 @@ static bool llid_in_use(struct ieee80211_sub_if_data *sdata,
rcu_read_lock();
list_for_each_entry_rcu(sta, &local->sta_list, list) {
- if (!memcmp(&sta->llid, &llid, sizeof(llid))) {
+ if (sdata != sta->sdata)
+ continue;
+
+ if (!memcmp(&sta->mesh->llid, &llid, sizeof(llid))) {
in_use = true;
break;
}
@@ -659,8 +705,6 @@ static u16 mesh_get_new_llid(struct ieee80211_sub_if_data *sdata)
do {
get_random_bytes(&llid, sizeof(llid));
- /* for mesh PS we still only have the AID range for TIM bits */
- llid = (llid % IEEE80211_MAX_AID) + 1;
} while (llid_in_use(sdata, llid));
return llid;
@@ -674,16 +718,16 @@ u32 mesh_plink_open(struct sta_info *sta)
if (!test_sta_flag(sta, WLAN_STA_AUTH))
return 0;
- spin_lock_bh(&sta->lock);
- sta->llid = mesh_get_new_llid(sdata);
- if (sta->plink_state != NL80211_PLINK_LISTEN &&
- sta->plink_state != NL80211_PLINK_BLOCKED) {
- spin_unlock_bh(&sta->lock);
+ spin_lock_bh(&sta->mesh->plink_lock);
+ sta->mesh->llid = mesh_get_new_llid(sdata);
+ if (sta->mesh->plink_state != NL80211_PLINK_LISTEN &&
+ sta->mesh->plink_state != NL80211_PLINK_BLOCKED) {
+ spin_unlock_bh(&sta->mesh->plink_lock);
return 0;
}
- sta->plink_state = NL80211_PLINK_OPN_SNT;
+ sta->mesh->plink_state = NL80211_PLINK_OPN_SNT;
mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout);
- spin_unlock_bh(&sta->lock);
+ spin_unlock_bh(&sta->mesh->plink_lock);
mpl_dbg(sdata,
"Mesh plink: starting establishment with %pM\n",
sta->sta.addr);
@@ -691,8 +735,8 @@ u32 mesh_plink_open(struct sta_info *sta)
/* set the non-peer mode to active during peering */
changed = ieee80211_mps_local_status_update(sdata);
- mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN,
- sta->sta.addr, sta->llid, 0, 0);
+ mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_OPEN,
+ sta->sta.addr, sta->mesh->llid, 0, 0);
return changed;
}
@@ -700,10 +744,10 @@ u32 mesh_plink_block(struct sta_info *sta)
{
u32 changed;
- spin_lock_bh(&sta->lock);
+ spin_lock_bh(&sta->mesh->plink_lock);
changed = __mesh_plink_deactivate(sta);
- sta->plink_state = NL80211_PLINK_BLOCKED;
- spin_unlock_bh(&sta->lock);
+ sta->mesh->plink_state = NL80211_PLINK_BLOCKED;
+ spin_unlock_bh(&sta->mesh->plink_lock);
return changed;
}
@@ -713,12 +757,11 @@ static void mesh_plink_close(struct ieee80211_sub_if_data *sdata,
enum plink_event event)
{
struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg;
-
u16 reason = (event == CLS_ACPT) ?
WLAN_REASON_MESH_CLOSE : WLAN_REASON_MESH_CONFIG;
- sta->reason = reason;
- sta->plink_state = NL80211_PLINK_HOLDING;
+ sta->mesh->reason = reason;
+ sta->mesh->plink_state = NL80211_PLINK_HOLDING;
mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout);
}
@@ -728,8 +771,8 @@ static u32 mesh_plink_establish(struct ieee80211_sub_if_data *sdata,
struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg;
u32 changed = 0;
- del_timer(&sta->plink_timer);
- sta->plink_state = NL80211_PLINK_ESTAB;
+ del_timer(&sta->mesh->plink_timer);
+ sta->mesh->plink_state = NL80211_PLINK_ESTAB;
changed |= mesh_plink_inc_estab_count(sdata);
changed |= mesh_set_ht_prot_mode(sdata);
changed |= mesh_set_short_slot_time(sdata);
@@ -756,18 +799,18 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
u32 changed = 0;
mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr,
- mplstates[sta->plink_state], mplevents[event]);
+ mplstates[sta->mesh->plink_state], mplevents[event]);
- spin_lock_bh(&sta->lock);
- switch (sta->plink_state) {
+ spin_lock_bh(&sta->mesh->plink_lock);
+ switch (sta->mesh->plink_state) {
case NL80211_PLINK_LISTEN:
switch (event) {
case CLS_ACPT:
mesh_plink_fsm_restart(sta);
break;
case OPN_ACPT:
- sta->plink_state = NL80211_PLINK_OPN_RCVD;
- sta->llid = mesh_get_new_llid(sdata);
+ sta->mesh->plink_state = NL80211_PLINK_OPN_RCVD;
+ sta->mesh->llid = mesh_get_new_llid(sdata);
mesh_plink_timer_set(sta,
mshcfg->dot11MeshRetryTimeout);
@@ -789,11 +832,11 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
break;
case OPN_ACPT:
/* retry timer is left untouched */
- sta->plink_state = NL80211_PLINK_OPN_RCVD;
+ sta->mesh->plink_state = NL80211_PLINK_OPN_RCVD;
action = WLAN_SP_MESH_PEERING_CONFIRM;
break;
case CNF_ACPT:
- sta->plink_state = NL80211_PLINK_CNF_RCVD;
+ sta->mesh->plink_state = NL80211_PLINK_CNF_RCVD;
mod_plink_timer(sta, mshcfg->dot11MeshConfirmTimeout);
break;
default:
@@ -853,7 +896,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
case NL80211_PLINK_HOLDING:
switch (event) {
case CLS_ACPT:
- del_timer(&sta->plink_timer);
+ del_timer(&sta->mesh->plink_timer);
mesh_plink_fsm_restart(sta);
break;
case OPN_ACPT:
@@ -872,17 +915,18 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
*/
break;
}
- spin_unlock_bh(&sta->lock);
+ spin_unlock_bh(&sta->mesh->plink_lock);
if (action) {
- mesh_plink_frame_tx(sdata, action, sta->sta.addr,
- sta->llid, sta->plid, sta->reason);
+ mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr,
+ sta->mesh->llid, sta->mesh->plid,
+ sta->mesh->reason);
/* also send confirm in open case */
if (action == WLAN_SP_MESH_PEERING_OPEN) {
- mesh_plink_frame_tx(sdata,
+ mesh_plink_frame_tx(sdata, sta,
WLAN_SP_MESH_PEERING_CONFIRM,
- sta->sta.addr, sta->llid,
- sta->plid, 0);
+ sta->sta.addr, sta->mesh->llid,
+ sta->mesh->plid, 0);
}
}
@@ -937,7 +981,7 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n");
goto out;
}
- if (sta->plink_state == NL80211_PLINK_BLOCKED)
+ if (sta->mesh->plink_state == NL80211_PLINK_BLOCKED)
goto out;
}
@@ -952,7 +996,7 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
if (!matches_local)
event = OPN_RJCT;
if (!mesh_plink_free_count(sdata) ||
- (sta->plid && sta->plid != plid))
+ (sta->mesh->plid && sta->mesh->plid != plid))
event = OPN_IGNR;
else
event = OPN_ACPT;
@@ -961,14 +1005,14 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
if (!matches_local)
event = CNF_RJCT;
if (!mesh_plink_free_count(sdata) ||
- sta->llid != llid ||
- (sta->plid && sta->plid != plid))
+ sta->mesh->llid != llid ||
+ (sta->mesh->plid && sta->mesh->plid != plid))
event = CNF_IGNR;
else
event = CNF_ACPT;
break;
case WLAN_SP_MESH_PEERING_CLOSE:
- if (sta->plink_state == NL80211_PLINK_ESTAB)
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
/* Do not check for llid or plid. This does not
* follow the standard but since multiple plinks
* per sta are not supported, it is necessary in
@@ -979,9 +1023,9 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
* restarted.
*/
event = CLS_ACPT;
- else if (sta->plid != plid)
+ else if (sta->mesh->plid != plid)
event = CLS_IGNR;
- else if (ie_len == 8 && sta->llid != llid)
+ else if (ie_len == 8 && sta->mesh->llid != llid)
event = CLS_IGNR;
else
event = CLS_ACPT;
@@ -1068,9 +1112,9 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata,
mpl_dbg(sdata, "Mesh plink: failed to init peer!\n");
goto unlock_rcu;
}
- sta->plid = plid;
+ sta->mesh->plid = plid;
} else if (!sta && event == OPN_RJCT) {
- mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE,
+ mesh_plink_frame_tx(sdata, NULL, WLAN_SP_MESH_PEERING_CLOSE,
mgmt->sa, 0, plid,
WLAN_REASON_MESH_CONFIG);
goto unlock_rcu;
@@ -1079,9 +1123,13 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata,
goto unlock_rcu;
}
- /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */
- if (!sta->plid && event == CNF_ACPT)
- sta->plid = plid;
+ if (event == CNF_ACPT) {
+ /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */
+ if (!sta->mesh->plid)
+ sta->mesh->plid = plid;
+
+ sta->mesh->aid = get_unaligned_le16(PLINK_CNF_AID(mgmt));
+ }
changed |= mesh_plink_fsm(sdata, sta, event);
@@ -1120,6 +1168,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
WLAN_SP_MESH_PEERING_CONFIRM) {
baseaddr += 4;
baselen += 4;
+
+ if (baselen > len)
+ return;
}
ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems);
mesh_process_plink_frame(sdata, mgmt, &elems);
diff --git a/kernel/net/mac80211/mesh_ps.c b/kernel/net/mac80211/mesh_ps.c
index ad8b377b4..90a268abe 100644
--- a/kernel/net/mac80211/mesh_ps.c
+++ b/kernel/net/mac80211/mesh_ps.c
@@ -92,16 +92,16 @@ u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata)
if (sdata != sta->sdata)
continue;
- switch (sta->plink_state) {
+ switch (sta->mesh->plink_state) {
case NL80211_PLINK_OPN_SNT:
case NL80211_PLINK_OPN_RCVD:
case NL80211_PLINK_CNF_RCVD:
peering = true;
break;
case NL80211_PLINK_ESTAB:
- if (sta->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP)
+ if (sta->mesh->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP)
light_sleep_cnt++;
- else if (sta->local_pm == NL80211_MESH_POWER_DEEP_SLEEP)
+ else if (sta->mesh->local_pm == NL80211_MESH_POWER_DEEP_SLEEP)
deep_sleep_cnt++;
break;
default:
@@ -153,19 +153,19 @@ u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta,
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
- if (sta->local_pm == pm)
+ if (sta->mesh->local_pm == pm)
return 0;
mps_dbg(sdata, "local STA operates in mode %d with %pM\n",
pm, sta->sta.addr);
- sta->local_pm = pm;
+ sta->mesh->local_pm = pm;
/*
* announce peer-specific power mode transition
* (see IEEE802.11-2012 13.14.3.2 and 13.14.3.3)
*/
- if (sta->plink_state == NL80211_PLINK_ESTAB)
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
mps_qos_null_tx(sta);
return ieee80211_mps_local_status_update(sdata);
@@ -197,8 +197,8 @@ void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata,
if (is_unicast_ether_addr(hdr->addr1) &&
ieee80211_is_data_qos(hdr->frame_control) &&
- sta->plink_state == NL80211_PLINK_ESTAB)
- pm = sta->local_pm;
+ sta->mesh->plink_state == NL80211_PLINK_ESTAB)
+ pm = sta->mesh->local_pm;
else
pm = sdata->u.mesh.nonpeer_pm;
@@ -241,16 +241,16 @@ void ieee80211_mps_sta_status_update(struct sta_info *sta)
* use peer-specific power mode if peering is established and the
* peer's power mode is known
*/
- if (sta->plink_state == NL80211_PLINK_ESTAB &&
- sta->peer_pm != NL80211_MESH_POWER_UNKNOWN)
- pm = sta->peer_pm;
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB &&
+ sta->mesh->peer_pm != NL80211_MESH_POWER_UNKNOWN)
+ pm = sta->mesh->peer_pm;
else
- pm = sta->nonpeer_pm;
+ pm = sta->mesh->nonpeer_pm;
do_buffer = (pm != NL80211_MESH_POWER_ACTIVE);
/* clear the MPSP flags for non-peers or active STA */
- if (sta->plink_state != NL80211_PLINK_ESTAB) {
+ if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) {
clear_sta_flag(sta, WLAN_STA_MPSP_OWNER);
clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
} else if (!do_buffer) {
@@ -296,13 +296,13 @@ static void mps_set_sta_peer_pm(struct sta_info *sta,
pm = NL80211_MESH_POWER_ACTIVE;
}
- if (sta->peer_pm == pm)
+ if (sta->mesh->peer_pm == pm)
return;
mps_dbg(sta->sdata, "STA %pM enters mode %d\n",
sta->sta.addr, pm);
- sta->peer_pm = pm;
+ sta->mesh->peer_pm = pm;
ieee80211_mps_sta_status_update(sta);
}
@@ -317,13 +317,13 @@ static void mps_set_sta_nonpeer_pm(struct sta_info *sta,
else
pm = NL80211_MESH_POWER_ACTIVE;
- if (sta->nonpeer_pm == pm)
+ if (sta->mesh->nonpeer_pm == pm)
return;
mps_dbg(sta->sdata, "STA %pM sets non-peer mode to %d\n",
sta->sta.addr, pm);
- sta->nonpeer_pm = pm;
+ sta->mesh->nonpeer_pm = pm;
ieee80211_mps_sta_status_update(sta);
}
@@ -552,7 +552,7 @@ void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta,
} else {
if (eosp)
clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
- else if (sta->local_pm != NL80211_MESH_POWER_ACTIVE)
+ else if (sta->mesh->local_pm != NL80211_MESH_POWER_ACTIVE)
set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
if (rspi && !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER))
@@ -577,9 +577,9 @@ void ieee80211_mps_frame_release(struct sta_info *sta,
int ac, buffer_local = 0;
bool has_buffered = false;
- if (sta->plink_state == NL80211_PLINK_ESTAB)
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len,
- sta->llid);
+ sta->mesh->aid);
if (has_buffered)
mps_dbg(sta->sdata, "%pM indicates buffered frames\n",
@@ -598,7 +598,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta,
if (!has_buffered && !buffer_local)
return;
- if (sta->plink_state == NL80211_PLINK_ESTAB)
+ if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
mpsp_trigger_send(sta, has_buffered, !buffer_local);
else
mps_frame_deliver(sta, 1);
diff --git a/kernel/net/mac80211/mesh_sync.c b/kernel/net/mac80211/mesh_sync.c
index 09625d620..64bc22ad9 100644
--- a/kernel/net/mac80211/mesh_sync.c
+++ b/kernel/net/mac80211/mesh_sync.c
@@ -127,14 +127,14 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
/* Timing offset calculation (see 13.13.2.2.2) */
t_t = le64_to_cpu(mgmt->u.beacon.timestamp);
- sta->t_offset = t_t - t_r;
+ sta->mesh->t_offset = t_t - t_r;
if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
- s64 t_clockdrift = sta->t_offset_setpoint - sta->t_offset;
+ s64 t_clockdrift = sta->mesh->t_offset_setpoint - sta->mesh->t_offset;
msync_dbg(sdata,
- "STA %pM : sta->t_offset=%lld, sta->t_offset_setpoint=%lld, t_clockdrift=%lld\n",
- sta->sta.addr, (long long) sta->t_offset,
- (long long) sta->t_offset_setpoint,
+ "STA %pM : t_offset=%lld, t_offset_setpoint=%lld, t_clockdrift=%lld\n",
+ sta->sta.addr, (long long) sta->mesh->t_offset,
+ (long long) sta->mesh->t_offset_setpoint,
(long long) t_clockdrift);
if (t_clockdrift > TOFFSET_MAXIMUM_ADJUSTMENT ||
@@ -152,12 +152,12 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
ifmsh->sync_offset_clockdrift_max = t_clockdrift;
spin_unlock_bh(&ifmsh->sync_offset_lock);
} else {
- sta->t_offset_setpoint = sta->t_offset - TOFFSET_SET_MARGIN;
+ sta->mesh->t_offset_setpoint = sta->mesh->t_offset - TOFFSET_SET_MARGIN;
set_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
msync_dbg(sdata,
- "STA %pM : offset was invalid, sta->t_offset=%lld\n",
+ "STA %pM : offset was invalid, t_offset=%lld\n",
sta->sta.addr,
- (long long) sta->t_offset);
+ (long long) sta->mesh->t_offset);
}
no_sync:
diff --git a/kernel/net/mac80211/mlme.c b/kernel/net/mac80211/mlme.c
index 26053bf2f..83097c383 100644
--- a/kernel/net/mac80211/mlme.c
+++ b/kernel/net/mac80211/mlme.c
@@ -6,6 +6,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (C) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -19,7 +20,6 @@
#include <linux/etherdevice.h>
#include <linux/moduleparam.h>
#include <linux/rtnetlink.h>
-#include <linux/pm_qos.h>
#include <linux/crc32.h>
#include <linux/slab.h>
#include <linux/export.h>
@@ -81,13 +81,6 @@ MODULE_PARM_DESC(probe_wait_ms,
" before disconnecting (reason 4).");
/*
- * Weight given to the latest Beacon frame when calculating average signal
- * strength for Beacon frames received in the current BSS. This must be
- * between 1 and 15.
- */
-#define IEEE80211_SIGNAL_AVE_WEIGHT 3
-
-/*
* How many Beacon frames need to have been used in average signal strength
* before starting to indicate signal change events.
*/
@@ -118,7 +111,7 @@ void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata)
if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)
return;
- if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
return;
mod_timer(&sdata->u.mgd.bcn_mon_timer,
@@ -134,7 +127,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
ifmgd->probe_send_count = 0;
- if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
return;
mod_timer(&sdata->u.mgd.conn_mon_timer,
@@ -538,11 +531,16 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata,
ieee80211_ie_build_ht_cap(pos, &ht_cap, cap);
}
+/* This function determines vht capability flags for the association
+ * and builds the IE.
+ * Note - the function may set the owner of the MU-MIMO capability
+ */
static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb,
struct ieee80211_supported_band *sband,
struct ieee80211_vht_cap *ap_vht_cap)
{
+ struct ieee80211_local *local = sdata->local;
u8 *pos;
u32 cap;
struct ieee80211_sta_vht_cap vht_cap;
@@ -576,7 +574,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
*/
if (!(ap_vht_cap->vht_cap_info &
cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)))
- cap &= ~IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE;
+ cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE |
+ IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE);
+ else if (!(ap_vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE)))
+ cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
+
+ /*
+ * If some other vif is using the MU-MIMO capablity we cannot associate
+ * using MU-MIMO - this will lead to contradictions in the group-id
+ * mechanism.
+ * Ownership is defined since association request, in order to avoid
+ * simultaneous associations with MU-MIMO.
+ */
+ if (cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) {
+ bool disable_mu_mimo = false;
+ struct ieee80211_sub_if_data *other;
+
+ list_for_each_entry_rcu(other, &local->interfaces, list) {
+ if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) {
+ disable_mu_mimo = true;
+ break;
+ }
+ }
+ if (disable_mu_mimo)
+ cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
+ else
+ sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER;
+ }
mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK;
@@ -669,17 +694,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
capab = WLAN_CAPABILITY_ESS;
if (sband->band == IEEE80211_BAND_2GHZ) {
- if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
- capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
- if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
- capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
+ capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
+ capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
}
if (assoc_data->capability & WLAN_CAPABILITY_PRIVACY)
capab |= WLAN_CAPABILITY_PRIVACY;
if ((assoc_data->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) &&
- (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT))
+ ieee80211_hw_check(&local->hw, SPECTRUM_MGMT))
capab |= WLAN_CAPABILITY_SPECTRUM_MGMT;
if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM)
@@ -887,7 +910,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
drv_mgd_prepare_tx(local, sdata);
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS |
IEEE80211_TX_INTFL_MLME_CONN_TX;
ieee80211_tx_skb(sdata, skb);
@@ -912,7 +935,7 @@ void ieee80211_send_pspoll(struct ieee80211_local *local,
void ieee80211_send_nullfunc(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
- int powersave)
+ bool powersave)
{
struct sk_buff *skb;
struct ieee80211_hdr_3addr *nullfunc;
@@ -929,7 +952,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL)
@@ -1161,6 +1184,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
return;
}
+ /*
+ * Drop all TDLS peers - either we disconnect or move to a different
+ * channel from this point on. There's no telling what our peer will do.
+ * The TDLS WIDER_BW scenario is also problematic, as peers might now
+ * have an incompatible wider chandef.
+ */
+ ieee80211_teardown_tdls_peers(sdata);
+
mutex_lock(&local->mtx);
mutex_lock(&local->chanctx_mtx);
conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
@@ -1174,7 +1205,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
chanctx = container_of(conf, struct ieee80211_chanctx, conf);
if (local->use_chanctx &&
- !(local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)) {
+ !ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) {
sdata_info(sdata,
"driver doesn't support chan-switch with channel contexts\n");
goto drop_connection;
@@ -1348,21 +1379,26 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
*/
if (has_80211h_pwr &&
(!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) {
+ new_ap_level = pwr_level_80211h;
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
sdata_dbg(sdata,
"Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n",
pwr_level_80211h, chan_pwr, pwr_reduction_80211h,
sdata->u.mgd.bssid);
- new_ap_level = pwr_level_80211h;
} else { /* has_cisco_pwr is always true here. */
+ new_ap_level = pwr_level_cisco;
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
sdata_dbg(sdata,
"Limiting TX power to %d dBm as advertised by %pM\n",
pwr_level_cisco, sdata->u.mgd.bssid);
- new_ap_level = pwr_level_cisco;
}
- if (sdata->ap_power_level == new_ap_level)
- return 0;
-
sdata->ap_power_level = new_ap_level;
if (__ieee80211_recalc_txpower(sdata))
return BSS_CHANGED_TXPOWER;
@@ -1383,15 +1419,15 @@ static void ieee80211_enable_ps(struct ieee80211_local *local,
return;
if (conf->dynamic_ps_timeout > 0 &&
- !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) {
+ !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) {
mod_timer(&local->dynamic_ps_timer, jiffies +
msecs_to_jiffies(conf->dynamic_ps_timeout));
} else {
- if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
- ieee80211_send_nullfunc(local, sdata, 1);
+ if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
+ ieee80211_send_nullfunc(local, sdata, true);
- if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) &&
- (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS))
+ if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) &&
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
return;
conf->flags |= IEEE80211_CONF_PS;
@@ -1444,13 +1480,13 @@ static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata)
}
/* need to hold RTNL or interface lock */
-void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
+void ieee80211_recalc_ps(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata, *found = NULL;
int count = 0;
int timeout;
- if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) {
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) {
local->ps_sdata = NULL;
return;
}
@@ -1473,48 +1509,23 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
}
if (count == 1 && ieee80211_powersave_allowed(found)) {
+ u8 dtimper = found->u.mgd.dtim_period;
s32 beaconint_us;
- if (latency < 0)
- latency = pm_qos_request(PM_QOS_NETWORK_LATENCY);
-
beaconint_us = ieee80211_tu_to_usec(
found->vif.bss_conf.beacon_int);
timeout = local->dynamic_ps_forced_timeout;
- if (timeout < 0) {
- /*
- * Go to full PSM if the user configures a very low
- * latency requirement.
- * The 2000 second value is there for compatibility
- * until the PM_QOS_NETWORK_LATENCY is configured
- * with real values.
- */
- if (latency > (1900 * USEC_PER_MSEC) &&
- latency != (2000 * USEC_PER_SEC))
- timeout = 0;
- else
- timeout = 100;
- }
+ if (timeout < 0)
+ timeout = 100;
local->hw.conf.dynamic_ps_timeout = timeout;
- if (beaconint_us > latency) {
- local->ps_sdata = NULL;
- } else {
- int maxslp = 1;
- u8 dtimper = found->u.mgd.dtim_period;
-
- /* If the TIM IE is invalid, pretend the value is 1 */
- if (!dtimper)
- dtimper = 1;
- else if (dtimper > 1)
- maxslp = min_t(int, dtimper,
- latency / beaconint_us);
-
- local->hw.conf.max_sleep_period = maxslp;
- local->hw.conf.ps_dtim_period = dtimper;
- local->ps_sdata = found;
- }
+ /* If the TIM IE is invalid, pretend the value is 1 */
+ if (!dtimper)
+ dtimper = 1;
+
+ local->hw.conf.ps_dtim_period = dtimper;
+ local->ps_sdata = found;
} else {
local->ps_sdata = NULL;
}
@@ -1596,21 +1607,21 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work)
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}
- if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) &&
+ if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) &&
!(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) {
if (drv_tx_frames_pending(local)) {
mod_timer(&local->dynamic_ps_timer, jiffies +
msecs_to_jiffies(
local->hw.conf.dynamic_ps_timeout));
} else {
- ieee80211_send_nullfunc(local, sdata, 1);
+ ieee80211_send_nullfunc(local, sdata, true);
/* Flush to get the tx status of nullfunc frame */
ieee80211_flush_queues(local, sdata, false);
}
}
- if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) &&
- (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) ||
+ if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) &&
+ ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) ||
(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) {
ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
local->hw.conf.flags |= IEEE80211_CONF_PS;
@@ -1738,10 +1749,10 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
const u8 *wmm_param, size_t wmm_param_len)
{
- struct ieee80211_tx_queue_params params;
+ struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS];
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
size_t left;
- int count;
+ int count, ac;
const u8 *pos;
u8 uapsd_queues = 0;
@@ -1775,25 +1786,24 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
int aci = (pos[0] >> 5) & 0x03;
int acm = (pos[0] >> 4) & 0x01;
bool uapsd = false;
- int queue;
switch (aci) {
case 1: /* AC_BK */
- queue = 3;
+ ac = IEEE80211_AC_BK;
if (acm)
sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK)
uapsd = true;
break;
case 2: /* AC_VI */
- queue = 1;
+ ac = IEEE80211_AC_VI;
if (acm)
sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI)
uapsd = true;
break;
case 3: /* AC_VO */
- queue = 0;
+ ac = IEEE80211_AC_VO;
if (acm)
sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
@@ -1801,7 +1811,7 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
break;
case 0: /* AC_BE */
default:
- queue = 2;
+ ac = IEEE80211_AC_BE;
if (acm)
sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE)
@@ -1809,25 +1819,41 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
break;
}
- params.aifs = pos[0] & 0x0f;
- params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4);
- params.cw_min = ecw2cw(pos[1] & 0x0f);
- params.txop = get_unaligned_le16(pos + 2);
- params.acm = acm;
- params.uapsd = uapsd;
+ params[ac].aifs = pos[0] & 0x0f;
+
+ if (params[ac].aifs < 2) {
+ sdata_info(sdata,
+ "AP has invalid WMM params (AIFSN=%d for ACI %d), will use 2\n",
+ params[ac].aifs, aci);
+ params[ac].aifs = 2;
+ }
+ params[ac].cw_max = ecw2cw((pos[1] & 0xf0) >> 4);
+ params[ac].cw_min = ecw2cw(pos[1] & 0x0f);
+ params[ac].txop = get_unaligned_le16(pos + 2);
+ params[ac].acm = acm;
+ params[ac].uapsd = uapsd;
+
+ if (params[ac].cw_min > params[ac].cw_max) {
+ sdata_info(sdata,
+ "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n",
+ params[ac].cw_min, params[ac].cw_max, aci);
+ return false;
+ }
+ }
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
mlme_dbg(sdata,
- "WMM queue=%d aci=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n",
- queue, aci, acm,
- params.aifs, params.cw_min, params.cw_max,
- params.txop, params.uapsd,
- ifmgd->tx_tspec[queue].downgraded);
- sdata->tx_conf[queue] = params;
- if (!ifmgd->tx_tspec[queue].downgraded &&
- drv_conf_tx(local, sdata, queue, &params))
+ "WMM AC=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n",
+ ac, params[ac].acm,
+ params[ac].aifs, params[ac].cw_min, params[ac].cw_max,
+ params[ac].txop, params[ac].uapsd,
+ ifmgd->tx_tspec[ac].downgraded);
+ sdata->tx_conf[ac] = params[ac];
+ if (!ifmgd->tx_tspec[ac].downgraded &&
+ drv_conf_tx(local, sdata, ac, &params[ac]))
sdata_err(sdata,
- "failed to set TX queue parameters for queue %d\n",
- queue);
+ "failed to set TX queue parameters for AC %d\n",
+ ac);
}
/* enable WMM or activate new settings */
@@ -1965,7 +1991,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
ieee80211_bss_info_change_notify(sdata, bss_info_changed);
mutex_lock(&local->iflist_mtx);
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
mutex_unlock(&local->iflist_mtx);
ieee80211_recalc_smps(sdata);
@@ -2052,6 +2078,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask));
memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa));
memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask));
+ sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
@@ -2070,7 +2097,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
ieee80211_bss_info_change_notify(sdata, changed);
/* disassociated - set to defaults now */
- ieee80211_set_wmm_default(sdata, false);
+ ieee80211_set_wmm_default(sdata, false, false);
del_timer_sync(&sdata->u.mgd.conn_mon_timer);
del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
@@ -2132,10 +2159,10 @@ static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
__ieee80211_stop_poll(sdata);
mutex_lock(&local->iflist_mtx);
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
mutex_unlock(&local->iflist_mtx);
- if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
goto out;
/*
@@ -2233,9 +2260,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
*/
ifmgd->probe_send_count++;
- if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) {
ifmgd->nullfunc_failed = false;
- ieee80211_send_nullfunc(sdata->local, sdata, 0);
+ ieee80211_send_nullfunc(sdata->local, sdata, false);
} else {
int ssid_len;
@@ -2308,7 +2335,7 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata,
goto out;
mutex_lock(&sdata->local->iflist_mtx);
- ieee80211_recalc_ps(sdata->local, -1);
+ ieee80211_recalc_ps(sdata->local);
mutex_unlock(&sdata->local->iflist_mtx);
ifmgd->probe_send_count = 0;
@@ -2413,15 +2440,9 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work)
container_of(work, struct ieee80211_sub_if_data,
u.mgd.beacon_connection_loss_work);
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- struct sta_info *sta;
- if (ifmgd->associated) {
- rcu_read_lock();
- sta = sta_info_get(sdata, ifmgd->bssid);
- if (sta)
- sta->beacon_loss_count++;
- rcu_read_unlock();
- }
+ if (ifmgd->associated)
+ ifmgd->beacon_loss_count++;
if (ifmgd->connection_loss) {
sdata_info(sdata, "Connection to AP %pM lost\n",
@@ -2495,6 +2516,35 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
sdata->u.mgd.auth_data = NULL;
}
+static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
+ bool assoc)
+{
+ struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
+
+ sdata_assert_lock(sdata);
+
+ if (!assoc) {
+ /*
+ * we are not associated yet, the only timer that could be
+ * running is the timeout for the association response which
+ * which is not relevant anymore.
+ */
+ del_timer_sync(&sdata->u.mgd.timer);
+ sta_info_destroy_addr(sdata, assoc_data->bss->bssid);
+
+ eth_zero_addr(sdata->u.mgd.bssid);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+ sdata->u.mgd.flags = 0;
+ sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+ mutex_lock(&sdata->local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->mtx);
+ }
+
+ kfree(assoc_data);
+ sdata->u.mgd.assoc_data = NULL;
+}
+
static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt, size_t len)
{
@@ -2510,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
return;
auth_data->expected_transaction = 4;
drv_mgd_prepare_tx(sdata->local, sdata);
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
IEEE80211_TX_INTFL_MLME_CONN_TX;
ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0,
@@ -2687,28 +2737,42 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt, size_t len)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- const u8 *bssid = NULL;
- u16 reason_code;
+ u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
sdata_assert_lock(sdata);
if (len < 24 + 2)
return;
- if (!ifmgd->associated ||
- !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
- return;
+ if (ifmgd->associated &&
+ ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) {
+ const u8 *bssid = ifmgd->associated->bssid;
- bssid = ifmgd->associated->bssid;
+ sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n",
+ bssid, reason_code,
+ ieee80211_get_reason_code_string(reason_code));
- reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
+ ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
- sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n",
- bssid, reason_code, ieee80211_get_reason_code_string(reason_code));
+ ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false,
+ reason_code);
+ return;
+ }
- ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
+ if (ifmgd->assoc_data &&
+ ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) {
+ const u8 *bssid = ifmgd->assoc_data->bss->bssid;
- ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code);
+ sdata_info(sdata,
+ "deauthenticated from %pM while associating (Reason: %u=%s)\n",
+ bssid, reason_code,
+ ieee80211_get_reason_code_string(reason_code));
+
+ ieee80211_destroy_assoc_data(sdata, false);
+
+ cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
+ return;
+ }
}
@@ -2788,34 +2852,6 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
}
}
-static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
- bool assoc)
-{
- struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
-
- sdata_assert_lock(sdata);
-
- if (!assoc) {
- /*
- * we are not associated yet, the only timer that could be
- * running is the timeout for the association response which
- * which is not relevant anymore.
- */
- del_timer_sync(&sdata->u.mgd.timer);
- sta_info_destroy_addr(sdata, assoc_data->bss->bssid);
-
- eth_zero_addr(sdata->u.mgd.bssid);
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
- sdata->u.mgd.flags = 0;
- mutex_lock(&sdata->local->mtx);
- ieee80211_vif_release_channel(sdata);
- mutex_unlock(&sdata->local->mtx);
- }
-
- kfree(assoc_data);
- sdata->u.mgd.assoc_data = NULL;
-}
-
static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss,
struct ieee80211_mgmt *mgmt, size_t len)
@@ -3028,11 +3064,21 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
*/
ifmgd->wmm_last_param_set = -1;
- if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && elems.wmm_param)
- ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
- elems.wmm_param_len);
- else
- ieee80211_set_wmm_default(sdata, false);
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
+ ieee80211_set_wmm_default(sdata, false, false);
+ } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
+ elems.wmm_param_len)) {
+ /* still enable QoS since we might have HT/VHT */
+ ieee80211_set_wmm_default(sdata, false, true);
+ /* set the disable-WMM flag in this case to disable
+ * tracking WMM parameter changes in the beacon if
+ * the parameters weren't actually valid. Doing so
+ * avoids changing parameters very strangely when
+ * the AP is going back and forth between valid and
+ * invalid parameters.
+ */
+ ifmgd->flags |= IEEE80211_STA_DISABLE_WMM;
+ }
changed |= BSS_CHANGED_QOS;
/* set AID and assoc capability,
@@ -3211,16 +3257,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
if (ifmgd->associated &&
ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
ieee80211_reset_ap_probe(sdata);
-
- if (ifmgd->auth_data && !ifmgd->auth_data->bss->proberesp_ies &&
- ether_addr_equal(mgmt->bssid, ifmgd->auth_data->bss->bssid)) {
- /* got probe response, continue with auth */
- sdata_info(sdata, "direct probe responded\n");
- ifmgd->auth_data->tries = 0;
- ifmgd->auth_data->timeout = jiffies;
- ifmgd->auth_data->timeout_started = true;
- run_again(sdata, ifmgd->auth_data->timeout);
- }
}
/*
@@ -3299,7 +3335,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
}
ifmgd->have_beacon = true;
ifmgd->assoc_data->need_beacon = false;
- if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) {
+ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) {
sdata->vif.bss_conf.sync_tsf =
le64_to_cpu(mgmt->u.beacon.timestamp);
sdata->vif.bss_conf.sync_device_ts =
@@ -3323,24 +3359,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
bssid = ifmgd->associated->bssid;
/* Track average RSSI from the Beacon frames of the current AP */
- ifmgd->last_beacon_signal = rx_status->signal;
if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
- ifmgd->ave_beacon_signal = rx_status->signal * 16;
+ ewma_beacon_signal_init(&ifmgd->ave_beacon_signal);
ifmgd->last_cqm_event_signal = 0;
ifmgd->count_beacon_signal = 1;
ifmgd->last_ave_beacon_signal = 0;
} else {
- ifmgd->ave_beacon_signal =
- (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 +
- (16 - IEEE80211_SIGNAL_AVE_WEIGHT) *
- ifmgd->ave_beacon_signal) / 16;
ifmgd->count_beacon_signal++;
}
+ ewma_beacon_signal_add(&ifmgd->ave_beacon_signal, -rx_status->signal);
+
if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold &&
ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) {
- int sig = ifmgd->ave_beacon_signal;
+ int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal);
int last_sig = ifmgd->last_ave_beacon_signal;
struct ieee80211_event event = {
.type = RSSI_EVENT,
@@ -3367,10 +3400,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (bss_conf->cqm_rssi_thold &&
ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT &&
!(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) {
- int sig = ifmgd->ave_beacon_signal / 16;
+ int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal);
int last_event = ifmgd->last_cqm_event_signal;
int thold = bss_conf->cqm_rssi_thold;
int hyst = bss_conf->cqm_rssi_hyst;
+
if (sig < thold &&
(last_event == 0 || sig < last_event - hyst)) {
ifmgd->last_cqm_event_signal = sig;
@@ -3405,31 +3439,27 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
len - baselen, false, &elems,
care_about_ies, ncrc);
- if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
- bool directed_tim = ieee80211_check_tim(elems.tim,
- elems.tim_len,
- ifmgd->aid);
- if (directed_tim) {
- if (local->hw.conf.dynamic_ps_timeout > 0) {
- if (local->hw.conf.flags & IEEE80211_CONF_PS) {
- local->hw.conf.flags &= ~IEEE80211_CONF_PS;
- ieee80211_hw_config(local,
- IEEE80211_CONF_CHANGE_PS);
- }
- ieee80211_send_nullfunc(local, sdata, 0);
- } else if (!local->pspolling && sdata->u.mgd.powersave) {
- local->pspolling = true;
-
- /*
- * Here is assumed that the driver will be
- * able to send ps-poll frame and receive a
- * response even though power save mode is
- * enabled, but some drivers might require
- * to disable power save here. This needs
- * to be investigated.
- */
- ieee80211_send_pspoll(local, sdata);
+ if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) &&
+ ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) {
+ if (local->hw.conf.dynamic_ps_timeout > 0) {
+ if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+ local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+ ieee80211_hw_config(local,
+ IEEE80211_CONF_CHANGE_PS);
}
+ ieee80211_send_nullfunc(local, sdata, false);
+ } else if (!local->pspolling && sdata->u.mgd.powersave) {
+ local->pspolling = true;
+
+ /*
+ * Here is assumed that the driver will be
+ * able to send ps-poll frame and receive a
+ * response even though power save mode is
+ * enabled, but some drivers might require
+ * to disable power save here. This needs
+ * to be investigated.
+ */
+ ieee80211_send_pspoll(local, sdata);
}
}
@@ -3473,7 +3503,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
* the driver will use them. The synchronized view is currently
* guaranteed only in certain callbacks.
*/
- if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) {
+ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) {
sdata->vif.bss_conf.sync_tsf =
le64_to_cpu(mgmt->u.beacon.timestamp);
sdata->vif.bss_conf.sync_device_ts =
@@ -3516,7 +3546,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
ifmgd->have_beacon = true;
mutex_lock(&local->iflist_mtx);
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
mutex_unlock(&local->iflist_mtx);
ieee80211_recalc_ps_vif(sdata);
@@ -3550,7 +3580,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (sta && elems.opmode_notif)
ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif,
- rx_status->band, true);
+ rx_status->band);
mutex_unlock(&local->sta_mtx);
changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt,
@@ -3666,12 +3696,14 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata,
reason);
}
-static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata)
+static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data;
u32 tx_flags = 0;
+ u16 trans = 1;
+ u16 status = 0;
sdata_assert_lock(sdata);
@@ -3695,54 +3727,27 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata)
drv_mgd_prepare_tx(local, sdata);
- if (auth_data->bss->proberesp_ies) {
- u16 trans = 1;
- u16 status = 0;
-
- sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
- auth_data->bss->bssid, auth_data->tries,
- IEEE80211_AUTH_MAX_TRIES);
+ sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
+ auth_data->bss->bssid, auth_data->tries,
+ IEEE80211_AUTH_MAX_TRIES);
- auth_data->expected_transaction = 2;
+ auth_data->expected_transaction = 2;
- if (auth_data->algorithm == WLAN_AUTH_SAE) {
- trans = auth_data->sae_trans;
- status = auth_data->sae_status;
- auth_data->expected_transaction = trans;
- }
-
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
- tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
- IEEE80211_TX_INTFL_MLME_CONN_TX;
-
- ieee80211_send_auth(sdata, trans, auth_data->algorithm, status,
- auth_data->data, auth_data->data_len,
- auth_data->bss->bssid,
- auth_data->bss->bssid, NULL, 0, 0,
- tx_flags);
- } else {
- const u8 *ssidie;
+ if (auth_data->algorithm == WLAN_AUTH_SAE) {
+ trans = auth_data->sae_trans;
+ status = auth_data->sae_status;
+ auth_data->expected_transaction = trans;
+ }
- sdata_info(sdata, "direct probe to %pM (try %d/%i)\n",
- auth_data->bss->bssid, auth_data->tries,
- IEEE80211_AUTH_MAX_TRIES);
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
+ tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_INTFL_MLME_CONN_TX;
- rcu_read_lock();
- ssidie = ieee80211_bss_get_ie(auth_data->bss, WLAN_EID_SSID);
- if (!ssidie) {
- rcu_read_unlock();
- return -EINVAL;
- }
- /*
- * Direct probe is sent to broadcast address as some APs
- * will not answer to direct packet in unassociated state.
- */
- ieee80211_send_probe_req(sdata, sdata->vif.addr, NULL,
- ssidie + 2, ssidie[1],
- NULL, 0, (u32) -1, true, 0,
- auth_data->bss->channel, false);
- rcu_read_unlock();
- }
+ ieee80211_send_auth(sdata, trans, auth_data->algorithm, status,
+ auth_data->data, auth_data->data_len,
+ auth_data->bss->bssid,
+ auth_data->bss->bssid, NULL, 0, 0,
+ tx_flags);
if (tx_flags == 0) {
auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT;
@@ -3784,7 +3789,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata)
IEEE80211_ASSOC_MAX_TRIES);
ieee80211_send_assoc(sdata);
- if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) {
+ if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT;
assoc_data->timeout_started = true;
run_again(sdata, assoc_data->timeout);
@@ -3823,8 +3828,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
bool status_acked = ifmgd->status_acked;
ifmgd->status_received = false;
- if (ifmgd->auth_data &&
- (ieee80211_is_probe_req(fc) || ieee80211_is_auth(fc))) {
+ if (ifmgd->auth_data && ieee80211_is_auth(fc)) {
if (status_acked) {
ifmgd->auth_data->timeout =
jiffies + IEEE80211_AUTH_TIMEOUT_SHORT;
@@ -3855,7 +3859,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
* so let's just kill the auth data
*/
ieee80211_destroy_auth_data(sdata, false);
- } else if (ieee80211_probe_auth(sdata)) {
+ } else if (ieee80211_auth(sdata)) {
u8 bssid[ETH_ALEN];
struct ieee80211_event event = {
.type = MLME_EVENT,
@@ -3898,7 +3902,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN);
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
max_tries = max_nullfunc_tries;
else
max_tries = max_probe_tries;
@@ -3923,7 +3927,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
}
} else if (time_is_after_jiffies(ifmgd->probe_timeout))
run_again(sdata, ifmgd->probe_timeout);
- else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ else if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
mlme_dbg(sdata,
"Failed to send nullfunc to AP %pM after %dms, disconnecting\n",
bssid, probe_wait_ms);
@@ -3992,18 +3996,13 @@ static void ieee80211_sta_monitor_work(struct work_struct *work)
static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
{
- u32 flags;
-
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
__ieee80211_stop_poll(sdata);
/* let's probe the connection once */
- flags = sdata->local->hw.flags;
- if (!(flags & IEEE80211_HW_CONNECTION_MONITOR))
+ if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
ieee80211_queue_work(&sdata->local->hw,
&sdata->u.mgd.monitor_work);
- /* and do all the other regular work too */
- ieee80211_queue_work(&sdata->local->hw, &sdata->work);
}
}
@@ -4149,21 +4148,6 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
rcu_read_unlock();
}
-int ieee80211_max_network_latency(struct notifier_block *nb,
- unsigned long data, void *dummy)
-{
- s32 latency_usec = (s32) data;
- struct ieee80211_local *local =
- container_of(nb, struct ieee80211_local,
- network_latency_notifier);
-
- mutex_lock(&local->iflist_mtx);
- ieee80211_recalc_ps(local, latency_usec);
- mutex_unlock(&local->iflist_mtx);
-
- return NOTIFY_OK;
-}
-
static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss)
{
@@ -4219,6 +4203,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
struct ieee80211_supported_band *sband;
struct cfg80211_chan_def chandef;
int ret;
+ u32 i;
+ bool have_80mhz;
sband = local->hw.wiphy->bands[cbss->channel->band];
@@ -4269,6 +4255,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
+ /* Allow VHT if at least one channel on the sband supports 80 MHz */
+ have_80mhz = false;
+ for (i = 0; i < sband->n_channels; i++) {
+ if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+ IEEE80211_CHAN_NO_80MHZ))
+ continue;
+
+ have_80mhz = true;
+ break;
+ }
+
+ if (!have_80mhz)
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
cbss->channel,
ht_cap, ht_oper, vht_oper,
@@ -4307,15 +4307,15 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
- struct cfg80211_bss *cbss, bool assoc)
+ struct cfg80211_bss *cbss, bool assoc,
+ bool override)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_bss *bss = (void *)cbss->priv;
struct sta_info *new_sta = NULL;
struct ieee80211_supported_band *sband;
- struct ieee80211_sta_ht_cap sta_ht_cap;
- bool have_sta = false, is_override = false;
+ bool have_sta = false;
int err;
sband = local->hw.wiphy->bands[cbss->channel->band];
@@ -4335,14 +4335,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
return -ENOMEM;
}
- memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
- ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
-
- is_override = (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) !=
- (sband->ht_cap.cap &
- IEEE80211_HT_CAP_SUP_WIDTH_20_40);
-
- if (new_sta || is_override) {
+ if (new_sta || override) {
err = ieee80211_prep_channel(sdata, cbss);
if (err) {
if (new_sta)
@@ -4419,8 +4412,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.sync_dtim_count = tim_ie[2];
else
sdata->vif.bss_conf.sync_dtim_count = 0;
- } else if (!(local->hw.flags &
- IEEE80211_HW_TIMING_BEACON_ONLY)) {
+ } else if (!ieee80211_hw_check(&sdata->local->hw,
+ TIMING_BEACON_ONLY)) {
ies = rcu_dereference(cbss->proberesp_ies);
/* must be non-NULL since beacon IEs were NULL */
sdata->vif.bss_conf.sync_tsf = ies->tsf;
@@ -4552,11 +4545,11 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid);
- err = ieee80211_prep_connection(sdata, req->bss, false);
+ err = ieee80211_prep_connection(sdata, req->bss, false, false);
if (err)
goto err_clear;
- err = ieee80211_probe_auth(sdata);
+ err = ieee80211_auth(sdata);
if (err) {
sta_info_destroy_addr(sdata, req->bss->bssid);
goto err_clear;
@@ -4570,49 +4563,14 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
eth_zero_addr(ifmgd->bssid);
ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
ifmgd->auth_data = NULL;
+ mutex_lock(&sdata->local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->mtx);
err_free:
kfree(auth_data);
return err;
}
-static bool ieee80211_usable_wmm_params(struct ieee80211_sub_if_data *sdata,
- const u8 *wmm_param, int len)
-{
- const u8 *pos;
- size_t left;
-
- if (len < 8)
- return false;
-
- if (wmm_param[5] != 1 /* version */)
- return false;
-
- pos = wmm_param + 8;
- left = len - 8;
-
- for (; left >= 4; left -= 4, pos += 4) {
- u8 aifsn = pos[0] & 0x0f;
- u8 ecwmin = pos[1] & 0x0f;
- u8 ecwmax = (pos[1] & 0xf0) >> 4;
- int aci = (pos[0] >> 5) & 0x03;
-
- if (aifsn < 2) {
- sdata_info(sdata,
- "AP has invalid WMM params (AIFSN=%d for ACI %d), disabling WMM\n",
- aifsn, aci);
- return false;
- }
- if (ecwmin > ecwmax) {
- sdata_info(sdata,
- "AP has invalid WMM params (ECWmin/max=%d/%d for ACI %d), disabling WMM\n",
- ecwmin, ecwmax, aci);
- return false;
- }
- }
-
- return true;
-}
-
int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
struct cfg80211_assoc_request *req)
{
@@ -4624,6 +4582,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
struct ieee80211_supported_band *sband;
const u8 *ssidie, *ht_ie, *vht_ie;
int i, err;
+ bool override = false;
assoc_data = kzalloc(sizeof(*assoc_data) + req->ie_len, GFP_KERNEL);
if (!assoc_data)
@@ -4676,39 +4635,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
assoc_data->wmm = bss->wmm_used &&
(local->hw.queues >= IEEE80211_NUM_ACS);
- if (assoc_data->wmm) {
- /* try to check validity of WMM params IE */
- const struct cfg80211_bss_ies *ies;
- const u8 *wp, *start, *end;
-
- rcu_read_lock();
- ies = rcu_dereference(req->bss->ies);
- start = ies->data;
- end = start + ies->len;
-
- while (true) {
- wp = cfg80211_find_vendor_ie(
- WLAN_OUI_MICROSOFT,
- WLAN_OUI_TYPE_MICROSOFT_WMM,
- start, end - start);
- if (!wp)
- break;
- start = wp + wp[1] + 2;
- /* if this IE is too short, try the next */
- if (wp[1] <= 4)
- continue;
- /* if this IE is WMM params, we found what we wanted */
- if (wp[6] == 1)
- break;
- }
-
- if (!wp || !ieee80211_usable_wmm_params(sdata, wp + 2,
- wp[1] - 2)) {
- assoc_data->wmm = false;
- ifmgd->flags |= IEEE80211_STA_DISABLE_WMM;
- }
- rcu_read_unlock();
- }
/*
* IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode.
@@ -4728,14 +4654,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
}
}
- if (req->flags & ASSOC_REQ_DISABLE_HT) {
- ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
- ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
- }
-
- if (req->flags & ASSOC_REQ_DISABLE_VHT)
- ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
-
/* Also disable HT if we don't support it or the AP doesn't use WMM */
sband = local->hw.wiphy->bands[req->bss->channel->band];
if (!sband->ht_cap.ht_supported ||
@@ -4802,7 +4720,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
rcu_read_unlock();
if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) &&
- (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK),
+ ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK),
"U-APSD not supported with HW_PS_NULLFUNC_STACK\n"))
sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD;
@@ -4847,14 +4765,43 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
ifmgd->dtim_period = 0;
ifmgd->have_beacon = false;
- err = ieee80211_prep_connection(sdata, req->bss, true);
+ /* override HT/VHT configuration only if the AP and we support it */
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+ struct ieee80211_sta_ht_cap sta_ht_cap;
+
+ if (req->flags & ASSOC_REQ_DISABLE_HT)
+ override = true;
+
+ memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
+ ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
+
+ /* check for 40 MHz disable override */
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ) &&
+ sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 &&
+ !(sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40))
+ override = true;
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+ req->flags & ASSOC_REQ_DISABLE_VHT)
+ override = true;
+ }
+
+ if (req->flags & ASSOC_REQ_DISABLE_HT) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ }
+
+ if (req->flags & ASSOC_REQ_DISABLE_VHT)
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
+ err = ieee80211_prep_connection(sdata, req->bss, true, override);
if (err)
goto err_clear;
rcu_read_lock();
beacon_ies = rcu_dereference(req->bss->beacon_ies);
- if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC &&
+ if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC) &&
!beacon_ies) {
/*
* Wait up to one beacon interval ...
@@ -4881,7 +4828,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
assoc_data->timeout = jiffies;
assoc_data->timeout_started = true;
- if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) {
+ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) {
sdata->vif.bss_conf.sync_tsf = beacon_ies->tsf;
sdata->vif.bss_conf.sync_device_ts =
bss->device_ts_beacon;
@@ -4946,6 +4893,25 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
return 0;
}
+ if (ifmgd->assoc_data &&
+ ether_addr_equal(ifmgd->assoc_data->bss->bssid, req->bssid)) {
+ sdata_info(sdata,
+ "aborting association with %pM by local choice (Reason: %u=%s)\n",
+ req->bssid, req->reason_code,
+ ieee80211_get_reason_code_string(req->reason_code));
+
+ drv_mgd_prepare_tx(sdata->local, sdata);
+ ieee80211_send_deauth_disassoc(sdata, req->bssid,
+ IEEE80211_STYPE_DEAUTH,
+ req->reason_code, tx,
+ frame_buf);
+ ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_report_disconnect(sdata, frame_buf,
+ sizeof(frame_buf), true,
+ req->reason_code);
+ return 0;
+ }
+
if (ifmgd->associated &&
ether_addr_equal(ifmgd->associated->bssid, req->bssid)) {
sdata_info(sdata,
diff --git a/kernel/net/mac80211/ocb.c b/kernel/net/mac80211/ocb.c
index 358d5f9d8..0be0aadfc 100644
--- a/kernel/net/mac80211/ocb.c
+++ b/kernel/net/mac80211/ocb.c
@@ -75,7 +75,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
if (!sta)
return;
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
/* Add only mandatory rates for now */
sband = local->hw.wiphy->bands[band];
@@ -179,7 +179,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
- u32 changed = BSS_CHANGED_OCB;
+ u32 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID;
int err;
if (ifocb->joined == true)
diff --git a/kernel/net/mac80211/offchannel.c b/kernel/net/mac80211/offchannel.c
index 683f0e3cb..044010371 100644
--- a/kernel/net/mac80211/offchannel.c
+++ b/kernel/net/mac80211/offchannel.c
@@ -46,7 +46,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
}
if (!local->offchannel_ps_enabled ||
- !(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK))
+ !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
/*
* If power save was enabled, no need to send a nullfunc
* frame because AP knows that we are sleeping. But if the
@@ -57,7 +57,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
* to send a new nullfunc frame to inform the AP that we
* are again sleeping.
*/
- ieee80211_send_nullfunc(local, sdata, 1);
+ ieee80211_send_nullfunc(local, sdata, true);
}
/* inform AP that we are awake again, unless power save is enabled */
@@ -66,7 +66,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
struct ieee80211_local *local = sdata->local;
if (!local->ps_sdata)
- ieee80211_send_nullfunc(local, sdata, 0);
+ ieee80211_send_nullfunc(local, sdata, false);
else if (local->offchannel_ps_enabled) {
/*
* In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware
@@ -93,7 +93,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
* restart the timer now and send a nullfunc frame to inform
* the AP that we are awake.
*/
- ieee80211_send_nullfunc(local, sdata, 0);
+ ieee80211_send_nullfunc(local, sdata, false);
mod_timer(&local->dynamic_ps_timer, jiffies +
msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
}
diff --git a/kernel/net/mac80211/pm.c b/kernel/net/mac80211/pm.c
index ac6ad6238..00a43a70e 100644
--- a/kernel/net/mac80211/pm.c
+++ b/kernel/net/mac80211/pm.c
@@ -6,6 +6,13 @@
#include "driver-ops.h"
#include "led.h"
+static void ieee80211_sched_scan_cancel(struct ieee80211_local *local)
+{
+ if (ieee80211_request_sched_scan_stop(local))
+ return;
+ cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy);
+}
+
int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
{
struct ieee80211_local *local = hw_to_local(hw);
@@ -23,7 +30,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
ieee80211_del_virtual_monitor(local);
- if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION) &&
+ !(wowlan && wowlan->any)) {
mutex_lock(&local->sta_mtx);
list_for_each_entry(sta, &local->sta_list, list) {
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
@@ -33,6 +41,10 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
mutex_unlock(&local->sta_mtx);
}
+ /* keep sched_scan only in case of 'any' trigger */
+ if (!(wowlan && wowlan->any))
+ ieee80211_sched_scan_cancel(local);
+
ieee80211_stop_queues_by_reason(hw,
IEEE80211_MAX_QUEUE_MAP,
IEEE80211_QUEUE_STOP_REASON_SUSPEND,
@@ -76,13 +88,29 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
if (sdata->vif.type != NL80211_IFTYPE_STATION)
continue;
ieee80211_mgd_quiesce(sdata);
+ /* If suspended during TX in progress, and wowlan
+ * is enabled (connection will be active) there
+ * can be a race where the driver is put out
+ * of power-save due to TX and during suspend
+ * dynamic_ps_timer is cancelled and TX packet
+ * is flushed, leaving the driver in ACTIVE even
+ * after resuming until dynamic_ps_timer puts
+ * driver back in DOZE.
+ */
+ if (sdata->u.mgd.associated &&
+ sdata->u.mgd.powersave &&
+ !(local->hw.conf.flags & IEEE80211_CONF_PS)) {
+ local->hw.conf.flags |= IEEE80211_CONF_PS;
+ ieee80211_hw_config(local,
+ IEEE80211_CONF_CHANGE_PS);
+ }
}
err = drv_suspend(local, wowlan);
if (err < 0) {
local->quiescing = false;
local->wowlan = false;
- if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) {
mutex_lock(&local->sta_mtx);
list_for_each_entry(sta,
&local->sta_list, list) {
diff --git a/kernel/net/mac80211/rate.c b/kernel/net/mac80211/rate.c
index d53355b01..a4e2f4e67 100644
--- a/kernel/net/mac80211/rate.c
+++ b/kernel/net/mac80211/rate.c
@@ -29,6 +29,65 @@ module_param(ieee80211_default_rc_algo, charp, 0644);
MODULE_PARM_DESC(ieee80211_default_rc_algo,
"Default rate control algorithm for mac80211 to use");
+void rate_control_rate_init(struct sta_info *sta)
+{
+ struct ieee80211_local *local = sta->sdata->local;
+ struct rate_control_ref *ref = sta->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ ieee80211_sta_set_rx_nss(sta);
+
+ if (!ref)
+ return;
+
+ rcu_read_lock();
+
+ chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band];
+
+ spin_lock_bh(&sta->rate_ctrl_lock);
+ ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista,
+ priv_sta);
+ spin_unlock_bh(&sta->rate_ctrl_lock);
+ rcu_read_unlock();
+ set_sta_flag(sta, WLAN_STA_RATE_CONTROL);
+}
+
+void rate_control_rate_update(struct ieee80211_local *local,
+ struct ieee80211_supported_band *sband,
+ struct sta_info *sta, u32 changed)
+{
+ struct rate_control_ref *ref = local->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ if (ref && ref->ops->rate_update) {
+ rcu_read_lock();
+
+ chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ spin_lock_bh(&sta->rate_ctrl_lock);
+ ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def,
+ ista, priv_sta, changed);
+ spin_unlock_bh(&sta->rate_ctrl_lock);
+ rcu_read_unlock();
+ }
+ drv_sta_rc_update(local, sta->sdata, &sta->sta, changed);
+}
+
int ieee80211_rate_control_register(const struct rate_control_ops *ops)
{
struct rate_control_alg *alg;
@@ -103,7 +162,7 @@ ieee80211_rate_control_ops_get(const char *name)
const struct rate_control_ops *ops;
const char *alg_name;
- kparam_block_sysfs_write(ieee80211_default_rc_algo);
+ kernel_param_lock(THIS_MODULE);
if (!name)
alg_name = ieee80211_default_rc_algo;
else
@@ -117,7 +176,7 @@ ieee80211_rate_control_ops_get(const char *name)
/* try built-in one if specific alg requested but not found */
if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
- kparam_unblock_sysfs_write(ieee80211_default_rc_algo);
+ kernel_param_unlock(THIS_MODULE);
return ops;
}
@@ -246,7 +305,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
info->control.rates[0].idx = i;
break;
}
- WARN_ON_ONCE(i == sband->n_bitrates);
+ WARN_ONCE(i == sband->n_bitrates,
+ "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n",
+ sta ? sta->supp_rates[sband->band] : -1,
+ rate_mask, rate_flags);
info->control.rates[0].count =
(info->flags & IEEE80211_TX_CTL_NO_ACK) ?
@@ -294,39 +356,37 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
}
EXPORT_SYMBOL(rate_control_send_low);
-static bool rate_idx_match_legacy_mask(struct ieee80211_tx_rate *rate,
- int n_bitrates, u32 mask)
+static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask)
{
int j;
/* See whether the selected rate or anything below it is allowed. */
- for (j = rate->idx; j >= 0; j--) {
+ for (j = *rate_idx; j >= 0; j--) {
if (mask & (1 << j)) {
/* Okay, found a suitable rate. Use it. */
- rate->idx = j;
+ *rate_idx = j;
return true;
}
}
/* Try to find a higher rate that would be allowed */
- for (j = rate->idx + 1; j < n_bitrates; j++) {
+ for (j = *rate_idx + 1; j < n_bitrates; j++) {
if (mask & (1 << j)) {
/* Okay, found a suitable rate. Use it. */
- rate->idx = j;
+ *rate_idx = j;
return true;
}
}
return false;
}
-static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate,
- u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN])
+static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask)
{
int i, j;
int ridx, rbit;
- ridx = rate->idx / 8;
- rbit = rate->idx % 8;
+ ridx = *rate_idx / 8;
+ rbit = *rate_idx % 8;
/* sanity check */
if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN)
@@ -336,20 +396,20 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate,
for (i = ridx; i >= 0; i--) {
for (j = rbit; j >= 0; j--)
if (mcs_mask[i] & BIT(j)) {
- rate->idx = i * 8 + j;
+ *rate_idx = i * 8 + j;
return true;
}
rbit = 7;
}
/* Try to find a higher rate that would be allowed */
- ridx = (rate->idx + 1) / 8;
- rbit = (rate->idx + 1) % 8;
+ ridx = (*rate_idx + 1) / 8;
+ rbit = (*rate_idx + 1) % 8;
for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
for (j = rbit; j < 8; j++)
if (mcs_mask[i] & BIT(j)) {
- rate->idx = i * 8 + j;
+ *rate_idx = i * 8 + j;
return true;
}
rbit = 0;
@@ -357,37 +417,93 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate,
return false;
}
+static bool rate_idx_match_vht_mcs_mask(s8 *rate_idx, u16 *vht_mask)
+{
+ int i, j;
+ int ridx, rbit;
+
+ ridx = *rate_idx >> 4;
+ rbit = *rate_idx & 0xf;
+
+ if (ridx < 0 || ridx >= NL80211_VHT_NSS_MAX)
+ return false;
+
+ /* See whether the selected rate or anything below it is allowed. */
+ for (i = ridx; i >= 0; i--) {
+ for (j = rbit; j >= 0; j--) {
+ if (vht_mask[i] & BIT(j)) {
+ *rate_idx = (i << 4) | j;
+ return true;
+ }
+ }
+ rbit = 15;
+ }
+
+ /* Try to find a higher rate that would be allowed */
+ ridx = (*rate_idx + 1) >> 4;
+ rbit = (*rate_idx + 1) & 0xf;
+ for (i = ridx; i < NL80211_VHT_NSS_MAX; i++) {
+ for (j = rbit; j < 16; j++) {
+ if (vht_mask[i] & BIT(j)) {
+ *rate_idx = (i << 4) | j;
+ return true;
+ }
+ }
+ rbit = 0;
+ }
+ return false;
+}
-static void rate_idx_match_mask(struct ieee80211_tx_rate *rate,
+static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags,
struct ieee80211_supported_band *sband,
enum nl80211_chan_width chan_width,
u32 mask,
- u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN])
+ u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN],
+ u16 vht_mask[NL80211_VHT_NSS_MAX])
{
- struct ieee80211_tx_rate alt_rate;
+ if (*rate_flags & IEEE80211_TX_RC_VHT_MCS) {
+ /* handle VHT rates */
+ if (rate_idx_match_vht_mcs_mask(rate_idx, vht_mask))
+ return;
+
+ *rate_idx = 0;
+ /* keep protection flags */
+ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS |
+ IEEE80211_TX_RC_USE_CTS_PROTECT |
+ IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
- /* handle HT rates */
- if (rate->flags & IEEE80211_TX_RC_MCS) {
- if (rate_idx_match_mcs_mask(rate, mcs_mask))
+ *rate_flags |= IEEE80211_TX_RC_MCS;
+ if (chan_width == NL80211_CHAN_WIDTH_40)
+ *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
+
+ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask))
return;
/* also try the legacy rates. */
- alt_rate.idx = 0;
+ *rate_flags &= ~(IEEE80211_TX_RC_MCS |
+ IEEE80211_TX_RC_40_MHZ_WIDTH);
+ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates,
+ mask))
+ return;
+ } else if (*rate_flags & IEEE80211_TX_RC_MCS) {
+ /* handle HT rates */
+ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask))
+ return;
+
+ /* also try the legacy rates. */
+ *rate_idx = 0;
/* keep protection flags */
- alt_rate.flags = rate->flags &
- (IEEE80211_TX_RC_USE_RTS_CTS |
- IEEE80211_TX_RC_USE_CTS_PROTECT |
- IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
- alt_rate.count = rate->count;
- if (rate_idx_match_legacy_mask(&alt_rate,
- sband->n_bitrates, mask)) {
- *rate = alt_rate;
+ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS |
+ IEEE80211_TX_RC_USE_CTS_PROTECT |
+ IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
+ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates,
+ mask))
return;
- }
- } else if (!(rate->flags & IEEE80211_TX_RC_VHT_MCS)) {
+ } else {
/* handle legacy rates */
- if (rate_idx_match_legacy_mask(rate, sband->n_bitrates, mask))
+ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates,
+ mask))
return;
/* if HT BSS, and we handle a data frame, also try HT rates */
@@ -400,23 +516,19 @@ static void rate_idx_match_mask(struct ieee80211_tx_rate *rate,
break;
}
- alt_rate.idx = 0;
+ *rate_idx = 0;
/* keep protection flags */
- alt_rate.flags = rate->flags &
- (IEEE80211_TX_RC_USE_RTS_CTS |
- IEEE80211_TX_RC_USE_CTS_PROTECT |
- IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
- alt_rate.count = rate->count;
+ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS |
+ IEEE80211_TX_RC_USE_CTS_PROTECT |
+ IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
- alt_rate.flags |= IEEE80211_TX_RC_MCS;
+ *rate_flags |= IEEE80211_TX_RC_MCS;
if (chan_width == NL80211_CHAN_WIDTH_40)
- alt_rate.flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
+ *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
- if (rate_idx_match_mcs_mask(&alt_rate, mcs_mask)) {
- *rate = alt_rate;
+ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask))
return;
- }
}
/*
@@ -569,18 +681,92 @@ static void rate_control_fill_sta_table(struct ieee80211_sta *sta,
}
}
+static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_sta *sta, u32 *mask,
+ u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN],
+ u16 vht_mask[NL80211_VHT_NSS_MAX])
+{
+ u32 i, flags;
+
+ *mask = sdata->rc_rateidx_mask[sband->band];
+ flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((flags & sband->bitrates[i].flags) != flags)
+ *mask &= ~BIT(i);
+ }
+
+ if (*mask == (1 << sband->n_bitrates) - 1 &&
+ !sdata->rc_has_mcs_mask[sband->band] &&
+ !sdata->rc_has_vht_mcs_mask[sband->band])
+ return false;
+
+ if (sdata->rc_has_mcs_mask[sband->band])
+ memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band],
+ IEEE80211_HT_MCS_MASK_LEN);
+ else
+ memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN);
+
+ if (sdata->rc_has_vht_mcs_mask[sband->band])
+ memcpy(vht_mask, sdata->rc_rateidx_vht_mcs_mask[sband->band],
+ sizeof(u16) * NL80211_VHT_NSS_MAX);
+ else
+ memset(vht_mask, 0xff, sizeof(u16) * NL80211_VHT_NSS_MAX);
+
+ if (sta) {
+ __le16 sta_vht_cap;
+ u16 sta_vht_mask[NL80211_VHT_NSS_MAX];
+
+ /* Filter out rates that the STA does not support */
+ *mask &= sta->supp_rates[sband->band];
+ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
+ mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i];
+
+ sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map;
+ ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask);
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
+ vht_mask[i] &= sta_vht_mask[i];
+ }
+
+ return true;
+}
+
+static void
+rate_control_apply_mask_ratetbl(struct sta_info *sta,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_sta_rates *rates)
+{
+ int i;
+ u32 mask;
+ u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN];
+ u16 vht_mask[NL80211_VHT_NSS_MAX];
+ enum nl80211_chan_width chan_width;
+
+ if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask,
+ mcs_mask, vht_mask))
+ return;
+
+ chan_width = sta->sdata->vif.bss_conf.chandef.width;
+ for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) {
+ if (rates->rate[i].idx < 0)
+ break;
+
+ rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags,
+ sband, chan_width, mask, mcs_mask,
+ vht_mask);
+ }
+}
+
static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sta *sta,
struct ieee80211_supported_band *sband,
- struct ieee80211_tx_info *info,
struct ieee80211_tx_rate *rates,
int max_rates)
{
enum nl80211_chan_width chan_width;
u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN];
- bool has_mcs_mask;
u32 mask;
- u32 rate_flags;
+ u16 rate_flags, vht_mask[NL80211_VHT_NSS_MAX];
int i;
/*
@@ -588,30 +774,10 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
* default mask (allow all rates) is used to save some processing for
* the common case.
*/
- mask = sdata->rc_rateidx_mask[info->band];
- has_mcs_mask = sdata->rc_has_mcs_mask[info->band];
- rate_flags =
- ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
- for (i = 0; i < sband->n_bitrates; i++)
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- mask &= ~BIT(i);
-
- if (mask == (1 << sband->n_bitrates) - 1 && !has_mcs_mask)
+ if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask,
+ vht_mask))
return;
- if (has_mcs_mask)
- memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[info->band],
- sizeof(mcs_mask));
- else
- memset(mcs_mask, 0xff, sizeof(mcs_mask));
-
- if (sta) {
- /* Filter out rates that the STA does not support */
- mask &= sta->supp_rates[info->band];
- for (i = 0; i < sizeof(mcs_mask); i++)
- mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i];
- }
-
/*
* Make sure the rate index selected for each TX rate is
* included in the configured mask and change the rate indexes
@@ -623,8 +789,10 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
if (rates[i].idx < 0)
break;
- rate_idx_match_mask(&rates[i], sband, chan_width, mask,
- mcs_mask);
+ rate_flags = rates[i].flags;
+ rate_idx_match_mask(&rates[i].idx, &rate_flags, sband,
+ chan_width, mask, mcs_mask, vht_mask);
+ rates[i].flags = rate_flags;
}
}
@@ -648,7 +816,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
sband = sdata->local->hw.wiphy->bands[info->band];
if (ieee80211_is_data(hdr->frame_control))
- rate_control_apply_mask(sdata, sta, sband, info, dest, max_rates);
+ rate_control_apply_mask(sdata, sta, sband, dest, max_rates);
if (dest[0].idx < 0)
__rate_control_send_low(&sdata->local->hw, sband, sta, info,
@@ -680,12 +848,18 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
info->control.rates[i].count = 0;
}
- if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+ if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL))
return;
- ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
+ if (ista) {
+ spin_lock_bh(&sta->rate_ctrl_lock);
+ ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
+ spin_unlock_bh(&sta->rate_ctrl_lock);
+ } else {
+ ref->ops->get_rate(ref->priv, NULL, NULL, txrc);
+ }
- if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE)
+ if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE))
return;
ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb,
@@ -699,7 +873,10 @@ int rate_control_set_rates(struct ieee80211_hw *hw,
{
struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
struct ieee80211_sta_rates *old;
+ struct ieee80211_supported_band *sband;
+ sband = hw->wiphy->bands[ieee80211_get_sdata_band(sta->sdata)];
+ rate_control_apply_mask_ratetbl(sta, sband, rates);
/*
* mac80211 guarantees that this function will not be called
* concurrently, so the following RCU access is safe, even without
@@ -727,7 +904,7 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
if (local->open_count)
return -EBUSY;
- if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) {
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
if (WARN_ON(!local->ops->set_rts_threshold))
return -EINVAL;
return 0;
diff --git a/kernel/net/mac80211/rate.h b/kernel/net/mac80211/rate.h
index 38652f09f..624fe5b81 100644
--- a/kernel/net/mac80211/rate.h
+++ b/kernel/net/mac80211/rate.h
@@ -42,10 +42,12 @@ static inline void rate_control_tx_status(struct ieee80211_local *local,
if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
return;
+ spin_lock_bh(&sta->rate_ctrl_lock);
if (ref->ops->tx_status)
ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb);
else
ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info);
+ spin_unlock_bh(&sta->rate_ctrl_lock);
}
static inline void
@@ -64,69 +66,21 @@ rate_control_tx_status_noskb(struct ieee80211_local *local,
if (WARN_ON_ONCE(!ref->ops->tx_status_noskb))
return;
+ spin_lock_bh(&sta->rate_ctrl_lock);
ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info);
+ spin_unlock_bh(&sta->rate_ctrl_lock);
}
-static inline void rate_control_rate_init(struct sta_info *sta)
-{
- struct ieee80211_local *local = sta->sdata->local;
- struct rate_control_ref *ref = sta->rate_ctrl;
- struct ieee80211_sta *ista = &sta->sta;
- void *priv_sta = sta->rate_ctrl_priv;
- struct ieee80211_supported_band *sband;
- struct ieee80211_chanctx_conf *chanctx_conf;
-
- ieee80211_sta_set_rx_nss(sta);
-
- if (!ref)
- return;
-
- rcu_read_lock();
-
- chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
- if (WARN_ON(!chanctx_conf)) {
- rcu_read_unlock();
- return;
- }
-
- sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band];
-
- ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista,
- priv_sta);
- rcu_read_unlock();
- set_sta_flag(sta, WLAN_STA_RATE_CONTROL);
-}
-
-static inline void rate_control_rate_update(struct ieee80211_local *local,
+void rate_control_rate_init(struct sta_info *sta);
+void rate_control_rate_update(struct ieee80211_local *local,
struct ieee80211_supported_band *sband,
- struct sta_info *sta, u32 changed)
-{
- struct rate_control_ref *ref = local->rate_ctrl;
- struct ieee80211_sta *ista = &sta->sta;
- void *priv_sta = sta->rate_ctrl_priv;
- struct ieee80211_chanctx_conf *chanctx_conf;
-
- if (ref && ref->ops->rate_update) {
- rcu_read_lock();
-
- chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
- if (WARN_ON(!chanctx_conf)) {
- rcu_read_unlock();
- return;
- }
-
- ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def,
- ista, priv_sta, changed);
- rcu_read_unlock();
- }
- drv_sta_rc_update(local, sta->sdata, &sta->sta, changed);
-}
+ struct sta_info *sta, u32 changed);
static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
- struct ieee80211_sta *sta,
- gfp_t gfp)
+ struct sta_info *sta, gfp_t gfp)
{
- return ref->ops->alloc_sta(ref->priv, sta, gfp);
+ spin_lock_init(&sta->rate_ctrl_lock);
+ return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp);
}
static inline void rate_control_free_sta(struct sta_info *sta)
diff --git a/kernel/net/mac80211/rc80211_minstrel.c b/kernel/net/mac80211/rc80211_minstrel.c
index 3ece7d103..b54f398cd 100644
--- a/kernel/net/mac80211/rc80211_minstrel.c
+++ b/kernel/net/mac80211/rc80211_minstrel.c
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
* computing cur_tp
*/
tmp_mrs = &mi->r[idx].stats;
- tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+ tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
return tmp_cur_tp;
diff --git a/kernel/net/mac80211/rc80211_minstrel_debugfs.c b/kernel/net/mac80211/rc80211_minstrel_debugfs.c
index 1db5f7c33..820b0abc9 100644
--- a/kernel/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/kernel/net/mac80211/rc80211_minstrel_debugfs.c
@@ -85,12 +85,10 @@ minstrel_stats_open(struct inode *inode, struct file *file)
file->private_data = ms;
p = ms->buf;
p += sprintf(p, "\n");
- p += sprintf(p, "best __________rate_________ ______"
- "statistics______ ________last_______ "
- "______sum-of________\n");
- p += sprintf(p, "rate [name idx airtime max_tp] [ ø(tp) ø(prob) "
- "sd(prob)] [prob.|retry|suc|att] "
- "[#success | #attempts]\n");
+ p += sprintf(p,
+ "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n");
+ p += sprintf(p,
+ "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n");
for (i = 0; i < mi->n_rates; i++) {
struct minstrel_rate *mr = &mi->r[i];
@@ -112,7 +110,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
- p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
+ p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
" %3u.%1u %3u %3u %-3u "
"%9llu %-9llu\n",
tp_max / 10, tp_max % 10,
diff --git a/kernel/net/mac80211/rc80211_minstrel_ht.c b/kernel/net/mac80211/rc80211_minstrel_ht.c
index 7430a1df2..239ed6e92 100644
--- a/kernel/net/mac80211/rc80211_minstrel_ht.c
+++ b/kernel/net/mac80211/rc80211_minstrel_ht.c
@@ -691,7 +691,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
if (likely(sta->ampdu_mlme.tid_tx[tid]))
return;
- ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+ ieee80211_start_tx_ba_session(pubsta, tid, 0);
}
static void
@@ -867,7 +867,13 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
else
idx = index % MCS_GROUP_RATES + (group->streams - 1) * 8;
- if (offset > 0) {
+ /* enable RTS/CTS if needed:
+ * - if station is in dynamic SMPS (and streams > 1)
+ * - for fallback rates, to increase chances of getting through
+ */
+ if (offset > 0 ||
+ (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC &&
+ group->streams > 1)) {
ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
flags |= IEEE80211_TX_RC_USE_RTS_CTS;
}
@@ -1070,7 +1076,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
if (sband->band != IEEE80211_BAND_2GHZ)
return;
- if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES))
+ if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES))
return;
mi->cck_supported = 0;
@@ -1328,7 +1334,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
prob = mi->groups[i].rates[j].prob_ewma;
/* convert tp_avg from pkt per second in kbps */
- tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+ tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
return tp_avg;
}
diff --git a/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c b/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 6822ce0f9..5320e35ed 100644
--- a/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/kernel/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -86,7 +86,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
- p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
+ p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
" %3u.%1u %3u %3u %-3u "
"%9llu %-9llu\n",
tp_max / 10, tp_max % 10,
@@ -129,12 +129,10 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
p = ms->buf;
p += sprintf(p, "\n");
- p += sprintf(p, " best ____________rate__________ "
- "______statistics______ ________last_______ "
- "______sum-of________\n");
- p += sprintf(p, "mode guard # rate [name idx airtime max_tp] "
- "[ ø(tp) ø(prob) sd(prob)] [prob.|retry|suc|att] [#success | "
- "#attempts]\n");
+ p += sprintf(p,
+ " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n");
+ p += sprintf(p,
+ "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n");
p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
for (i = 0; i < MINSTREL_CCK_GROUP; i++)
diff --git a/kernel/net/mac80211/rx.c b/kernel/net/mac80211/rx.c
index f6f8d9880..0e9e264bc 100644
--- a/kernel/net/mac80211/rx.c
+++ b/kernel/net/mac80211/rx.c
@@ -32,6 +32,61 @@
#include "wme.h"
#include "rate.h"
+static inline void ieee80211_rx_stats(struct net_device *dev, u32 len)
+{
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += len;
+ u64_stats_update_end(&tstats->syncp);
+}
+
+static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+ enum nl80211_iftype type)
+{
+ __le16 fc = hdr->frame_control;
+
+ if (ieee80211_is_data(fc)) {
+ if (len < 24) /* drop incorrect hdr len (data) */
+ return NULL;
+
+ if (ieee80211_has_a4(fc))
+ return NULL;
+ if (ieee80211_has_tods(fc))
+ return hdr->addr1;
+ if (ieee80211_has_fromds(fc))
+ return hdr->addr2;
+
+ return hdr->addr3;
+ }
+
+ if (ieee80211_is_mgmt(fc)) {
+ if (len < 24) /* drop incorrect hdr len (mgmt) */
+ return NULL;
+ return hdr->addr3;
+ }
+
+ if (ieee80211_is_ctl(fc)) {
+ if (ieee80211_is_pspoll(fc))
+ return hdr->addr1;
+
+ if (ieee80211_is_back_req(fc)) {
+ switch (type) {
+ case NL80211_IFTYPE_STATION:
+ return hdr->addr2;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ return hdr->addr1;
+ default:
+ break; /* fall through to the return */
+ }
+ }
+ }
+
+ return NULL;
+}
+
/*
* monitor mode reception
*
@@ -42,7 +97,7 @@ static struct sk_buff *remove_monitor_info(struct ieee80211_local *local,
struct sk_buff *skb,
unsigned int rtap_vendor_space)
{
- if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) {
+ if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) {
if (likely(skb->len > FCS_LEN))
__pskb_trim(skb, skb->len - FCS_LEN);
else {
@@ -67,8 +122,7 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
hdr = (void *)(skb->data + rtap_vendor_space);
if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
- RX_FLAG_FAILED_PLCP_CRC |
- RX_FLAG_AMPDU_IS_ZEROLEN))
+ RX_FLAG_FAILED_PLCP_CRC))
return true;
if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
@@ -100,7 +154,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
len = ALIGN(len, 8);
len += 8;
}
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
len += 1;
/* antenna field, if we don't have per-chain info */
@@ -175,7 +229,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
}
mpdulen = skb->len;
- if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)))
+ if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)))
mpdulen += FCS_LEN;
rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len);
@@ -229,7 +283,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
}
/* IEEE80211_RADIOTAP_FLAGS */
- if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS))
+ if (has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS))
*pos |= IEEE80211_RADIOTAP_F_FCS;
if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC))
*pos |= IEEE80211_RADIOTAP_F_BADFCS;
@@ -279,7 +333,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
pos += 2;
/* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM &&
+ if (ieee80211_hw_check(&local->hw, SIGNAL_DBM) &&
!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
*pos = status->signal;
rthdr->it_present |=
@@ -336,10 +390,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS);
put_unaligned_le32(status->ampdu_reference, pos);
pos += 4;
- if (status->flag & RX_FLAG_AMPDU_REPORT_ZEROLEN)
- flags |= IEEE80211_RADIOTAP_AMPDU_REPORT_ZEROLEN;
- if (status->flag & RX_FLAG_AMPDU_IS_ZEROLEN)
- flags |= IEEE80211_RADIOTAP_AMPDU_IS_ZEROLEN;
if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN)
flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN;
if (status->flag & RX_FLAG_AMPDU_IS_LAST)
@@ -448,7 +498,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
* the SKB because it has a bad FCS/PLCP checksum.
*/
- if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
+ if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS))
present_fcs_len = FCS_LEN;
/* ensure hdr->frame_control and vendor radiotap data are in skb head */
@@ -529,8 +579,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
}
prev_dev = sdata->dev;
- sdata->dev->stats.rx_packets++;
- sdata->dev->stats.rx_bytes += skb->len;
+ ieee80211_rx_stats(sdata->dev, skb->len);
}
if (prev_dev) {
@@ -981,7 +1030,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
struct sk_buff *skb = rx->skb;
struct ieee80211_local *local = rx->local;
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
struct sta_info *sta = rx->sta;
struct tid_ampdu_rx *tid_agg_rx;
u16 sc;
@@ -1016,10 +1064,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL)
goto dont_reorder;
- /* not actually part of this BA session */
- if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
- goto dont_reorder;
-
/* new, potentially un-ordered, ampdu frame - process it */
/* reset session timer */
@@ -1069,18 +1113,16 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
is_multicast_ether_addr(hdr->addr1))
return RX_CONTINUE;
- if (rx->sta) {
- if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
- rx->sta->last_seq_ctrl[rx->seqno_idx] ==
- hdr->seq_ctrl)) {
- if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
- rx->local->dot11FrameDuplicateCount++;
- rx->sta->num_duplicates++;
- }
- return RX_DROP_UNUSABLE;
- } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
- rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl;
- }
+ if (!rx->sta)
+ return RX_CONTINUE;
+
+ if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
+ rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) {
+ I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount);
+ rx->sta->rx_stats.num_duplicates++;
+ return RX_DROP_UNUSABLE;
+ } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
+ rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl;
}
return RX_CONTINUE;
@@ -1091,11 +1133,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
- if (unlikely(rx->skb->len < 16)) {
- I802_DEBUG_INC(rx->local->rx_handlers_drop_short);
- return RX_DROP_MONITOR;
- }
-
/* Drop disallowed frame classes based on STA auth/assoc state;
* IEEE 802.11, Chap 5.5.
*
@@ -1195,11 +1232,13 @@ static void sta_ps_start(struct sta_info *sta)
atomic_inc(&ps->num_sta_ps);
set_sta_flag(sta, WLAN_STA_PS_STA);
- if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
+ if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta);
ps_dbg(sdata, "STA %pM aid %d enters power save mode\n",
sta->sta.addr, sta->sta.aid);
+ ieee80211_clear_fast_xmit(sta);
+
if (!sta->sta.txq[0])
return;
@@ -1236,22 +1275,22 @@ static void sta_ps_end(struct sta_info *sta)
ieee80211_sta_ps_deliver_wakeup(sta);
}
-int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start)
+int ieee80211_sta_ps_transition(struct ieee80211_sta *pubsta, bool start)
{
- struct sta_info *sta_inf = container_of(sta, struct sta_info, sta);
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
bool in_ps;
- WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS));
+ WARN_ON(!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS));
/* Don't let the same PS state be set twice */
- in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA);
+ in_ps = test_sta_flag(sta, WLAN_STA_PS_STA);
if ((start && in_ps) || (!start && !in_ps))
return -EINVAL;
if (start)
- sta_ps_start(sta_inf);
+ sta_ps_start(sta);
else
- sta_ps_end(sta_inf);
+ sta_ps_end(sta);
return 0;
}
@@ -1265,7 +1304,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
int tid, ac;
- if (!rx->sta || !(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ if (!rx->sta)
return RX_CONTINUE;
if (sdata->vif.type != NL80211_IFTYPE_AP &&
@@ -1277,7 +1316,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
* uAPSD and PS-Poll frames (the latter shouldn't even come up from
* it to mac80211 since they're handled.)
*/
- if (sdata->local->hw.flags & IEEE80211_HW_AP_LINK_PS)
+ if (ieee80211_hw_check(&sdata->local->hw, AP_LINK_PS))
return RX_CONTINUE;
/*
@@ -1357,58 +1396,56 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
NL80211_IFTYPE_ADHOC);
if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) &&
test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
if (ieee80211_is_data(hdr->frame_control) &&
!is_multicast_ether_addr(hdr->addr1)) {
- sta->last_rx_rate_idx = status->rate_idx;
- sta->last_rx_rate_flag = status->flag;
- sta->last_rx_rate_vht_flag = status->vht_flag;
- sta->last_rx_rate_vht_nss = status->vht_nss;
+ sta->rx_stats.last_rate_idx =
+ status->rate_idx;
+ sta->rx_stats.last_rate_flag =
+ status->flag;
+ sta->rx_stats.last_rate_vht_flag =
+ status->vht_flag;
+ sta->rx_stats.last_rate_vht_nss =
+ status->vht_nss;
}
}
} else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) {
- u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
- NL80211_IFTYPE_OCB);
- /* OCB uses wild-card BSSID */
- if (is_broadcast_ether_addr(bssid))
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
} else if (!is_multicast_ether_addr(hdr->addr1)) {
/*
* Mesh beacons will update last_rx when if they are found to
* match the current local configuration when processed.
*/
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
if (ieee80211_is_data(hdr->frame_control)) {
- sta->last_rx_rate_idx = status->rate_idx;
- sta->last_rx_rate_flag = status->flag;
- sta->last_rx_rate_vht_flag = status->vht_flag;
- sta->last_rx_rate_vht_nss = status->vht_nss;
+ sta->rx_stats.last_rate_idx = status->rate_idx;
+ sta->rx_stats.last_rate_flag = status->flag;
+ sta->rx_stats.last_rate_vht_flag = status->vht_flag;
+ sta->rx_stats.last_rate_vht_nss = status->vht_nss;
}
}
- if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
- return RX_CONTINUE;
-
if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
ieee80211_sta_rx_notify(rx->sdata, hdr);
- sta->rx_fragments++;
- sta->rx_bytes += rx->skb->len;
+ sta->rx_stats.fragments++;
+ sta->rx_stats.bytes += rx->skb->len;
if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
- sta->last_signal = status->signal;
- ewma_add(&sta->avg_signal, -status->signal);
+ sta->rx_stats.last_signal = status->signal;
+ ewma_signal_add(&sta->rx_stats.avg_signal, -status->signal);
}
if (status->chains) {
- sta->chains = status->chains;
+ sta->rx_stats.chains = status->chains;
for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
int signal = status->chain_signal[i];
if (!(status->chains & BIT(i)))
continue;
- sta->chain_signal_last[i] = signal;
- ewma_add(&sta->chain_signal_avg[i], -signal);
+ sta->rx_stats.chain_signal_last[i] = signal;
+ ewma_signal_add(&sta->rx_stats.chain_signal_avg[i],
+ -signal);
}
}
@@ -1416,7 +1453,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
* Change STA power saving mode only at the end of a frame
* exchange sequence.
*/
- if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) &&
+ if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) &&
!ieee80211_has_morefrags(hdr->frame_control) &&
!(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
(rx->sdata->vif.type == NL80211_IFTYPE_AP ||
@@ -1468,7 +1505,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
* Update counter and free packet here to avoid
* counting this as a dropped packed.
*/
- sta->rx_packets++;
+ sta->rx_stats.packets++;
dev_kfree_skb(rx->skb);
return RX_QUEUED;
}
@@ -1517,13 +1554,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
* possible.
*/
- /*
- * No point in finding a key and decrypting if the frame is neither
- * addressed to us nor a multicast frame.
- */
- if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
- return RX_CONTINUE;
-
/* start without a key */
rx->key = NULL;
fc = hdr->frame_control;
@@ -1657,7 +1687,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
return RX_DROP_MONITOR;
- rx->key->tx_rx_count++;
/* TODO: add threshold stuff again */
} else {
return RX_DROP_MONITOR;
@@ -1725,7 +1754,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
entry->seq = seq;
entry->rx_queue = rx_queue;
entry->last_frag = frag;
- entry->ccmp = 0;
+ entry->check_sequential_pn = false;
entry->extra_len = 0;
return entry;
@@ -1795,7 +1824,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
frag = sc & IEEE80211_SCTL_FRAG;
if (is_multicast_ether_addr(hdr->addr1)) {
- rx->local->dot11MulticastReceivedFrameCount++;
+ I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount);
goto out_no_led;
}
@@ -1821,15 +1850,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
rx->seqno_idx, &(rx->skb));
if (rx->key &&
(rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
- rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
ieee80211_has_protected(fc)) {
int queue = rx->security_idx;
- /* Store CCMP PN so that we can verify that the next
- * fragment has a sequential PN value. */
- entry->ccmp = 1;
+
+ /* Store CCMP/GCMP PN so that we can verify that the
+ * next fragment has a sequential PN value.
+ */
+ entry->check_sequential_pn = true;
memcpy(entry->last_pn,
rx->key->u.ccmp.rx_pn[queue],
IEEE80211_CCMP_PN_LEN);
+ BUILD_BUG_ON(offsetof(struct ieee80211_key,
+ u.ccmp.rx_pn) !=
+ offsetof(struct ieee80211_key,
+ u.gcmp.rx_pn));
+ BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+ sizeof(rx->key->u.gcmp.rx_pn[queue]));
+ BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+ IEEE80211_GCMP_PN_LEN);
}
return RX_QUEUED;
}
@@ -1844,15 +1885,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
return RX_DROP_MONITOR;
}
- /* Verify that MPDUs within one MSDU have sequential PN values.
- * (IEEE 802.11i, 8.3.3.4.5) */
- if (entry->ccmp) {
+ /* "The receiver shall discard MSDUs and MMPDUs whose constituent
+ * MPDU PN values are not incrementing in steps of 1."
+ * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+ * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+ */
+ if (entry->check_sequential_pn) {
int i;
u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
int queue;
+
if (!rx->key ||
(rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
- rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
return RX_DROP_UNUSABLE;
memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -1878,7 +1925,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
rx->skb = __skb_dequeue(&entry->skb_list);
if (skb_tailroom(rx->skb) < entry->extra_len) {
- I802_DEBUG_INC(rx->local->rx_expand_skb_head2);
+ I802_DEBUG_INC(rx->local->rx_expand_skb_head_defrag);
if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len,
GFP_ATOMIC))) {
I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
@@ -1893,13 +1940,12 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
/* Complete frame has been reassembled - process it now */
status = IEEE80211_SKB_RXCB(rx->skb);
- status->rx_flags |= IEEE80211_RX_FRAGMENTED;
out:
ieee80211_led_rx(rx->local);
out_no_led:
if (rx->sta)
- rx->sta->rx_packets++;
+ rx->sta->rx_stats.packets++;
return RX_CONTINUE;
}
@@ -2054,18 +2100,15 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
struct sk_buff *skb, *xmit_skb;
struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
struct sta_info *dsta;
- struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
-
- dev->stats.rx_packets++;
- dev->stats.rx_bytes += rx->skb->len;
skb = rx->skb;
xmit_skb = NULL;
+ ieee80211_rx_stats(dev, skb->len);
+
if ((sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
- (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
if (is_multicast_ether_addr(ehdr->h_dest)) {
/*
@@ -2121,9 +2164,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
/* deliver to local stack */
skb->protocol = eth_type_trans(skb, dev);
memset(skb->cb, 0, sizeof(skb->cb));
- if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) &&
- rx->local->napi)
- napi_gro_receive(rx->local->napi, skb);
+ if (rx->napi)
+ napi_gro_receive(rx->napi, skb);
else
netif_receive_skb(skb);
}
@@ -2207,7 +2249,6 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
struct sk_buff *skb = rx->skb, *fwd_skb;
struct ieee80211_local *local = rx->local;
struct ieee80211_sub_if_data *sdata = rx->sdata;
- struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
u16 q, hdrlen;
@@ -2238,8 +2279,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr))
return RX_DROP_MONITOR;
- if (!ieee80211_is_data(hdr->frame_control) ||
- !(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ if (!ieee80211_is_data(hdr->frame_control))
return RX_CONTINUE;
if (!mesh_hdr->ttl)
@@ -2330,11 +2370,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames);
ieee80211_add_pending_skb(local, fwd_skb);
out:
- if (is_multicast_ether_addr(hdr->addr1) ||
- sdata->dev->flags & IFF_PROMISC)
+ if (is_multicast_ether_addr(hdr->addr1))
return RX_CONTINUE;
- else
- return RX_DROP_MONITOR;
+ return RX_DROP_MONITOR;
}
#endif
@@ -2361,7 +2399,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
* for non-QoS-data frames. Here we know it's a data
* frame, so count MSDUs.
*/
- rx->sta->rx_msdu[rx->seqno_idx]++;
+ rx->sta->rx_stats.msdu[rx->seqno_idx]++;
}
/*
@@ -2395,11 +2433,10 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
tf->category == WLAN_CATEGORY_TDLS &&
(tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST ||
tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) {
- rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TDLS_CHSW;
- skb_queue_tail(&sdata->skb_queue, rx->skb);
- ieee80211_queue_work(&rx->local->hw, &sdata->work);
+ skb_queue_tail(&local->skb_queue_tdls_chsw, rx->skb);
+ schedule_work(&local->tdls_chsw_work);
if (rx->sta)
- rx->sta->rx_packets++;
+ rx->sta->rx_stats.packets++;
return RX_QUEUED;
}
@@ -2445,6 +2482,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
struct {
__le16 control, start_seq_num;
} __packed bar_data;
+ struct ieee80211_event event = {
+ .type = BAR_RX_EVENT,
+ };
if (!rx->sta)
return RX_DROP_MONITOR;
@@ -2460,6 +2500,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
return RX_DROP_MONITOR;
start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
+ event.u.ba.tid = tid;
+ event.u.ba.ssn = start_seq_num;
+ event.u.ba.sta = &rx->sta->sta;
/* reset session timer */
if (tid_agg_rx->timeout)
@@ -2472,6 +2515,8 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
start_seq_num, frames);
spin_unlock(&tid_agg_rx->reorder_lock);
+ drv_event_callback(rx->local, rx->sdata, &event);
+
kfree_skb(skb);
return RX_QUEUED;
}
@@ -2552,7 +2597,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
!(rx->flags & IEEE80211_RX_BEACON_REPORTED)) {
int sig = 0;
- if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM))
sig = status->signal;
cfg80211_report_obss_beacon(rx->local->hw.wiphy,
@@ -2561,9 +2606,6 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
rx->flags |= IEEE80211_RX_BEACON_REPORTED;
}
- if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
- return RX_DROP_MONITOR;
-
if (ieee80211_drop_unencrypted_mgmt(rx))
return RX_DROP_UNUSABLE;
@@ -2591,9 +2633,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT)
return RX_DROP_UNUSABLE;
- if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
- return RX_DROP_UNUSABLE;
-
switch (mgmt->u.action.category) {
case WLAN_CATEGORY_HT:
/* reject HT action frames from stations not supporting HT */
@@ -2715,8 +2754,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
- opmode, status->band,
- false);
+ opmode, status->band);
goto handled;
}
default:
@@ -2859,7 +2897,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
handled:
if (rx->sta)
- rx->sta->rx_packets++;
+ rx->sta->rx_stats.packets++;
dev_kfree_skb(rx->skb);
return RX_QUEUED;
@@ -2868,7 +2906,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
skb_queue_tail(&sdata->skb_queue, rx->skb);
ieee80211_queue_work(&local->hw, &sdata->work);
if (rx->sta)
- rx->sta->rx_packets++;
+ rx->sta->rx_stats.packets++;
return RX_QUEUED;
}
@@ -2889,13 +2927,13 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
* it transmitted were processed or returned.
*/
- if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM))
sig = status->signal;
if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
rx->skb->data, rx->skb->len, 0)) {
if (rx->sta)
- rx->sta->rx_packets++;
+ rx->sta->rx_stats.packets++;
dev_kfree_skb(rx->skb);
return RX_QUEUED;
}
@@ -2954,7 +2992,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
info->flags = IEEE80211_TX_CTL_TX_OFFCHAN |
IEEE80211_TX_INTFL_OFFCHAN_TX_OK |
IEEE80211_TX_CTL_NO_CCK_RATE;
- if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
info->hw_queue =
local->hw.offchannel_tx_hw_queue;
}
@@ -3014,12 +3052,11 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
skb_queue_tail(&sdata->skb_queue, rx->skb);
ieee80211_queue_work(&rx->local->hw, &sdata->work);
if (rx->sta)
- rx->sta->rx_packets++;
+ rx->sta->rx_stats.packets++;
return RX_QUEUED;
}
-/* TODO: use IEEE80211_RX_FRAGMENTED */
static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
struct ieee80211_rate *rate)
{
@@ -3077,8 +3114,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
}
prev_dev = sdata->dev;
- sdata->dev->stats.rx_packets++;
- sdata->dev->stats.rx_bytes += skb->len;
+ ieee80211_rx_stats(sdata->dev, skb->len);
}
if (prev_dev) {
@@ -3098,7 +3134,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
case RX_DROP_MONITOR:
I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
if (rx->sta)
- rx->sta->rx_dropped++;
+ rx->sta->rx_stats.dropped++;
/* fall through */
case RX_CONTINUE: {
struct ieee80211_rate *rate = NULL;
@@ -3118,7 +3154,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
case RX_DROP_UNUSABLE:
I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
if (rx->sta)
- rx->sta->rx_dropped++;
+ rx->sta->rx_stats.dropped++;
dev_kfree_skb(rx->skb);
break;
case RX_QUEUED:
@@ -3232,7 +3268,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
/* This is OK -- must be QoS data frame */
.security_idx = tid,
.seqno_idx = tid,
- .flags = IEEE80211_RX_REORDER_TIMER,
+ .napi = NULL, /* must be NULL to not have races */
};
struct tid_ampdu_rx *tid_agg_rx;
@@ -3246,16 +3282,25 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
spin_unlock(&tid_agg_rx->reorder_lock);
+ if (!skb_queue_empty(&frames)) {
+ struct ieee80211_event event = {
+ .type = BA_FRAME_TIMEOUT,
+ .u.ba.tid = tid,
+ .u.ba.sta = &sta->sta,
+ };
+ drv_event_callback(rx.local, rx.sdata, &event);
+ }
+
ieee80211_rx_handlers(&rx, &frames);
}
/* main receive path */
-static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
- struct ieee80211_hdr *hdr)
+static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
{
struct ieee80211_sub_if_data *sdata = rx->sdata;
struct sk_buff *skb = rx->skb;
+ struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
int multicast = is_multicast_ether_addr(hdr->addr1);
@@ -3264,30 +3309,23 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
case NL80211_IFTYPE_STATION:
if (!bssid && !sdata->u.mgd.use_4addr)
return false;
- if (!multicast &&
- !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
- if (!(sdata->dev->flags & IFF_PROMISC) ||
- sdata->u.mgd.use_4addr)
- return false;
- status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
- }
- break;
+ if (multicast)
+ return true;
+ return ether_addr_equal(sdata->vif.addr, hdr->addr1);
case NL80211_IFTYPE_ADHOC:
if (!bssid)
return false;
if (ether_addr_equal(sdata->vif.addr, hdr->addr2) ||
ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2))
return false;
- if (ieee80211_is_beacon(hdr->frame_control)) {
+ if (ieee80211_is_beacon(hdr->frame_control))
return true;
- } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
+ if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid))
return false;
- } else if (!multicast &&
- !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
- if (!(sdata->dev->flags & IFF_PROMISC))
- return false;
- status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
- } else if (!rx->sta) {
+ if (!multicast &&
+ !ether_addr_equal(sdata->vif.addr, hdr->addr1))
+ return false;
+ if (!rx->sta) {
int rate_idx;
if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
rate_idx = 0; /* TODO: HT/VHT rates */
@@ -3296,25 +3334,18 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2,
BIT(rate_idx));
}
- break;
+ return true;
case NL80211_IFTYPE_OCB:
if (!bssid)
return false;
- if (ieee80211_is_beacon(hdr->frame_control)) {
+ if (!ieee80211_is_data_present(hdr->frame_control))
return false;
- } else if (!is_broadcast_ether_addr(bssid)) {
- ocb_dbg(sdata, "BSSID mismatch in OCB mode!\n");
+ if (!is_broadcast_ether_addr(bssid))
return false;
- } else if (!multicast &&
- !ether_addr_equal(sdata->dev->dev_addr,
- hdr->addr1)) {
- /* if we are in promisc mode we also accept
- * packets not destined for us
- */
- if (!(sdata->dev->flags & IFF_PROMISC))
- return false;
- rx->flags &= ~IEEE80211_RX_RA_MATCH;
- } else if (!rx->sta) {
+ if (!multicast &&
+ !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1))
+ return false;
+ if (!rx->sta) {
int rate_idx;
if (status->flag & RX_FLAG_HT)
rate_idx = 0; /* TODO: HT rates */
@@ -3323,22 +3354,17 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
ieee80211_ocb_rx_no_sta(sdata, bssid, hdr->addr2,
BIT(rate_idx));
}
- break;
+ return true;
case NL80211_IFTYPE_MESH_POINT:
- if (!multicast &&
- !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
- if (!(sdata->dev->flags & IFF_PROMISC))
- return false;
-
- status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
- }
- break;
+ if (multicast)
+ return true;
+ return ether_addr_equal(sdata->vif.addr, hdr->addr1);
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_AP:
- if (!bssid) {
- if (!ether_addr_equal(sdata->vif.addr, hdr->addr1))
- return false;
- } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) {
+ if (!bssid)
+ return ether_addr_equal(sdata->vif.addr, hdr->addr1);
+
+ if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) {
/*
* Accept public action frames even when the
* BSSID doesn't match, this is used for P2P
@@ -3350,42 +3376,35 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
return false;
if (ieee80211_is_public_action(hdr, skb->len))
return true;
- if (!ieee80211_is_beacon(hdr->frame_control))
- return false;
- status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
- } else if (!ieee80211_has_tods(hdr->frame_control)) {
+ return ieee80211_is_beacon(hdr->frame_control);
+ }
+
+ if (!ieee80211_has_tods(hdr->frame_control)) {
/* ignore data frames to TDLS-peers */
if (ieee80211_is_data(hdr->frame_control))
return false;
/* ignore action frames to TDLS-peers */
if (ieee80211_is_action(hdr->frame_control) &&
+ !is_broadcast_ether_addr(bssid) &&
!ether_addr_equal(bssid, hdr->addr1))
return false;
}
- break;
+ return true;
case NL80211_IFTYPE_WDS:
if (bssid || !ieee80211_is_data(hdr->frame_control))
return false;
- if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2))
- return false;
- break;
+ return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2);
case NL80211_IFTYPE_P2P_DEVICE:
- if (!ieee80211_is_public_action(hdr, skb->len) &&
- !ieee80211_is_probe_req(hdr->frame_control) &&
- !ieee80211_is_probe_resp(hdr->frame_control) &&
- !ieee80211_is_beacon(hdr->frame_control))
- return false;
- if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) &&
- !multicast)
- status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
- break;
+ return ieee80211_is_public_action(hdr, skb->len) ||
+ ieee80211_is_probe_req(hdr->frame_control) ||
+ ieee80211_is_probe_resp(hdr->frame_control) ||
+ ieee80211_is_beacon(hdr->frame_control);
default:
- /* should never get here */
- WARN_ON_ONCE(1);
break;
}
- return true;
+ WARN_ON_ONCE(1);
+ return false;
}
/*
@@ -3399,13 +3418,10 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
{
struct ieee80211_local *local = rx->local;
struct ieee80211_sub_if_data *sdata = rx->sdata;
- struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
- struct ieee80211_hdr *hdr = (void *)skb->data;
rx->skb = skb;
- status->rx_flags |= IEEE80211_RX_RA_MATCH;
- if (!prepare_for_handlers(rx, hdr))
+ if (!ieee80211_accept_frame(rx))
return false;
if (!consume) {
@@ -3430,7 +3446,8 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
* be called with rcu_read_lock protection.
*/
static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ struct napi_struct *napi)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_sub_if_data *sdata;
@@ -3446,9 +3463,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
memset(&rx, 0, sizeof(rx));
rx.skb = skb;
rx.local = local;
+ rx.napi = napi;
if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
- local->dot11ReceivedFragmentCount++;
+ I802_DEBUG_INC(local->dot11ReceivedFragmentCount);
if (ieee80211_is_mgmt(fc)) {
/* drop frame if too short for header */
@@ -3547,7 +3565,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
* This is the receive path handler. It is called by a low level driver when an
* 802.11 MPDU is received from the hardware.
*/
-void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
+void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
+ struct napi_struct *napi)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_rate *rate = NULL;
@@ -3646,7 +3665,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
ieee80211_tpt_led_trig_rx(local,
((struct ieee80211_hdr *)skb->data)->frame_control,
skb->len);
- __ieee80211_rx_handle_packet(hw, skb);
+ __ieee80211_rx_handle_packet(hw, skb, napi);
rcu_read_unlock();
@@ -3654,7 +3673,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
drop:
kfree_skb(skb);
}
-EXPORT_SYMBOL(ieee80211_rx);
+EXPORT_SYMBOL(ieee80211_rx_napi);
/* This is a version of the rx handler that can be called from hard irq
* context. Post the skb on the queue and schedule the tasklet */
diff --git a/kernel/net/mac80211/scan.c b/kernel/net/mac80211/scan.c
index 7bb6a9383..acbe182b7 100644
--- a/kernel/net/mac80211/scan.c
+++ b/kernel/net/mac80211/scan.c
@@ -6,7 +6,7 @@
* Copyright 2005, Devicescape Software, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
- * Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2013-2015 Intel Mobile Communications GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -16,7 +16,6 @@
#include <linux/if_arp.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
-#include <linux/pm_qos.h>
#include <net/sch_generic.h>
#include <linux/slab.h>
#include <linux/export.h>
@@ -67,25 +66,30 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
struct cfg80211_bss *cbss;
struct ieee80211_bss *bss;
int clen, srlen;
- enum nl80211_bss_scan_width scan_width;
- s32 signal = 0;
+ struct cfg80211_inform_bss bss_meta = {};
+ bool signal_valid;
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
- signal = rx_status->signal * 100;
- else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
- signal = (rx_status->signal * 100) / local->hw.max_signal;
+ if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
+ bss_meta.signal = rx_status->signal * 100;
+ else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC))
+ bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal;
- scan_width = NL80211_BSS_CHAN_WIDTH_20;
+ bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_20;
if (rx_status->flag & RX_FLAG_5MHZ)
- scan_width = NL80211_BSS_CHAN_WIDTH_5;
+ bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_5;
if (rx_status->flag & RX_FLAG_10MHZ)
- scan_width = NL80211_BSS_CHAN_WIDTH_10;
+ bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10;
- cbss = cfg80211_inform_bss_width_frame(local->hw.wiphy, channel,
- scan_width, mgmt, len, signal,
- GFP_ATOMIC);
+ bss_meta.chan = channel;
+ cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta,
+ mgmt, len, GFP_ATOMIC);
if (!cbss)
return NULL;
+ /* In case the signal is invalid update the status */
+ signal_valid = abs(channel->center_freq - cbss->channel->center_freq)
+ <= local->hw.wiphy->max_adj_channel_rssi_comp;
+ if (!signal_valid)
+ rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL;
bss = (void *)cbss->priv;
@@ -257,7 +261,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
if (test_bit(SCAN_HW_CANCELLED, &local->scanning))
return false;
- if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) {
+ if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) {
for (i = 0; i < req->n_channels; i++) {
local->hw_scan_req->req.channels[i] = req->channels[i];
bands_used |= BIT(req->channels[i]->band);
@@ -310,6 +314,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
bool was_scanning = local->scanning;
struct cfg80211_scan_request *scan_req;
struct ieee80211_sub_if_data *scan_sdata;
+ struct ieee80211_sub_if_data *sdata;
lockdep_assert_held(&local->mtx);
@@ -326,7 +331,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
return;
if (hw_scan && !aborted &&
- !(local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) &&
+ !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) &&
ieee80211_prep_hw_scan(local)) {
int rc;
@@ -369,7 +374,16 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
ieee80211_mlme_notify_scan_completed(local);
ieee80211_ibss_notify_scan_completed(local);
- ieee80211_mesh_notify_scan_completed(local);
+
+ /* Requeue all the work that might have been ignored while
+ * the scan was in progress; if there was none this will
+ * just be a no-op for the particular interface.
+ */
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (ieee80211_sdata_running(sdata))
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ }
+
if (was_scanning)
ieee80211_start_next_roc(local);
}
@@ -520,7 +534,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len;
- if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) {
+ if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) {
int i, n_bands = 0;
u8 bands_counted = 0;
@@ -593,8 +607,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
/* We need to ensure power level is at max for scanning. */
ieee80211_hw_config(local, 0);
- if ((req->channels[0]->flags &
- IEEE80211_CHAN_NO_IR) ||
+ if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR |
+ IEEE80211_CHAN_RADAR)) ||
!req->n_ssids) {
next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
} else {
@@ -641,7 +655,7 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
* TODO: channel switching also consumes quite some time,
* add that delay as well to get a better estimation
*/
- if (chan->flags & IEEE80211_CHAN_NO_IR)
+ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR))
return IEEE80211_PASSIVE_CHANNEL_TIME;
return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
}
@@ -773,7 +787,8 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
*
* In any case, it is not necessary for a passive scan.
*/
- if (chan->flags & IEEE80211_CHAN_NO_IR || !scan_req->n_ssids) {
+ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) ||
+ !scan_req->n_ssids) {
*next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
local->next_scan_state = SCAN_DECISION;
return;
@@ -1136,10 +1151,10 @@ int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
return ret;
}
-int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata)
+int ieee80211_request_sched_scan_stop(struct ieee80211_local *local)
{
- struct ieee80211_local *local = sdata->local;
- int ret = 0;
+ struct ieee80211_sub_if_data *sched_scan_sdata;
+ int ret = -ENOENT;
mutex_lock(&local->mtx);
@@ -1151,8 +1166,10 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata)
/* We don't want to restart sched scan anymore. */
RCU_INIT_POINTER(local->sched_scan_req, NULL);
- if (rcu_access_pointer(local->sched_scan_sdata)) {
- ret = drv_sched_scan_stop(local, sdata);
+ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata,
+ lockdep_is_held(&local->mtx));
+ if (sched_scan_sdata) {
+ ret = drv_sched_scan_stop(local, sched_scan_sdata);
if (!ret)
RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
}
diff --git a/kernel/net/mac80211/sta_info.c b/kernel/net/mac80211/sta_info.c
index 2880f2ae9..f91d18732 100644
--- a/kernel/net/mac80211/sta_info.c
+++ b/kernel/net/mac80211/sta_info.c
@@ -68,9 +68,10 @@ static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
- .key_offset = offsetof(struct sta_info, sta.addr),
+ .key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
.hashfn = sta_addr_hash,
+ .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
};
/* Caller must hold local->sta_mtx */
@@ -248,6 +249,9 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
if (sta->sta.txq[0])
kfree(to_txq_info(sta->sta.txq[0]));
kfree(rcu_dereference_raw(sta->sta.rates));
+#ifdef CONFIG_MAC80211_MESH
+ kfree(sta->mesh);
+#endif
kfree(sta);
}
@@ -281,12 +285,12 @@ static void sta_deliver_ps_frames(struct work_struct *wk)
static int sta_prepare_rate_control(struct ieee80211_local *local,
struct sta_info *sta, gfp_t gfp)
{
- if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
return 0;
sta->rate_ctrl = local->rate_ctrl;
sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
- &sta->sta, gfp);
+ sta, gfp);
if (!sta->rate_ctrl_priv)
return -ENOMEM;
@@ -299,7 +303,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_hw *hw = &local->hw;
struct sta_info *sta;
- struct timespec uptime;
int i;
sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp);
@@ -312,27 +315,33 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
mutex_init(&sta->ampdu_mlme.mtx);
#ifdef CONFIG_MAC80211_MESH
- if (ieee80211_vif_is_mesh(&sdata->vif) &&
- !sdata->u.mesh.user_mpm)
- init_timer(&sta->plink_timer);
- sta->nonpeer_pm = NL80211_MESH_POWER_ACTIVE;
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ sta->mesh = kzalloc(sizeof(*sta->mesh), gfp);
+ if (!sta->mesh)
+ goto free;
+ spin_lock_init(&sta->mesh->plink_lock);
+ if (ieee80211_vif_is_mesh(&sdata->vif) &&
+ !sdata->u.mesh.user_mpm)
+ init_timer(&sta->mesh->plink_timer);
+ sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE;
+ }
#endif
+ memcpy(sta->addr, addr, ETH_ALEN);
memcpy(sta->sta.addr, addr, ETH_ALEN);
sta->local = local;
sta->sdata = sdata;
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
sta->sta_state = IEEE80211_STA_NONE;
/* Mark TID as unreserved */
sta->reserved_tid = IEEE80211_TID_UNRESERVED;
- ktime_get_ts(&uptime);
- sta->last_connected = uptime.tv_sec;
- ewma_init(&sta->avg_signal, 1024, 8);
- for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++)
- ewma_init(&sta->chain_signal_avg[i], 1024, 8);
+ sta->last_connected = ktime_get_seconds();
+ ewma_signal_init(&sta->rx_stats.avg_signal);
+ for (i = 0; i < ARRAY_SIZE(sta->rx_stats.chain_signal_avg); i++)
+ ewma_signal_init(&sta->rx_stats.chain_signal_avg[i]);
if (local->ops->wake_tx_queue) {
void *txq_data;
@@ -403,6 +412,9 @@ free_txq:
if (sta->sta.txq[0])
kfree(to_txq_info(sta->sta.txq[0]));
free:
+#ifdef CONFIG_MAC80211_MESH
+ kfree(sta->mesh);
+#endif
kfree(sta);
return NULL;
}
@@ -621,7 +633,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
bool indicate_tim = false;
u8 ignore_for_tim = sta->sta.uapsd_queues;
int ac;
- u16 id;
+ u16 id = sta->sta.aid;
if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
@@ -629,19 +641,16 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
return;
ps = &sta->sdata->bss->ps;
- id = sta->sta.aid;
#ifdef CONFIG_MAC80211_MESH
} else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) {
ps = &sta->sdata->u.mesh.ps;
- /* TIM map only for 1 <= PLID <= IEEE80211_MAX_AID */
- id = sta->plid % (IEEE80211_MAX_AID + 1);
#endif
} else {
return;
}
/* No need to do anything if the driver does all */
- if (local->hw.flags & IEEE80211_HW_AP_LINK_PS)
+ if (ieee80211_hw_check(&local->hw, AP_LINK_PS))
return;
if (sta->dead)
@@ -1057,7 +1066,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
if (sdata != sta->sdata)
continue;
- if (time_after(jiffies, sta->last_rx + exp_time)) {
+ if (time_after(jiffies, sta->rx_stats.last_rx + exp_time)) {
sta_dbg(sta->sdata, "expiring inactive STA %pM\n",
sta->sta.addr);
@@ -1146,7 +1155,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
sta->driver_buffered_tids = 0;
sta->txq_buffered_tids = 0;
- if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
+ if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
if (sta->sta.txq[0]) {
@@ -1217,6 +1226,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
ps_dbg(sdata,
"STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n",
sta->sta.addr, sta->sta.aid, filtered, buffered);
+
+ ieee80211_check_fast_xmit(sta);
}
static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata,
@@ -1615,6 +1626,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
if (block) {
set_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ ieee80211_clear_fast_xmit(sta);
return;
}
@@ -1632,6 +1644,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
ieee80211_queue_work(hw, &sta->drv_deliver_wk);
} else {
clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ ieee80211_check_fast_xmit(sta);
}
}
EXPORT_SYMBOL(ieee80211_sta_block_awake);
@@ -1736,6 +1749,7 @@ int sta_info_move_state(struct sta_info *sta,
!sta->sdata->u.vlan.sta))
atomic_dec(&sta->sdata->bss->num_mcast_sta);
clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
+ ieee80211_clear_fast_xmit(sta);
}
break;
case IEEE80211_STA_AUTHORIZED:
@@ -1745,6 +1759,7 @@ int sta_info_move_state(struct sta_info *sta,
!sta->sdata->u.vlan.sta))
atomic_inc(&sta->sdata->bss->num_mcast_sta);
set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
+ ieee80211_check_fast_xmit(sta);
}
break;
default:
@@ -1791,12 +1806,50 @@ u8 sta_info_tx_streams(struct sta_info *sta)
>> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1;
}
+static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+{
+ rinfo->flags = 0;
+
+ if (sta->rx_stats.last_rate_flag & RX_FLAG_HT) {
+ rinfo->flags |= RATE_INFO_FLAGS_MCS;
+ rinfo->mcs = sta->rx_stats.last_rate_idx;
+ } else if (sta->rx_stats.last_rate_flag & RX_FLAG_VHT) {
+ rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS;
+ rinfo->nss = sta->rx_stats.last_rate_vht_nss;
+ rinfo->mcs = sta->rx_stats.last_rate_idx;
+ } else {
+ struct ieee80211_supported_band *sband;
+ int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+ u16 brate;
+
+ sband = sta->local->hw.wiphy->bands[
+ ieee80211_get_sdata_band(sta->sdata)];
+ brate = sband->bitrates[sta->rx_stats.last_rate_idx].bitrate;
+ rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
+ }
+
+ if (sta->rx_stats.last_rate_flag & RX_FLAG_SHORT_GI)
+ rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
+
+ if (sta->rx_stats.last_rate_flag & RX_FLAG_5MHZ)
+ rinfo->bw = RATE_INFO_BW_5;
+ else if (sta->rx_stats.last_rate_flag & RX_FLAG_10MHZ)
+ rinfo->bw = RATE_INFO_BW_10;
+ else if (sta->rx_stats.last_rate_flag & RX_FLAG_40MHZ)
+ rinfo->bw = RATE_INFO_BW_40;
+ else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_80MHZ)
+ rinfo->bw = RATE_INFO_BW_80;
+ else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_160MHZ)
+ rinfo->bw = RATE_INFO_BW_160;
+ else
+ rinfo->bw = RATE_INFO_BW_20;
+}
+
void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
struct rate_control_ref *ref = NULL;
- struct timespec uptime;
u32 thr = 0;
int i, ac;
@@ -1818,51 +1871,54 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
BIT(NL80211_STA_INFO_STA_FLAGS) |
BIT(NL80211_STA_INFO_BSS_PARAM) |
BIT(NL80211_STA_INFO_CONNECTED_TIME) |
- BIT(NL80211_STA_INFO_RX_DROP_MISC) |
- BIT(NL80211_STA_INFO_BEACON_LOSS);
+ BIT(NL80211_STA_INFO_RX_DROP_MISC);
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count;
+ sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS);
+ }
- ktime_get_ts(&uptime);
- sinfo->connected_time = uptime.tv_sec - sta->last_connected;
- sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
+ sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
+ sinfo->inactive_time =
+ jiffies_to_msecs(jiffies - sta->rx_stats.last_rx);
if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) |
BIT(NL80211_STA_INFO_TX_BYTES)))) {
sinfo->tx_bytes = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
- sinfo->tx_bytes += sta->tx_bytes[ac];
+ sinfo->tx_bytes += sta->tx_stats.bytes[ac];
sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64);
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) {
sinfo->tx_packets = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
- sinfo->tx_packets += sta->tx_packets[ac];
+ sinfo->tx_packets += sta->tx_stats.packets[ac];
sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS);
}
if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) |
BIT(NL80211_STA_INFO_RX_BYTES)))) {
- sinfo->rx_bytes = sta->rx_bytes;
+ sinfo->rx_bytes = sta->rx_stats.bytes;
sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64);
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) {
- sinfo->rx_packets = sta->rx_packets;
+ sinfo->rx_packets = sta->rx_stats.packets;
sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS);
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) {
- sinfo->tx_retries = sta->tx_retry_count;
+ sinfo->tx_retries = sta->status_stats.retry_count;
sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES);
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) {
- sinfo->tx_failed = sta->tx_retry_failed;
+ sinfo->tx_failed = sta->status_stats.retry_failed;
sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED);
}
- sinfo->rx_dropped_misc = sta->rx_dropped;
- sinfo->beacon_loss_count = sta->beacon_loss_count;
+ sinfo->rx_dropped_misc = sta->rx_stats.dropped;
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
!(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
@@ -1871,35 +1927,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
}
- if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) ||
- (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) {
+ if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
+ ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) {
- sinfo->signal = (s8)sta->last_signal;
+ sinfo->signal = (s8)sta->rx_stats.last_signal;
sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL);
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) {
- sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal);
+ sinfo->signal_avg =
+ -ewma_signal_read(&sta->rx_stats.avg_signal);
sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG);
}
}
- if (sta->chains &&
+ if (sta->rx_stats.chains &&
!(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
- sinfo->chains = sta->chains;
+ sinfo->chains = sta->rx_stats.chains;
for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) {
- sinfo->chain_signal[i] = sta->chain_signal_last[i];
+ sinfo->chain_signal[i] =
+ sta->rx_stats.chain_signal_last[i];
sinfo->chain_signal_avg[i] =
- (s8) -ewma_read(&sta->chain_signal_avg[i]);
+ -ewma_signal_read(&sta->rx_stats.chain_signal_avg[i]);
}
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
- sta_set_rate_info_tx(sta, &sta->last_tx_rate, &sinfo->txrate);
+ sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate,
+ &sinfo->txrate);
sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE);
}
@@ -1914,28 +1973,30 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU);
- tidstats->rx_msdu = sta->rx_msdu[i];
+ tidstats->rx_msdu = sta->rx_stats.msdu[i];
}
if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) {
tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU);
- tidstats->tx_msdu = sta->tx_msdu[i];
+ tidstats->tx_msdu = sta->tx_stats.msdu[i];
}
if (!(tidstats->filled &
BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) &&
- local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
tidstats->filled |=
BIT(NL80211_TID_STATS_TX_MSDU_RETRIES);
- tidstats->tx_msdu_retries = sta->tx_msdu_retries[i];
+ tidstats->tx_msdu_retries =
+ sta->status_stats.msdu_retries[i];
}
if (!(tidstats->filled &
BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) &&
- local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
tidstats->filled |=
BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
- tidstats->tx_msdu_failed = sta->tx_msdu_failed[i];
+ tidstats->tx_msdu_failed =
+ sta->status_stats.msdu_failed[i];
}
}
@@ -1948,16 +2009,16 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
BIT(NL80211_STA_INFO_PEER_PM) |
BIT(NL80211_STA_INFO_NONPEER_PM);
- sinfo->llid = sta->llid;
- sinfo->plid = sta->plid;
- sinfo->plink_state = sta->plink_state;
+ sinfo->llid = sta->mesh->llid;
+ sinfo->plid = sta->mesh->plid;
+ sinfo->plink_state = sta->mesh->plink_state;
if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET);
- sinfo->t_offset = sta->t_offset;
+ sinfo->t_offset = sta->mesh->t_offset;
}
- sinfo->local_pm = sta->local_pm;
- sinfo->peer_pm = sta->peer_pm;
- sinfo->nonpeer_pm = sta->nonpeer_pm;
+ sinfo->local_pm = sta->mesh->local_pm;
+ sinfo->peer_pm = sta->mesh->peer_pm;
+ sinfo->nonpeer_pm = sta->mesh->nonpeer_pm;
#endif
}
diff --git a/kernel/net/mac80211/sta_info.h b/kernel/net/mac80211/sta_info.h
index 5c164fb3f..2cafb21b4 100644
--- a/kernel/net/mac80211/sta_info.h
+++ b/kernel/net/mac80211/sta_info.h
@@ -53,6 +53,8 @@
* @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching
* @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this
* TDLS peer
+ * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on
+ * the BSS base channel.
* @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was
* keeping station in power-save mode, reply when the driver
* unblocks the station.
@@ -84,6 +86,7 @@ enum ieee80211_sta_info_flags {
WLAN_STA_TDLS_INITIATOR,
WLAN_STA_TDLS_CHAN_SWITCH,
WLAN_STA_TDLS_OFF_CHANNEL,
+ WLAN_STA_TDLS_WIDER_BW,
WLAN_STA_UAPSD,
WLAN_STA_SP,
WLAN_STA_4ADDR_EVENT,
@@ -130,6 +133,7 @@ enum ieee80211_agg_stop_reason {
* @buf_size: reorder buffer size at receiver
* @failed_bar_ssn: ssn of the last failed BAR tx attempt
* @bar_pending: BAR needs to be re-sent
+ * @amsdu: support A-MSDU withing A-MDPU
*
* This structure's lifetime is managed by RCU, assignments to
* the array holding it must hold the aggregation mutex.
@@ -155,6 +159,7 @@ struct tid_ampdu_tx {
u16 failed_bar_ssn;
bool bar_pending;
+ bool amsdu;
};
/**
@@ -241,6 +246,84 @@ struct sta_ampdu_mlme {
/* Value to indicate no TID reservation */
#define IEEE80211_TID_UNRESERVED 0xff
+#define IEEE80211_FAST_XMIT_MAX_IV 18
+
+/**
+ * struct ieee80211_fast_tx - TX fastpath information
+ * @key: key to use for hw crypto
+ * @hdr: the 802.11 header to put with the frame
+ * @hdr_len: actual 802.11 header length
+ * @sa_offs: offset of the SA
+ * @da_offs: offset of the DA
+ * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
+ * @band: band this will be transmitted on, for tx_info
+ * @rcu_head: RCU head to free this struct
+ *
+ * This struct is small enough so that the common case (maximum crypto
+ * header length of 8 like for CCMP/GCMP) fits into a single 64-byte
+ * cache line.
+ */
+struct ieee80211_fast_tx {
+ struct ieee80211_key *key;
+ u8 hdr_len;
+ u8 sa_offs, da_offs, pn_offs;
+ u8 band;
+ u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
+ sizeof(rfc1042_header)];
+
+ struct rcu_head rcu_head;
+};
+
+/**
+ * struct mesh_sta - mesh STA information
+ * @plink_lock: serialize access to plink fields
+ * @llid: Local link ID
+ * @plid: Peer link ID
+ * @aid: local aid supplied by peer
+ * @reason: Cancel reason on PLINK_HOLDING state
+ * @plink_retries: Retries in establishment
+ * @plink_state: peer link state
+ * @plink_timeout: timeout of peer link
+ * @plink_timer: peer link watch timer
+ * @t_offset: timing offset relative to this host
+ * @t_offset_setpoint: reference timing offset of this sta to be used when
+ * calculating clockdrift
+ * @local_pm: local link-specific power save mode
+ * @peer_pm: peer-specific power save mode towards local STA
+ * @nonpeer_pm: STA power save mode towards non-peer neighbors
+ * @processed_beacon: set to true after peer rates and capabilities are
+ * processed
+ * @fail_avg: moving percentage of failed MSDUs
+ */
+struct mesh_sta {
+ struct timer_list plink_timer;
+
+ s64 t_offset;
+ s64 t_offset_setpoint;
+
+ spinlock_t plink_lock;
+ u16 llid;
+ u16 plid;
+ u16 aid;
+ u16 reason;
+ u8 plink_retries;
+
+ bool processed_beacon;
+
+ enum nl80211_plink_state plink_state;
+ u32 plink_timeout;
+
+ /* mesh power save */
+ enum nl80211_mesh_power_mode local_pm;
+ enum nl80211_mesh_power_mode peer_pm;
+ enum nl80211_mesh_power_mode nonpeer_pm;
+
+ /* moving percentage of failed MSDUs */
+ unsigned int fail_avg;
+};
+
+DECLARE_EWMA(signal, 1024, 8)
+
/**
* struct sta_info - STA information
*
@@ -250,20 +333,17 @@ struct sta_ampdu_mlme {
* @list: global linked list entry
* @free_list: list entry for keeping track of stations to free
* @hash_node: hash node for rhashtable
+ * @addr: station's MAC address - duplicated from public part to
+ * let the hash table work with just a single cacheline
* @local: pointer to the global information
* @sdata: virtual interface this station belongs to
* @ptk: peer keys negotiated with this station, if any
* @ptk_idx: last installed peer key index
* @gtk: group keys negotiated with this station, if any
- * @gtk_idx: last installed group key index
* @rate_ctrl: rate control algorithm reference
+ * @rate_ctrl_lock: spinlock used to protect rate control data
+ * (data inside the algorithm, so serializes calls there)
* @rate_ctrl_priv: rate control private per-STA pointer
- * @last_tx_rate: rate used for last transmit, to report to userspace as
- * "the" transmit rate
- * @last_rx_rate_idx: rx status rate index of the last data packet
- * @last_rx_rate_flag: rx status flag of the last data packet
- * @last_rx_rate_vht_flag: rx status vht flag of the last data packet
- * @last_rx_rate_vht_nss: rx status nss of last data packet
* @lock: used for locking all fields that require locking, see comments
* in the header file.
* @drv_deliver_wk: used for delivering frames after driver PS unblocking
@@ -278,82 +358,55 @@ struct sta_ampdu_mlme {
* the station when it leaves powersave or polls for frames
* @driver_buffered_tids: bitmap of TIDs the driver has data buffered on
* @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on
- * @rx_packets: Number of MSDUs received from this STA
- * @rx_bytes: Number of bytes received from this STA
- * @last_rx: time (in jiffies) when last frame was received from this STA
* @last_connected: time (in seconds) when a station got connected
- * @num_duplicates: number of duplicate frames received from this STA
- * @rx_fragments: number of received MPDUs
- * @rx_dropped: number of dropped MPDUs from this STA
- * @last_signal: signal of last received frame from this STA
- * @avg_signal: moving average of signal of received frames from this STA
- * @last_ack_signal: signal of last received Ack frame from this STA
- * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue)
- * @tx_filtered_count: number of frames the hardware filtered for this STA
- * @tx_retry_failed: number of frames that failed retry
- * @tx_retry_count: total number of retries for frames to this STA
- * @fail_avg: moving percentage of failed MSDUs
- * @tx_packets: number of RX/TX MSDUs
- * @tx_bytes: number of bytes transmitted to this STA
- * @tx_fragments: number of transmitted MPDUs
+ * @last_seq_ctrl: last received seq/frag number from this STA (per TID
+ * plus one for non-QoS frames)
* @tid_seq: per-TID sequence numbers for sending to this STA
* @ampdu_mlme: A-MPDU state machine state
* @timer_to_tid: identity mapping to ID timers
- * @llid: Local link ID
- * @plid: Peer link ID
- * @reason: Cancel reason on PLINK_HOLDING state
- * @plink_retries: Retries in establishment
- * @plink_state: peer link state
- * @plink_timeout: timeout of peer link
- * @plink_timer: peer link watch timer
- * @t_offset: timing offset relative to this host
- * @t_offset_setpoint: reference timing offset of this sta to be used when
- * calculating clockdrift
- * @local_pm: local link-specific power save mode
- * @peer_pm: peer-specific power save mode towards local STA
- * @nonpeer_pm: STA power save mode towards non-peer neighbors
+ * @mesh: mesh STA information
* @debugfs: debug filesystem info
* @dead: set to true when sta is unlinked
* @uploaded: set to true when sta is uploaded to the driver
- * @lost_packets: number of consecutive lost packets
* @sta: station information we share with the driver
* @sta_state: duplicates information about station state (for debug)
* @beacon_loss_count: number of times beacon loss has triggered
* @rcu_head: RCU head used for freeing this station struct
* @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
* taken from HT/VHT capabilities or VHT operating mode notification
- * @chains: chains ever used for RX from this station
- * @chain_signal_last: last signal (per chain)
- * @chain_signal_avg: signal average (per chain)
* @known_smps_mode: the smps_mode the client thinks we are in. Relevant for
* AP only.
* @cipher_scheme: optional cipher scheme for this station
- * @last_tdls_pkt_time: holds the time in jiffies of last TDLS pkt ACKed
* @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
- * @tx_msdu: MSDUs transmitted to this station, using IEEE80211_NUM_TID
- * entry for non-QoS frames
- * @tx_msdu_retries: MSDU retries for transmissions to to this station,
- * using IEEE80211_NUM_TID entry for non-QoS frames
- * @tx_msdu_failed: MSDU failures for transmissions to to this station,
- * using IEEE80211_NUM_TID entry for non-QoS frames
- * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID
- * entry for non-QoS frames
+ * @fast_tx: TX fastpath information
+ * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
+ * the BSS one.
+ * @tx_stats: TX statistics
+ * @rx_stats: RX statistics
+ * @status_stats: TX status statistics
*/
struct sta_info {
/* General information, mostly static */
struct list_head list, free_list;
struct rcu_head rcu_head;
struct rhash_head hash_node;
+ u8 addr[ETH_ALEN];
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS];
- u8 gtk_idx;
u8 ptk_idx;
struct rate_control_ref *rate_ctrl;
void *rate_ctrl_priv;
+ spinlock_t rate_ctrl_lock;
spinlock_t lock;
+ struct ieee80211_fast_tx __rcu *fast_tx;
+
+#ifdef CONFIG_MAC80211_MESH
+ struct mesh_sta *mesh;
+#endif
+
struct work_struct drv_deliver_wk;
u16 listen_interval;
@@ -374,45 +427,49 @@ struct sta_info {
unsigned long driver_buffered_tids;
unsigned long txq_buffered_tids;
- /* Updated from RX path only, no locking requirements */
- unsigned long rx_packets;
- u64 rx_bytes;
- unsigned long last_rx;
long last_connected;
- unsigned long num_duplicates;
- unsigned long rx_fragments;
- unsigned long rx_dropped;
- int last_signal;
- struct ewma avg_signal;
- int last_ack_signal;
- u8 chains;
- s8 chain_signal_last[IEEE80211_MAX_CHAINS];
- struct ewma chain_signal_avg[IEEE80211_MAX_CHAINS];
+ /* Updated from RX path only, no locking requirements */
+ struct {
+ unsigned long packets;
+ u64 bytes;
+ unsigned long last_rx;
+ unsigned long num_duplicates;
+ unsigned long fragments;
+ unsigned long dropped;
+ int last_signal;
+ struct ewma_signal avg_signal;
+ u8 chains;
+ s8 chain_signal_last[IEEE80211_MAX_CHAINS];
+ struct ewma_signal chain_signal_avg[IEEE80211_MAX_CHAINS];
+ int last_rate_idx;
+ u32 last_rate_flag;
+ u32 last_rate_vht_flag;
+ u8 last_rate_vht_nss;
+ u64 msdu[IEEE80211_NUM_TIDS + 1];
+ } rx_stats;
/* Plus 1 for non-QoS frames */
__le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1];
/* Updated from TX status path only, no locking requirements */
- unsigned long tx_filtered_count;
- unsigned long tx_retry_failed, tx_retry_count;
- /* moving percentage of failed MSDUs */
- unsigned int fail_avg;
+ struct {
+ unsigned long filtered;
+ unsigned long retry_failed, retry_count;
+ unsigned int lost_packets;
+ unsigned long last_tdls_pkt_time;
+ u64 msdu_retries[IEEE80211_NUM_TIDS + 1];
+ u64 msdu_failed[IEEE80211_NUM_TIDS + 1];
+ } status_stats;
/* Updated from TX path only, no locking requirements */
- u32 tx_fragments;
- u64 tx_packets[IEEE80211_NUM_ACS];
- u64 tx_bytes[IEEE80211_NUM_ACS];
- struct ieee80211_tx_rate last_tx_rate;
- int last_rx_rate_idx;
- u32 last_rx_rate_flag;
- u32 last_rx_rate_vht_flag;
- u8 last_rx_rate_vht_nss;
+ struct {
+ u64 packets[IEEE80211_NUM_ACS];
+ u64 bytes[IEEE80211_NUM_ACS];
+ struct ieee80211_tx_rate last_rate;
+ u64 msdu[IEEE80211_NUM_TIDS + 1];
+ } tx_stats;
u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
- u64 tx_msdu[IEEE80211_NUM_TIDS + 1];
- u64 tx_msdu_retries[IEEE80211_NUM_TIDS + 1];
- u64 tx_msdu_failed[IEEE80211_NUM_TIDS + 1];
- u64 rx_msdu[IEEE80211_NUM_TIDS + 1];
/*
* Aggregation information, locked with lock.
@@ -420,26 +477,6 @@ struct sta_info {
struct sta_ampdu_mlme ampdu_mlme;
u8 timer_to_tid[IEEE80211_NUM_TIDS];
-#ifdef CONFIG_MAC80211_MESH
- /*
- * Mesh peer link attributes
- * TODO: move to a sub-structure that is referenced with pointer?
- */
- u16 llid;
- u16 plid;
- u16 reason;
- u8 plink_retries;
- enum nl80211_plink_state plink_state;
- u32 plink_timeout;
- struct timer_list plink_timer;
- s64 t_offset;
- s64 t_offset_setpoint;
- /* mesh power save */
- enum nl80211_mesh_power_mode local_pm;
- enum nl80211_mesh_power_mode peer_pm;
- enum nl80211_mesh_power_mode nonpeer_pm;
-#endif
-
#ifdef CONFIG_MAC80211_DEBUGFS
struct sta_info_debugfsdentries {
struct dentry *dir;
@@ -449,17 +486,13 @@ struct sta_info {
enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
- unsigned int lost_packets;
- unsigned int beacon_loss_count;
-
enum ieee80211_smps_mode known_smps_mode;
const struct ieee80211_cipher_scheme *cipher_scheme;
- /* TDLS timeout data */
- unsigned long last_tdls_pkt_time;
-
u8 reserved_tid;
+ struct cfg80211_chan_def tdls_chandef;
+
/* keep last! */
struct ieee80211_sta sta;
};
@@ -467,7 +500,7 @@ struct sta_info {
static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta)
{
#ifdef CONFIG_MAC80211_MESH
- return sta->plink_state;
+ return sta->mesh->plink_state;
#endif
return NL80211_PLINK_LISTEN;
}
@@ -570,7 +603,7 @@ u32 sta_addr_hash(const void *key, u32 length, u32 seed);
_sta_bucket_idx(tbl, _addr), \
hash_node) \
/* compare address and run code only if it matches */ \
- if (ether_addr_equal(_sta->sta.addr, (_addr)))
+ if (ether_addr_equal(_sta->addr, (_addr)))
/*
* Get STA info by index, BROKEN!
@@ -626,8 +659,6 @@ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
void sta_set_rate_info_tx(struct sta_info *sta,
const struct ieee80211_tx_rate *rate,
struct rate_info *rinfo);
-void sta_set_rate_info_rx(struct sta_info *sta,
- struct rate_info *rinfo);
void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo);
void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
diff --git a/kernel/net/mac80211/status.c b/kernel/net/mac80211/status.c
index 005fdbe39..5bad05e9a 100644
--- a/kernel/net/mac80211/status.c
+++ b/kernel/net/mac80211/status.c
@@ -67,7 +67,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
IEEE80211_TX_INTFL_RETRANSMISSION;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
- sta->tx_filtered_count++;
+ sta->status_stats.filtered++;
/*
* Clear more-data bit on filtered frames, it might be set
@@ -101,6 +101,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
* when it wakes up for the next time.
*/
set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT);
+ ieee80211_clear_fast_xmit(sta);
/*
* This code races in the following way:
@@ -181,8 +182,8 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
- sta->last_rx = jiffies;
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
+ sta->rx_stats.last_rx = jiffies;
if (ieee80211_is_data_qos(mgmt->frame_control)) {
struct ieee80211_hdr *hdr = (void *) skb->data;
@@ -414,8 +415,7 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local,
if (is_teardown) {
/* This mechanism relies on being able to get ACKs */
- WARN_ON(!(local->hw.flags &
- IEEE80211_HW_REPORTS_TX_ACK_STATUS));
+ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS));
/* Check if peer has ACKed */
if (flags & IEEE80211_TX_STAT_ACK) {
@@ -429,6 +429,74 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local,
}
}
+static struct ieee80211_sub_if_data *
+ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (skb->dev) {
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!sdata->dev)
+ continue;
+
+ if (skb->dev == sdata->dev)
+ return sdata;
+ }
+
+ return NULL;
+ }
+
+ return rcu_dereference(local->p2p_sdata);
+}
+
+static void ieee80211_report_ack_skb(struct ieee80211_local *local,
+ struct ieee80211_tx_info *info,
+ bool acked, bool dropped)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->ack_status_lock, flags);
+ skb = idr_find(&local->ack_status_frames, info->ack_frame_id);
+ if (skb)
+ idr_remove(&local->ack_status_frames, info->ack_frame_id);
+ spin_unlock_irqrestore(&local->ack_status_lock, flags);
+
+ if (!skb)
+ return;
+
+ if (dropped) {
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) {
+ u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+
+ rcu_read_lock();
+ sdata = ieee80211_sdata_from_skb(local, skb);
+ if (sdata) {
+ if (ieee80211_is_nullfunc(hdr->frame_control) ||
+ ieee80211_is_qos_nullfunc(hdr->frame_control))
+ cfg80211_probe_status(sdata->dev, hdr->addr1,
+ cookie, acked,
+ GFP_ATOMIC);
+ else
+ cfg80211_mgmt_tx_status(&sdata->wdev, cookie,
+ skb->data, skb->len,
+ acked, GFP_ATOMIC);
+ }
+ rcu_read_unlock();
+
+ dev_kfree_skb_any(skb);
+ } else {
+ /* consumes skb */
+ skb_complete_wifi_ack(skb, acked);
+ }
+}
+
static void ieee80211_report_used_skb(struct ieee80211_local *local,
struct sk_buff *skb, bool dropped)
{
@@ -439,32 +507,16 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
if (dropped)
acked = false;
- if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX |
- IEEE80211_TX_INTFL_MLME_CONN_TX)) {
- struct ieee80211_sub_if_data *sdata = NULL;
- struct ieee80211_sub_if_data *iter_sdata;
- u64 cookie = (unsigned long)skb;
+ if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
+ struct ieee80211_sub_if_data *sdata;
rcu_read_lock();
- if (skb->dev) {
- list_for_each_entry_rcu(iter_sdata, &local->interfaces,
- list) {
- if (!iter_sdata->dev)
- continue;
-
- if (skb->dev == iter_sdata->dev) {
- sdata = iter_sdata;
- break;
- }
- }
- } else {
- sdata = rcu_dereference(local->p2p_sdata);
- }
+ sdata = ieee80211_sdata_from_skb(local, skb);
if (!sdata) {
skb->dev = NULL;
- } else if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
+ } else {
unsigned int hdr_size =
ieee80211_hdrlen(hdr->frame_control);
@@ -478,38 +530,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
ieee80211_mgd_conn_tx_status(sdata,
hdr->frame_control,
acked);
- } else if (ieee80211_is_nullfunc(hdr->frame_control) ||
- ieee80211_is_qos_nullfunc(hdr->frame_control)) {
- cfg80211_probe_status(sdata->dev, hdr->addr1,
- cookie, acked, GFP_ATOMIC);
- } else {
- cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data,
- skb->len, acked, GFP_ATOMIC);
}
rcu_read_unlock();
- }
-
- if (unlikely(info->ack_frame_id)) {
- struct sk_buff *ack_skb;
- unsigned long flags;
-
- spin_lock_irqsave(&local->ack_status_lock, flags);
- ack_skb = idr_find(&local->ack_status_frames,
- info->ack_frame_id);
- if (ack_skb)
- idr_remove(&local->ack_status_frames,
- info->ack_frame_id);
- spin_unlock_irqrestore(&local->ack_status_lock, flags);
-
- if (ack_skb) {
- if (!dropped) {
- /* consumes ack_skb */
- skb_complete_wifi_ack(ack_skb, acked);
- } else {
- dev_kfree_skb_any(ack_skb);
- }
- }
+ } else if (info->ack_frame_id) {
+ ieee80211_report_ack_skb(local, info, acked, dropped);
}
}
@@ -532,8 +557,9 @@ static void ieee80211_lost_packet(struct sta_info *sta,
!(info->flags & IEEE80211_TX_STAT_AMPDU))
return;
- sta->lost_packets++;
- if (!sta->sta.tdls && sta->lost_packets < STA_LOST_PKT_THRESHOLD)
+ sta->status_stats.lost_packets++;
+ if (!sta->sta.tdls &&
+ sta->status_stats.lost_packets < STA_LOST_PKT_THRESHOLD)
return;
/*
@@ -543,14 +569,15 @@ static void ieee80211_lost_packet(struct sta_info *sta,
* mechanism.
*/
if (sta->sta.tdls &&
- (sta->lost_packets < STA_LOST_TDLS_PKT_THRESHOLD ||
+ (sta->status_stats.lost_packets < STA_LOST_TDLS_PKT_THRESHOLD ||
time_before(jiffies,
- sta->last_tdls_pkt_time + STA_LOST_TDLS_PKT_TIME)))
+ sta->status_stats.last_tdls_pkt_time +
+ STA_LOST_TDLS_PKT_TIME)))
return;
cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr,
- sta->lost_packets, GFP_ATOMIC);
- sta->lost_packets = 0;
+ sta->status_stats.lost_packets, GFP_ATOMIC);
+ sta->status_stats.lost_packets = 0;
}
static int ieee80211_tx_get_rates(struct ieee80211_hw *hw,
@@ -611,18 +638,18 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
sta = container_of(pubsta, struct sta_info, sta);
if (!acked)
- sta->tx_retry_failed++;
- sta->tx_retry_count += retry_count;
+ sta->status_stats.retry_failed++;
+ sta->status_stats.retry_count += retry_count;
if (acked) {
- sta->last_rx = jiffies;
+ sta->rx_stats.last_rx = jiffies;
- if (sta->lost_packets)
- sta->lost_packets = 0;
+ if (sta->status_stats.lost_packets)
+ sta->status_stats.lost_packets = 0;
/* Track when last TDLS packet was ACKed */
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
- sta->last_tdls_pkt_time = jiffies;
+ sta->status_stats.last_tdls_pkt_time = jiffies;
} else {
ieee80211_lost_packet(sta, info);
}
@@ -631,29 +658,83 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
}
if (acked || noack_success) {
- local->dot11TransmittedFrameCount++;
- if (!pubsta)
- local->dot11MulticastTransmittedFrameCount++;
- if (retry_count > 0)
- local->dot11RetryCount++;
- if (retry_count > 1)
- local->dot11MultipleRetryCount++;
+ I802_DEBUG_INC(local->dot11TransmittedFrameCount);
+ if (!pubsta)
+ I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount);
+ if (retry_count > 0)
+ I802_DEBUG_INC(local->dot11RetryCount);
+ if (retry_count > 1)
+ I802_DEBUG_INC(local->dot11MultipleRetryCount);
} else {
- local->dot11FailedCount++;
+ I802_DEBUG_INC(local->dot11FailedCount);
}
}
EXPORT_SYMBOL(ieee80211_tx_status_noskb);
-void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
+void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
+ struct ieee80211_supported_band *sband,
+ int retry_count, int shift, bool send_to_cooked)
{
struct sk_buff *skb2;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_sub_if_data *sdata;
+ struct net_device *prev_dev = NULL;
+ int rtap_len;
+
+ /* send frame to monitor interfaces now */
+ rtap_len = ieee80211_tx_radiotap_len(info);
+ if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) {
+ pr_err("ieee80211_tx_status: headroom too small\n");
+ dev_kfree_skb(skb);
+ return;
+ }
+ ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count,
+ rtap_len, shift);
+
+ /* XXX: is this sufficient for BPF? */
+ skb_set_mac_header(skb, 0);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = htons(ETH_P_802_2);
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
+ !send_to_cooked)
+ continue;
+
+ if (prev_dev) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2) {
+ skb2->dev = prev_dev;
+ netif_rx(skb2);
+ }
+ }
+
+ prev_dev = sdata->dev;
+ }
+ }
+ if (prev_dev) {
+ skb->dev = prev_dev;
+ netif_rx(skb);
+ skb = NULL;
+ }
+ rcu_read_unlock();
+ dev_kfree_skb(skb);
+}
+
+void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
__le16 fc;
struct ieee80211_supported_band *sband;
- struct ieee80211_sub_if_data *sdata;
- struct net_device *prev_dev = NULL;
struct sta_info *sta;
struct rhash_head *tmp;
int retry_count;
@@ -661,7 +742,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
bool send_to_cooked;
bool acked;
struct ieee80211_bar *bar;
- int rtap_len;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
const struct bucket_table *tbl;
@@ -703,10 +783,11 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
ieee80211_get_qos_ctl(hdr),
sta, true, acked);
- if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) &&
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) &&
(ieee80211_is_data(hdr->frame_control)) &&
(rates_idx != -1))
- sta->last_tx_rate = info->status.rates[rates_idx];
+ sta->tx_stats.last_rate =
+ info->status.rates[rates_idx];
if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) &&
(ieee80211_is_data_qos(fc))) {
@@ -752,13 +833,15 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
return;
} else {
if (!acked)
- sta->tx_retry_failed++;
- sta->tx_retry_count += retry_count;
+ sta->status_stats.retry_failed++;
+ sta->status_stats.retry_count += retry_count;
if (ieee80211_is_data_present(fc)) {
if (!acked)
- sta->tx_msdu_failed[tid]++;
- sta->tx_msdu_retries[tid] += retry_count;
+ sta->status_stats.msdu_failed[tid]++;
+
+ sta->status_stats.msdu_retries[tid] +=
+ retry_count;
}
}
@@ -770,25 +853,23 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
ieee80211_frame_acked(sta, skb);
if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
- (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS))
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
acked, info->status.tx_time);
- if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
if (info->flags & IEEE80211_TX_STAT_ACK) {
- if (sta->lost_packets)
- sta->lost_packets = 0;
+ if (sta->status_stats.lost_packets)
+ sta->status_stats.lost_packets = 0;
/* Track when last TDLS packet was ACKed */
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
- sta->last_tdls_pkt_time = jiffies;
+ sta->status_stats.last_tdls_pkt_time =
+ jiffies;
} else {
ieee80211_lost_packet(sta, info);
}
}
-
- if (acked)
- sta->last_ack_signal = info->status.ack_signal;
}
rcu_read_unlock();
@@ -802,13 +883,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
if ((info->flags & IEEE80211_TX_STAT_ACK) ||
(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) {
if (ieee80211_is_first_frag(hdr->seq_ctrl)) {
- local->dot11TransmittedFrameCount++;
+ I802_DEBUG_INC(local->dot11TransmittedFrameCount);
if (is_multicast_ether_addr(ieee80211_get_DA(hdr)))
- local->dot11MulticastTransmittedFrameCount++;
+ I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount);
if (retry_count > 0)
- local->dot11RetryCount++;
+ I802_DEBUG_INC(local->dot11RetryCount);
if (retry_count > 1)
- local->dot11MultipleRetryCount++;
+ I802_DEBUG_INC(local->dot11MultipleRetryCount);
}
/* This counter shall be incremented for an acknowledged MPDU
@@ -818,14 +899,14 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
if (!is_multicast_ether_addr(hdr->addr1) ||
ieee80211_is_data(fc) ||
ieee80211_is_mgmt(fc))
- local->dot11TransmittedFragmentCount++;
+ I802_DEBUG_INC(local->dot11TransmittedFragmentCount);
} else {
if (ieee80211_is_first_frag(hdr->seq_ctrl))
- local->dot11FailedCount++;
+ I802_DEBUG_INC(local->dot11FailedCount);
}
if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) &&
- (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) &&
+ ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) &&
!(info->flags & IEEE80211_TX_CTL_INJECTED) &&
local->ps_sdata && !(local->scanning)) {
if (info->flags & IEEE80211_TX_STAT_ACK) {
@@ -854,51 +935,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
return;
}
- /* send frame to monitor interfaces now */
- rtap_len = ieee80211_tx_radiotap_len(info);
- if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) {
- pr_err("ieee80211_tx_status: headroom too small\n");
- dev_kfree_skb(skb);
- return;
- }
- ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count,
- rtap_len, shift);
-
- /* XXX: is this sufficient for BPF? */
- skb_set_mac_header(skb, 0);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- skb->pkt_type = PACKET_OTHERHOST;
- skb->protocol = htons(ETH_P_802_2);
- memset(skb->cb, 0, sizeof(skb->cb));
-
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
- if (!ieee80211_sdata_running(sdata))
- continue;
-
- if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
- !send_to_cooked)
- continue;
-
- if (prev_dev) {
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2) {
- skb2->dev = prev_dev;
- netif_rx(skb2);
- }
- }
-
- prev_dev = sdata->dev;
- }
- }
- if (prev_dev) {
- skb->dev = prev_dev;
- netif_rx(skb);
- skb = NULL;
- }
- rcu_read_unlock();
- dev_kfree_skb(skb);
+ /* send to monitor interfaces */
+ ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked);
}
EXPORT_SYMBOL(ieee80211_tx_status);
diff --git a/kernel/net/mac80211/tdls.c b/kernel/net/mac80211/tdls.c
index fff0d864a..c9eeb3f12 100644
--- a/kernel/net/mac80211/tdls.c
+++ b/kernel/net/mac80211/tdls.c
@@ -4,6 +4,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2014, Intel Corporation
* Copyright 2014 Intel Mobile Communications GmbH
+ * Copyright 2015 Intel Deutschland GmbH
*
* This file is GPLv2 as found in COPYING.
*/
@@ -11,6 +12,7 @@
#include <linux/ieee80211.h>
#include <linux/log2.h>
#include <net/cfg80211.h>
+#include <linux/rtnetlink.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
@@ -35,20 +37,30 @@ void ieee80211_tdls_peer_del_work(struct work_struct *wk)
mutex_unlock(&local->mtx);
}
-static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local,
+static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
- u8 *pos = (void *)skb_put(skb, 7);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
bool chan_switch = local->hw.wiphy->features &
NL80211_FEATURE_TDLS_CHANNEL_SWITCH;
+ bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) &&
+ !ifmgd->tdls_wider_bw_prohibited;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
+ bool vht = sband && sband->vht_cap.vht_supported;
+ u8 *pos = (void *)skb_put(skb, 10);
*pos++ = WLAN_EID_EXT_CAPABILITY;
- *pos++ = 5; /* len */
+ *pos++ = 8; /* len */
*pos++ = 0x0;
*pos++ = 0x0;
*pos++ = 0x0;
*pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0;
*pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED;
+ *pos++ = 0;
+ *pos++ = 0;
+ *pos++ = (vht && wider_band) ? WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED : 0;
}
static u8
@@ -60,6 +72,7 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *ch;
struct cfg80211_chan_def chandef;
int i, subband_start;
+ struct wiphy *wiphy = sdata->local->hw.wiphy;
for (i = start; i <= end; i += spacing) {
if (!ch_cnt)
@@ -70,9 +83,8 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
/* we will be active on the channel */
cfg80211_chandef_create(&chandef, ch,
NL80211_CHAN_NO_HT);
- if (cfg80211_reg_can_beacon(sdata->local->hw.wiphy,
- &chandef,
- sdata->wdev.iftype)) {
+ if (cfg80211_reg_can_beacon_relax(wiphy, &chandef,
+ sdata->wdev.iftype)) {
ch_cnt++;
/*
* check if the next channel is also part of
@@ -167,23 +179,16 @@ static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb)
static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata,
u16 status_code)
{
- struct ieee80211_local *local = sdata->local;
- u16 capab;
-
/* The capability will be 0 when sending a failure code */
if (status_code != 0)
return 0;
- capab = 0;
- if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ)
- return capab;
-
- if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
- capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
- if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
- capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
+ if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) {
+ return WLAN_CAPABILITY_SHORT_SLOT_TIME |
+ WLAN_CAPABILITY_SHORT_PREAMBLE;
+ }
- return capab;
+ return 0;
}
static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata,
@@ -291,6 +296,60 @@ static void ieee80211_tdls_add_wmm_param_ie(struct ieee80211_sub_if_data *sdata,
}
static void
+ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ /* IEEE802.11ac-2013 Table E-4 */
+ u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 };
+ struct cfg80211_chan_def uc = sta->tdls_chandef;
+ enum nl80211_chan_width max_width = ieee80211_get_sta_bw(&sta->sta);
+ int i;
+
+ /* only support upgrading non-narrow channels up to 80Mhz */
+ if (max_width == NL80211_CHAN_WIDTH_5 ||
+ max_width == NL80211_CHAN_WIDTH_10)
+ return;
+
+ if (max_width > NL80211_CHAN_WIDTH_80)
+ max_width = NL80211_CHAN_WIDTH_80;
+
+ if (uc.width == max_width)
+ return;
+ /*
+ * Channel usage constrains in the IEEE802.11ac-2013 specification only
+ * allow expanding a 20MHz channel to 80MHz in a single way. In
+ * addition, there are no 40MHz allowed channels that are not part of
+ * the allowed 80MHz range in the 5GHz spectrum (the relevant one here).
+ */
+ for (i = 0; i < ARRAY_SIZE(centers_80mhz); i++)
+ if (abs(uc.chan->center_freq - centers_80mhz[i]) <= 30) {
+ uc.center_freq1 = centers_80mhz[i];
+ uc.width = NL80211_CHAN_WIDTH_80;
+ break;
+ }
+
+ if (!uc.center_freq1)
+ return;
+
+ /* proceed to downgrade the chandef until usable or the same */
+ while (uc.width > max_width &&
+ !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
+ sdata->wdev.iftype))
+ ieee80211_chandef_downgrade(&uc);
+
+ if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) {
+ tdls_dbg(sdata, "TDLS ch width upgraded %d -> %d\n",
+ sta->tdls_chandef.width, uc.width);
+
+ /*
+ * the station is not yet authorized when BW upgrade is done,
+ * locking is not required
+ */
+ sta->tdls_chandef = uc;
+ }
+}
+
+static void
ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, const u8 *peer,
u8 action_code, bool initiator,
@@ -327,7 +386,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
offset = noffset;
}
- ieee80211_tdls_add_ext_capab(local, skb);
+ ieee80211_tdls_add_ext_capab(sdata, skb);
/* add the QoS element if we support it */
if (local->hw.queues >= IEEE80211_NUM_ACS &&
@@ -357,15 +416,17 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
offset = noffset;
}
- rcu_read_lock();
+ mutex_lock(&local->sta_mtx);
/* we should have the peer STA if we're already responding */
if (action_code == WLAN_TDLS_SETUP_RESPONSE) {
sta = sta_info_get(sdata, peer);
if (WARN_ON_ONCE(!sta)) {
- rcu_read_unlock();
+ mutex_unlock(&local->sta_mtx);
return;
}
+
+ sta->tdls_chandef = sdata->vif.bss_conf.chandef;
}
ieee80211_tdls_add_oper_classes(sdata, skb);
@@ -391,10 +452,6 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap);
} else if (action_code == WLAN_TDLS_SETUP_RESPONSE &&
ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) {
- /* disable SMPS in TDLS responder */
- sta->sta.ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED
- << IEEE80211_HT_CAP_SM_PS_SHIFT;
-
/* the peer caps are already intersected with our own */
memcpy(&ht_cap, &sta->sta.ht_cap, sizeof(ht_cap));
@@ -455,9 +512,16 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2);
ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap);
+
+ /*
+ * if both peers support WIDER_BW, we can expand the chandef to
+ * a wider compatible one, up to 80MHz
+ */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW))
+ ieee80211_tdls_chandef_vht_upgrade(sdata, sta);
}
- rcu_read_unlock();
+ mutex_unlock(&local->sta_mtx);
/* add any remaining IEs */
if (extra_ies_len) {
@@ -481,15 +545,17 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata,
enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
u8 *pos;
- rcu_read_lock();
+ mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, peer);
ap_sta = sta_info_get(sdata, ifmgd->bssid);
if (WARN_ON_ONCE(!sta || !ap_sta)) {
- rcu_read_unlock();
+ mutex_unlock(&local->sta_mtx);
return;
}
+ sta->tdls_chandef = sdata->vif.bss_conf.chandef;
+
/* add any custom IEs that go before the QoS IE */
if (extra_ies_len) {
static const u8 before_qos[] = {
@@ -525,35 +591,38 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata,
offset = noffset;
}
- /* if HT support is only added in TDLS, we need an HT-operation IE */
+ /*
+ * if HT support is only added in TDLS, we need an HT-operation IE.
+ * add the IE as required by IEEE802.11-2012 9.23.3.2.
+ */
if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) {
- struct ieee80211_chanctx_conf *chanctx_conf =
- rcu_dereference(sdata->vif.chanctx_conf);
- if (!WARN_ON(!chanctx_conf)) {
- pos = skb_put(skb, 2 +
- sizeof(struct ieee80211_ht_operation));
- /* send an empty HT operation IE */
- ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap,
- &chanctx_conf->def, 0);
- }
+ u16 prot = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED |
+ IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+ IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
+
+ pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation));
+ ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap,
+ &sdata->vif.bss_conf.chandef, prot,
+ true);
}
ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
/* only include VHT-operation if not on the 2.4GHz band */
- if (band != IEEE80211_BAND_2GHZ && !ap_sta->sta.vht_cap.vht_supported &&
- sta->sta.vht_cap.vht_supported) {
- struct ieee80211_chanctx_conf *chanctx_conf =
- rcu_dereference(sdata->vif.chanctx_conf);
- if (!WARN_ON(!chanctx_conf)) {
- pos = skb_put(skb, 2 +
- sizeof(struct ieee80211_vht_operation));
- ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap,
- &chanctx_conf->def);
- }
+ if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) {
+ /*
+ * if both peers support WIDER_BW, we can expand the chandef to
+ * a wider compatible one, up to 80MHz
+ */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW))
+ ieee80211_tdls_chandef_vht_upgrade(sdata, sta);
+
+ pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation));
+ ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap,
+ &sta->tdls_chandef);
}
- rcu_read_unlock();
+ mutex_unlock(&local->sta_mtx);
/* add any remaining IEs */
if (extra_ies_len) {
@@ -802,7 +871,7 @@ ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata,
max(sizeof(struct ieee80211_mgmt),
sizeof(struct ieee80211_tdls_data)) +
50 + /* supported rates */
- 7 + /* ext capab */
+ 10 + /* ext capab */
26 + /* max(WMM-info, WMM-param) */
2 + max(sizeof(struct ieee80211_ht_cap),
sizeof(struct ieee80211_ht_operation)) +
@@ -953,7 +1022,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
* packet through the AP.
*/
if ((action_code == WLAN_TDLS_TEARDOWN) &&
- (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) {
+ ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) {
bool try_resend; /* Should we keep skb for possible resend */
/* If not sending directly to peer - no point in keeping skb */
@@ -1001,8 +1070,17 @@ ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev,
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
+ enum ieee80211_smps_mode smps_mode = sdata->u.mgd.driver_smps_mode;
int ret;
+ /* don't support setup with forced SMPS mode that's not off */
+ if (smps_mode != IEEE80211_SMPS_AUTOMATIC &&
+ smps_mode != IEEE80211_SMPS_OFF) {
+ tdls_dbg(sdata, "Aborting TDLS setup due to SMPS mode %d\n",
+ smps_mode);
+ return -ENOTSUPP;
+ }
+
mutex_lock(&local->mtx);
/* we don't support concurrent TDLS peer setups */
@@ -1164,6 +1242,74 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
return ret;
}
+static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *ctx;
+
+ mutex_lock(&local->chanctx_mtx);
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (conf) {
+ ctx = container_of(conf, struct ieee80211_chanctx, conf);
+ ieee80211_recalc_chanctx_chantype(local, ctx);
+ }
+ mutex_unlock(&local->chanctx_mtx);
+}
+
+static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata)
+{
+ struct sta_info *sta;
+ bool result = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+ if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+ !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) ||
+ !sta->sta.ht_cap.ht_supported)
+ continue;
+ result = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ return result;
+}
+
+static void
+iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ bool tdls_ht;
+ u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED |
+ IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+ IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
+ u16 opmode;
+
+ /* Nothing to do if the BSS connection uses HT */
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+ return;
+
+ tdls_ht = (sta && sta->sta.ht_cap.ht_supported) ||
+ iee80211_tdls_have_ht_peers(sdata);
+
+ opmode = sdata->vif.bss_conf.ht_operation_mode;
+
+ if (tdls_ht)
+ opmode |= protection;
+ else
+ opmode &= ~protection;
+
+ if (opmode == sdata->vif.bss_conf.ht_operation_mode)
+ return;
+
+ sdata->vif.bss_conf.ht_operation_mode = opmode;
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+}
+
int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
const u8 *peer, enum nl80211_tdls_operation oper)
{
@@ -1189,21 +1335,35 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
return -ENOTSUPP;
}
+ /* protect possible bss_conf changes and avoid concurrency in
+ * ieee80211_bss_info_change_notify()
+ */
+ sdata_lock(sdata);
mutex_lock(&local->mtx);
tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
switch (oper) {
case NL80211_TDLS_ENABLE_LINK:
- rcu_read_lock();
+ if (sdata->vif.csa_active) {
+ tdls_dbg(sdata, "TDLS: disallow link during CSA\n");
+ ret = -EBUSY;
+ break;
+ }
+
+ iee80211_tdls_recalc_chanctx(sdata);
+
+ mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, peer);
if (!sta) {
- rcu_read_unlock();
+ mutex_unlock(&local->sta_mtx);
ret = -ENOLINK;
break;
}
+ iee80211_tdls_recalc_ht_protection(sdata, sta);
+
set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
- rcu_read_unlock();
+ mutex_unlock(&local->sta_mtx);
WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
!ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
@@ -1225,6 +1385,12 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
ieee80211_flush_queues(local, sdata, false);
ret = sta_info_destroy_addr(sdata, peer);
+
+ mutex_lock(&local->sta_mtx);
+ iee80211_tdls_recalc_ht_protection(sdata, NULL);
+ mutex_unlock(&local->sta_mtx);
+
+ iee80211_tdls_recalc_chanctx(sdata);
break;
default:
ret = -ENOTSUPP;
@@ -1236,7 +1402,12 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
eth_zero_addr(sdata->u.mgd.tdls_peer);
}
+ if (ret == 0)
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.mgd.request_smps_work);
+
mutex_unlock(&local->mtx);
+ sdata_unlock(sdata);
return ret;
}
@@ -1639,6 +1810,31 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata,
return -EINVAL;
}
+ if (!elems.sec_chan_offs) {
+ chan_type = NL80211_CHAN_HT20;
+ } else {
+ switch (elems.sec_chan_offs->sec_chan_offs) {
+ case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+ chan_type = NL80211_CHAN_HT40PLUS;
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+ chan_type = NL80211_CHAN_HT40MINUS;
+ break;
+ default:
+ chan_type = NL80211_CHAN_HT20;
+ break;
+ }
+ }
+
+ cfg80211_chandef_create(&chandef, chan, chan_type);
+
+ /* we will be active on the TDLS link */
+ if (!cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &chandef,
+ sdata->wdev.iftype)) {
+ tdls_dbg(sdata, "TDLS chan switch to forbidden channel\n");
+ return -EINVAL;
+ }
+
mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, tf->sa);
if (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) {
@@ -1659,27 +1855,15 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata,
goto out;
}
- if (!sta->sta.ht_cap.ht_supported) {
- chan_type = NL80211_CHAN_NO_HT;
- } else if (!elems.sec_chan_offs) {
- chan_type = NL80211_CHAN_HT20;
- } else {
- switch (elems.sec_chan_offs->sec_chan_offs) {
- case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
- chan_type = NL80211_CHAN_HT40PLUS;
- break;
- case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
- chan_type = NL80211_CHAN_HT40MINUS;
- break;
- default:
- chan_type = NL80211_CHAN_HT20;
- break;
- }
+ /* peer should have known better */
+ if (!sta->sta.ht_cap.ht_supported && elems.sec_chan_offs &&
+ elems.sec_chan_offs->sec_chan_offs) {
+ tdls_dbg(sdata, "TDLS chan switch - wide chan unsupported\n");
+ ret = -ENOTSUPP;
+ goto out;
}
- cfg80211_chandef_create(&chandef, chan, chan_type);
params.chandef = &chandef;
-
params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time);
params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout);
@@ -1703,12 +1887,15 @@ out:
return ret;
}
-void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb)
+static void
+ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
{
struct ieee80211_tdls_data *tf = (void *)skb->data;
struct wiphy *wiphy = sdata->local->hw.wiphy;
+ ASSERT_RTNL();
+
/* make sure the driver supports it */
if (!(wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH))
return;
@@ -1732,3 +1919,47 @@ void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
return;
}
}
+
+void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata)
+{
+ struct sta_info *sta;
+ u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+ if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ continue;
+
+ ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr,
+ NL80211_TDLS_TEARDOWN, reason,
+ GFP_ATOMIC);
+ }
+ rcu_read_unlock();
+}
+
+void ieee80211_tdls_chsw_work(struct work_struct *wk)
+{
+ struct ieee80211_local *local =
+ container_of(wk, struct ieee80211_local, tdls_chsw_work);
+ struct ieee80211_sub_if_data *sdata;
+ struct sk_buff *skb;
+ struct ieee80211_tdls_data *tf;
+
+ rtnl_lock();
+ while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) {
+ tf = (struct ieee80211_tdls_data *)skb->data;
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata) ||
+ sdata->vif.type != NL80211_IFTYPE_STATION ||
+ !ether_addr_equal(tf->da, sdata->vif.addr))
+ continue;
+
+ ieee80211_process_tdls_channel_switch(sdata, skb);
+ break;
+ }
+
+ kfree_skb(skb);
+ }
+ rtnl_unlock();
+}
diff --git a/kernel/net/mac80211/trace.h b/kernel/net/mac80211/trace.h
index 4c2e76902..56c6d6cfa 100644
--- a/kernel/net/mac80211/trace.h
+++ b/kernel/net/mac80211/trace.h
@@ -33,11 +33,11 @@
__field(u32, chan_width) \
__field(u32, center_freq1) \
__field(u32, center_freq2)
-#define CHANDEF_ASSIGN(c) \
- __entry->control_freq = (c)->chan ? (c)->chan->center_freq : 0; \
- __entry->chan_width = (c)->width; \
- __entry->center_freq1 = (c)->center_freq1; \
- __entry->center_freq2 = (c)->center_freq2;
+#define CHANDEF_ASSIGN(c) \
+ __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \
+ __entry->chan_width = (c) ? (c)->width : 0; \
+ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \
+ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0;
#define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz"
#define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \
__entry->center_freq1, __entry->center_freq2
@@ -69,6 +69,17 @@
#define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, \
__entry->rx_chains_static, __entry->rx_chains_dynamic
+#define KEY_ENTRY __field(u32, cipher) \
+ __field(u8, hw_key_idx) \
+ __field(u8, flags) \
+ __field(s8, keyidx)
+#define KEY_ASSIGN(k) __entry->cipher = (k)->cipher; \
+ __entry->flags = (k)->flags; \
+ __entry->keyidx = (k)->keyidx; \
+ __entry->hw_key_idx = (k)->hw_key_idx;
+#define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d"
+#define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx
+
/*
@@ -314,7 +325,6 @@ TRACE_EVENT(drv_config,
__field(u32, flags)
__field(int, power_level)
__field(int, dynamic_ps_timeout)
- __field(int, max_sleep_period)
__field(u16, listen_interval)
__field(u8, long_frame_max_tx_count)
__field(u8, short_frame_max_tx_count)
@@ -328,7 +338,6 @@ TRACE_EVENT(drv_config,
__entry->flags = local->hw.conf.flags;
__entry->power_level = local->hw.conf.power_level;
__entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout;
- __entry->max_sleep_period = local->hw.conf.max_sleep_period;
__entry->listen_interval = local->hw.conf.listen_interval;
__entry->long_frame_max_tx_count =
local->hw.conf.long_frame_max_tx_count;
@@ -486,6 +495,36 @@ TRACE_EVENT(drv_configure_filter,
)
);
+TRACE_EVENT(drv_config_iface_filter,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ unsigned int filter_flags,
+ unsigned int changed_flags),
+
+ TP_ARGS(local, sdata, filter_flags, changed_flags),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(unsigned int, filter_flags)
+ __field(unsigned int, changed_flags)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->filter_flags = filter_flags;
+ __entry->changed_flags = changed_flags;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT
+ " filter_flags: %#x changed_flags: %#x",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags,
+ __entry->changed_flags
+ )
+);
+
TRACE_EVENT(drv_set_tim,
TP_PROTO(struct ieee80211_local *local,
struct ieee80211_sta *sta, bool set),
@@ -522,25 +561,19 @@ TRACE_EVENT(drv_set_key,
LOCAL_ENTRY
VIF_ENTRY
STA_ENTRY
- __field(u32, cipher)
- __field(u8, hw_key_idx)
- __field(u8, flags)
- __field(s8, keyidx)
+ KEY_ENTRY
),
TP_fast_assign(
LOCAL_ASSIGN;
VIF_ASSIGN;
STA_ASSIGN;
- __entry->cipher = key->cipher;
- __entry->flags = key->flags;
- __entry->keyidx = key->keyidx;
- __entry->hw_key_idx = key->hw_key_idx;
+ KEY_ASSIGN(key);
),
TP_printk(
- LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT,
- LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT KEY_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, KEY_PR_ARG
)
);
@@ -656,28 +689,25 @@ TRACE_EVENT(drv_get_stats,
)
);
-TRACE_EVENT(drv_get_tkip_seq,
+TRACE_EVENT(drv_get_key_seq,
TP_PROTO(struct ieee80211_local *local,
- u8 hw_key_idx, u32 *iv32, u16 *iv16),
+ struct ieee80211_key_conf *key),
- TP_ARGS(local, hw_key_idx, iv32, iv16),
+ TP_ARGS(local, key),
TP_STRUCT__entry(
LOCAL_ENTRY
- __field(u8, hw_key_idx)
- __field(u32, iv32)
- __field(u16, iv16)
+ KEY_ENTRY
),
TP_fast_assign(
LOCAL_ASSIGN;
- __entry->hw_key_idx = hw_key_idx;
- __entry->iv32 = *iv32;
- __entry->iv16 = *iv16;
+ KEY_ASSIGN(key);
),
TP_printk(
- LOCAL_PR_FMT, LOCAL_PR_ARG
+ LOCAL_PR_FMT KEY_PR_FMT,
+ LOCAL_PR_ARG, KEY_PR_ARG
)
);
@@ -942,9 +972,9 @@ TRACE_EVENT(drv_ampdu_action,
struct ieee80211_sub_if_data *sdata,
enum ieee80211_ampdu_mlme_action action,
struct ieee80211_sta *sta, u16 tid,
- u16 *ssn, u8 buf_size),
+ u16 *ssn, u8 buf_size, bool amsdu),
- TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size),
+ TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu),
TP_STRUCT__entry(
LOCAL_ENTRY
@@ -953,6 +983,7 @@ TRACE_EVENT(drv_ampdu_action,
__field(u16, tid)
__field(u16, ssn)
__field(u8, buf_size)
+ __field(bool, amsdu)
VIF_ENTRY
),
@@ -964,12 +995,13 @@ TRACE_EVENT(drv_ampdu_action,
__entry->tid = tid;
__entry->ssn = ssn ? *ssn : 0;
__entry->buf_size = buf_size;
+ __entry->amsdu = amsdu;
),
TP_printk(
- LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d",
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d",
LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action,
- __entry->tid, __entry->buf_size
+ __entry->tid, __entry->buf_size, __entry->amsdu
)
);
diff --git a/kernel/net/mac80211/tx.c b/kernel/net/mac80211/tx.c
index 5787f15a3..bdc224d50 100644
--- a/kernel/net/mac80211/tx.c
+++ b/kernel/net/mac80211/tx.c
@@ -37,6 +37,16 @@
/* misc utils */
+static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
+{
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_packets++;
+ tstats->tx_bytes += len;
+ u64_stats_update_end(&tstats->syncp);
+}
+
static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
struct sk_buff *skb, int group_addr,
int next_frag_len)
@@ -201,11 +211,11 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx)
struct ieee80211_if_managed *ifmgd;
/* driver doesn't support power save */
- if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS))
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS))
return TX_CONTINUE;
/* hardware does dynamic power save */
- if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
return TX_CONTINUE;
/* dynamic power save disabled */
@@ -418,7 +428,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
if (ieee80211_is_probe_req(hdr->frame_control))
return TX_CONTINUE;
- if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL))
info->hw_queue = tx->sdata->vif.cab_queue;
/* no stations in PS mode */
@@ -428,7 +438,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM;
/* device releases frame after DTIM beacon */
- if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING))
+ if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING))
return TX_CONTINUE;
/* buffered in mac80211 */
@@ -597,7 +607,6 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
if (tx->key) {
bool skip_hw = false;
- tx->key->tx_rx_count++;
/* TODO: add threshold stuff again */
switch (tx->key->conf.cipher) {
@@ -677,7 +686,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP ||
tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
- tx->sdata->vif.type == NL80211_IFTYPE_ADHOC);
+ tx->sdata->vif.type == NL80211_IFTYPE_ADHOC ||
+ tx->sdata->vif.type == NL80211_IFTYPE_OCB);
/* set up RTS protection if desired */
if (len > tx->local->hw.wiphy->rts_threshold) {
@@ -747,9 +757,9 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
if (txrc.reported_rate.idx < 0) {
txrc.reported_rate = tx->rate;
if (tx->sta && ieee80211_is_data(hdr->frame_control))
- tx->sta->last_tx_rate = txrc.reported_rate;
+ tx->sta->tx_stats.last_rate = txrc.reported_rate;
} else if (tx->sta)
- tx->sta->last_tx_rate = txrc.reported_rate;
+ tx->sta->tx_stats.last_rate = txrc.reported_rate;
if (ratetbl)
return TX_CONTINUE;
@@ -814,7 +824,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
tx->sdata->sequence_number += 0x10;
if (tx->sta)
- tx->sta->tx_msdu[IEEE80211_NUM_TIDS]++;
+ tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
return TX_CONTINUE;
}
@@ -830,7 +840,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
qc = ieee80211_get_qos_ctl(hdr);
tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
- tx->sta->tx_msdu[tid]++;
+ tx->sta->tx_stats.msdu[tid]++;
if (!tx->sta->sta.txq[0])
hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
@@ -984,11 +994,10 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
skb_queue_walk(&tx->skbs, skb) {
ac = skb_get_queue_mapping(skb);
- tx->sta->tx_fragments++;
- tx->sta->tx_bytes[ac] += skb->len;
+ tx->sta->tx_stats.bytes[ac] += skb->len;
}
if (ac >= 0)
- tx->sta->tx_packets[ac]++;
+ tx->sta->tx_stats.packets[ac]++;
return TX_CONTINUE;
}
@@ -1105,7 +1114,9 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
queued = true;
info->control.vif = &tx->sdata->vif;
info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
- info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
+ info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS |
+ IEEE80211_TX_CTL_NO_PS_BUFFER |
+ IEEE80211_TX_STATUS_EOSP;
__skb_queue_tail(&tid_tx->pending, skb);
if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER)
purge_skb = __skb_dequeue(&tid_tx->pending);
@@ -1173,8 +1184,8 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) &&
!ieee80211_is_qos_nullfunc(hdr->frame_control) &&
- (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) &&
- !(local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) {
+ ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) &&
+ !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) {
struct tid_ampdu_tx *tid_tx;
qc = ieee80211_get_qos_ctl(hdr);
@@ -1207,8 +1218,10 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
if (!tx->sta)
info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
- else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT))
+ else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) {
info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
+ ieee80211_check_fast_xmit(tx->sta);
+ }
info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT;
@@ -1417,7 +1430,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
vif = &sdata->vif;
info->hw_queue =
vif->hw_queue[skb_get_queue_mapping(skb)];
- } else if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) {
+ } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) {
dev_kfree_skb(skb);
return true;
} else
@@ -1463,7 +1476,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
CALL_TXH(ieee80211_tx_h_ps_buf);
CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
CALL_TXH(ieee80211_tx_h_select_key);
- if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
+ if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
CALL_TXH(ieee80211_tx_h_rate_ctrl);
if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
@@ -1478,7 +1491,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
/* handlers after fragment must be aware of tx info fragmentation! */
CALL_TXH(ieee80211_tx_h_stats);
CALL_TXH(ieee80211_tx_h_encrypt);
- if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
+ if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
CALL_TXH(ieee80211_tx_h_calculate_duration);
#undef CALL_TXH
@@ -1568,7 +1581,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
/* set up hw_queue value early */
if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) ||
- !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL))
+ !ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
info->hw_queue =
sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
@@ -1595,9 +1608,9 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
}
if (skb_cloned(skb) &&
- (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) ||
+ (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) ||
!skb_clone_writable(skb, ETH_HLEN) ||
- sdata->crypto_tx_tailroom_needed_cnt))
+ (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt)))
I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
else if (head_need || tail_need)
I802_DEBUG_INC(local->tx_expand_skb_head);
@@ -2384,12 +2397,461 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
return ERR_PTR(ret);
}
+/*
+ * fast-xmit overview
+ *
+ * The core idea of this fast-xmit is to remove per-packet checks by checking
+ * them out of band. ieee80211_check_fast_xmit() implements the out-of-band
+ * checks that are needed to get the sta->fast_tx pointer assigned, after which
+ * much less work can be done per packet. For example, fragmentation must be
+ * disabled or the fast_tx pointer will not be set. All the conditions are seen
+ * in the code here.
+ *
+ * Once assigned, the fast_tx data structure also caches the per-packet 802.11
+ * header and other data to aid packet processing in ieee80211_xmit_fast().
+ *
+ * The most difficult part of this is that when any of these assumptions
+ * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(),
+ * ieee80211_check_fast_xmit() or friends) is required to reset the data,
+ * since the per-packet code no longer checks the conditions. This is reflected
+ * by the calls to these functions throughout the rest of the code, and must be
+ * maintained if any of the TX path checks change.
+ */
+
+void ieee80211_check_fast_xmit(struct sta_info *sta)
+{
+ struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old;
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_hdr *hdr = (void *)build.hdr;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ __le16 fc;
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT))
+ return;
+
+ /* Locking here protects both the pointer itself, and against concurrent
+ * invocations winning data access races to, e.g., the key pointer that
+ * is used.
+ * Without it, the invocation of this function right after the key
+ * pointer changes wouldn't be sufficient, as another CPU could access
+ * the pointer, then stall, and then do the cache update after the CPU
+ * that invalidated the key.
+ * With the locking, such scenarios cannot happen as the check for the
+ * key and the fast-tx assignment are done atomically, so the CPU that
+ * modifies the key will either wait or other one will see the key
+ * cleared/changed already.
+ */
+ spin_lock_bh(&sta->lock);
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
+ !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) &&
+ sdata->vif.type == NL80211_IFTYPE_STATION)
+ goto out;
+
+ if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ goto out;
+
+ if (test_sta_flag(sta, WLAN_STA_PS_STA) ||
+ test_sta_flag(sta, WLAN_STA_PS_DRIVER) ||
+ test_sta_flag(sta, WLAN_STA_PS_DELIVER) ||
+ test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT))
+ goto out;
+
+ if (sdata->noack_map)
+ goto out;
+
+ /* fast-xmit doesn't handle fragmentation at all */
+ if (local->hw.wiphy->frag_threshold != (u32)-1 &&
+ !local->ops->set_frag_threshold)
+ goto out;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ rcu_read_unlock();
+ goto out;
+ }
+ build.band = chanctx_conf->def.chan->band;
+ rcu_read_unlock();
+
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_ADHOC:
+ /* DA SA BSSID */
+ build.da_offs = offsetof(struct ieee80211_hdr, addr1);
+ build.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+ memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN);
+ build.hdr_len = 24;
+ break;
+ case NL80211_IFTYPE_STATION:
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ /* DA SA BSSID */
+ build.da_offs = offsetof(struct ieee80211_hdr, addr1);
+ build.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+ memcpy(hdr->addr3, sdata->u.mgd.bssid, ETH_ALEN);
+ build.hdr_len = 24;
+ break;
+ }
+
+ if (sdata->u.mgd.use_4addr) {
+ /* non-regular ethertype cannot use the fastpath */
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS |
+ IEEE80211_FCTL_TODS);
+ /* RA TA DA SA */
+ memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ build.da_offs = offsetof(struct ieee80211_hdr, addr3);
+ build.sa_offs = offsetof(struct ieee80211_hdr, addr4);
+ build.hdr_len = 30;
+ break;
+ }
+ fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+ /* BSSID SA DA */
+ memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+ build.da_offs = offsetof(struct ieee80211_hdr, addr3);
+ build.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+ build.hdr_len = 24;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ if (sdata->wdev.use_4addr) {
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS |
+ IEEE80211_FCTL_TODS);
+ /* RA TA DA SA */
+ memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN);
+ memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ build.da_offs = offsetof(struct ieee80211_hdr, addr3);
+ build.sa_offs = offsetof(struct ieee80211_hdr, addr4);
+ build.hdr_len = 30;
+ break;
+ }
+ /* fall through */
+ case NL80211_IFTYPE_AP:
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+ /* DA BSSID SA */
+ build.da_offs = offsetof(struct ieee80211_hdr, addr1);
+ memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ build.sa_offs = offsetof(struct ieee80211_hdr, addr3);
+ build.hdr_len = 24;
+ break;
+ default:
+ /* not handled on fast-xmit */
+ goto out;
+ }
+
+ if (sta->sta.wme) {
+ build.hdr_len += 2;
+ fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+ }
+
+ /* We store the key here so there's no point in using rcu_dereference()
+ * but that's fine because the code that changes the pointers will call
+ * this function after doing so. For a single CPU that would be enough,
+ * for multiple see the comment above.
+ */
+ build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]);
+ if (!build.key)
+ build.key = rcu_access_pointer(sdata->default_unicast_key);
+ if (build.key) {
+ bool gen_iv, iv_spc, mmic;
+
+ gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV;
+ iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE;
+ mmic = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC;
+
+ /* don't handle software crypto */
+ if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ goto out;
+
+ switch (build.key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ /* add fixed key ID */
+ if (gen_iv) {
+ (build.hdr + build.hdr_len)[3] =
+ 0x20 | (build.key->conf.keyidx << 6);
+ build.pn_offs = build.hdr_len;
+ }
+ if (gen_iv || iv_spc)
+ build.hdr_len += IEEE80211_CCMP_HDR_LEN;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ /* add fixed key ID */
+ if (gen_iv) {
+ (build.hdr + build.hdr_len)[3] =
+ 0x20 | (build.key->conf.keyidx << 6);
+ build.pn_offs = build.hdr_len;
+ }
+ if (gen_iv || iv_spc)
+ build.hdr_len += IEEE80211_GCMP_HDR_LEN;
+ break;
+ case WLAN_CIPHER_SUITE_TKIP:
+ /* cannot handle MMIC or IV generation in xmit-fast */
+ if (mmic || gen_iv)
+ goto out;
+ if (iv_spc)
+ build.hdr_len += IEEE80211_TKIP_IV_LEN;
+ break;
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ /* cannot handle IV generation in fast-xmit */
+ if (gen_iv)
+ goto out;
+ if (iv_spc)
+ build.hdr_len += IEEE80211_WEP_IV_LEN;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ WARN(1,
+ "management cipher suite 0x%x enabled for data\n",
+ build.key->conf.cipher);
+ goto out;
+ default:
+ /* we don't know how to generate IVs for this at all */
+ if (WARN_ON(gen_iv))
+ goto out;
+ /* pure hardware keys are OK, of course */
+ if (!(build.key->flags & KEY_FLAG_CIPHER_SCHEME))
+ break;
+ /* cipher scheme might require space allocation */
+ if (iv_spc &&
+ build.key->conf.iv_len > IEEE80211_FAST_XMIT_MAX_IV)
+ goto out;
+ if (iv_spc)
+ build.hdr_len += build.key->conf.iv_len;
+ }
+
+ fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+ }
+
+ hdr->frame_control = fc;
+
+ memcpy(build.hdr + build.hdr_len,
+ rfc1042_header, sizeof(rfc1042_header));
+ build.hdr_len += sizeof(rfc1042_header);
+
+ fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC);
+ /* if the kmemdup fails, continue w/o fast_tx */
+ if (!fast_tx)
+ goto out;
+
+ out:
+ /* we might have raced against another call to this function */
+ old = rcu_dereference_protected(sta->fast_tx,
+ lockdep_is_held(&sta->lock));
+ rcu_assign_pointer(sta->fast_tx, fast_tx);
+ if (old)
+ kfree_rcu(old, rcu_head);
+ spin_unlock_bh(&sta->lock);
+}
+
+void ieee80211_check_fast_xmit_all(struct ieee80211_local *local)
+{
+ struct sta_info *sta;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list, list)
+ ieee80211_check_fast_xmit(sta);
+ rcu_read_unlock();
+}
+
+void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata &&
+ (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
+ continue;
+ ieee80211_check_fast_xmit(sta);
+ }
+
+ rcu_read_unlock();
+}
+
+void ieee80211_clear_fast_xmit(struct sta_info *sta)
+{
+ struct ieee80211_fast_tx *fast_tx;
+
+ spin_lock_bh(&sta->lock);
+ fast_tx = rcu_dereference_protected(sta->fast_tx,
+ lockdep_is_held(&sta->lock));
+ RCU_INIT_POINTER(sta->fast_tx, NULL);
+ spin_unlock_bh(&sta->lock);
+
+ if (fast_tx)
+ kfree_rcu(fast_tx, rcu_head);
+}
+
+static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
+ struct net_device *dev, struct sta_info *sta,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ u16 ethertype = (skb->data[12] << 8) | skb->data[13];
+ int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
+ int hw_headroom = sdata->local->hw.extra_tx_headroom;
+ struct ethhdr eth;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+ struct ieee80211_tx_data tx;
+ ieee80211_tx_result r;
+ struct tid_ampdu_tx *tid_tx = NULL;
+ u8 tid = IEEE80211_NUM_TIDS;
+
+ /* control port protocol needs a lot of special handling */
+ if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
+ return false;
+
+ /* only RFC 1042 SNAP */
+ if (ethertype < ETH_P_802_3_MIN)
+ return false;
+
+ /* don't handle TX status request here either */
+ if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)
+ return false;
+
+ if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (tid_tx) {
+ if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state))
+ return false;
+ if (tid_tx->timeout)
+ tid_tx->last_tx = jiffies;
+ }
+ }
+
+ /* after this point (skb is modified) we cannot return false */
+
+ if (skb_shared(skb)) {
+ struct sk_buff *tmp_skb = skb;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ kfree_skb(tmp_skb);
+
+ if (!skb)
+ return true;
+ }
+
+ ieee80211_tx_stats(dev, skb->len + extra_head);
+
+ /* will not be crypto-handled beyond what we do here, so use false
+ * as the may-encrypt argument for the resize to not account for
+ * more room than we already have in 'extra_head'
+ */
+ if (unlikely(ieee80211_skb_resize(sdata, skb,
+ max_t(int, extra_head + hw_headroom -
+ skb_headroom(skb), 0),
+ false))) {
+ kfree_skb(skb);
+ return true;
+ }
+
+ memcpy(&eth, skb->data, ETH_HLEN - 2);
+ hdr = (void *)skb_push(skb, extra_head);
+ memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len);
+ memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN);
+ memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN);
+
+ memset(info, 0, sizeof(*info));
+ info->band = fast_tx->band;
+ info->control.vif = &sdata->vif;
+ info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
+ IEEE80211_TX_CTL_DONTFRAG |
+ (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+
+ if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+ *ieee80211_get_qos_ctl(hdr) = tid;
+ if (!sta->sta.txq[0])
+ hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+ } else {
+ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+ hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+ sdata->sequence_number += 0x10;
+ }
+
+ if (skb_shinfo(skb)->gso_size)
+ sta->tx_stats.msdu[tid] +=
+ DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+ else
+ sta->tx_stats.msdu[tid]++;
+
+ info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+ __skb_queue_head_init(&tx.skbs);
+
+ tx.flags = IEEE80211_TX_UNICAST;
+ tx.local = local;
+ tx.sdata = sdata;
+ tx.sta = sta;
+ tx.key = fast_tx->key;
+
+ if (fast_tx->key)
+ info->control.hw_key = &fast_tx->key->conf;
+
+ if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
+ tx.skb = skb;
+ r = ieee80211_tx_h_rate_ctrl(&tx);
+ skb = tx.skb;
+ tx.skb = NULL;
+
+ if (r != TX_CONTINUE) {
+ if (r != TX_QUEUED)
+ kfree_skb(skb);
+ return true;
+ }
+ }
+
+ /* statistics normally done by ieee80211_tx_h_stats (but that
+ * has to consider fragmentation, so is more complex)
+ */
+ sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+ sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
+
+ if (fast_tx->pn_offs) {
+ u64 pn;
+ u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
+
+ switch (fast_tx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
+ crypto_hdr[0] = pn;
+ crypto_hdr[1] = pn >> 8;
+ crypto_hdr[4] = pn >> 16;
+ crypto_hdr[5] = pn >> 24;
+ crypto_hdr[6] = pn >> 32;
+ crypto_hdr[7] = pn >> 40;
+ break;
+ }
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+
+ __skb_queue_tail(&tx.skbs, skb);
+ ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+ return true;
+}
+
void __ieee80211_subif_start_xmit(struct sk_buff *skb,
struct net_device *dev,
u32 info_flags)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct sta_info *sta;
+ struct sk_buff *next;
if (unlikely(skb->len < ETH_HLEN)) {
kfree_skb(skb);
@@ -2398,20 +2860,67 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
rcu_read_lock();
- if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
- kfree_skb(skb);
- goto out;
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
+ goto out_free;
+
+ if (!IS_ERR_OR_NULL(sta)) {
+ struct ieee80211_fast_tx *fast_tx;
+
+ fast_tx = rcu_dereference(sta->fast_tx);
+
+ if (fast_tx &&
+ ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+ goto out;
}
- skb = ieee80211_build_hdr(sdata, skb, info_flags, sta);
- if (IS_ERR(skb))
- goto out;
+ if (skb_is_gso(skb)) {
+ struct sk_buff *segs;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
- dev->trans_start = jiffies;
+ segs = skb_gso_segment(skb, 0);
+ if (IS_ERR(segs)) {
+ goto out_free;
+ } else if (segs) {
+ consume_skb(skb);
+ skb = segs;
+ }
+ } else {
+ /* we cannot process non-linear frames on this path */
+ if (skb_linearize(skb)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ /* the frame could be fragmented, software-encrypted, and other
+ * things so we cannot really handle checksum offload with it -
+ * fix it up in software before we handle anything else.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (skb_checksum_help(skb))
+ goto out_free;
+ }
+ }
+
+ next = skb;
+ while (next) {
+ skb = next;
+ next = skb->next;
+
+ skb->prev = NULL;
+ skb->next = NULL;
- ieee80211_xmit(sdata, sta, skb);
+ skb = ieee80211_build_hdr(sdata, skb, info_flags, sta);
+ if (IS_ERR(skb))
+ goto out;
+
+ ieee80211_tx_stats(dev, skb->len);
+
+ ieee80211_xmit(sdata, sta, skb);
+ }
+ goto out;
+ out_free:
+ kfree_skb(skb);
out:
rcu_read_unlock();
}
@@ -2709,6 +3218,16 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata,
rcu_read_unlock();
}
+static u8 __ieee80211_csa_update_counter(struct beacon_data *beacon)
+{
+ beacon->csa_current_counter--;
+
+ /* the counter should never reach 0 */
+ WARN_ON_ONCE(!beacon->csa_current_counter);
+
+ return beacon->csa_current_counter;
+}
+
u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -2727,11 +3246,7 @@ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
if (!beacon)
goto unlock;
- beacon->csa_current_counter--;
-
- /* the counter should never reach 0 */
- WARN_ON_ONCE(!beacon->csa_current_counter);
- count = beacon->csa_current_counter;
+ count = __ieee80211_csa_update_counter(beacon);
unlock:
rcu_read_unlock();
@@ -2831,7 +3346,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (beacon) {
if (beacon->csa_counter_offsets[0]) {
if (!is_template)
- ieee80211_csa_update_counter(vif);
+ __ieee80211_csa_update_counter(beacon);
ieee80211_set_csa(sdata, beacon);
}
@@ -2877,7 +3392,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (beacon->csa_counter_offsets[0]) {
if (!is_template)
- ieee80211_csa_update_counter(vif);
+ __ieee80211_csa_update_counter(beacon);
ieee80211_set_csa(sdata, beacon);
}
@@ -2907,7 +3422,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
* for now we leave it consistent with overall
* mac80211's behavior.
*/
- ieee80211_csa_update_counter(vif);
+ __ieee80211_csa_update_counter(beacon);
ieee80211_set_csa(sdata, beacon);
}
@@ -3001,6 +3516,12 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
{
struct ieee80211_mutable_offsets offs = {};
struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false);
+ struct sk_buff *copy;
+ struct ieee80211_supported_band *sband;
+ int shift;
+
+ if (!bcn)
+ return bcn;
if (tim_offset)
*tim_offset = offs.tim_offset;
@@ -3008,6 +3529,19 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
if (tim_length)
*tim_length = offs.tim_length;
+ if (ieee80211_hw_check(hw, BEACON_TX_STATUS) ||
+ !hw_to_local(hw)->monitors)
+ return bcn;
+
+ /* send a copy to monitor interfaces */
+ copy = skb_copy(bcn, GFP_ATOMIC);
+ if (!copy)
+ return bcn;
+
+ shift = ieee80211_vif_get_shift(vif);
+ sband = hw->wiphy->bands[ieee80211_get_sdata_band(vif_to_sdata(vif))];
+ ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false);
+
return bcn;
}
EXPORT_SYMBOL(ieee80211_beacon_get_tim);
@@ -3305,7 +3839,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid)
synchronize_net();
/* Tear down BA sessions so we stop aggregating on this TID */
- if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
__ieee80211_stop_tx_ba_session(sta, tid,
AGG_STOP_LOCAL_REQUEST);
@@ -3319,7 +3853,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid)
ieee80211_wake_vif_queues(local, sdata,
IEEE80211_QUEUE_STOP_REASON_RESERVE_TID);
- if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)
+ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION))
clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
ret = 0;
diff --git a/kernel/net/mac80211/util.c b/kernel/net/mac80211/util.c
index b864ebc6a..33344f5a6 100644
--- a/kernel/net/mac80211/util.c
+++ b/kernel/net/mac80211/util.c
@@ -4,6 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (C) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -47,55 +48,6 @@ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
}
EXPORT_SYMBOL(wiphy_to_ieee80211_hw);
-u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
- enum nl80211_iftype type)
-{
- __le16 fc = hdr->frame_control;
-
- /* drop ACK/CTS frames and incorrect hdr len (ctrl) */
- if (len < 16)
- return NULL;
-
- if (ieee80211_is_data(fc)) {
- if (len < 24) /* drop incorrect hdr len (data) */
- return NULL;
-
- if (ieee80211_has_a4(fc))
- return NULL;
- if (ieee80211_has_tods(fc))
- return hdr->addr1;
- if (ieee80211_has_fromds(fc))
- return hdr->addr2;
-
- return hdr->addr3;
- }
-
- if (ieee80211_is_mgmt(fc)) {
- if (len < 24) /* drop incorrect hdr len (mgmt) */
- return NULL;
- return hdr->addr3;
- }
-
- if (ieee80211_is_ctl(fc)) {
- if (ieee80211_is_pspoll(fc))
- return hdr->addr1;
-
- if (ieee80211_is_back_req(fc)) {
- switch (type) {
- case NL80211_IFTYPE_STATION:
- return hdr->addr2;
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_AP_VLAN:
- return hdr->addr1;
- default:
- break; /* fall through to the return */
- }
- }
- }
-
- return NULL;
-}
-
void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
{
struct sk_buff *skb;
@@ -564,7 +516,7 @@ ieee80211_get_vif_queues(struct ieee80211_local *local,
{
unsigned int queues;
- if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) {
+ if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) {
int ac;
queues = 0;
@@ -592,7 +544,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local,
* If no queue was set, or if the HW doesn't support
* IEEE80211_HW_QUEUE_CONTROL - flush all queues
*/
- if (!queues || !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL))
+ if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
queues = ieee80211_get_vif_queues(local, sdata);
ieee80211_stop_queues_by_reason(&local->hw, queues,
@@ -752,7 +704,12 @@ EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif);
struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif)
{
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_sub_if_data *sdata;
+
+ if (!vif)
+ return NULL;
+
+ sdata = vif_to_sdata(vif);
if (!ieee80211_sdata_running(sdata) ||
!(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
@@ -1148,13 +1105,13 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
}
void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
- bool bss_notify)
+ bool bss_notify, bool enable_qos)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_queue_params qparam;
struct ieee80211_chanctx_conf *chanctx_conf;
int ac;
- bool use_11b, enable_qos;
+ bool use_11b;
bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */
int aCWmin, aCWmax;
@@ -1173,13 +1130,6 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
!(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE);
rcu_read_unlock();
- /*
- * By default disable QoS in STA mode for old access points, which do
- * not support 802.11e. New APs will provide proper queue parameters,
- * that we will configure later.
- */
- enable_qos = (sdata->vif.type != NL80211_IFTYPE_STATION);
-
is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB);
/* Set defaults according to 802.11-2007 Table 7-37 */
@@ -1691,6 +1641,29 @@ void ieee80211_stop_device(struct ieee80211_local *local)
drv_stop(local);
}
+static void ieee80211_flush_completed_scan(struct ieee80211_local *local,
+ bool aborted)
+{
+ /* It's possible that we don't handle the scan completion in
+ * time during suspend, so if it's still marked as completed
+ * here, queue the work and flush it to clean things up.
+ * Instead of calling the worker function directly here, we
+ * really queue it to avoid potential races with other flows
+ * scheduling the same work.
+ */
+ if (test_bit(SCAN_COMPLETED, &local->scanning)) {
+ /* If coming from reconfiguration failure, abort the scan so
+ * we don't attempt to continue a partial HW scan - which is
+ * possible otherwise if (e.g.) the 2.4 GHz portion was the
+ * completed scan, and a 5 GHz portion is still pending.
+ */
+ if (aborted)
+ set_bit(SCAN_ABORTED, &local->scanning);
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+ flush_delayed_work(&local->scan_work);
+ }
+}
+
static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
@@ -1708,7 +1681,9 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
local->resuming = false;
local->suspended = false;
- local->started = false;
+ local->in_reconfig = false;
+
+ ieee80211_flush_completed_scan(local, true);
/* scheduled scan clearly can't be running any more, but tell
* cfg80211 and clear local state
@@ -1748,6 +1723,27 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
mutex_unlock(&local->chanctx_mtx);
}
+static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ /* add STAs back */
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ enum ieee80211_sta_state state;
+
+ if (!sta->uploaded || sta->sdata != sdata)
+ continue;
+
+ for (state = IEEE80211_STA_NOTEXIST;
+ state < sta->sta_state; state++)
+ WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
+ state + 1));
+ }
+ mutex_unlock(&local->sta_mtx);
+}
+
int ieee80211_reconfig(struct ieee80211_local *local)
{
struct ieee80211_hw *hw = &local->hw;
@@ -1759,16 +1755,24 @@ int ieee80211_reconfig(struct ieee80211_local *local)
struct ieee80211_sub_if_data *sched_scan_sdata;
struct cfg80211_sched_scan_request *sched_scan_req;
bool sched_scan_stopped = false;
+ bool suspended = local->suspended;
/* nothing to do if HW shouldn't run */
if (!local->open_count)
goto wake_up;
#ifdef CONFIG_PM
- if (local->suspended)
+ if (suspended)
local->resuming = true;
if (local->wowlan) {
+ /*
+ * In the wowlan case, both mac80211 and the device
+ * are functional when the resume op is called, so
+ * clear local->suspended so the device could operate
+ * normally (e.g. pass rx frames).
+ */
+ local->suspended = false;
res = drv_resume(local);
local->wowlan = false;
if (res < 0) {
@@ -1781,12 +1785,26 @@ int ieee80211_reconfig(struct ieee80211_local *local)
/*
* res is 1, which means the driver requested
* to go through a regular reset on wakeup.
+ * restore local->suspended in this case.
*/
reconfig_due_to_wowlan = true;
+ local->suspended = true;
}
#endif
/*
+ * In case of hw_restart during suspend (without wowlan),
+ * cancel restart work, as we are reconfiguring the device
+ * anyway.
+ * Note that restart_work is scheduled on a frozen workqueue,
+ * so we can't deadlock in this case.
+ */
+ if (suspended && local->in_reconfig && !reconfig_due_to_wowlan)
+ cancel_work_sync(&local->restart_work);
+
+ local->started = false;
+
+ /*
* Upon resume hardware can sometimes be goofy due to
* various platform / driver / bus issues, so restarting
* the device may at times not work immediately. Propagate
@@ -1794,7 +1812,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
*/
res = drv_start(local);
if (res) {
- if (local->suspended)
+ if (suspended)
WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n");
else
WARN(1, "Hardware became unavailable during restart.\n");
@@ -1861,50 +1879,11 @@ int ieee80211_reconfig(struct ieee80211_local *local)
WARN_ON(drv_add_chanctx(local, ctx));
mutex_unlock(&local->chanctx_mtx);
- list_for_each_entry(sdata, &local->interfaces, list) {
- if (!ieee80211_sdata_running(sdata))
- continue;
- ieee80211_assign_chanctx(local, sdata);
- }
-
sdata = rtnl_dereference(local->monitor_sdata);
if (sdata && ieee80211_sdata_running(sdata))
ieee80211_assign_chanctx(local, sdata);
}
- /* add STAs back */
- mutex_lock(&local->sta_mtx);
- list_for_each_entry(sta, &local->sta_list, list) {
- enum ieee80211_sta_state state;
-
- if (!sta->uploaded)
- continue;
-
- /* AP-mode stations will be added later */
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP)
- continue;
-
- for (state = IEEE80211_STA_NOTEXIST;
- state < sta->sta_state; state++)
- WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
- state + 1));
- }
- mutex_unlock(&local->sta_mtx);
-
- /* reconfigure tx conf */
- if (hw->queues >= IEEE80211_NUM_ACS) {
- list_for_each_entry(sdata, &local->interfaces, list) {
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- !ieee80211_sdata_running(sdata))
- continue;
-
- for (i = 0; i < IEEE80211_NUM_ACS; i++)
- drv_conf_tx(local, sdata, i,
- &sdata->tx_conf[i]);
- }
- }
-
/* reconfigure hardware */
ieee80211_hw_config(local, ~0);
@@ -1917,6 +1896,22 @@ int ieee80211_reconfig(struct ieee80211_local *local)
if (!ieee80211_sdata_running(sdata))
continue;
+ ieee80211_assign_chanctx(local, sdata);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MONITOR:
+ break;
+ default:
+ ieee80211_reconfig_stations(sdata);
+ /* fall through */
+ case NL80211_IFTYPE_AP: /* AP stations are handled later */
+ for (i = 0; i < IEEE80211_NUM_ACS; i++)
+ drv_conf_tx(local, sdata, i,
+ &sdata->tx_conf[i]);
+ break;
+ }
+
/* common change flags for all interface types */
changed = BSS_CHANGED_ERP_CTS_PROT |
BSS_CHANGED_ERP_PREAMBLE |
@@ -1984,7 +1979,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
}
}
- ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps(local);
/*
* The sta might be in psm against the ap (e.g. because
@@ -1999,7 +1994,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
if (!sdata->u.mgd.associated)
continue;
- ieee80211_send_nullfunc(local, sdata, 0);
+ ieee80211_send_nullfunc(local, sdata, false);
}
}
@@ -2029,6 +2024,29 @@ int ieee80211_reconfig(struct ieee80211_local *local)
if (ieee80211_sdata_running(sdata))
ieee80211_enable_keys(sdata);
+ /* Reconfigure sched scan if it was interrupted by FW restart */
+ mutex_lock(&local->mtx);
+ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata,
+ lockdep_is_held(&local->mtx));
+ sched_scan_req = rcu_dereference_protected(local->sched_scan_req,
+ lockdep_is_held(&local->mtx));
+ if (sched_scan_sdata && sched_scan_req)
+ /*
+ * Sched scan stopped, but we don't want to report it. Instead,
+ * we're trying to reschedule. However, if more than one scan
+ * plan was set, we cannot reschedule since we don't know which
+ * scan plan was currently running (and some scan plans may have
+ * already finished).
+ */
+ if (sched_scan_req->n_scan_plans > 1 ||
+ __ieee80211_request_sched_scan_start(sched_scan_sdata,
+ sched_scan_req))
+ sched_scan_stopped = true;
+ mutex_unlock(&local->mtx);
+
+ if (sched_scan_stopped)
+ cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy);
+
wake_up:
local->in_reconfig = false;
barrier();
@@ -2046,12 +2064,13 @@ int ieee80211_reconfig(struct ieee80211_local *local)
* about the sessions, but we and the AP still think they
* are active. This is really a workaround though.
*/
- if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) {
mutex_lock(&local->sta_mtx);
list_for_each_entry(sta, &local->sta_list, list) {
- ieee80211_sta_tear_down_BA_sessions(
- sta, AGG_STOP_LOCAL_REQUEST);
+ if (!local->resuming)
+ ieee80211_sta_tear_down_BA_sessions(
+ sta, AGG_STOP_LOCAL_REQUEST);
clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
}
@@ -2063,35 +2082,13 @@ int ieee80211_reconfig(struct ieee80211_local *local)
false);
/*
- * Reconfigure sched scan if it was interrupted by FW restart or
- * suspend.
- */
- mutex_lock(&local->mtx);
- sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata,
- lockdep_is_held(&local->mtx));
- sched_scan_req = rcu_dereference_protected(local->sched_scan_req,
- lockdep_is_held(&local->mtx));
- if (sched_scan_sdata && sched_scan_req)
- /*
- * Sched scan stopped, but we don't want to report it. Instead,
- * we're trying to reschedule.
- */
- if (__ieee80211_request_sched_scan_start(sched_scan_sdata,
- sched_scan_req))
- sched_scan_stopped = true;
- mutex_unlock(&local->mtx);
-
- if (sched_scan_stopped)
- cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy);
-
- /*
* If this is for hw restart things are still running.
* We may want to change that later, however.
*/
- if (local->open_count && (!local->suspended || reconfig_due_to_wowlan))
+ if (local->open_count && (!suspended || reconfig_due_to_wowlan))
drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART);
- if (!local->suspended)
+ if (!suspended)
return 0;
#ifdef CONFIG_PM
@@ -2100,17 +2097,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
mb();
local->resuming = false;
- /* It's possible that we don't handle the scan completion in
- * time during suspend, so if it's still marked as completed
- * here, queue the work and flush it to clean things up.
- * Instead of calling the worker function directly here, we
- * really queue it to avoid potential races with other flows
- * scheduling the same work.
- */
- if (test_bit(SCAN_COMPLETED, &local->scanning)) {
- ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
- flush_delayed_work(&local->scan_work);
- }
+ ieee80211_flush_completed_scan(local, false);
if (local->open_count && !reconfig_due_to_wowlan)
drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND);
@@ -2168,7 +2155,13 @@ void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata)
chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
lockdep_is_held(&local->chanctx_mtx));
- if (WARN_ON_ONCE(!chanctx_conf))
+ /*
+ * This function can be called from a work, thus it may be possible
+ * that the chanctx_conf is removed (due to a disconnection, for
+ * example).
+ * So nothing should be done in such case.
+ */
+ if (!chanctx_conf)
goto unlock;
chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf);
@@ -2305,7 +2298,7 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
const struct cfg80211_chan_def *chandef,
- u16 prot_mode)
+ u16 prot_mode, bool rifs_mode)
{
struct ieee80211_ht_operation *ht_oper;
/* Build HT Information */
@@ -2333,6 +2326,9 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
chandef->width != NL80211_CHAN_WIDTH_20)
ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY;
+ if (rifs_mode)
+ ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE;
+
ht_oper->operation_mode = cpu_to_le16(prot_mode);
ht_oper->stbc_param = 0x0000;
@@ -2357,6 +2353,8 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
if (chandef->center_freq2)
vht_oper->center_freq_seg2_idx =
ieee80211_frequency_to_channel(chandef->center_freq2);
+ else
+ vht_oper->center_freq_seg2_idx = 0x00;
switch (chandef->width) {
case NL80211_CHAN_WIDTH_160:
@@ -2574,7 +2572,7 @@ int ieee80211_ave_rssi(struct ieee80211_vif *vif)
/* non-managed type inferfaces */
return 0;
}
- return ifmgd->ave_beacon_signal / 16;
+ return -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal);
}
EXPORT_SYMBOL_GPL(ieee80211_ave_rssi);
@@ -2984,6 +2982,13 @@ ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i)
if (end > 0)
return false;
+ /* One shot NOA */
+ if (data->count[i] == 1)
+ return false;
+
+ if (data->desc[i].interval == 0)
+ return false;
+
/* End time is in the past, check for repetitions */
skip = DIV_ROUND_UP(-end, data->desc[i].interval);
if (data->count[i] < 255) {
@@ -3331,9 +3336,11 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
if (sta) {
txqi->txq.sta = &sta->sta;
sta->sta.txq[tid] = &txqi->txq;
+ txqi->txq.tid = tid;
txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
} else {
sdata->vif.txq = &txqi->txq;
+ txqi->txq.tid = 0;
txqi->txq.ac = IEEE80211_AC_BE;
}
}
diff --git a/kernel/net/mac80211/vht.c b/kernel/net/mac80211/vht.c
index 80694d55d..c38b2f07a 100644
--- a/kernel/net/mac80211/vht.c
+++ b/kernel/net/mac80211/vht.c
@@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
struct ieee80211_sta_vht_cap own_cap;
u32 cap_info, i;
+ bool have_80mhz;
memset(vht_cap, 0, sizeof(*vht_cap));
@@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
if (!vht_cap_ie || !sband->vht_cap.vht_supported)
return;
+ /* Allow VHT if at least one channel on the sband supports 80 MHz */
+ have_80mhz = false;
+ for (i = 0; i < sband->n_channels; i++) {
+ if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+ IEEE80211_CHAN_NO_80MHZ))
+ continue;
+
+ have_80mhz = true;
+ break;
+ }
+
+ if (!have_80mhz)
+ return;
+
/*
* A VHT STA must support 40 MHz, but if we verify that here
* then we break a few things - some APs (e.g. Netgear R6300v2
@@ -308,11 +323,15 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
enum ieee80211_sta_rx_bandwidth bw;
+ enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width;
- bw = ieee80211_chan_width_to_rx_bw(sdata->vif.bss_conf.chandef.width);
- bw = min(bw, ieee80211_sta_cap_rx_bw(sta));
+ bw = ieee80211_sta_cap_rx_bw(sta);
bw = min(bw, sta->cur_max_bandwidth);
+ /* do not cap the BW of TDLS WIDER_BW peers by the bss */
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW))
+ bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width));
+
return bw;
}
@@ -359,7 +378,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only)
+ enum ieee80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
@@ -382,9 +401,6 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
changed |= IEEE80211_RC_NSS_CHANGED;
}
- if (nss_only)
- return changed;
-
switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
@@ -411,14 +427,39 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only)
+ enum ieee80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
- u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode,
- band, nss_only);
+ u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
if (changed > 0)
rate_control_rate_update(local, sband, sta, changed);
}
+
+void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
+ u16 vht_mask[NL80211_VHT_NSS_MAX])
+{
+ int i;
+ u16 mask, cap = le16_to_cpu(vht_cap);
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
+ mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ switch (mask) {
+ case IEEE80211_VHT_MCS_SUPPORT_0_7:
+ vht_mask[i] = 0x00FF;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_8:
+ vht_mask[i] = 0x01FF;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_9:
+ vht_mask[i] = 0x03FF;
+ break;
+ case IEEE80211_VHT_MCS_NOT_SUPPORTED:
+ default:
+ vht_mask[i] = 0;
+ break;
+ }
+ }
+}
diff --git a/kernel/net/mac80211/wpa.c b/kernel/net/mac80211/wpa.c
index 9d63d93c8..d824c3897 100644
--- a/kernel/net/mac80211/wpa.c
+++ b/kernel/net/mac80211/wpa.c
@@ -174,9 +174,12 @@ mic_fail_no_key:
* a driver that supports HW encryption. Send up the key idx only if
* the key is set.
*/
- mac80211_ev_michael_mic_failure(rx->sdata,
- rx->key ? rx->key->conf.keyidx : -1,
- (void *) skb->data, NULL, GFP_ATOMIC);
+ cfg80211_michael_mic_failure(rx->sdata->dev, hdr->addr2,
+ is_multicast_ether_addr(hdr->addr1) ?
+ NL80211_KEYTYPE_GROUP :
+ NL80211_KEYTYPE_PAIRWISE,
+ rx->key ? rx->key->conf.keyidx : -1,
+ NULL, GFP_ATOMIC);
return RX_DROP_UNUSABLE;
}
@@ -444,7 +447,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb,
hdr = (struct ieee80211_hdr *) pos;
pos += hdrlen;
- pn64 = atomic64_inc_return(&key->u.ccmp.tx_pn);
+ pn64 = atomic64_inc_return(&key->conf.tx_pn);
pn[5] = pn64;
pn[4] = pn64 >> 8;
@@ -516,31 +519,34 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
return RX_DROP_UNUSABLE;
}
- ccmp_hdr2pn(pn, skb->data + hdrlen);
+ if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
+ ccmp_hdr2pn(pn, skb->data + hdrlen);
- queue = rx->security_idx;
+ queue = rx->security_idx;
- if (memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN) <= 0) {
- key->u.ccmp.replays++;
- return RX_DROP_UNUSABLE;
- }
+ if (memcmp(pn, key->u.ccmp.rx_pn[queue],
+ IEEE80211_CCMP_PN_LEN) <= 0) {
+ key->u.ccmp.replays++;
+ return RX_DROP_UNUSABLE;
+ }
- if (!(status->flag & RX_FLAG_DECRYPTED)) {
- u8 aad[2 * AES_BLOCK_SIZE];
- u8 b_0[AES_BLOCK_SIZE];
- /* hardware didn't decrypt/verify MIC */
- ccmp_special_blocks(skb, pn, b_0, aad);
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ u8 aad[2 * AES_BLOCK_SIZE];
+ u8 b_0[AES_BLOCK_SIZE];
+ /* hardware didn't decrypt/verify MIC */
+ ccmp_special_blocks(skb, pn, b_0, aad);
+
+ if (ieee80211_aes_ccm_decrypt(
+ key->u.ccmp.tfm, b_0, aad,
+ skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
+ data_len,
+ skb->data + skb->len - mic_len, mic_len))
+ return RX_DROP_UNUSABLE;
+ }
- if (ieee80211_aes_ccm_decrypt(
- key->u.ccmp.tfm, b_0, aad,
- skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
- data_len,
- skb->data + skb->len - mic_len, mic_len))
- return RX_DROP_UNUSABLE;
+ memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
}
- memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
-
/* Remove CCMP header and MIC */
if (pskb_trim(skb, skb->len - mic_len))
return RX_DROP_UNUSABLE;
@@ -670,7 +676,7 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
hdr = (struct ieee80211_hdr *)pos;
pos += hdrlen;
- pn64 = atomic64_inc_return(&key->u.gcmp.tx_pn);
+ pn64 = atomic64_inc_return(&key->conf.tx_pn);
pn[5] = pn64;
pn[4] = pn64 >> 8;
@@ -739,31 +745,35 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
return RX_DROP_UNUSABLE;
}
- gcmp_hdr2pn(pn, skb->data + hdrlen);
+ if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
+ gcmp_hdr2pn(pn, skb->data + hdrlen);
- queue = rx->security_idx;
+ queue = rx->security_idx;
- if (memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN) <= 0) {
- key->u.gcmp.replays++;
- return RX_DROP_UNUSABLE;
- }
+ if (memcmp(pn, key->u.gcmp.rx_pn[queue],
+ IEEE80211_GCMP_PN_LEN) <= 0) {
+ key->u.gcmp.replays++;
+ return RX_DROP_UNUSABLE;
+ }
- if (!(status->flag & RX_FLAG_DECRYPTED)) {
- u8 aad[2 * AES_BLOCK_SIZE];
- u8 j_0[AES_BLOCK_SIZE];
- /* hardware didn't decrypt/verify MIC */
- gcmp_special_blocks(skb, pn, j_0, aad);
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ u8 aad[2 * AES_BLOCK_SIZE];
+ u8 j_0[AES_BLOCK_SIZE];
+ /* hardware didn't decrypt/verify MIC */
+ gcmp_special_blocks(skb, pn, j_0, aad);
+
+ if (ieee80211_aes_gcm_decrypt(
+ key->u.gcmp.tfm, j_0, aad,
+ skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN,
+ data_len,
+ skb->data + skb->len -
+ IEEE80211_GCMP_MIC_LEN))
+ return RX_DROP_UNUSABLE;
+ }
- if (ieee80211_aes_gcm_decrypt(
- key->u.gcmp.tfm, j_0, aad,
- skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN,
- data_len,
- skb->data + skb->len - IEEE80211_GCMP_MIC_LEN))
- return RX_DROP_UNUSABLE;
+ memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
}
- memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
-
/* Remove GCMP header and MIC */
if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN))
return RX_DROP_UNUSABLE;
@@ -940,7 +950,7 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
mmie->key_id = cpu_to_le16(key->conf.keyidx);
/* PN = PN + 1 */
- pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn);
+ pn64 = atomic64_inc_return(&key->conf.tx_pn);
bip_ipn_set64(mmie->sequence_number, pn64);
@@ -984,7 +994,7 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
mmie->key_id = cpu_to_le16(key->conf.keyidx);
/* PN = PN + 1 */
- pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn);
+ pn64 = atomic64_inc_return(&key->conf.tx_pn);
bip_ipn_set64(mmie->sequence_number, pn64);
@@ -1129,7 +1139,7 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx)
mmie->key_id = cpu_to_le16(key->conf.keyidx);
/* PN = PN + 1 */
- pn64 = atomic64_inc_return(&key->u.aes_gmac.tx_pn);
+ pn64 = atomic64_inc_return(&key->conf.tx_pn);
bip_ipn_set64(mmie->sequence_number, pn64);
diff --git a/kernel/net/mac802154/Kconfig b/kernel/net/mac802154/Kconfig
index aa462b480..fb45287eb 100644
--- a/kernel/net/mac802154/Kconfig
+++ b/kernel/net/mac802154/Kconfig
@@ -2,6 +2,7 @@ config MAC802154
tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)"
depends on IEEE802154
select CRC_CCITT
+ select CRYPTO
select CRYPTO_AUTHENC
select CRYPTO_CCM
select CRYPTO_CTR
diff --git a/kernel/net/mac802154/Makefile b/kernel/net/mac802154/Makefile
index 702d8b466..17a51e838 100644
--- a/kernel/net/mac802154/Makefile
+++ b/kernel/net/mac802154/Makefile
@@ -1,5 +1,7 @@
obj-$(CONFIG_MAC802154) += mac802154.o
mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \
- iface.o llsec.o util.o cfg.o
+ iface.o llsec.o util.o cfg.o trace.o
+
+CFLAGS_trace.o := -I$(src)
ccflags-y += -D__CHECK_ENDIAN__
diff --git a/kernel/net/mac802154/cfg.c b/kernel/net/mac802154/cfg.c
index 70be9c799..57b5e9447 100644
--- a/kernel/net/mac802154/cfg.c
+++ b/kernel/net/mac802154/cfg.c
@@ -44,6 +44,49 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy,
ieee802154_if_remove(sdata);
}
+#ifdef CONFIG_PM
+static int ieee802154_suspend(struct wpan_phy *wpan_phy)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+
+ if (!local->open_count)
+ goto suspend;
+
+ ieee802154_stop_queue(&local->hw);
+ synchronize_net();
+
+ /* stop hardware - this must stop RX */
+ ieee802154_stop_device(local);
+
+suspend:
+ local->suspended = true;
+ return 0;
+}
+
+static int ieee802154_resume(struct wpan_phy *wpan_phy)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ int ret;
+
+ /* nothing to do if HW shouldn't run */
+ if (!local->open_count)
+ goto wake_up;
+
+ /* restart hardware */
+ ret = drv_start(local);
+ if (ret)
+ return ret;
+
+wake_up:
+ ieee802154_wake_queue(&local->hw);
+ local->suspended = false;
+ return 0;
+}
+#else
+#define ieee802154_suspend NULL
+#define ieee802154_resume NULL
+#endif
+
static int
ieee802154_add_iface(struct wpan_phy *phy, const char *name,
unsigned char name_assign_type,
@@ -73,9 +116,9 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel)
ASSERT_RTNL();
- /* check if phy support this setting */
- if (!(wpan_phy->channels_supported[page] & BIT(channel)))
- return -EINVAL;
+ if (wpan_phy->current_page == page &&
+ wpan_phy->current_channel == channel)
+ return 0;
ret = drv_set_channel(local, page, channel);
if (!ret) {
@@ -95,9 +138,8 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy,
ASSERT_RTNL();
- /* check if phy support this setting */
- if (!(local->hw.flags & IEEE802154_HW_CCA_MODE))
- return -EOPNOTSUPP;
+ if (wpan_phy_cca_cmp(&wpan_phy->cca, cca))
+ return 0;
ret = drv_set_cca_mode(local, cca);
if (!ret)
@@ -107,23 +149,57 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy,
}
static int
+ieee802154_set_cca_ed_level(struct wpan_phy *wpan_phy, s32 ed_level)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (wpan_phy->cca_ed_level == ed_level)
+ return 0;
+
+ ret = drv_set_cca_ed_level(local, ed_level);
+ if (!ret)
+ wpan_phy->cca_ed_level = ed_level;
+
+ return ret;
+}
+
+static int
+ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (wpan_phy->transmit_power == power)
+ return 0;
+
+ ret = drv_set_tx_power(local, power);
+ if (!ret)
+ wpan_phy->transmit_power = power;
+
+ return ret;
+}
+
+static int
ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
__le16 pan_id)
{
+ int ret;
+
ASSERT_RTNL();
- /* TODO
- * I am not sure about to check here on broadcast pan_id.
- * Broadcast is a valid setting, comment from 802.15.4:
- * If this value is 0xffff, the device is not associated.
- *
- * This could useful to simple deassociate an device.
- */
- if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
- return -EINVAL;
-
- wpan_dev->pan_id = pan_id;
- return 0;
+ if (wpan_dev->pan_id == pan_id)
+ return 0;
+
+ ret = mac802154_wpan_update_llsec(wpan_dev->netdev);
+ if (!ret)
+ wpan_dev->pan_id = pan_id;
+
+ return ret;
}
static int
@@ -131,13 +207,8 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy,
struct wpan_dev *wpan_dev,
u8 min_be, u8 max_be)
{
- struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
-
ASSERT_RTNL();
- if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS))
- return -EOPNOTSUPP;
-
wpan_dev->min_be = min_be;
wpan_dev->max_be = max_be;
return 0;
@@ -149,21 +220,6 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
{
ASSERT_RTNL();
- /* TODO
- * I am not sure about to check here on broadcast short_addr.
- * Broadcast is a valid setting, comment from 802.15.4:
- * A value of 0xfffe indicates that the device has
- * associated but has not been allocated an address. A
- * value of 0xffff indicates that the device does not
- * have a short address.
- *
- * I think we should allow to set these settings but
- * don't allow to allow socket communication with it.
- */
- if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) ||
- short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST))
- return -EINVAL;
-
wpan_dev->short_addr = short_addr;
return 0;
}
@@ -173,13 +229,8 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy,
struct wpan_dev *wpan_dev,
u8 max_csma_backoffs)
{
- struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
-
ASSERT_RTNL();
- if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS))
- return -EOPNOTSUPP;
-
wpan_dev->csma_retries = max_csma_backoffs;
return 0;
}
@@ -189,13 +240,8 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy,
struct wpan_dev *wpan_dev,
s8 max_frame_retries)
{
- struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
-
ASSERT_RTNL();
- if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES))
- return -EOPNOTSUPP;
-
wpan_dev->frame_retries = max_frame_retries;
return 0;
}
@@ -204,28 +250,243 @@ static int
ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
bool mode)
{
- struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
-
ASSERT_RTNL();
- if (!(local->hw.flags & IEEE802154_HW_LBT))
- return -EOPNOTSUPP;
-
wpan_dev->lbt = mode;
return 0;
}
+static int
+ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev, bool ackreq)
+{
+ ASSERT_RTNL();
+
+ wpan_dev->ackreq = ackreq;
+ return 0;
+}
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+static void
+ieee802154_get_llsec_table(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_llsec_table **table)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ *table = &sdata->sec.table;
+}
+
+static void
+ieee802154_lock_llsec_table(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ mutex_lock(&sdata->sec_mtx);
+}
+
+static void
+ieee802154_unlock_llsec_table(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ mutex_unlock(&sdata->sec_mtx);
+}
+
+static int
+ieee802154_set_llsec_params(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_params *params,
+ int changed)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_set_params(&sdata->sec, params, changed);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_get_llsec_params(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_llsec_params *params)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_get_params(&sdata->sec, params);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_add_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_key_add(&sdata->sec, id, key);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_del_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_key_id *id)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_key_del(&sdata->sec, id);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_add_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_seclevel_add(&sdata->sec, sl);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_del_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_seclevel_del(&sdata->sec, sl);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_add_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_device *dev_desc)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_dev_add(&sdata->sec, dev_desc);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_del_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le64 extended_addr)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_dev_del(&sdata->sec, extended_addr);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_add_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *key)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_devkey_add(&sdata->sec, extended_addr, key);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+static int
+ieee802154_del_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *key)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_devkey_del(&sdata->sec, extended_addr, key);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
const struct cfg802154_ops mac802154_config_ops = {
.add_virtual_intf_deprecated = ieee802154_add_iface_deprecated,
.del_virtual_intf_deprecated = ieee802154_del_iface_deprecated,
+ .suspend = ieee802154_suspend,
+ .resume = ieee802154_resume,
.add_virtual_intf = ieee802154_add_iface,
.del_virtual_intf = ieee802154_del_iface,
.set_channel = ieee802154_set_channel,
.set_cca_mode = ieee802154_set_cca_mode,
+ .set_cca_ed_level = ieee802154_set_cca_ed_level,
+ .set_tx_power = ieee802154_set_tx_power,
.set_pan_id = ieee802154_set_pan_id,
.set_short_addr = ieee802154_set_short_addr,
.set_backoff_exponent = ieee802154_set_backoff_exponent,
.set_max_csma_backoffs = ieee802154_set_max_csma_backoffs,
.set_max_frame_retries = ieee802154_set_max_frame_retries,
.set_lbt_mode = ieee802154_set_lbt_mode,
+ .set_ackreq_default = ieee802154_set_ackreq_default,
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ .get_llsec_table = ieee802154_get_llsec_table,
+ .lock_llsec_table = ieee802154_lock_llsec_table,
+ .unlock_llsec_table = ieee802154_unlock_llsec_table,
+ /* TODO above */
+ .set_llsec_params = ieee802154_set_llsec_params,
+ .get_llsec_params = ieee802154_get_llsec_params,
+ .add_llsec_key = ieee802154_add_llsec_key,
+ .del_llsec_key = ieee802154_del_llsec_key,
+ .add_seclevel = ieee802154_add_seclevel,
+ .del_seclevel = ieee802154_del_seclevel,
+ .add_device = ieee802154_add_device,
+ .del_device = ieee802154_del_device,
+ .add_devkey = ieee802154_add_devkey,
+ .del_devkey = ieee802154_del_devkey,
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};
diff --git a/kernel/net/mac802154/driver-ops.h b/kernel/net/mac802154/driver-ops.h
index a0533357b..0550f3365 100644
--- a/kernel/net/mac802154/driver-ops.h
+++ b/kernel/net/mac802154/driver-ops.h
@@ -7,6 +7,7 @@
#include <net/mac802154.h>
#include "ieee802154_i.h"
+#include "trace.h"
static inline int
drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb)
@@ -27,19 +28,25 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb)
static inline int drv_start(struct ieee802154_local *local)
{
+ int ret;
+
might_sleep();
+ trace_802154_drv_start(local);
local->started = true;
smp_mb();
-
- return local->ops->start(&local->hw);
+ ret = local->ops->start(&local->hw);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline void drv_stop(struct ieee802154_local *local)
{
might_sleep();
+ trace_802154_drv_stop(local);
local->ops->stop(&local->hw);
+ trace_802154_drv_return_void(local);
/* sync away all work on the tasklet before clearing started */
tasklet_disable(&local->tasklet);
@@ -53,13 +60,20 @@ static inline void drv_stop(struct ieee802154_local *local)
static inline int
drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
{
+ int ret;
+
might_sleep();
- return local->ops->set_channel(&local->hw, page, channel);
+ trace_802154_drv_set_channel(local, page, channel);
+ ret = local->ops->set_channel(&local->hw, page, channel);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
-static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm)
+static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_txpower) {
@@ -67,12 +81,17 @@ static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm)
return -EOPNOTSUPP;
}
- return local->ops->set_txpower(&local->hw, dbm);
+ trace_802154_drv_set_tx_power(local, mbm);
+ ret = local->ops->set_txpower(&local->hw, mbm);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int drv_set_cca_mode(struct ieee802154_local *local,
const struct wpan_phy_cca *cca)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_cca_mode) {
@@ -80,11 +99,16 @@ static inline int drv_set_cca_mode(struct ieee802154_local *local,
return -EOPNOTSUPP;
}
- return local->ops->set_cca_mode(&local->hw, cca);
+ trace_802154_drv_set_cca_mode(local, cca);
+ ret = local->ops->set_cca_mode(&local->hw, cca);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_lbt) {
@@ -92,12 +116,17 @@ static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
return -EOPNOTSUPP;
}
- return local->ops->set_lbt(&local->hw, mode);
+ trace_802154_drv_set_lbt_mode(local, mode);
+ ret = local->ops->set_lbt(&local->hw, mode);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
-drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level)
+drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_cca_ed_level) {
@@ -105,12 +134,16 @@ drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level)
return -EOPNOTSUPP;
}
- return local->ops->set_cca_ed_level(&local->hw, ed_level);
+ trace_802154_drv_set_cca_ed_level(local, mbm);
+ ret = local->ops->set_cca_ed_level(&local->hw, mbm);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
{
struct ieee802154_hw_addr_filt filt;
+ int ret;
might_sleep();
@@ -121,14 +154,18 @@ static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
filt.pan_id = pan_id;
- return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ trace_802154_drv_set_pan_id(local, pan_id);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
IEEE802154_AFILT_PANID_CHANGED);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr)
{
struct ieee802154_hw_addr_filt filt;
+ int ret;
might_sleep();
@@ -139,14 +176,18 @@ drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr)
filt.ieee_addr = extended_addr;
- return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ trace_802154_drv_set_extended_addr(local, extended_addr);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
IEEE802154_AFILT_IEEEADDR_CHANGED);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr)
{
struct ieee802154_hw_addr_filt filt;
+ int ret;
might_sleep();
@@ -157,14 +198,18 @@ drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr)
filt.short_addr = short_addr;
- return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ trace_802154_drv_set_short_addr(local, short_addr);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
IEEE802154_AFILT_SADDR_CHANGED);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
drv_set_pan_coord(struct ieee802154_local *local, bool is_coord)
{
struct ieee802154_hw_addr_filt filt;
+ int ret;
might_sleep();
@@ -175,14 +220,19 @@ drv_set_pan_coord(struct ieee802154_local *local, bool is_coord)
filt.pan_coord = is_coord;
- return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ trace_802154_drv_set_pan_coord(local, is_coord);
+ ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
IEEE802154_AFILT_PANC_CHANGED);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be,
u8 max_csma_backoffs)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_csma_params) {
@@ -190,13 +240,19 @@ drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be,
return -EOPNOTSUPP;
}
- return local->ops->set_csma_params(&local->hw, min_be, max_be,
+ trace_802154_drv_set_csma_params(local, min_be, max_be,
+ max_csma_backoffs);
+ ret = local->ops->set_csma_params(&local->hw, min_be, max_be,
max_csma_backoffs);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_frame_retries) {
@@ -204,12 +260,17 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries)
return -EOPNOTSUPP;
}
- return local->ops->set_frame_retries(&local->hw, max_frame_retries);
+ trace_802154_drv_set_max_frame_retries(local, max_frame_retries);
+ ret = local->ops->set_frame_retries(&local->hw, max_frame_retries);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
static inline int
drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
{
+ int ret;
+
might_sleep();
if (!local->ops->set_promiscuous_mode) {
@@ -217,7 +278,10 @@ drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
return -EOPNOTSUPP;
}
- return local->ops->set_promiscuous_mode(&local->hw, on);
+ trace_802154_drv_set_promiscuous_mode(local, on);
+ ret = local->ops->set_promiscuous_mode(&local->hw, on);
+ trace_802154_drv_return_int(local, ret);
+ return ret;
}
#endif /* __MAC802154_DRIVER_OPS */
diff --git a/kernel/net/mac802154/ieee802154_i.h b/kernel/net/mac802154/ieee802154_i.h
index 127ba1838..56ccffa3f 100644
--- a/kernel/net/mac802154/ieee802154_i.h
+++ b/kernel/net/mac802154/ieee802154_i.h
@@ -56,9 +56,13 @@ struct ieee802154_local {
struct hrtimer ifs_timer;
bool started;
+ bool suspended;
struct tasklet_struct tasklet;
struct sk_buff_head skb_queue;
+
+ struct sk_buff *tx_skb;
+ struct work_struct tx_work;
};
enum {
@@ -86,20 +90,14 @@ struct ieee802154_sub_if_data {
unsigned long state;
char name[IFNAMSIZ];
- spinlock_t mib_lock;
-
/* protects sec from concurrent access by netlink. access by
* encrypt/decrypt/header_create safe without additional protection.
*/
struct mutex sec_mtx;
struct mac802154_llsec sec;
- /* must be last, dynamically sized area in this! */
- struct ieee802154_vif vif;
};
-#define MAC802154_CHAN_NONE 0xff /* No channel is assigned */
-
/* utility functions/constants */
extern const void *const mac802154_wpan_phy_privid; /* for wpan_phy privid */
@@ -129,6 +127,8 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata)
extern struct ieee802154_mlme_ops mac802154_mlme_wpan;
+void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb);
+void ieee802154_xmit_worker(struct work_struct *work);
netdev_tx_t
ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
netdev_tx_t
@@ -136,12 +136,7 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer);
/* MIB callbacks */
-void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val);
-__le16 mac802154_dev_get_short_addr(const struct net_device *dev);
-__le16 mac802154_dev_get_pan_id(const struct net_device *dev);
-void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val);
void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
-u8 mac802154_dev_get_dsn(const struct net_device *dev);
int mac802154_get_params(struct net_device *dev,
struct ieee802154_llsec_params *params);
@@ -176,6 +171,8 @@ void mac802154_get_table(struct net_device *dev,
struct ieee802154_llsec_table **t);
void mac802154_unlock_table(struct net_device *dev);
+int mac802154_wpan_update_llsec(struct net_device *dev);
+
/* interface handling */
int ieee802154_iface_init(void);
void ieee802154_iface_exit(void);
@@ -185,5 +182,6 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
unsigned char name_assign_type, enum nl802154_iftype type,
__le64 extended_addr);
void ieee802154_remove_interfaces(struct ieee802154_local *local);
+void ieee802154_stop_device(struct ieee802154_local *local);
#endif /* __IEEE802154_I_H */
diff --git a/kernel/net/mac802154/iface.c b/kernel/net/mac802154/iface.c
index 91b75abbd..7079cd32a 100644
--- a/kernel/net/mac802154/iface.c
+++ b/kernel/net/mac802154/iface.c
@@ -30,7 +30,7 @@
#include "ieee802154_i.h"
#include "driver-ops.h"
-static int mac802154_wpan_update_llsec(struct net_device *dev)
+int mac802154_wpan_update_llsec(struct net_device *dev)
{
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
@@ -62,9 +62,10 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
(struct sockaddr_ieee802154 *)&ifr->ifr_addr;
int err = -ENOIOCTLCMD;
- ASSERT_RTNL();
+ if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR)
+ return err;
- spin_lock_bh(&sdata->mib_lock);
+ rtnl_lock();
switch (cmd) {
case SIOCGIFADDR:
@@ -89,7 +90,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
}
case SIOCSIFADDR:
if (netif_running(dev)) {
- spin_unlock_bh(&sdata->mib_lock);
+ rtnl_unlock();
return -EBUSY;
}
@@ -111,7 +112,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
- spin_unlock_bh(&sdata->mib_lock);
+ rtnl_unlock();
return err;
}
@@ -124,29 +125,97 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p)
if (netif_running(dev))
return -EBUSY;
+ /* lowpan need to be down for update
+ * SLAAC address after ifup
+ */
+ if (sdata->wpan_dev.lowpan_dev) {
+ if (netif_running(sdata->wpan_dev.lowpan_dev))
+ return -EBUSY;
+ }
+
ieee802154_be64_to_le64(&extended_addr, addr->sa_data);
- if (!ieee802154_is_valid_extended_addr(extended_addr))
+ if (!ieee802154_is_valid_extended_unicast_addr(extended_addr))
return -EINVAL;
memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
sdata->wpan_dev.extended_addr = extended_addr;
+ /* update lowpan interface mac address when
+ * wpan mac has been changed
+ */
+ if (sdata->wpan_dev.lowpan_dev)
+ memcpy(sdata->wpan_dev.lowpan_dev->dev_addr, dev->dev_addr,
+ dev->addr_len);
+
return mac802154_wpan_update_llsec(dev);
}
+static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata)
+{
+ struct ieee802154_local *local = sdata->local;
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ int ret;
+
+ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+ ret = drv_set_promiscuous_mode(local,
+ wpan_dev->promiscuous_mode);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_AFILT) {
+ ret = drv_set_pan_id(local, wpan_dev->pan_id);
+ if (ret < 0)
+ return ret;
+
+ ret = drv_set_extended_addr(local, wpan_dev->extended_addr);
+ if (ret < 0)
+ return ret;
+
+ ret = drv_set_short_addr(local, wpan_dev->short_addr);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_LBT) {
+ ret = drv_set_lbt_mode(local, wpan_dev->lbt);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) {
+ ret = drv_set_csma_params(local, wpan_dev->min_be,
+ wpan_dev->max_be,
+ wpan_dev->csma_retries);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) {
+ ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
static int mac802154_slave_open(struct net_device *dev)
{
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
struct ieee802154_local *local = sdata->local;
- int res = 0;
+ int res;
ASSERT_RTNL();
set_bit(SDATA_STATE_RUNNING, &sdata->state);
if (!local->open_count) {
+ res = ieee802154_setup_hw(sdata);
+ if (res)
+ goto err;
+
res = drv_start(local);
- WARN_ON(res);
if (res)
goto err;
}
@@ -218,8 +287,8 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata,
* exist really an use case if we need to support
* multiple node types at the same time.
*/
- if (sdata->vif.type == NL802154_IFTYPE_NODE &&
- nsdata->vif.type == NL802154_IFTYPE_NODE)
+ if (wpan_dev->iftype == NL802154_IFTYPE_NODE &&
+ nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE)
return -EBUSY;
/* check all phy mac sublayer settings are the same.
@@ -239,67 +308,13 @@ static int mac802154_wpan_open(struct net_device *dev)
{
int rc;
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
- struct ieee802154_local *local = sdata->local;
struct wpan_dev *wpan_dev = &sdata->wpan_dev;
- struct wpan_phy *phy = sdata->local->phy;
-
- rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type);
- if (rc < 0)
- return rc;
- rc = mac802154_slave_open(dev);
+ rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype);
if (rc < 0)
return rc;
- mutex_lock(&phy->pib_lock);
-
- if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
- rc = drv_set_promiscuous_mode(local,
- wpan_dev->promiscuous_mode);
- if (rc < 0)
- goto out;
- }
-
- if (local->hw.flags & IEEE802154_HW_AFILT) {
- rc = drv_set_pan_id(local, wpan_dev->pan_id);
- if (rc < 0)
- goto out;
-
- rc = drv_set_extended_addr(local, wpan_dev->extended_addr);
- if (rc < 0)
- goto out;
-
- rc = drv_set_short_addr(local, wpan_dev->short_addr);
- if (rc < 0)
- goto out;
- }
-
- if (local->hw.flags & IEEE802154_HW_LBT) {
- rc = drv_set_lbt_mode(local, wpan_dev->lbt);
- if (rc < 0)
- goto out;
- }
-
- if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) {
- rc = drv_set_csma_params(local, wpan_dev->min_be,
- wpan_dev->max_be,
- wpan_dev->csma_retries);
- if (rc < 0)
- goto out;
- }
-
- if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) {
- rc = drv_set_max_frame_retries(local, wpan_dev->frame_retries);
- if (rc < 0)
- goto out;
- }
-
- mutex_unlock(&phy->pib_lock);
- return 0;
-
-out:
- mutex_unlock(&phy->pib_lock);
- return rc;
+ return mac802154_slave_open(dev);
}
static int mac802154_slave_close(struct net_device *dev)
@@ -309,15 +324,13 @@ static int mac802154_slave_close(struct net_device *dev)
ASSERT_RTNL();
- hrtimer_cancel(&local->ifs_timer);
-
netif_stop_queue(dev);
local->open_count--;
clear_bit(SDATA_STATE_RUNNING, &sdata->state);
if (!local->open_count)
- drv_stop(local);
+ ieee802154_stop_device(local);
return 0;
}
@@ -354,12 +367,11 @@ static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata,
return 0;
}
-static int mac802154_header_create(struct sk_buff *skb,
- struct net_device *dev,
- unsigned short type,
- const void *daddr,
- const void *saddr,
- unsigned len)
+static int ieee802154_header_create(struct sk_buff *skb,
+ struct net_device *dev,
+ const struct ieee802154_addr *daddr,
+ const struct ieee802154_addr *saddr,
+ unsigned len)
{
struct ieee802154_hdr hdr;
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
@@ -374,14 +386,12 @@ static int mac802154_header_create(struct sk_buff *skb,
hdr.fc.type = cb->type;
hdr.fc.security_enabled = cb->secen;
hdr.fc.ack_request = cb->ackreq;
- hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev);
+ hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF;
if (mac802154_set_header_security(sdata, &hdr, cb) < 0)
return -EINVAL;
if (!saddr) {
- spin_lock_bh(&sdata->mib_lock);
-
if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) ||
wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) ||
wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) {
@@ -393,8 +403,6 @@ static int mac802154_header_create(struct sk_buff *skb,
}
hdr.source.pan_id = wpan_dev->pan_id;
-
- spin_unlock_bh(&sdata->mib_lock);
} else {
hdr.source = *(const struct ieee802154_addr *)saddr;
}
@@ -414,24 +422,89 @@ static int mac802154_header_create(struct sk_buff *skb,
return hlen;
}
+static const struct wpan_dev_header_ops ieee802154_header_ops = {
+ .create = ieee802154_header_create,
+};
+
+/* This header create functionality assumes a 8 byte array for
+ * source and destination pointer at maximum. To adapt this for
+ * the 802.15.4 dataframe header we use extended address handling
+ * here only and intra pan connection. fc fields are mostly fallback
+ * handling. For provide dev_hard_header for dgram sockets.
+ */
+static int mac802154_header_create(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned short type,
+ const void *daddr,
+ const void *saddr,
+ unsigned len)
+{
+ struct ieee802154_hdr hdr;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ struct ieee802154_mac_cb cb = { };
+ int hlen;
+
+ if (!daddr)
+ return -EINVAL;
+
+ memset(&hdr.fc, 0, sizeof(hdr.fc));
+ hdr.fc.type = IEEE802154_FC_TYPE_DATA;
+ hdr.fc.ack_request = wpan_dev->ackreq;
+ hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF;
+
+ /* TODO currently a workaround to give zero cb block to set
+ * security parameters defaults according MIB.
+ */
+ if (mac802154_set_header_security(sdata, &hdr, &cb) < 0)
+ return -EINVAL;
+
+ hdr.dest.pan_id = wpan_dev->pan_id;
+ hdr.dest.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr);
+
+ hdr.source.pan_id = hdr.dest.pan_id;
+ hdr.source.mode = IEEE802154_ADDR_LONG;
+
+ if (!saddr)
+ hdr.source.extended_addr = wpan_dev->extended_addr;
+ else
+ ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr);
+
+ hlen = ieee802154_hdr_push(skb, &hdr);
+ if (hlen < 0)
+ return -EINVAL;
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = hlen;
+
+ if (len > ieee802154_max_payload(&hdr))
+ return -EMSGSIZE;
+
+ return hlen;
+}
+
static int
mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
struct ieee802154_hdr hdr;
- struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr;
if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) {
pr_debug("malformed packet\n");
return 0;
}
- *addr = hdr.source;
- return sizeof(*addr);
+ if (hdr.source.mode == IEEE802154_ADDR_LONG) {
+ ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr);
+ return IEEE802154_EXTENDED_ADDR_LEN;
+ }
+
+ return 0;
}
-static struct header_ops mac802154_header_ops = {
- .create = mac802154_header_create,
- .parse = mac802154_header_parse,
+static const struct header_ops mac802154_header_ops = {
+ .create = mac802154_header_create,
+ .parse = mac802154_header_parse,
};
static const struct net_device_ops mac802154_wpan_ops = {
@@ -462,9 +535,29 @@ static void ieee802154_if_setup(struct net_device *dev)
dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN;
memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN);
- dev->hard_header_len = MAC802154_FRAME_HARD_HEADER_LEN;
- dev->needed_tailroom = 2 + 16; /* FCS + MIC */
- dev->mtu = IEEE802154_MTU;
+ /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET
+ * will not send frames without any payload, but ack frames
+ * has no payload, so substract one that we can send a 3 bytes
+ * frame. The xmit callback assumes at least a hard header where two
+ * bytes fc and sequence field are set.
+ */
+ dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1;
+ /* The auth_tag header is for security and places in private payload
+ * room of mac frame which stucks between payload and FCS field.
+ */
+ dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN +
+ IEEE802154_FCS_LEN;
+ /* The mtu size is the payload without mac header in this case.
+ * We have a dynamic length header with a minimum header length
+ * which is hard_header_len. In this case we let mtu to the size
+ * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN -
+ * hard_header_len. The FCS which is set by hardware or ndo_start_xmit
+ * and the minimum mac header which can be evaluated inside driver
+ * layer. The rest of mac header will be part of payload if greater
+ * than hard_header_len.
+ */
+ dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN -
+ dev->hard_header_len;
dev->tx_queue_len = 300;
dev->flags = IFF_NOARP | IFF_BROADCAST;
}
@@ -474,20 +567,22 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
enum nl802154_iftype type)
{
struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ int ret;
+ u8 tmp;
/* set some type-dependent values */
- sdata->vif.type = type;
sdata->wpan_dev.iftype = type;
- get_random_bytes(&wpan_dev->bsn, 1);
- get_random_bytes(&wpan_dev->dsn, 1);
+ get_random_bytes(&tmp, sizeof(tmp));
+ atomic_set(&wpan_dev->bsn, tmp);
+ get_random_bytes(&tmp, sizeof(tmp));
+ atomic_set(&wpan_dev->dsn, tmp);
/* defaults per 802.15.4-2011 */
wpan_dev->min_be = 3;
wpan_dev->max_be = 5;
wpan_dev->csma_retries = 4;
- /* for compatibility, actual default is 3 */
- wpan_dev->frame_retries = -1;
+ wpan_dev->frame_retries = 3;
wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST);
wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
@@ -502,11 +597,15 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
sdata->dev->netdev_ops = &mac802154_wpan_ops;
sdata->dev->ml_priv = &mac802154_mlme_wpan;
wpan_dev->promiscuous_mode = false;
+ wpan_dev->header_ops = &ieee802154_header_ops;
- spin_lock_init(&sdata->mib_lock);
mutex_init(&sdata->sec_mtx);
mac802154_llsec_init(&sdata->sec);
+ ret = mac802154_wpan_update_llsec(sdata->dev);
+ if (ret < 0)
+ return ret;
+
break;
case NL802154_IFTYPE_MONITOR:
sdata->dev->destructor = free_netdev;
@@ -531,12 +630,13 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
ASSERT_RTNL();
- ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name,
+ ndev = alloc_netdev(sizeof(*sdata), name,
name_assign_type, ieee802154_if_setup);
if (!ndev)
return ERR_PTR(-ENOMEM);
- ndev->needed_headroom = local->hw.extra_tx_headroom;
+ ndev->needed_headroom = local->hw.extra_tx_headroom +
+ IEEE802154_MAX_HEADER_LEN;
ret = dev_alloc_name(ndev, ndev->name);
if (ret < 0)
@@ -547,7 +647,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
switch (type) {
case NL802154_IFTYPE_NODE:
ndev->type = ARPHRD_IEEE802154;
- if (ieee802154_is_valid_extended_addr(extended_addr))
+ if (ieee802154_is_valid_extended_unicast_addr(extended_addr))
ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr);
else
memcpy(ndev->dev_addr, ndev->perm_addr,
diff --git a/kernel/net/mac802154/llsec.c b/kernel/net/mac802154/llsec.c
index 5b2be1283..a13d02b7c 100644
--- a/kernel/net/mac802154/llsec.c
+++ b/kernel/net/mac802154/llsec.c
@@ -17,8 +17,9 @@
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/completion.h>
+#include <linux/crypto.h>
#include <linux/ieee802154.h>
-#include <crypto/algapi.h>
+#include <crypto/aead.h>
#include "ieee802154_i.h"
#include "llsec.h"
@@ -54,7 +55,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec)
msl = container_of(sl, struct mac802154_llsec_seclevel, level);
list_del(&sl->list);
- kfree(msl);
+ kzfree(msl);
}
list_for_each_entry_safe(dev, dn, &sec->table.devices, list) {
@@ -71,7 +72,7 @@ void mac802154_llsec_destroy(struct mac802154_llsec *sec)
mkey = container_of(key->key, struct mac802154_llsec_key, key);
list_del(&key->list);
llsec_key_put(mkey);
- kfree(key);
+ kzfree(key);
}
}
@@ -160,7 +161,7 @@ err_tfm:
if (key->tfm[i])
crypto_free_aead(key->tfm[i]);
- kfree(key);
+ kzfree(key);
return NULL;
}
@@ -175,7 +176,7 @@ static void llsec_key_release(struct kref *ref)
crypto_free_aead(key->tfm[i]);
crypto_free_blkcipher(key->tfm0);
- kfree(key);
+ kzfree(key);
}
static struct mac802154_llsec_key*
@@ -266,7 +267,7 @@ int mac802154_llsec_key_add(struct mac802154_llsec *sec,
return 0;
fail:
- kfree(new);
+ kzfree(new);
return -ENOMEM;
}
@@ -346,10 +347,10 @@ static void llsec_dev_free(struct mac802154_llsec_device *dev)
devkey);
list_del(&pos->list);
- kfree(devkey);
+ kzfree(devkey);
}
- kfree(dev);
+ kzfree(dev);
}
int mac802154_llsec_dev_add(struct mac802154_llsec *sec,
@@ -400,6 +401,7 @@ int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr)
hash_del_rcu(&pos->bucket_s);
hash_del_rcu(&pos->bucket_hw);
+ list_del_rcu(&pos->dev.list);
call_rcu(&pos->rcu, llsec_dev_free_rcu);
return 0;
@@ -649,7 +651,7 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
u8 iv[16];
unsigned char *data;
int authlen, assoclen, datalen, rc;
- struct scatterlist src, assoc[2], dst[2];
+ struct scatterlist sg;
struct aead_request *req;
authlen = ieee802154_sechdr_authtag_len(&hdr->sec);
@@ -659,34 +661,27 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
if (!req)
return -ENOMEM;
- sg_init_table(assoc, 2);
- sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len);
assoclen = skb->mac_len;
data = skb_mac_header(skb) + skb->mac_len;
datalen = skb_tail_pointer(skb) - data;
- if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) {
- sg_set_buf(&assoc[1], data, 0);
- } else {
- sg_set_buf(&assoc[1], data, datalen);
+ skb_put(skb, authlen);
+
+ sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen + authlen);
+
+ if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) {
assoclen += datalen;
datalen = 0;
}
- sg_init_one(&src, data, datalen);
-
- sg_init_table(dst, 2);
- sg_set_buf(&dst[0], data, datalen);
- sg_set_buf(&dst[1], skb_put(skb, authlen), authlen);
-
aead_request_set_callback(req, 0, NULL, NULL);
- aead_request_set_assoc(req, assoc, assoclen);
- aead_request_set_crypt(req, &src, dst, datalen, iv);
+ aead_request_set_crypt(req, &sg, &sg, datalen, iv);
+ aead_request_set_ad(req, assoclen);
rc = crypto_aead_encrypt(req);
- kfree(req);
+ kzfree(req);
return rc;
}
@@ -858,7 +853,7 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
u8 iv[16];
unsigned char *data;
int authlen, datalen, assoclen, rc;
- struct scatterlist src, assoc[2];
+ struct scatterlist sg;
struct aead_request *req;
authlen = ieee802154_sechdr_authtag_len(&hdr->sec);
@@ -868,31 +863,25 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
if (!req)
return -ENOMEM;
- sg_init_table(assoc, 2);
- sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len);
assoclen = skb->mac_len;
data = skb_mac_header(skb) + skb->mac_len;
datalen = skb_tail_pointer(skb) - data;
- if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) {
- sg_set_buf(&assoc[1], data, 0);
- } else {
- sg_set_buf(&assoc[1], data, datalen - authlen);
+ sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen);
+
+ if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) {
assoclen += datalen - authlen;
- data += datalen - authlen;
datalen = authlen;
}
- sg_init_one(&src, data, datalen);
-
aead_request_set_callback(req, 0, NULL, NULL);
- aead_request_set_assoc(req, assoc, assoclen);
- aead_request_set_crypt(req, &src, &src, datalen, iv);
+ aead_request_set_crypt(req, &sg, &sg, datalen, iv);
+ aead_request_set_ad(req, assoclen);
rc = crypto_aead_decrypt(req);
- kfree(req);
+ kzfree(req);
skb_trim(skb, skb->len - authlen);
return rc;
@@ -932,7 +921,7 @@ llsec_update_devkey_record(struct mac802154_llsec_device *dev,
if (!devkey)
list_add_rcu(&next->devkey.list, &dev->dev.keys);
else
- kfree(next);
+ kzfree(next);
spin_unlock_bh(&dev->lock);
}
diff --git a/kernel/net/mac802154/mac_cmd.c b/kernel/net/mac802154/mac_cmd.c
index bdccb4ecd..8606da459 100644
--- a/kernel/net/mac802154/mac_cmd.c
+++ b/kernel/net/mac802154/mac_cmd.c
@@ -36,37 +36,30 @@ static int mac802154_mlme_start_req(struct net_device *dev,
u8 pan_coord, u8 blx,
u8 coord_realign)
{
- struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
- int rc = 0;
+ struct ieee802154_llsec_params params;
+ int changed = 0;
ASSERT_RTNL();
BUG_ON(addr->mode != IEEE802154_ADDR_SHORT);
- mac802154_dev_set_pan_id(dev, addr->pan_id);
- mac802154_dev_set_short_addr(dev, addr->short_addr);
+ dev->ieee802154_ptr->pan_id = addr->pan_id;
+ dev->ieee802154_ptr->short_addr = addr->short_addr;
mac802154_dev_set_page_channel(dev, page, channel);
- if (ops->llsec) {
- struct ieee802154_llsec_params params;
- int changed = 0;
+ params.pan_id = addr->pan_id;
+ changed |= IEEE802154_LLSEC_PARAM_PAN_ID;
- params.coord_shortaddr = addr->short_addr;
- changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR;
+ params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr);
+ changed |= IEEE802154_LLSEC_PARAM_HWADDR;
- params.pan_id = addr->pan_id;
- changed |= IEEE802154_LLSEC_PARAM_PAN_ID;
+ params.coord_hwaddr = params.hwaddr;
+ changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR;
- params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr);
- changed |= IEEE802154_LLSEC_PARAM_HWADDR;
+ params.coord_shortaddr = addr->short_addr;
+ changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR;
- params.coord_hwaddr = params.hwaddr;
- changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR;
-
- rc = ops->llsec->set_params(dev, &params, changed);
- }
-
- return rc;
+ return mac802154_set_params(dev, &params, changed);
}
static int mac802154_set_mac_params(struct net_device *dev,
@@ -91,19 +84,19 @@ static int mac802154_set_mac_params(struct net_device *dev,
wpan_dev->frame_retries = params->frame_retries;
wpan_dev->lbt = params->lbt;
- if (local->hw.flags & IEEE802154_HW_TXPOWER) {
+ if (local->hw.phy->flags & WPAN_PHY_FLAG_TXPOWER) {
ret = drv_set_tx_power(local, params->transmit_power);
if (ret < 0)
return ret;
}
- if (local->hw.flags & IEEE802154_HW_CCA_MODE) {
+ if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_MODE) {
ret = drv_set_cca_mode(local, &params->cca);
if (ret < 0)
return ret;
}
- if (local->hw.flags & IEEE802154_HW_CCA_ED_LEVEL) {
+ if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
ret = drv_set_cca_ed_level(local, params->cca_ed_level);
if (ret < 0)
return ret;
@@ -151,9 +144,6 @@ static struct ieee802154_llsec_ops mac802154_llsec_ops = {
struct ieee802154_mlme_ops mac802154_mlme_wpan = {
.start_req = mac802154_mlme_start_req,
- .get_pan_id = mac802154_dev_get_pan_id,
- .get_short_addr = mac802154_dev_get_short_addr,
- .get_dsn = mac802154_dev_get_dsn,
.llsec = &mac802154_llsec_ops,
diff --git a/kernel/net/mac802154/main.c b/kernel/net/mac802154/main.c
index 08cb32dc8..e8cab5bb8 100644
--- a/kernel/net/mac802154/main.c
+++ b/kernel/net/mac802154/main.c
@@ -40,7 +40,7 @@ static void ieee802154_tasklet_handler(unsigned long data)
* netstack.
*/
skb->pkt_type = 0;
- ieee802154_rx(&local->hw, skb);
+ ieee802154_rx(local, skb);
break;
default:
WARN(1, "mac802154: Packet is of unknown type %d\n",
@@ -58,11 +58,9 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
struct ieee802154_local *local;
size_t priv_size;
- if (!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed ||
- !ops->start || !ops->stop || !ops->set_channel) {
- pr_err("undefined IEEE802.15.4 device operations\n");
+ if (WARN_ON(!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed ||
+ !ops->start || !ops->stop || !ops->set_channel))
return NULL;
- }
/* Ensure 32-byte alignment of our private data and hw private data.
* We use the wpan_phy priv data for both our ieee802154_local and for
@@ -107,6 +105,20 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
skb_queue_head_init(&local->skb_queue);
+ INIT_WORK(&local->tx_work, ieee802154_xmit_worker);
+
+ /* init supported flags with 802.15.4 default ranges */
+ phy->supported.max_minbe = 8;
+ phy->supported.min_maxbe = 3;
+ phy->supported.max_maxbe = 8;
+ phy->supported.min_frame_retries = 0;
+ phy->supported.max_frame_retries = 7;
+ phy->supported.max_csma_backoffs = 5;
+ phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE;
+
+ /* always supported */
+ phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE);
+
return &local->hw;
}
EXPORT_SYMBOL(ieee802154_alloc_hw);
@@ -155,6 +167,23 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
ieee802154_setup_wpan_phy_pib(local->phy);
+ if (!(hw->flags & IEEE802154_HW_CSMA_PARAMS)) {
+ local->phy->supported.min_csma_backoffs = 4;
+ local->phy->supported.max_csma_backoffs = 4;
+ local->phy->supported.min_maxbe = 5;
+ local->phy->supported.max_maxbe = 5;
+ local->phy->supported.min_minbe = 3;
+ local->phy->supported.max_minbe = 3;
+ }
+
+ if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) {
+ local->phy->supported.min_frame_retries = 3;
+ local->phy->supported.max_frame_retries = 3;
+ }
+
+ if (hw->flags & IEEE802154_HW_PROMISCUOUS)
+ local->phy->supported.iftypes |= BIT(NL802154_IFTYPE_MONITOR);
+
rc = wpan_phy_register(local->phy);
if (rc < 0)
goto out_wq;
diff --git a/kernel/net/mac802154/mib.c b/kernel/net/mac802154/mib.c
index 5cf019a57..73f94fbf8 100644
--- a/kernel/net/mac802154/mib.c
+++ b/kernel/net/mac802154/mib.c
@@ -26,81 +26,22 @@
#include "ieee802154_i.h"
#include "driver-ops.h"
-void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val)
-{
- struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-
- BUG_ON(dev->type != ARPHRD_IEEE802154);
-
- spin_lock_bh(&sdata->mib_lock);
- sdata->wpan_dev.short_addr = val;
- spin_unlock_bh(&sdata->mib_lock);
-}
-
-__le16 mac802154_dev_get_short_addr(const struct net_device *dev)
-{
- struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
- __le16 ret;
-
- BUG_ON(dev->type != ARPHRD_IEEE802154);
-
- spin_lock_bh(&sdata->mib_lock);
- ret = sdata->wpan_dev.short_addr;
- spin_unlock_bh(&sdata->mib_lock);
-
- return ret;
-}
-
-__le16 mac802154_dev_get_pan_id(const struct net_device *dev)
-{
- struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
- __le16 ret;
-
- BUG_ON(dev->type != ARPHRD_IEEE802154);
-
- spin_lock_bh(&sdata->mib_lock);
- ret = sdata->wpan_dev.pan_id;
- spin_unlock_bh(&sdata->mib_lock);
-
- return ret;
-}
-
-void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val)
-{
- struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-
- BUG_ON(dev->type != ARPHRD_IEEE802154);
-
- spin_lock_bh(&sdata->mib_lock);
- sdata->wpan_dev.pan_id = val;
- spin_unlock_bh(&sdata->mib_lock);
-}
-
-u8 mac802154_dev_get_dsn(const struct net_device *dev)
-{
- struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
-
- BUG_ON(dev->type != ARPHRD_IEEE802154);
-
- return sdata->wpan_dev.dsn++;
-}
-
void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
{
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
struct ieee802154_local *local = sdata->local;
int res;
+ ASSERT_RTNL();
+
BUG_ON(dev->type != ARPHRD_IEEE802154);
res = drv_set_channel(local, page, chan);
if (res) {
pr_debug("set_channel failed\n");
} else {
- mutex_lock(&local->phy->pib_lock);
local->phy->current_channel = chan;
local->phy->current_page = page;
- mutex_unlock(&local->phy->pib_lock);
}
}
diff --git a/kernel/net/mac802154/rx.c b/kernel/net/mac802154/rx.c
index c0d67b2b4..42e96729d 100644
--- a/kernel/net/mac802154/rx.c
+++ b/kernel/net/mac802154/rx.c
@@ -47,8 +47,6 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
pr_debug("getting packet via slave interface %s\n", sdata->dev->name);
- spin_lock_bh(&sdata->mib_lock);
-
span = wpan_dev->pan_id;
sshort = wpan_dev->short_addr;
@@ -83,15 +81,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
skb->pkt_type = PACKET_OTHERHOST;
break;
default:
- spin_unlock_bh(&sdata->mib_lock);
pr_debug("invalid dest mode\n");
goto fail;
}
- spin_unlock_bh(&sdata->mib_lock);
-
skb->dev = sdata->dev;
+ /* TODO this should be moved after netif_receive_skb call, otherwise
+ * wireshark will show a mac header with security fields and the
+ * payload is already decrypted.
+ */
rc = mac802154_llsec_decrypt(&sdata->sec, skb);
if (rc) {
pr_debug("decryption failed: %i\n", rc);
@@ -207,8 +206,10 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local,
}
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- if (sdata->vif.type != NL802154_IFTYPE_NODE ||
- !netif_running(sdata->dev))
+ if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE)
+ continue;
+
+ if (!ieee802154_sdata_running(sdata))
continue;
ieee802154_subif_frame(sdata, skb, &hdr);
@@ -232,7 +233,7 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IEEE802154);
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- if (sdata->vif.type != NL802154_IFTYPE_MONITOR)
+ if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR)
continue;
if (!ieee802154_sdata_running(sdata))
@@ -249,13 +250,15 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb)
}
}
-void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
+void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
{
- struct ieee802154_local *local = hw_to_local(hw);
u16 crc;
WARN_ON_ONCE(softirq_count() == 0);
+ if (local->suspended)
+ goto drop;
+
/* TODO: When a transceiver omits the checksum here, we
* add an own calculated one. This is currently an ugly
* solution because the monitor needs a crc here.
@@ -276,8 +279,7 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
crc = crc_ccitt(0, skb->data, skb->len);
if (crc) {
rcu_read_unlock();
- kfree_skb(skb);
- return;
+ goto drop;
}
}
/* remove crc */
@@ -286,8 +288,11 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
__ieee802154_rx_handle_packet(local, skb);
rcu_read_unlock();
+
+ return;
+drop:
+ kfree_skb(skb);
}
-EXPORT_SYMBOL(ieee802154_rx);
void
ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi)
diff --git a/kernel/net/mac802154/trace.c b/kernel/net/mac802154/trace.c
new file mode 100644
index 000000000..863e5e6b9
--- /dev/null
+++ b/kernel/net/mac802154/trace.c
@@ -0,0 +1,9 @@
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#include <net/cfg802154.h>
+#include "driver-ops.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/kernel/net/mac802154/trace.h b/kernel/net/mac802154/trace.h
new file mode 100644
index 000000000..6f30e0c93
--- /dev/null
+++ b/kernel/net/mac802154/trace.h
@@ -0,0 +1,272 @@
+/* Based on net/mac80211/trace.h */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mac802154
+
+#if !defined(__MAC802154_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __MAC802154_DRIVER_TRACE
+
+#include <linux/tracepoint.h>
+
+#include <net/mac802154.h>
+#include "ieee802154_i.h"
+
+#define MAXNAME 32
+#define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME)
+#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \
+ wpan_phy_name(local->hw.phy), MAXNAME)
+#define LOCAL_PR_FMT "%s"
+#define LOCAL_PR_ARG __entry->wpan_phy_name
+
+#define CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \
+ __field(enum nl802154_cca_opts, cca_opt)
+#define CCA_ASSIGN \
+ do { \
+ (__entry->cca_mode) = cca->mode; \
+ (__entry->cca_opt) = cca->opt; \
+ } while (0)
+#define CCA_PR_FMT "cca_mode: %d, cca_opt: %d"
+#define CCA_PR_ARG __entry->cca_mode, __entry->cca_opt
+
+#define BOOL_TO_STR(bo) (bo) ? "true" : "false"
+
+/* Tracing for driver callbacks */
+
+DECLARE_EVENT_CLASS(local_only_evt,
+ TP_PROTO(struct ieee802154_local *local),
+ TP_ARGS(local),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ ),
+ TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
+);
+
+DEFINE_EVENT(local_only_evt, 802154_drv_return_void,
+ TP_PROTO(struct ieee802154_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(802154_drv_return_int,
+ TP_PROTO(struct ieee802154_local *local, int ret),
+ TP_ARGS(local, ret),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(LOCAL_PR_FMT ", returned: %d", LOCAL_PR_ARG,
+ __entry->ret)
+);
+
+DEFINE_EVENT(local_only_evt, 802154_drv_start,
+ TP_PROTO(struct ieee802154_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, 802154_drv_stop,
+ TP_PROTO(struct ieee802154_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(802154_drv_set_channel,
+ TP_PROTO(struct ieee802154_local *local, u8 page, u8 channel),
+ TP_ARGS(local, page, channel),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u8, page)
+ __field(u8, channel)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->page = page;
+ __entry->channel = channel;
+ ),
+ TP_printk(LOCAL_PR_FMT ", page: %d, channel: %d", LOCAL_PR_ARG,
+ __entry->page, __entry->channel)
+);
+
+TRACE_EVENT(802154_drv_set_cca_mode,
+ TP_PROTO(struct ieee802154_local *local,
+ const struct wpan_phy_cca *cca),
+ TP_ARGS(local, cca),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ CCA_ENTRY
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ CCA_ASSIGN;
+ ),
+ TP_printk(LOCAL_PR_FMT ", " CCA_PR_FMT, LOCAL_PR_ARG,
+ CCA_PR_ARG)
+);
+
+TRACE_EVENT(802154_drv_set_cca_ed_level,
+ TP_PROTO(struct ieee802154_local *local, s32 mbm),
+ TP_ARGS(local, mbm),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(s32, mbm)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->mbm = mbm;
+ ),
+ TP_printk(LOCAL_PR_FMT ", ed level: %d", LOCAL_PR_ARG,
+ __entry->mbm)
+);
+
+TRACE_EVENT(802154_drv_set_tx_power,
+ TP_PROTO(struct ieee802154_local *local, s32 power),
+ TP_ARGS(local, power),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(s32, power)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->power = power;
+ ),
+ TP_printk(LOCAL_PR_FMT ", mbm: %d", LOCAL_PR_ARG,
+ __entry->power)
+);
+
+TRACE_EVENT(802154_drv_set_lbt_mode,
+ TP_PROTO(struct ieee802154_local *local, bool mode),
+ TP_ARGS(local, mode),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, mode)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->mode = mode;
+ ),
+ TP_printk(LOCAL_PR_FMT ", lbt mode: %s", LOCAL_PR_ARG,
+ BOOL_TO_STR(__entry->mode))
+);
+
+TRACE_EVENT(802154_drv_set_short_addr,
+ TP_PROTO(struct ieee802154_local *local, __le16 short_addr),
+ TP_ARGS(local, short_addr),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(__le16, short_addr)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->short_addr = short_addr;
+ ),
+ TP_printk(LOCAL_PR_FMT ", short addr: 0x%04x", LOCAL_PR_ARG,
+ le16_to_cpu(__entry->short_addr))
+);
+
+TRACE_EVENT(802154_drv_set_pan_id,
+ TP_PROTO(struct ieee802154_local *local, __le16 pan_id),
+ TP_ARGS(local, pan_id),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(__le16, pan_id)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->pan_id = pan_id;
+ ),
+ TP_printk(LOCAL_PR_FMT ", pan id: 0x%04x", LOCAL_PR_ARG,
+ le16_to_cpu(__entry->pan_id))
+);
+
+TRACE_EVENT(802154_drv_set_extended_addr,
+ TP_PROTO(struct ieee802154_local *local, __le64 extended_addr),
+ TP_ARGS(local, extended_addr),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(__le64, extended_addr)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->extended_addr = extended_addr;
+ ),
+ TP_printk(LOCAL_PR_FMT ", extended addr: 0x%llx", LOCAL_PR_ARG,
+ le64_to_cpu(__entry->extended_addr))
+);
+
+TRACE_EVENT(802154_drv_set_pan_coord,
+ TP_PROTO(struct ieee802154_local *local, bool is_coord),
+ TP_ARGS(local, is_coord),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, is_coord)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->is_coord = is_coord;
+ ),
+ TP_printk(LOCAL_PR_FMT ", is_coord: %s", LOCAL_PR_ARG,
+ BOOL_TO_STR(__entry->is_coord))
+);
+
+TRACE_EVENT(802154_drv_set_csma_params,
+ TP_PROTO(struct ieee802154_local *local, u8 min_be, u8 max_be,
+ u8 max_csma_backoffs),
+ TP_ARGS(local, min_be, max_be, max_csma_backoffs),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u8, min_be)
+ __field(u8, max_be)
+ __field(u8, max_csma_backoffs)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN,
+ __entry->min_be = min_be;
+ __entry->max_be = max_be;
+ __entry->max_csma_backoffs = max_csma_backoffs;
+ ),
+ TP_printk(LOCAL_PR_FMT ", min be: %d, max be: %d, max csma backoffs: %d",
+ LOCAL_PR_ARG, __entry->min_be, __entry->max_be,
+ __entry->max_csma_backoffs)
+);
+
+TRACE_EVENT(802154_drv_set_max_frame_retries,
+ TP_PROTO(struct ieee802154_local *local, s8 max_frame_retries),
+ TP_ARGS(local, max_frame_retries),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(s8, max_frame_retries)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->max_frame_retries = max_frame_retries;
+ ),
+ TP_printk(LOCAL_PR_FMT ", max frame retries: %d", LOCAL_PR_ARG,
+ __entry->max_frame_retries)
+);
+
+TRACE_EVENT(802154_drv_set_promiscuous_mode,
+ TP_PROTO(struct ieee802154_local *local, bool on),
+ TP_ARGS(local, on),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, on)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->on = on;
+ ),
+ TP_printk(LOCAL_PR_FMT ", promiscuous mode: %s", LOCAL_PR_ARG,
+ BOOL_TO_STR(__entry->on))
+);
+
+#endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/kernel/net/mac802154/tx.c b/kernel/net/mac802154/tx.c
index c62e95695..3827f359b 100644
--- a/kernel/net/mac802154/tx.c
+++ b/kernel/net/mac802154/tx.c
@@ -30,23 +30,11 @@
#include "ieee802154_i.h"
#include "driver-ops.h"
-/* IEEE 802.15.4 transceivers can sleep during the xmit session, so process
- * packets through the workqueue.
- */
-struct ieee802154_xmit_cb {
- struct sk_buff *skb;
- struct work_struct work;
- struct ieee802154_local *local;
-};
-
-static struct ieee802154_xmit_cb ieee802154_xmit_cb;
-
-static void ieee802154_xmit_worker(struct work_struct *work)
+void ieee802154_xmit_worker(struct work_struct *work)
{
- struct ieee802154_xmit_cb *cb =
- container_of(work, struct ieee802154_xmit_cb, work);
- struct ieee802154_local *local = cb->local;
- struct sk_buff *skb = cb->skb;
+ struct ieee802154_local *local =
+ container_of(work, struct ieee802154_local, tx_work);
+ struct sk_buff *skb = local->tx_skb;
struct net_device *dev = skb->dev;
int res;
@@ -89,9 +77,6 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
put_unaligned_le16(crc, skb_put(skb, 2));
}
- if (skb_cow_head(skb, local->hw.extra_tx_headroom))
- goto err_tx;
-
/* Stop the netif queue on each sub_if_data object. */
ieee802154_stop_queue(&local->hw);
@@ -106,11 +91,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
dev->stats.tx_packets++;
dev->stats.tx_bytes += skb->len;
} else {
- INIT_WORK(&ieee802154_xmit_cb.work, ieee802154_xmit_worker);
- ieee802154_xmit_cb.skb = skb;
- ieee802154_xmit_cb.local = local;
-
- queue_work(local->workqueue, &ieee802154_xmit_cb.work);
+ local->tx_skb = skb;
+ queue_work(local->workqueue, &local->tx_work);
}
return NETDEV_TX_OK;
@@ -136,6 +118,10 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev)
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
int rc;
+ /* TODO we should move it to wpan_dev_hard_header and dev_hard_header
+ * functions. The reason is wireshark will show a mac header which is
+ * with security fields but the payload is not encrypted.
+ */
rc = mac802154_llsec_encrypt(&sdata->sec, skb);
if (rc) {
netdev_warn(dev, "encryption failed: %i\n", rc);
diff --git a/kernel/net/mac802154/util.c b/kernel/net/mac802154/util.c
index 150bf807e..f9fd0957a 100644
--- a/kernel/net/mac802154/util.c
+++ b/kernel/net/mac802154/util.c
@@ -14,6 +14,7 @@
*/
#include "ieee802154_i.h"
+#include "driver-ops.h"
/* privid for wpan_phys to determine whether they belong to us or not */
const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid;
@@ -85,11 +86,17 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
hrtimer_start(&local->ifs_timer,
ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC),
HRTIMER_MODE_REL);
-
- consume_skb(skb);
} else {
ieee802154_wake_queue(hw);
- consume_skb(skb);
}
+
+ dev_consume_skb_any(skb);
}
EXPORT_SYMBOL(ieee802154_xmit_complete);
+
+void ieee802154_stop_device(struct ieee802154_local *local)
+{
+ flush_workqueue(local->workqueue);
+ hrtimer_cancel(&local->ifs_timer);
+ drv_stop(local);
+}
diff --git a/kernel/net/mpls/Kconfig b/kernel/net/mpls/Kconfig
index 17bde799c..5c467ef97 100644
--- a/kernel/net/mpls/Kconfig
+++ b/kernel/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
config MPLS_ROUTING
tristate "MPLS: routing support"
- help
+ ---help---
Add support for forwarding of mpls packets.
+config MPLS_IPTUNNEL
+ tristate "MPLS: IP over MPLS tunnel support"
+ depends on LWTUNNEL && MPLS_ROUTING
+ ---help---
+ mpls ip tunnel support.
+
endif # MPLS
diff --git a/kernel/net/mpls/Makefile b/kernel/net/mpls/Makefile
index 65bbe68c7..9ca923625 100644
--- a/kernel/net/mpls/Makefile
+++ b/kernel/net/mpls/Makefile
@@ -3,5 +3,6 @@
#
obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
mpls_router-y := af_mpls.o
diff --git a/kernel/net/mpls/af_mpls.c b/kernel/net/mpls/af_mpls.c
index 1f93a5978..c32fc411a 100644
--- a/kernel/net/mpls/af_mpls.c
+++ b/kernel/net/mpls/af_mpls.c
@@ -15,24 +15,19 @@
#include <net/ip_fib.h>
#include <net/netevent.h>
#include <net/netns/generic.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#endif
+#include <net/nexthop.h>
#include "internal.h"
-#define LABEL_NOT_SPECIFIED (1<<20)
-#define MAX_NEW_LABELS 2
-
-/* This maximum ha length copied from the definition of struct neighbour */
-#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
-
-struct mpls_route { /* next hop label forwarding entry */
- struct net_device __rcu *rt_dev;
- struct rcu_head rt_rcu;
- u32 rt_label[MAX_NEW_LABELS];
- u8 rt_protocol; /* routing protocol that set this entry */
- u8 rt_labels;
- u8 rt_via_alen;
- u8 rt_via_table;
- u8 rt_via[0];
-};
+/* Maximum number of labels to look ahead at when selecting a path of
+ * a multipath route
+ */
+#define MAX_MP_SELECT_LABELS 4
+
+#define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
static int zero = 0;
static int label_limit = (1 << 20) - 1;
@@ -58,24 +53,40 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
return rcu_dereference_rtnl(dev->mpls_ptr);
}
-static bool mpls_output_possible(const struct net_device *dev)
+bool mpls_output_possible(const struct net_device *dev)
{
return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
}
+EXPORT_SYMBOL_GPL(mpls_output_possible);
+
+static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh)
+{
+ u8 *nh0_via = PTR_ALIGN((u8 *)&rt->rt_nh[rt->rt_nhn], VIA_ALEN_ALIGN);
+ int nh_index = nh - rt->rt_nh;
+
+ return nh0_via + rt->rt_max_alen * nh_index;
+}
+
+static const u8 *mpls_nh_via(const struct mpls_route *rt,
+ const struct mpls_nh *nh)
+{
+ return __mpls_nh_via((struct mpls_route *)rt, (struct mpls_nh *)nh);
+}
-static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
{
/* The size of the layer 2.5 labels to be added for this route */
- return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+ return nh->nh_labels * sizeof(struct mpls_shim_hdr);
}
-static unsigned int mpls_dev_mtu(const struct net_device *dev)
+unsigned int mpls_dev_mtu(const struct net_device *dev)
{
/* The amount of data the layer 2 frame can hold */
return dev->mtu;
}
+EXPORT_SYMBOL_GPL(mpls_dev_mtu);
-static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
@@ -85,20 +96,87 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
return true;
}
+EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
+
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+ struct sk_buff *skb, bool bos)
+{
+ struct mpls_entry_decoded dec;
+ struct mpls_shim_hdr *hdr;
+ bool eli_seen = false;
+ int label_index;
+ int nh_index = 0;
+ u32 hash = 0;
+
+ /* No need to look further into packet if there's only
+ * one path
+ */
+ if (rt->rt_nhn == 1)
+ goto out;
+
+ for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
+ label_index++) {
+ if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
+ break;
+
+ /* Read and decode the current label */
+ hdr = mpls_hdr(skb) + label_index;
+ dec = mpls_entry_decode(hdr);
+
+ /* RFC6790 - reserved labels MUST NOT be used as keys
+ * for the load-balancing function
+ */
+ if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) {
+ hash = jhash_1word(dec.label, hash);
+
+ /* The entropy label follows the entropy label
+ * indicator, so this means that the entropy
+ * label was just added to the hash - no need to
+ * go any deeper either in the label stack or in the
+ * payload
+ */
+ if (eli_seen)
+ break;
+ } else if (dec.label == MPLS_LABEL_ENTROPY) {
+ eli_seen = true;
+ }
+
+ bos = dec.bos;
+ if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index +
+ sizeof(struct iphdr))) {
+ const struct iphdr *v4hdr;
+
+ v4hdr = (const struct iphdr *)(mpls_hdr(skb) +
+ label_index);
+ if (v4hdr->version == 4) {
+ hash = jhash_3words(ntohl(v4hdr->saddr),
+ ntohl(v4hdr->daddr),
+ v4hdr->protocol, hash);
+ } else if (v4hdr->version == 6 &&
+ pskb_may_pull(skb, sizeof(*hdr) * label_index +
+ sizeof(struct ipv6hdr))) {
+ const struct ipv6hdr *v6hdr;
+
+ v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) +
+ label_index);
+
+ hash = __ipv6_addr_jhash(&v6hdr->saddr, hash);
+ hash = __ipv6_addr_jhash(&v6hdr->daddr, hash);
+ hash = jhash_1word(v6hdr->nexthdr, hash);
+ }
+ }
+ }
+
+ nh_index = hash % rt->rt_nhn;
+out:
+ return &rt->rt_nh[nh_index];
+}
static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
struct mpls_entry_decoded dec)
{
- /* RFC4385 and RFC5586 encode other packets in mpls such that
- * they don't conflict with the ip version number, making
- * decoding by examining the ip version correct in everything
- * except for the strangest cases.
- *
- * The strange cases if we choose to support them will require
- * manual configuration.
- */
- struct iphdr *hdr4;
- bool success = true;
+ enum mpls_payload_type payload_type;
+ bool success = false;
/* The IPv4 code below accesses through the IPv4 header
* checksum, which is 12 bytes into the packet.
@@ -113,23 +191,32 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
if (!pskb_may_pull(skb, 12))
return false;
- /* Use ip_hdr to find the ip protocol version */
- hdr4 = ip_hdr(skb);
- if (hdr4->version == 4) {
+ payload_type = rt->rt_payload_type;
+ if (payload_type == MPT_UNSPEC)
+ payload_type = ip_hdr(skb)->version;
+
+ switch (payload_type) {
+ case MPT_IPV4: {
+ struct iphdr *hdr4 = ip_hdr(skb);
skb->protocol = htons(ETH_P_IP);
csum_replace2(&hdr4->check,
htons(hdr4->ttl << 8),
htons(dec.ttl << 8));
hdr4->ttl = dec.ttl;
+ success = true;
+ break;
}
- else if (hdr4->version == 6) {
+ case MPT_IPV6: {
struct ipv6hdr *hdr6 = ipv6_hdr(skb);
skb->protocol = htons(ETH_P_IPV6);
hdr6->hop_limit = dec.ttl;
+ success = true;
+ break;
}
- else
- /* version 0 and version 1 are used by pseudo wires */
- success = false;
+ case MPT_UNSPEC:
+ break;
+ }
+
return success;
}
@@ -139,6 +226,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
struct net *net = dev_net(dev);
struct mpls_shim_hdr *hdr;
struct mpls_route *rt;
+ struct mpls_nh *nh;
struct mpls_entry_decoded dec;
struct net_device *out_dev;
struct mpls_dev *mdev;
@@ -176,8 +264,12 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
if (!rt)
goto drop;
+ nh = mpls_select_multipath(rt, skb, dec.bos);
+ if (!nh)
+ goto drop;
+
/* Find the output device */
- out_dev = rcu_dereference(rt->rt_dev);
+ out_dev = rcu_dereference(nh->nh_dev);
if (!mpls_output_possible(out_dev))
goto drop;
@@ -192,7 +284,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
dec.ttl -= 1;
/* Verify the destination can hold the packet */
- new_header_size = mpls_rt_header_size(rt);
+ new_header_size = mpls_nh_header_size(nh);
mtu = mpls_dev_mtu(out_dev);
if (mpls_pkt_too_big(skb, mtu - new_header_size))
goto drop;
@@ -220,13 +312,20 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
/* Push the new labels */
hdr = mpls_hdr(skb);
bos = dec.bos;
- for (i = rt->rt_labels - 1; i >= 0; i--) {
- hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
+ for (i = nh->nh_labels - 1; i >= 0; i--) {
+ hdr[i] = mpls_entry_encode(nh->nh_label[i],
+ dec.ttl, 0, bos);
bos = false;
}
}
- err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
+ /* If via wasn't specified then send out using device address */
+ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
+ err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
+ out_dev->dev_addr, skb);
+ else
+ err = neigh_xmit(nh->nh_via_table, out_dev,
+ mpls_nh_via(rt, nh), skb);
if (err)
net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err);
@@ -248,25 +347,35 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
};
struct mpls_route_config {
- u32 rc_protocol;
- u32 rc_ifindex;
- u16 rc_via_table;
- u16 rc_via_alen;
- u8 rc_via[MAX_VIA_ALEN];
- u32 rc_label;
- u32 rc_output_labels;
- u32 rc_output_label[MAX_NEW_LABELS];
- u32 rc_nlflags;
- struct nl_info rc_nlinfo;
+ u32 rc_protocol;
+ u32 rc_ifindex;
+ u8 rc_via_table;
+ u8 rc_via_alen;
+ u8 rc_via[MAX_VIA_ALEN];
+ u32 rc_label;
+ u8 rc_output_labels;
+ u32 rc_output_label[MAX_NEW_LABELS];
+ u32 rc_nlflags;
+ enum mpls_payload_type rc_payload_type;
+ struct nl_info rc_nlinfo;
+ struct rtnexthop *rc_mp;
+ int rc_mp_len;
};
-static struct mpls_route *mpls_rt_alloc(size_t alen)
+static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
{
+ u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
struct mpls_route *rt;
- rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
- if (rt)
- rt->rt_via_alen = alen;
+ rt = kzalloc(ALIGN(sizeof(*rt) + num_nh * sizeof(*rt->rt_nh),
+ VIA_ALEN_ALIGN) +
+ num_nh * max_alen_aligned,
+ GFP_KERNEL);
+ if (rt) {
+ rt->rt_nhn = num_nh;
+ rt->rt_max_alen = max_alen_aligned;
+ }
+
return rt;
}
@@ -286,30 +395,27 @@ static void mpls_notify_route(struct net *net, unsigned index,
struct mpls_route *rt = new ? new : old;
unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0;
/* Ignore reserved labels for now */
- if (rt && (index >= 16))
+ if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED))
rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags);
}
static void mpls_route_update(struct net *net, unsigned index,
- struct net_device *dev, struct mpls_route *new,
+ struct mpls_route *new,
const struct nl_info *info)
{
struct mpls_route __rcu **platform_label;
- struct mpls_route *rt, *old = NULL;
+ struct mpls_route *rt;
ASSERT_RTNL();
platform_label = rtnl_dereference(net->mpls.platform_label);
rt = rtnl_dereference(platform_label[index]);
- if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
- rcu_assign_pointer(platform_label[index], new);
- old = rt;
- }
+ rcu_assign_pointer(platform_label[index], new);
- mpls_notify_route(net, index, old, new, info);
+ mpls_notify_route(net, index, rt, new, info);
/* If we removed a route free it now */
- mpls_rt_free(old);
+ mpls_rt_free(rt);
}
static unsigned find_free_label(struct net *net)
@@ -320,22 +426,300 @@ static unsigned find_free_label(struct net *net)
platform_label = rtnl_dereference(net->mpls.platform_label);
platform_labels = net->mpls.platform_labels;
- for (index = 16; index < platform_labels; index++) {
+ for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels;
+ index++) {
if (!rtnl_dereference(platform_label[index]))
return index;
}
return LABEL_NOT_SPECIFIED;
}
+#if IS_ENABLED(CONFIG_INET)
+static struct net_device *inet_fib_lookup_dev(struct net *net,
+ const void *addr)
+{
+ struct net_device *dev;
+ struct rtable *rt;
+ struct in_addr daddr;
+
+ memcpy(&daddr, addr, sizeof(struct in_addr));
+ rt = ip_route_output(net, daddr.s_addr, 0, 0, 0);
+ if (IS_ERR(rt))
+ return ERR_CAST(rt);
+
+ dev = rt->dst.dev;
+ dev_hold(dev);
+
+ ip_rt_put(rt);
+
+ return dev;
+}
+#else
+static struct net_device *inet_fib_lookup_dev(struct net *net,
+ const void *addr)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct net_device *inet6_fib_lookup_dev(struct net *net,
+ const void *addr)
+{
+ struct net_device *dev;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ if (!ipv6_stub)
+ return ERR_PTR(-EAFNOSUPPORT);
+
+ memset(&fl6, 0, sizeof(fl6));
+ memcpy(&fl6.daddr, addr, sizeof(struct in6_addr));
+ err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6);
+ if (err)
+ return ERR_PTR(err);
+
+ dev = dst->dev;
+ dev_hold(dev);
+ dst_release(dst);
+
+ return dev;
+}
+#else
+static struct net_device *inet6_fib_lookup_dev(struct net *net,
+ const void *addr)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+#endif
+
+static struct net_device *find_outdev(struct net *net,
+ struct mpls_route *rt,
+ struct mpls_nh *nh, int oif)
+{
+ struct net_device *dev = NULL;
+
+ if (!oif) {
+ switch (nh->nh_via_table) {
+ case NEIGH_ARP_TABLE:
+ dev = inet_fib_lookup_dev(net, mpls_nh_via(rt, nh));
+ break;
+ case NEIGH_ND_TABLE:
+ dev = inet6_fib_lookup_dev(net, mpls_nh_via(rt, nh));
+ break;
+ case NEIGH_LINK_TABLE:
+ break;
+ }
+ } else {
+ dev = dev_get_by_index(net, oif);
+ }
+
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ /* The caller is holding rtnl anyways, so release the dev reference */
+ dev_put(dev);
+
+ return dev;
+}
+
+static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
+ struct mpls_nh *nh, int oif)
+{
+ struct net_device *dev = NULL;
+ int err = -ENODEV;
+
+ dev = find_outdev(net, rt, nh, oif);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ dev = NULL;
+ goto errout;
+ }
+
+ /* Ensure this is a supported device */
+ err = -EINVAL;
+ if (!mpls_dev_get(dev))
+ goto errout;
+
+ if ((nh->nh_via_table == NEIGH_LINK_TABLE) &&
+ (dev->addr_len != nh->nh_via_alen))
+ goto errout;
+
+ RCU_INIT_POINTER(nh->nh_dev, dev);
+
+ return 0;
+
+errout:
+ return err;
+}
+
+static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
+ struct mpls_route *rt)
+{
+ struct net *net = cfg->rc_nlinfo.nl_net;
+ struct mpls_nh *nh = rt->rt_nh;
+ int err;
+ int i;
+
+ if (!nh)
+ return -ENOMEM;
+
+ err = -EINVAL;
+ /* Ensure only a supported number of labels are present */
+ if (cfg->rc_output_labels > MAX_NEW_LABELS)
+ goto errout;
+
+ nh->nh_labels = cfg->rc_output_labels;
+ for (i = 0; i < nh->nh_labels; i++)
+ nh->nh_label[i] = cfg->rc_output_label[i];
+
+ nh->nh_via_table = cfg->rc_via_table;
+ memcpy(__mpls_nh_via(rt, nh), cfg->rc_via, cfg->rc_via_alen);
+ nh->nh_via_alen = cfg->rc_via_alen;
+
+ err = mpls_nh_assign_dev(net, rt, nh, cfg->rc_ifindex);
+ if (err)
+ goto errout;
+
+ return 0;
+
+errout:
+ return err;
+}
+
+static int mpls_nh_build(struct net *net, struct mpls_route *rt,
+ struct mpls_nh *nh, int oif,
+ struct nlattr *via, struct nlattr *newdst)
+{
+ int err = -ENOMEM;
+
+ if (!nh)
+ goto errout;
+
+ if (newdst) {
+ err = nla_get_labels(newdst, MAX_NEW_LABELS,
+ &nh->nh_labels, nh->nh_label);
+ if (err)
+ goto errout;
+ }
+
+ if (via) {
+ err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table,
+ __mpls_nh_via(rt, nh));
+ if (err)
+ goto errout;
+ } else {
+ nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC;
+ }
+
+ err = mpls_nh_assign_dev(net, rt, nh, oif);
+ if (err)
+ goto errout;
+
+ return 0;
+
+errout:
+ return err;
+}
+
+static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
+ u8 cfg_via_alen, u8 *max_via_alen)
+{
+ int nhs = 0;
+ int remaining = len;
+
+ if (!rtnh) {
+ *max_via_alen = cfg_via_alen;
+ return 1;
+ }
+
+ *max_via_alen = 0;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ int attrlen;
+
+ attrlen = rtnh_attrlen(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nla_len(nla) >=
+ offsetof(struct rtvia, rtvia_addr)) {
+ int via_alen = nla_len(nla) -
+ offsetof(struct rtvia, rtvia_addr);
+
+ if (via_alen <= MAX_VIA_ALEN)
+ *max_via_alen = max_t(u16, *max_via_alen,
+ via_alen);
+ }
+
+ nhs++;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ /* leftover implies invalid nexthop configuration, discard it */
+ return remaining > 0 ? 0 : nhs;
+}
+
+static int mpls_nh_build_multi(struct mpls_route_config *cfg,
+ struct mpls_route *rt)
+{
+ struct rtnexthop *rtnh = cfg->rc_mp;
+ struct nlattr *nla_via, *nla_newdst;
+ int remaining = cfg->rc_mp_len;
+ int nhs = 0;
+ int err = 0;
+
+ change_nexthops(rt) {
+ int attrlen;
+
+ nla_via = NULL;
+ nla_newdst = NULL;
+
+ err = -EINVAL;
+ if (!rtnh_ok(rtnh, remaining))
+ goto errout;
+
+ /* neither weighted multipath nor any flags
+ * are supported
+ */
+ if (rtnh->rtnh_hops || rtnh->rtnh_flags)
+ goto errout;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *attrs = rtnh_attrs(rtnh);
+
+ nla_via = nla_find(attrs, attrlen, RTA_VIA);
+ nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
+ }
+
+ err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
+ rtnh->rtnh_ifindex, nla_via,
+ nla_newdst);
+ if (err)
+ goto errout;
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ nhs++;
+ } endfor_nexthops(rt);
+
+ rt->rt_nhn = nhs;
+
+ return 0;
+
+errout:
+ return err;
+}
+
static int mpls_route_add(struct mpls_route_config *cfg)
{
struct mpls_route __rcu **platform_label;
struct net *net = cfg->rc_nlinfo.nl_net;
- struct net_device *dev = NULL;
struct mpls_route *rt, *old;
- unsigned index;
- int i;
int err = -EINVAL;
+ u8 max_via_alen;
+ unsigned index;
+ int nhs;
index = cfg->rc_label;
@@ -345,33 +729,14 @@ static int mpls_route_add(struct mpls_route_config *cfg)
index = find_free_label(net);
}
- /* The first 16 labels are reserved, and may not be set */
- if (index < 16)
+ /* Reserved labels may not be set */
+ if (index < MPLS_LABEL_FIRST_UNRESERVED)
goto errout;
/* The full 20 bit range may not be supported. */
if (index >= net->mpls.platform_labels)
goto errout;
- /* Ensure only a supported number of labels are present */
- if (cfg->rc_output_labels > MAX_NEW_LABELS)
- goto errout;
-
- err = -ENODEV;
- dev = dev_get_by_index(net, cfg->rc_ifindex);
- if (!dev)
- goto errout;
-
- /* Ensure this is a supported device */
- err = -EINVAL;
- if (!mpls_dev_get(dev))
- goto errout;
-
- err = -EINVAL;
- if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
- (dev->addr_len != cfg->rc_via_alen))
- goto errout;
-
/* Append makes no sense with mpls */
err = -EOPNOTSUPP;
if (cfg->rc_nlflags & NLM_F_APPEND)
@@ -391,27 +756,34 @@ static int mpls_route_add(struct mpls_route_config *cfg)
if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
goto errout;
+ err = -EINVAL;
+ nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len,
+ cfg->rc_via_alen, &max_via_alen);
+ if (nhs == 0)
+ goto errout;
+
err = -ENOMEM;
- rt = mpls_rt_alloc(cfg->rc_via_alen);
+ rt = mpls_rt_alloc(nhs, max_via_alen);
if (!rt)
goto errout;
- rt->rt_labels = cfg->rc_output_labels;
- for (i = 0; i < rt->rt_labels; i++)
- rt->rt_label[i] = cfg->rc_output_label[i];
rt->rt_protocol = cfg->rc_protocol;
- RCU_INIT_POINTER(rt->rt_dev, dev);
- rt->rt_via_table = cfg->rc_via_table;
- memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
+ rt->rt_payload_type = cfg->rc_payload_type;
- mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
+ if (cfg->rc_mp)
+ err = mpls_nh_build_multi(cfg, rt);
+ else
+ err = mpls_nh_build_from_cfg(cfg, rt);
+ if (err)
+ goto freert;
+
+ mpls_route_update(net, index, rt, &cfg->rc_nlinfo);
- dev_put(dev);
return 0;
+freert:
+ mpls_rt_free(rt);
errout:
- if (dev)
- dev_put(dev);
return err;
}
@@ -423,15 +795,15 @@ static int mpls_route_del(struct mpls_route_config *cfg)
index = cfg->rc_label;
- /* The first 16 labels are reserved, and may not be removed */
- if (index < 16)
+ /* Reserved labels may not be removed */
+ if (index < MPLS_LABEL_FIRST_UNRESERVED)
goto errout;
/* The full 20 bit range may not be supported */
if (index >= net->mpls.platform_labels)
goto errout;
- mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
+ mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
err = 0;
errout:
@@ -528,9 +900,11 @@ static void mpls_ifdown(struct net_device *dev)
struct mpls_route *rt = rtnl_dereference(platform_label[index]);
if (!rt)
continue;
- if (rtnl_dereference(rt->rt_dev) != dev)
- continue;
- rt->rt_dev = NULL;
+ for_nexthops(rt) {
+ if (rtnl_dereference(nh->nh_dev) != dev)
+ continue;
+ nh->nh_dev = NULL;
+ } endfor_nexthops(rt);
}
mdev = mpls_dev_get(dev);
@@ -626,9 +1000,10 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
return 0;
}
+EXPORT_SYMBOL_GPL(nla_put_labels);
int nla_get_labels(const struct nlattr *nla,
- u32 max_labels, u32 *labels, u32 label[])
+ u32 max_labels, u8 *labels, u32 label[])
{
unsigned len = nla_len(nla);
unsigned nla_labels;
@@ -671,6 +1046,49 @@ int nla_get_labels(const struct nlattr *nla,
*labels = nla_labels;
return 0;
}
+EXPORT_SYMBOL_GPL(nla_get_labels);
+
+int nla_get_via(const struct nlattr *nla, u8 *via_alen,
+ u8 *via_table, u8 via_addr[])
+{
+ struct rtvia *via = nla_data(nla);
+ int err = -EINVAL;
+ int alen;
+
+ if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
+ goto errout;
+ alen = nla_len(nla) -
+ offsetof(struct rtvia, rtvia_addr);
+ if (alen > MAX_VIA_ALEN)
+ goto errout;
+
+ /* Validate the address family */
+ switch (via->rtvia_family) {
+ case AF_PACKET:
+ *via_table = NEIGH_LINK_TABLE;
+ break;
+ case AF_INET:
+ *via_table = NEIGH_ARP_TABLE;
+ if (alen != 4)
+ goto errout;
+ break;
+ case AF_INET6:
+ *via_table = NEIGH_ND_TABLE;
+ if (alen != 16)
+ goto errout;
+ break;
+ default:
+ /* Unsupported address family */
+ goto errout;
+ }
+
+ memcpy(via_addr, via->rtvia_addr, alen);
+ *via_alen = alen;
+ err = 0;
+
+errout:
+ return err;
+}
static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct mpls_route_config *cfg)
@@ -713,6 +1131,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->rc_label = LABEL_NOT_SPECIFIED;
cfg->rc_protocol = rtm->rtm_protocol;
+ cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC;
cfg->rc_nlflags = nlh->nlmsg_flags;
cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->rc_nlinfo.nlh = nlh;
@@ -735,48 +1154,28 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
break;
case RTA_DST:
{
- u32 label_count;
+ u8 label_count;
if (nla_get_labels(nla, 1, &label_count,
&cfg->rc_label))
goto errout;
- /* The first 16 labels are reserved, and may not be set */
- if (cfg->rc_label < 16)
+ /* Reserved labels may not be set */
+ if (cfg->rc_label < MPLS_LABEL_FIRST_UNRESERVED)
goto errout;
break;
}
case RTA_VIA:
{
- struct rtvia *via = nla_data(nla);
- if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
- goto errout;
- cfg->rc_via_alen = nla_len(nla) -
- offsetof(struct rtvia, rtvia_addr);
- if (cfg->rc_via_alen > MAX_VIA_ALEN)
- goto errout;
-
- /* Validate the address family */
- switch(via->rtvia_family) {
- case AF_PACKET:
- cfg->rc_via_table = NEIGH_LINK_TABLE;
- break;
- case AF_INET:
- cfg->rc_via_table = NEIGH_ARP_TABLE;
- if (cfg->rc_via_alen != 4)
- goto errout;
- break;
- case AF_INET6:
- cfg->rc_via_table = NEIGH_ND_TABLE;
- if (cfg->rc_via_alen != 16)
- goto errout;
- break;
- default:
- /* Unsupported address family */
+ if (nla_get_via(nla, &cfg->rc_via_alen,
+ &cfg->rc_via_table, cfg->rc_via))
goto errout;
- }
-
- memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
+ break;
+ }
+ case RTA_MULTIPATH:
+ {
+ cfg->rc_mp = nla_data(nla);
+ cfg->rc_mp_len = nla_len(nla);
break;
}
default:
@@ -837,16 +1236,54 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtm->rtm_type = RTN_UNICAST;
rtm->rtm_flags = 0;
- if (rt->rt_labels &&
- nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
- goto nla_put_failure;
- if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
- goto nla_put_failure;
- dev = rtnl_dereference(rt->rt_dev);
- if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
- goto nla_put_failure;
if (nla_put_labels(skb, RTA_DST, 1, &label))
goto nla_put_failure;
+ if (rt->rt_nhn == 1) {
+ const struct mpls_nh *nh = rt->rt_nh;
+
+ if (nh->nh_labels &&
+ nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
+ nh->nh_label))
+ goto nla_put_failure;
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC &&
+ nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh),
+ nh->nh_via_alen))
+ goto nla_put_failure;
+ dev = rtnl_dereference(nh->nh_dev);
+ if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ } else {
+ struct rtnexthop *rtnh;
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ for_nexthops(rt) {
+ rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+ if (!rtnh)
+ goto nla_put_failure;
+
+ dev = rtnl_dereference(nh->nh_dev);
+ if (dev)
+ rtnh->rtnh_ifindex = dev->ifindex;
+ if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
+ nh->nh_labels,
+ nh->nh_label))
+ goto nla_put_failure;
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC &&
+ nla_put_via(skb, nh->nh_via_table,
+ mpls_nh_via(rt, nh),
+ nh->nh_via_alen))
+ goto nla_put_failure;
+
+ /* length of rtnetlink header + attributes */
+ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+ } endfor_nexthops(rt);
+
+ nla_nest_end(skb, mp);
+ }
nlmsg_end(skb, nlh);
return 0;
@@ -866,8 +1303,8 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
ASSERT_RTNL();
index = cb->args[0];
- if (index < 16)
- index = 16;
+ if (index < MPLS_LABEL_FIRST_UNRESERVED)
+ index = MPLS_LABEL_FIRST_UNRESERVED;
platform_label = rtnl_dereference(net->mpls.platform_label);
platform_labels = net->mpls.platform_labels;
@@ -891,12 +1328,33 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
{
size_t payload =
NLMSG_ALIGN(sizeof(struct rtmsg))
- + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */
+ nla_total_size(4); /* RTA_DST */
- if (rt->rt_labels) /* RTA_NEWDST */
- payload += nla_total_size(rt->rt_labels * 4);
- if (rt->rt_dev) /* RTA_OIF */
- payload += nla_total_size(4);
+
+ if (rt->rt_nhn == 1) {
+ struct mpls_nh *nh = rt->rt_nh;
+
+ if (nh->nh_dev)
+ payload += nla_total_size(4); /* RTA_OIF */
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */
+ payload += nla_total_size(2 + nh->nh_via_alen);
+ if (nh->nh_labels) /* RTA_NEWDST */
+ payload += nla_total_size(nh->nh_labels * 4);
+ } else {
+ /* each nexthop is packed in an attribute */
+ size_t nhsize = 0;
+
+ for_nexthops(rt) {
+ nhsize += nla_total_size(sizeof(struct rtnexthop));
+ /* RTA_VIA */
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC)
+ nhsize += nla_total_size(2 + nh->nh_via_alen);
+ if (nh->nh_labels)
+ nhsize += nla_total_size(nh->nh_labels * 4);
+ } endfor_nexthops(rt);
+ /* nested attribute */
+ payload += nla_total_size(nhsize);
+ }
+
return payload;
}
@@ -948,23 +1406,29 @@ static int resize_platform_label_table(struct net *net, size_t limit)
/* In case the predefined labels need to be populated */
if (limit > MPLS_LABEL_IPV4NULL) {
struct net_device *lo = net->loopback_dev;
- rt0 = mpls_rt_alloc(lo->addr_len);
+ rt0 = mpls_rt_alloc(1, lo->addr_len);
if (!rt0)
goto nort0;
- RCU_INIT_POINTER(rt0->rt_dev, lo);
+ RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo);
rt0->rt_protocol = RTPROT_KERNEL;
- rt0->rt_via_table = NEIGH_LINK_TABLE;
- memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
+ rt0->rt_payload_type = MPT_IPV4;
+ rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE;
+ rt0->rt_nh->nh_via_alen = lo->addr_len;
+ memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr,
+ lo->addr_len);
}
if (limit > MPLS_LABEL_IPV6NULL) {
struct net_device *lo = net->loopback_dev;
- rt2 = mpls_rt_alloc(lo->addr_len);
+ rt2 = mpls_rt_alloc(1, lo->addr_len);
if (!rt2)
goto nort2;
- RCU_INIT_POINTER(rt2->rt_dev, lo);
+ RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo);
rt2->rt_protocol = RTPROT_KERNEL;
- rt2->rt_via_table = NEIGH_LINK_TABLE;
- memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
+ rt2->rt_payload_type = MPT_IPV6;
+ rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE;
+ rt2->rt_nh->nh_via_alen = lo->addr_len;
+ memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr,
+ lo->addr_len);
}
rtnl_lock();
@@ -974,7 +1438,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
/* Free any labels beyond the new table */
for (index = limit; index < old_limit; index++)
- mpls_route_update(net, index, NULL, NULL, NULL);
+ mpls_route_update(net, index, NULL, NULL);
/* Copy over the old labels */
cp_size = size;
@@ -1066,8 +1530,10 @@ static int mpls_net_init(struct net *net)
table[0].data = net;
net->mpls.ctl = register_net_sysctl(net, "net/mpls", table);
- if (net->mpls.ctl == NULL)
+ if (net->mpls.ctl == NULL) {
+ kfree(table);
return -ENOMEM;
+ }
return 0;
}
diff --git a/kernel/net/mpls/internal.h b/kernel/net/mpls/internal.h
index 8cabeb5a1..bde52ce88 100644
--- a/kernel/net/mpls/internal.h
+++ b/kernel/net/mpls/internal.h
@@ -21,6 +21,76 @@ struct mpls_dev {
struct sk_buff;
+#define LABEL_NOT_SPECIFIED (1 << 20)
+#define MAX_NEW_LABELS 2
+
+/* This maximum ha length copied from the definition of struct neighbour */
+#define VIA_ALEN_ALIGN sizeof(unsigned long)
+#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, VIA_ALEN_ALIGN))
+
+enum mpls_payload_type {
+ MPT_UNSPEC, /* IPv4 or IPv6 */
+ MPT_IPV4 = 4,
+ MPT_IPV6 = 6,
+
+ /* Other types not implemented:
+ * - Pseudo-wire with or without control word (RFC4385)
+ * - GAL (RFC5586)
+ */
+};
+
+struct mpls_nh { /* next hop label forwarding entry */
+ struct net_device __rcu *nh_dev;
+ u32 nh_label[MAX_NEW_LABELS];
+ u8 nh_labels;
+ u8 nh_via_alen;
+ u8 nh_via_table;
+};
+
+/* The route, nexthops and vias are stored together in the same memory
+ * block:
+ *
+ * +----------------------+
+ * | mpls_route |
+ * +----------------------+
+ * | mpls_nh 0 |
+ * +----------------------+
+ * | ... |
+ * +----------------------+
+ * | mpls_nh n-1 |
+ * +----------------------+
+ * | alignment padding |
+ * +----------------------+
+ * | via[rt_max_alen] 0 |
+ * +----------------------+
+ * | ... |
+ * +----------------------+
+ * | via[rt_max_alen] n-1 |
+ * +----------------------+
+ */
+struct mpls_route { /* next hop label forwarding entry */
+ struct rcu_head rt_rcu;
+ u8 rt_protocol;
+ u8 rt_payload_type;
+ u8 rt_max_alen;
+ unsigned int rt_nhn;
+ struct mpls_nh rt_nh[0];
+};
+
+#define for_nexthops(rt) { \
+ int nhsel; struct mpls_nh *nh; \
+ for (nhsel = 0, nh = (rt)->rt_nh; \
+ nhsel < (rt)->rt_nhn; \
+ nh++, nhsel++)
+
+#define change_nexthops(rt) { \
+ int nhsel; struct mpls_nh *nh; \
+ for (nhsel = 0, nh = (struct mpls_nh *)((rt)->rt_nh); \
+ nhsel < (rt)->rt_nhn; \
+ nh++, nhsel++)
+
+#define endfor_nexthops(rt) }
+
static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
{
return (struct mpls_shim_hdr *)skb_network_header(skb);
@@ -50,7 +120,14 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
return result;
}
-int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]);
-int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]);
+int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels,
+ const u32 label[]);
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
+ u32 label[]);
+int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
+ u8 via[]);
+bool mpls_output_possible(const struct net_device *dev);
+unsigned int mpls_dev_mtu(const struct net_device *dev);
+bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
#endif /* MPLS_INTERNAL_H */
diff --git a/kernel/net/mpls/mpls_gso.c b/kernel/net/mpls/mpls_gso.c
index 809df534a..0183b32da 100644
--- a/kernel/net/mpls/mpls_gso.c
+++ b/kernel/net/mpls/mpls_gso.c
@@ -62,6 +62,7 @@ out:
static struct packet_offload mpls_mc_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_MPLS_MC),
+ .priority = 15,
.callbacks = {
.gso_segment = mpls_gso_segment,
},
@@ -69,6 +70,7 @@ static struct packet_offload mpls_mc_offload __read_mostly = {
static struct packet_offload mpls_uc_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_MPLS_UC),
+ .priority = 15,
.callbacks = {
.gso_segment = mpls_gso_segment,
},
diff --git a/kernel/net/mpls/mpls_iptunnel.c b/kernel/net/mpls/mpls_iptunnel.c
new file mode 100644
index 000000000..64afd3d0b
--- /dev/null
+++ b/kernel/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,231 @@
+/*
+ * mpls tunnels An implementation mpls tunnels using the light weight tunnel
+ * infrastructure
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+ [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+ /* The size of the layer 2.5 labels to be added for this route */
+ return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+ struct mpls_shim_hdr *hdr;
+ struct net_device *out_dev;
+ unsigned int hh_len;
+ unsigned int new_header_size;
+ unsigned int mtu;
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = NULL;
+ struct rt6_info *rt6 = NULL;
+ int err = 0;
+ bool bos;
+ int i;
+ unsigned int ttl;
+
+ /* Obtain the ttl */
+ if (dst->ops->family == AF_INET) {
+ ttl = ip_hdr(skb)->ttl;
+ rt = (struct rtable *)dst;
+ } else if (dst->ops->family == AF_INET6) {
+ ttl = ipv6_hdr(skb)->hop_limit;
+ rt6 = (struct rt6_info *)dst;
+ } else {
+ goto drop;
+ }
+
+ skb_orphan(skb);
+
+ /* Find the output device */
+ out_dev = dst->dev;
+ if (!mpls_output_possible(out_dev) ||
+ !dst->lwtstate || skb_warn_if_lro(skb))
+ goto drop;
+
+ skb_forward_csum(skb);
+
+ tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate);
+
+ /* Verify the destination can hold the packet */
+ new_header_size = mpls_encap_size(tun_encap_info);
+ mtu = mpls_dev_mtu(out_dev);
+ if (mpls_pkt_too_big(skb, mtu - new_header_size))
+ goto drop;
+
+ hh_len = LL_RESERVED_SPACE(out_dev);
+ if (!out_dev->header_ops)
+ hh_len = 0;
+
+ /* Ensure there is enough space for the headers in the skb */
+ if (skb_cow(skb, hh_len + new_header_size))
+ goto drop;
+
+ skb_push(skb, new_header_size);
+ skb_reset_network_header(skb);
+
+ skb->dev = out_dev;
+ skb->protocol = htons(ETH_P_MPLS_UC);
+
+ /* Push the new labels */
+ hdr = mpls_hdr(skb);
+ bos = true;
+ for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+ hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+ ttl, 0, bos);
+ bos = false;
+ }
+
+ if (rt)
+ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+ skb);
+ else if (rt6)
+ err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+ skb);
+ if (err)
+ net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+ __func__, err);
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+ struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ int tun_encap_info_len;
+ int ret;
+
+ ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+ mpls_iptunnel_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[MPLS_IPTUNNEL_DST])
+ return -EINVAL;
+
+ tun_encap_info_len = sizeof(*tun_encap_info);
+
+ newts = lwtunnel_state_alloc(tun_encap_info_len);
+ if (!newts)
+ return -ENOMEM;
+
+ newts->len = tun_encap_info_len;
+ tun_encap_info = mpls_lwtunnel_encap(newts);
+ ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+ &tun_encap_info->labels, tun_encap_info->label);
+ if (ret)
+ goto errout;
+ newts->type = LWTUNNEL_ENCAP_MPLS;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+
+errout:
+ kfree(newts);
+ *ts = NULL;
+
+ return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+ tun_encap_info->label))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+ struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+ int l;
+
+ if (a_hdr->labels != b_hdr->labels)
+ return 1;
+
+ for (l = 0; l < MAX_NEW_LABELS; l++)
+ if (a_hdr->label[l] != b_hdr->label[l])
+ return 1;
+ return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+ .build_state = mpls_build_state,
+ .output = mpls_output,
+ .fill_encap = mpls_fill_encap_info,
+ .get_encap_size = mpls_encap_nlsize,
+ .cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/net/netfilter/Kconfig b/kernel/net/netfilter/Kconfig
index a0f3e6a3c..4692782b5 100644
--- a/kernel/net/netfilter/Kconfig
+++ b/kernel/net/netfilter/Kconfig
@@ -1,6 +1,14 @@
menu "Core Netfilter Configuration"
depends on NET && INET && NETFILTER
+config NETFILTER_INGRESS
+ bool "Netfilter ingress support"
+ default y
+ select NET_INGRESS
+ help
+ This allows you to classify packets from ingress using the Netfilter
+ infrastructure.
+
config NETFILTER_NETLINK
tristate
@@ -198,7 +206,7 @@ config NF_CONNTRACK_FTP
config NF_CONNTRACK_H323
tristate "H.323 protocol support"
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on NETFILTER_ADVANCED
help
H.323 is a VoIP signalling protocol from ITU-T. As one of the most
@@ -346,7 +354,7 @@ config NF_CT_NETLINK_HELPER
select NETFILTER_NETLINK
depends on NF_CT_NETLINK
depends on NETFILTER_NETLINK_QUEUE
- depends on NETFILTER_NETLINK_QUEUE_CT
+ depends on NETFILTER_NETLINK_GLUE_CT
depends on NETFILTER_ADVANCED
help
This option enables the user-space connection tracking helpers
@@ -354,13 +362,14 @@ config NF_CT_NETLINK_HELPER
If unsure, say `N'.
-config NETFILTER_NETLINK_QUEUE_CT
- bool "NFQUEUE integration with Connection Tracking"
- default n
- depends on NETFILTER_NETLINK_QUEUE
+config NETFILTER_NETLINK_GLUE_CT
+ bool "NFQUEUE and NFLOG integration with Connection Tracking"
+ default n
+ depends on (NETFILTER_NETLINK_QUEUE || NETFILTER_NETLINK_LOG) && NF_CT_NETLINK
help
- If this option is enabled, NFQUEUE can include Connection Tracking
- information together with the packet is the enqueued via NFNETLINK.
+ If this option is enabled, NFQUEUE and NFLOG can include
+ Connection Tracking information together with the packet is
+ the enqueued via NFNETLINK.
config NF_NAT
tristate
@@ -448,6 +457,11 @@ config NF_TABLES_INET
help
This option enables support for a mixed IPv4/IPv6 "inet" table.
+config NF_TABLES_NETDEV
+ tristate "Netfilter nf_tables netdev tables support"
+ help
+ This option enables support for the "netdev" table.
+
config NFT_EXTHDR
tristate "Netfilter nf_tables IPv6 exthdr module"
help
@@ -710,7 +724,7 @@ config NETFILTER_XT_TARGET_HL
config NETFILTER_XT_TARGET_HMARK
tristate '"HMARK" target support'
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
---help---
This option adds the "HMARK" target.
@@ -852,8 +866,10 @@ config NETFILTER_XT_TARGET_REDIRECT
config NETFILTER_XT_TARGET_TEE
tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV4
+ select NF_DUP_IPV6 if IP6_NF_IPTABLES != n
---help---
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -862,11 +878,11 @@ config NETFILTER_XT_TARGET_TPROXY
tristate '"TPROXY" target transparent proxying support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
- depends on (IPV6 || IPV6=n)
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IPV6 || IPV6=n
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
- select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
This option adds a `TPROXY' target, which is somewhat similar to
REDIRECT. It can only be used in the mangle table and is useful
@@ -902,7 +918,7 @@ config NETFILTER_XT_TARGET_SECMARK
config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
---help---
This option adds a `TCPMSS' target, which allows you to alter the
@@ -1114,7 +1130,7 @@ config NETFILTER_XT_MATCH_ESP
config NETFILTER_XT_MATCH_HASHLIMIT
tristate '"hashlimit" match support'
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
help
This option adds a `hashlimit' match.
@@ -1356,10 +1372,10 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
depends on !NF_CONNTRACK || NF_CONNTRACK
- depends on (IPV6 || IPV6=n)
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IPV6 || IPV6=n
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
select NF_DEFRAG_IPV4
- select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
This option adds a `socket' match, which can be used to match
packets for which a TCP or UDP socket lookup finds a valid socket.
diff --git a/kernel/net/netfilter/Makefile b/kernel/net/netfilter/Makefile
index a87d8b8ec..7638c36b4 100644
--- a/kernel/net/netfilter/Makefile
+++ b/kernel/net/netfilter/Makefile
@@ -10,8 +10,6 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
-nfnetlink_queue-y := nfnetlink_queue_core.o
-nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
@@ -75,6 +73,7 @@ nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
+obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
obj-$(CONFIG_NFT_META) += nft_meta.o
diff --git a/kernel/net/netfilter/core.c b/kernel/net/netfilter/core.c
index f0adf700b..10880c89d 100644
--- a/kernel/net/netfilter/core.c
+++ b/kernel/net/netfilter/core.c
@@ -40,6 +40,9 @@ EXPORT_SYMBOL(nf_afinfo);
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
+DEFINE_PER_CPU(bool, nf_skb_duplicated);
+EXPORT_SYMBOL_GPL(nf_skb_duplicated);
+
int nf_register_afinfo(const struct nf_afinfo *afinfo)
{
mutex_lock(&afinfo_mutex);
@@ -58,9 +61,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
}
EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
-struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
-EXPORT_SYMBOL(nf_hooks);
-
#ifdef HAVE_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
@@ -68,33 +68,168 @@ EXPORT_SYMBOL(nf_hooks_needed);
static DEFINE_MUTEX(nf_hook_mutex);
-int nf_register_hook(struct nf_hook_ops *reg)
+static struct list_head *nf_find_hook_list(struct net *net,
+ const struct nf_hook_ops *reg)
+{
+ struct list_head *hook_list = NULL;
+
+ if (reg->pf != NFPROTO_NETDEV)
+ hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+ else if (reg->hooknum == NF_NETDEV_INGRESS) {
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->dev && dev_net(reg->dev) == net)
+ hook_list = &reg->dev->nf_hooks_ingress;
+#endif
+ }
+ return hook_list;
+}
+
+struct nf_hook_entry {
+ const struct nf_hook_ops *orig_ops;
+ struct nf_hook_ops ops;
+};
+
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
+ struct list_head *hook_list;
+ struct nf_hook_entry *entry;
struct nf_hook_ops *elem;
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->orig_ops = reg;
+ entry->ops = *reg;
+
+ hook_list = nf_find_hook_list(net, reg);
+ if (!hook_list) {
+ kfree(entry);
+ return -ENOENT;
+ }
+
mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+ list_for_each_entry(elem, hook_list, list) {
if (reg->priority < elem->priority)
break;
}
- list_add_rcu(&reg->list, elem->list.prev);
+ list_add_rcu(&entry->ops.list, elem->list.prev);
mutex_unlock(&nf_hook_mutex);
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_inc_ingress_queue();
+#endif
#ifdef HAVE_JUMP_LABEL
static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
return 0;
}
-EXPORT_SYMBOL(nf_register_hook);
+EXPORT_SYMBOL(nf_register_net_hook);
-void nf_unregister_hook(struct nf_hook_ops *reg)
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
+ struct list_head *hook_list;
+ struct nf_hook_entry *entry;
+ struct nf_hook_ops *elem;
+
+ hook_list = nf_find_hook_list(net, reg);
+ if (!hook_list)
+ return;
+
mutex_lock(&nf_hook_mutex);
- list_del_rcu(&reg->list);
+ list_for_each_entry(elem, hook_list, list) {
+ entry = container_of(elem, struct nf_hook_entry, ops);
+ if (entry->orig_ops == reg) {
+ list_del_rcu(&entry->ops.list);
+ break;
+ }
+ }
mutex_unlock(&nf_hook_mutex);
+ if (&elem->list == hook_list) {
+ WARN(1, "nf_unregister_net_hook: hook not found!\n");
+ return;
+ }
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_dec_ingress_queue();
+#endif
#ifdef HAVE_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
synchronize_net();
+ nf_queue_nf_hook_drop(net, &entry->ops);
+ /* other cpu might still process nfqueue verdict that used reg */
+ synchronize_net();
+ kfree(entry);
+}
+EXPORT_SYMBOL(nf_unregister_net_hook);
+
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+ unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = nf_register_net_hook(net, &reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ nf_unregister_net_hooks(net, reg, i);
+ return err;
+}
+EXPORT_SYMBOL(nf_register_net_hooks);
+
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+ unsigned int n)
+{
+ while (n-- > 0)
+ nf_unregister_net_hook(net, &reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_net_hooks);
+
+static LIST_HEAD(nf_hook_list);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ struct net *net, *last;
+ int ret;
+
+ rtnl_lock();
+ for_each_net(net) {
+ ret = nf_register_net_hook(net, reg);
+ if (ret && ret != -ENOENT)
+ goto rollback;
+ }
+ list_add_tail(&reg->list, &nf_hook_list);
+ rtnl_unlock();
+
+ return 0;
+rollback:
+ last = net;
+ for_each_net(net) {
+ if (net == last)
+ break;
+ nf_unregister_net_hook(net, reg);
+ }
+ rtnl_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_del(&reg->list);
+ for_each_net(net)
+ nf_unregister_net_hook(net, reg);
+ rtnl_unlock();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -142,7 +277,7 @@ unsigned int nf_iterate(struct list_head *head,
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
repeat:
- verdict = (*elemp)->hook(*elemp, skb, state);
+ verdict = (*elemp)->hook((*elemp)->priv, skb, state);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
@@ -172,11 +307,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
/* We may already have this, but read-locks nest anyway */
rcu_read_lock();
- elem = list_entry_rcu(&nf_hooks[state->pf][state->hook],
- struct nf_hook_ops, list);
+ elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
next_hook:
- verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state,
- &elem);
+ verdict = nf_iterate(state->hook_list, skb, state, &elem);
if (verdict == NF_ACCEPT || verdict == NF_STOP) {
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
@@ -188,8 +321,6 @@ next_hook:
int err = nf_queue(skb, elem, state,
verdict >> NF_VERDICT_QBITS);
if (err < 0) {
- if (err == -ECANCELED)
- goto next_hook;
if (err == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
goto next_hook;
@@ -223,6 +354,12 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
}
EXPORT_SYMBOL(skb_make_writable);
+/* This needs to be compiled in any case to avoid dependencies between the
+ * nfnetlink_queue code and nf_conntrack.
+ */
+struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfnl_ct_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
@@ -260,12 +397,12 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
}
EXPORT_SYMBOL(nf_conntrack_destroy);
-struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
-EXPORT_SYMBOL_GPL(nfq_ct_hook);
-
-struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly;
-EXPORT_SYMBOL_GPL(nfq_ct_nat_hook);
-
+/* Built-in default zone used e.g. by modules. */
+const struct nf_conntrack_zone nf_ct_zone_dflt = {
+ .id = NF_CT_DEFAULT_ZONE_ID,
+ .dir = NF_CT_DEFAULT_ZONE_DIR,
+};
+EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */
#ifdef CONFIG_NF_NAT_NEEDED
@@ -273,8 +410,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
EXPORT_SYMBOL(nf_nat_decode_session_hook);
#endif
+static int nf_register_hook_list(struct net *net)
+{
+ struct nf_hook_ops *elem;
+ int ret;
+
+ rtnl_lock();
+ list_for_each_entry(elem, &nf_hook_list, list) {
+ ret = nf_register_net_hook(net, elem);
+ if (ret && ret != -ENOENT)
+ goto out_undo;
+ }
+ rtnl_unlock();
+ return 0;
+
+out_undo:
+ list_for_each_entry_continue_reverse(elem, &nf_hook_list, list)
+ nf_unregister_net_hook(net, elem);
+ rtnl_unlock();
+ return ret;
+}
+
+static void nf_unregister_hook_list(struct net *net)
+{
+ struct nf_hook_ops *elem;
+
+ rtnl_lock();
+ list_for_each_entry(elem, &nf_hook_list, list)
+ nf_unregister_net_hook(net, elem);
+ rtnl_unlock();
+}
+
static int __net_init netfilter_net_init(struct net *net)
{
+ int i, h, ret;
+
+ for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+ INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+ }
+
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
@@ -285,11 +460,16 @@ static int __net_init netfilter_net_init(struct net *net)
return -ENOMEM;
}
#endif
- return 0;
+ ret = nf_register_hook_list(net);
+ if (ret)
+ remove_proc_entry("netfilter", net->proc_net);
+
+ return ret;
}
static void __net_exit netfilter_net_exit(struct net *net)
{
+ nf_unregister_hook_list(net);
remove_proc_entry("netfilter", net->proc_net);
}
@@ -300,12 +480,7 @@ static struct pernet_operations netfilter_net_ops = {
int __init netfilter_init(void)
{
- int i, h, ret;
-
- for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
- for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&nf_hooks[i][h]);
- }
+ int ret;
ret = register_pernet_subsys(&netfilter_net_ops);
if (ret < 0)
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h
index 6f024a8a1..b0bc475f6 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -33,7 +33,7 @@
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype MTYPE
-#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id))
+#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
static void
mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
@@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
struct mtype *map = set->data;
init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
+ map->gc.data = (unsigned long)set;
map->gc.function = gc;
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
@@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set)
del_timer_sync(&map->gc);
ip_set_free(map->members);
- if (set->dsize) {
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set);
- ip_set_free(map->extensions);
- }
- kfree(map);
+ if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ ip_set_free(map);
set->data = NULL;
}
@@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
{
const struct mtype *map = set->data;
struct nlattr *nested;
+ size_t memsize = sizeof(*map) + map->memsize;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) +
- map->memsize +
- set->dsize * map->elements)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -144,10 +139,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (ret == IPSET_ADD_FAILED) {
if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(x, set)))
+ ip_set_timeout_expired(ext_timeout(x, set))) {
ret = 0;
- else if (!(flags & IPSET_FLAG_EXIST))
+ } else if (!(flags & IPSET_FLAG_EXIST)) {
+ set_bit(e->id, map->members);
return -IPSET_ERR_EXIST;
+ }
/* Element is re-added, cleanup extensions */
ip_set_ext_destroy(set, x);
}
@@ -165,6 +162,10 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ip_set_init_comment(ext_comment(x, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
+
+ /* Activate element */
+ set_bit(e->id, map->members);
+
return 0;
}
@@ -203,10 +204,13 @@ mtype_list(const struct ip_set *set,
struct nlattr *adt, *nested;
void *x;
u32 id, first = cb->args[IPSET_CB_ARG0];
+ int ret = 0;
adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!adt)
return -EMSGSIZE;
+ /* Extensions may be replaced */
+ rcu_read_lock();
for (; cb->args[IPSET_CB_ARG0] < map->elements;
cb->args[IPSET_CB_ARG0]++) {
id = cb->args[IPSET_CB_ARG0];
@@ -214,7 +218,7 @@ mtype_list(const struct ip_set *set,
if (!test_bit(id, map->members) ||
(SET_WITH_TIMEOUT(set) &&
#ifdef IP_SET_BITMAP_STORED_TIMEOUT
- mtype_is_filled((const struct mtype_elem *) x) &&
+ mtype_is_filled((const struct mtype_elem *)x) &&
#endif
ip_set_timeout_expired(ext_timeout(x, set))))
continue;
@@ -222,14 +226,16 @@ mtype_list(const struct ip_set *set,
if (!nested) {
if (id == first) {
nla_nest_cancel(skb, adt);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ goto nla_put_failure;
}
if (mtype_do_list(skb, map, id, set->dsize))
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, x,
- mtype_is_filled((const struct mtype_elem *) x)))
+ mtype_is_filled((const struct mtype_elem *)x)))
goto nla_put_failure;
ipset_nest_end(skb, nested);
}
@@ -238,29 +244,32 @@ mtype_list(const struct ip_set *set,
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
- return 0;
+ goto out;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(id == first)) {
cb->args[IPSET_CB_ARG0] = 0;
- return -EMSGSIZE;
+ ret = -EMSGSIZE;
}
ipset_nest_end(skb, adt);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static void
mtype_gc(unsigned long ul_set)
{
- struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set *set = (struct ip_set *)ul_set;
struct mtype *map = set->data;
void *x;
u32 id;
/* We run parallel with other readers (test element)
- * but adding/deleting new entries is locked out */
- read_lock_bh(&set->lock);
+ * but adding/deleting new entries is locked out
+ */
+ spin_lock_bh(&set->lock);
for (id = 0; id < map->elements; id++)
if (mtype_gc_test(id, map, set->dsize)) {
x = get_ext(set, map, id);
@@ -269,7 +278,7 @@ mtype_gc(unsigned long ul_set)
ip_set_ext_destroy(set, x);
}
}
- read_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c
index 55b083ec5..4783efff0 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -36,11 +36,11 @@ IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip");
#define MTYPE bitmap_ip
+#define HOST_MASK 32
/* Type structure */
struct bitmap_ip {
void *members; /* the set members */
- void *extensions; /* data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -48,6 +48,8 @@ struct bitmap_ip {
size_t memsize; /* members size */
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
+ unsigned char extensions[0] /* data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
@@ -58,7 +60,7 @@ struct bitmap_ip_adt_elem {
static inline u32
ip_to_id(const struct bitmap_ip *m, u32 ip)
{
- return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts;
}
/* Common functions */
@@ -80,7 +82,7 @@ static inline int
bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
u32 flags, size_t dsize)
{
- return !!test_and_set_bit(e->id, map->members);
+ return !!test_bit(e->id, map->members);
}
static inline int
@@ -137,20 +139,17 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -174,11 +173,12 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
- } else
+ } else {
ip_to = ip;
+ }
if (ip_to > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
@@ -189,8 +189,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -225,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -277,16 +270,17 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (cidr >= 32)
+ if (cidr >= HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(first_ip, last_ip, cidr);
- } else
+ } else {
return -IPSET_ERR_PROTOCOL;
+ }
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
- if (netmask > 32)
+ if (netmask > HOST_MASK)
return -IPSET_ERR_INVALID_NETMASK;
first_ip &= ip_set_hostmask(netmask);
@@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
pr_debug("hosts %u, elements %llu\n",
hosts, (unsigned long long)elements);
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ set->dsize = ip_set_elem_len(set, tb, 0, 0);
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
map->memsize = bitmap_bytes(0, elements - 1);
set->variant = &bitmap_ip;
- set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
kfree(map);
@@ -360,7 +354,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -377,6 +372,7 @@ bitmap_ip_init(void)
static void __exit
bitmap_ip_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&bitmap_ip_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 86104744b..29dde2083 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip,mac");
#define MTYPE bitmap_ipmac
+#define HOST_MASK 32
#define IP_SET_BITMAP_STORED_TIMEOUT
enum {
@@ -46,24 +47,26 @@ enum {
/* Type structure */
struct bitmap_ipmac {
void *members; /* the set members */
- void *extensions; /* MAC + data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
+ unsigned char extensions[0] /* MAC + data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
struct bitmap_ipmac_adt_elem {
+ unsigned char ether[ETH_ALEN] __aligned(2);
u16 id;
- unsigned char *ether;
+ u16 add_mac;
};
struct bitmap_ipmac_elem {
unsigned char ether[ETH_ALEN];
unsigned char filled;
-} __attribute__ ((aligned));
+} __aligned(__alignof__(u64));
static inline u32
ip_to_id(const struct bitmap_ipmac *m, u32 ip)
@@ -71,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip)
return ip - m->first_ip;
}
-static inline struct bitmap_ipmac_elem *
-get_elem(void *extensions, u16 id, size_t dsize)
-{
- return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
-}
+#define get_elem(extensions, id, dsize) \
+ (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize))
+
+#define get_const_elem(extensions, id, dsize) \
+ (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize))
/* Common functions */
@@ -87,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
if (!test_bit(e->id, map->members))
return 0;
- elem = get_elem(map->extensions, e->id, dsize);
- if (elem->filled == MAC_FILLED)
- return e->ether == NULL ||
- ether_addr_equal(e->ether, elem->ether);
+ elem = get_const_elem(map->extensions, e->id, dsize);
+ if (e->add_mac && elem->filled == MAC_FILLED)
+ return ether_addr_equal(e->ether, elem->ether);
/* Trigger kernel to fill out the ethernet address */
return -EAGAIN;
}
@@ -102,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
if (!test_bit(id, map->members))
return 0;
- elem = get_elem(map->extensions, id, dsize);
+ elem = get_const_elem(map->extensions, id, dsize);
/* Timer not started for the incomplete elements */
return elem->filled == MAC_FILLED;
}
@@ -130,8 +132,9 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,
/* If MAC is unset yet, we store plain timeout value
* because the timer is not activated yet
* and we can reuse it later when MAC is filled out,
- * possibly by the kernel */
- if (e->ether)
+ * possibly by the kernel
+ */
+ if (e->add_mac)
ip_set_timeout_set(timeout, t);
else
*timeout = t;
@@ -146,28 +149,35 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
struct bitmap_ipmac_elem *elem;
elem = get_elem(map->extensions, e->id, dsize);
- if (test_and_set_bit(e->id, map->members)) {
+ if (test_bit(e->id, map->members)) {
if (elem->filled == MAC_FILLED) {
- if (e->ether && (flags & IPSET_FLAG_EXIST))
- memcpy(elem->ether, e->ether, ETH_ALEN);
+ if (e->add_mac &&
+ (flags & IPSET_FLAG_EXIST) &&
+ !ether_addr_equal(e->ether, elem->ether)) {
+ /* memcpy isn't atomic */
+ clear_bit(e->id, map->members);
+ smp_mb__after_atomic();
+ ether_addr_copy(elem->ether, e->ether);
+ }
return IPSET_ADD_FAILED;
- } else if (!e->ether)
+ } else if (!e->add_mac)
/* Already added without ethernet address */
return IPSET_ADD_FAILED;
/* Fill the MAC address and trigger the timer activation */
- memcpy(elem->ether, e->ether, ETH_ALEN);
+ clear_bit(e->id, map->members);
+ smp_mb__after_atomic();
+ ether_addr_copy(elem->ether, e->ether);
elem->filled = MAC_FILLED;
return IPSET_ADD_START_STORED_TIMEOUT;
- } else if (e->ether) {
+ } else if (e->add_mac) {
/* We can store MAC too */
- memcpy(elem->ether, e->ether, ETH_ALEN);
+ ether_addr_copy(elem->ether, e->ether);
elem->filled = MAC_FILLED;
return 0;
- } else {
- elem->filled = MAC_UNSET;
- /* MAC is not stored yet, don't start timer */
- return IPSET_ADD_STORE_PLAIN_TIMEOUT;
}
+ elem->filled = MAC_UNSET;
+ /* MAC is not stored yet, don't start timer */
+ return IPSET_ADD_STORE_PLAIN_TIMEOUT;
}
static inline int
@@ -182,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
u32 id, size_t dsize)
{
const struct bitmap_ipmac_elem *elem =
- get_elem(map->extensions, id, dsize);
+ get_const_elem(map->extensions, id, dsize);
return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
htonl(map->first_ip + id)) ||
@@ -204,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
{
struct bitmap_ipmac *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct bitmap_ipmac_adt_elem e = { .id = 0 };
+ struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
u32 ip;
@@ -222,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
return -EINVAL;
e.id = ip_to_id(map, ip);
- e.ether = eth_hdr(skb)->h_source;
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -238,20 +248,17 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0;
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -259,11 +266,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
e.id = ip_to_id(map, ip);
- if (tb[IPSET_ATTR_ETHER])
- e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
- else
- e.ether = NULL;
-
+ if (tb[IPSET_ATTR_ETHER]) {
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ e.add_mac = 1;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
@@ -294,13 +300,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -343,25 +342,27 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (cidr >= 32)
+ if (cidr >= HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(first_ip, last_ip, cidr);
- } else
+ } else {
return -IPSET_ERR_PROTOCOL;
+ }
elements = (u64)last_ip - first_ip + 1;
if (elements > IPSET_BITMAP_MAX_RANGE + 1)
return -IPSET_ERR_BITMAP_RANGE_SIZE;
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct bitmap_ipmac_elem),
+ __alignof__(struct bitmap_ipmac_elem));
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
map->memsize = bitmap_bytes(0, elements - 1);
set->variant = &bitmap_ipmac;
- set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct bitmap_ipmac_elem));
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
kfree(map);
return -ENOMEM;
@@ -397,7 +398,8 @@ static struct ip_set_type bitmap_ipmac_type = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -414,6 +416,7 @@ bitmap_ipmac_init(void)
static void __exit
bitmap_ipmac_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&bitmap_ipmac_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c
index 005dd3644..7f0c73335 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port");
/* Type structure */
struct bitmap_port {
void *members; /* the set members */
- void *extensions; /* data extensions */
u16 first_port; /* host byte order, included in range */
u16 last_port; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
+ unsigned char extensions[0] /* data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
@@ -73,7 +74,7 @@ static inline int
bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
struct bitmap_port *map, u32 flags, size_t dsize)
{
- return !!test_and_set_bit(e->id, map->members);
+ return !!test_bit(e->id, map->members);
}
static inline int
@@ -136,19 +137,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
u16 port_to;
int ret = 0;
- if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
+ return -IPSET_ERR_PROTOCOL;
+
port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
if (port < map->first_port || port > map->last_port)
return -IPSET_ERR_BITMAP_RANGE;
@@ -168,8 +163,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
if (port < map->first_port)
return -IPSET_ERR_BITMAP_RANGE;
}
- } else
+ } else {
port_to = port;
+ }
if (port_to > map->last_port)
return -IPSET_ERR_BITMAP_RANGE;
@@ -180,8 +176,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -214,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * map->elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_port = first_port;
map->last_port = last_port;
set->timeout = IPSET_NO_TIMEOUT;
@@ -237,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
{
struct bitmap_port *map;
u16 first_port, last_port;
+ u32 elements;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -253,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
last_port = tmp;
}
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ elements = last_port - first_port + 1;
+ set->dsize = ip_set_elem_len(set, tb, 0, 0);
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
- map->elements = last_port - first_port + 1;
+ map->elements = elements;
map->memsize = bitmap_bytes(0, map->elements);
set->variant = &bitmap_port;
- set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_port(set, map, first_port, last_port)) {
kfree(map);
return -ENOMEM;
@@ -294,7 +285,8 @@ static struct ip_set_type bitmap_port_type = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -311,6 +303,7 @@ bitmap_port_init(void)
static void __exit
bitmap_port_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&bitmap_port_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_core.c b/kernel/net/netfilter/ipset/ip_set_core.c
index d259da3ce..54f3d7cb2 100644
--- a/kernel/net/netfilter/ipset/ip_set_core.c
+++ b/kernel/net/netfilter/ipset/ip_set_core.c
@@ -32,8 +32,10 @@ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */
struct ip_set_net {
struct ip_set * __rcu *ip_set_list; /* all individual sets */
ip_set_id_t ip_set_max; /* max number of sets */
- int is_deleted; /* deleted by ip_set_net_exit */
+ bool is_deleted; /* deleted by ip_set_net_exit */
+ bool is_destroyed; /* all sets are destroyed */
};
+
static int ip_set_net_id __read_mostly;
static inline struct ip_set_net *ip_set_pernet(struct net *net)
@@ -42,7 +44,7 @@ static inline struct ip_set_net *ip_set_pernet(struct net *net)
}
#define IP_SET_INC 64
-#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+#define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
static unsigned int max_sets;
@@ -59,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
#define ip_set(inst, id) \
ip_set_dereference((inst)->ip_set_list)[id]
-/*
- * The set types are implemented in modules and registered set types
+/* The set types are implemented in modules and registered set types
* can be found in ip_set_type_list. Adding/deleting types is
* serialized by ip_set_type_mutex.
*/
@@ -85,7 +86,7 @@ find_set_type(const char *name, u8 family, u8 revision)
struct ip_set_type *type;
list_for_each_entry_rcu(type, &ip_set_type_list, list)
- if (STREQ(type->name, name) &&
+ if (STRNCMP(type->name, name) &&
(type->family == family ||
type->family == NFPROTO_UNSPEC) &&
revision >= type->revision_min &&
@@ -130,9 +131,10 @@ __find_set_type_get(const char *name, u8 family, u8 revision,
goto unlock;
}
/* Make sure the type is already loaded
- * but we don't support the revision */
+ * but we don't support the revision
+ */
list_for_each_entry_rcu(type, &ip_set_type_list, list)
- if (STREQ(type->name, name)) {
+ if (STRNCMP(type->name, name)) {
err = -IPSET_ERR_FIND_TYPE;
goto unlock;
}
@@ -166,7 +168,7 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
*min = 255; *max = 0;
rcu_read_lock();
list_for_each_entry_rcu(type, &ip_set_type_list, list)
- if (STREQ(type->name, name) &&
+ if (STRNCMP(type->name, name) &&
(type->family == family ||
type->family == NFPROTO_UNSPEC)) {
found = true;
@@ -208,15 +210,15 @@ ip_set_type_register(struct ip_set_type *type)
pr_warn("ip_set type %s, family %s with revision min %u already registered!\n",
type->name, family_name(type->family),
type->revision_min);
- ret = -EINVAL;
- goto unlock;
+ ip_set_type_unlock();
+ return -EINVAL;
}
list_add_rcu(&type->list, &ip_set_type_list);
pr_debug("type %s, family %s, revision %u:%u registered.\n",
type->name, family_name(type->family),
type->revision_min, type->revision_max);
-unlock:
ip_set_type_unlock();
+
return ret;
}
EXPORT_SYMBOL_GPL(ip_set_type_register);
@@ -230,12 +232,12 @@ ip_set_type_unregister(struct ip_set_type *type)
pr_warn("ip_set type %s, family %s with revision min %u not registered\n",
type->name, family_name(type->family),
type->revision_min);
- goto unlock;
+ ip_set_type_unlock();
+ return;
}
list_del_rcu(&type->list);
pr_debug("type %s, family %s with revision min %u unregistered.\n",
type->name, family_name(type->family), type->revision_min);
-unlock:
ip_set_type_unlock();
synchronize_rcu();
@@ -289,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
int
ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
{
- struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1];
if (unlikely(!flag_nested(nla)))
return -IPSET_ERR_PROTOCOL;
@@ -306,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
int
ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
{
- struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1];
if (unlikely(!flag_nested(nla)))
return -IPSET_ERR_PROTOCOL;
@@ -317,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
return -IPSET_ERR_PROTOCOL;
memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
- sizeof(struct in6_addr));
+ sizeof(struct in6_addr));
return 0;
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
@@ -362,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
}
size_t
-ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
+ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len,
+ size_t align)
{
enum ip_set_ext_id id;
- size_t offset = 0;
u32 cadt_flags = 0;
if (tb[IPSET_ATTR_CADT_FLAGS])
cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
set->flags |= IPSET_CREATE_FLAG_FORCEADD;
+ if (!align)
+ align = 1;
for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
if (!add_extension(id, cadt_flags, tb))
continue;
- offset += ALIGN(len + offset, ip_set_extensions[id].align);
- set->offset[id] = offset;
+ len = ALIGN(len, ip_set_extensions[id].align);
+ set->offset[id] = len;
set->extensions |= ip_set_extensions[id].type;
- offset += ip_set_extensions[id].len;
+ len += ip_set_extensions[id].len;
}
- return len + offset;
+ return ALIGN(len, align);
}
EXPORT_SYMBOL_GPL(ip_set_elem_len);
@@ -389,13 +393,22 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext *ext)
{
u64 fullmark;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!(set->extensions & IPSET_EXT_TIMEOUT))
+ if (!SET_WITH_TIMEOUT(set))
return -IPSET_ERR_TIMEOUT;
ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
}
if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
- if (!(set->extensions & IPSET_EXT_COUNTER))
+ if (!SET_WITH_COUNTER(set))
return -IPSET_ERR_COUNTER;
if (tb[IPSET_ATTR_BYTES])
ext->bytes = be64_to_cpu(nla_get_be64(
@@ -405,25 +418,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
tb[IPSET_ATTR_PACKETS]));
}
if (tb[IPSET_ATTR_COMMENT]) {
- if (!(set->extensions & IPSET_EXT_COMMENT))
+ if (!SET_WITH_COMMENT(set))
return -IPSET_ERR_COMMENT;
ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]);
}
if (tb[IPSET_ATTR_SKBMARK]) {
- if (!(set->extensions & IPSET_EXT_SKBINFO))
+ if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
ext->skbmark = fullmark >> 32;
ext->skbmarkmask = fullmark & 0xffffffff;
}
if (tb[IPSET_ATTR_SKBPRIO]) {
- if (!(set->extensions & IPSET_EXT_SKBINFO))
+ if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
ext->skbprio = be32_to_cpu(nla_get_be32(
tb[IPSET_ATTR_SKBPRIO]));
}
if (tb[IPSET_ATTR_SKBQUEUE]) {
- if (!(set->extensions & IPSET_EXT_SKBINFO))
+ if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
ext->skbqueue = be16_to_cpu(nla_get_be16(
tb[IPSET_ATTR_SKBQUEUE]));
@@ -432,8 +445,32 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
}
EXPORT_SYMBOL_GPL(ip_set_get_extensions);
-/*
- * Creating/destroying/renaming/swapping affect the existence and
+int
+ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
+ const void *e, bool active)
+{
+ if (SET_WITH_TIMEOUT(set)) {
+ unsigned long *timeout = ext_timeout(e, set);
+
+ if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(active ? ip_set_timeout_get(timeout)
+ : *timeout)))
+ return -EMSGSIZE;
+ }
+ if (SET_WITH_COUNTER(set) &&
+ ip_set_put_counter(skb, ext_counter(e, set)))
+ return -EMSGSIZE;
+ if (SET_WITH_COMMENT(set) &&
+ ip_set_put_comment(skb, ext_comment(e, set)))
+ return -EMSGSIZE;
+ if (SET_WITH_SKBINFO(set) &&
+ ip_set_put_skbinfo(skb, ext_skbinfo(e, set)))
+ return -EMSGSIZE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_put_extensions);
+
+/* Creating/destroying/renaming/swapping affect the existence and
* the properties of a set. All of these can be executed from userspace
* only and serialized by the nfnl mutex indirectly from nfnetlink.
*
@@ -460,8 +497,7 @@ __ip_set_put(struct ip_set *set)
write_unlock_bh(&ip_set_ref_lock);
}
-/*
- * Add, del and test set entries from kernel.
+/* Add, del and test set entries from kernel.
*
* The set behind the index must exist and must be referenced
* so it can't be destroyed (or changed) under our foot.
@@ -485,27 +521,26 @@ int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(
- dev_net(par->in ? par->in : par->out), index);
+ struct ip_set *set = ip_set_rcu_get(par->net, index);
int ret = 0;
- BUG_ON(set == NULL);
+ BUG_ON(!set);
pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension ||
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return 0;
- read_lock_bh(&set->lock);
+ rcu_read_lock_bh();
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
- read_unlock_bh(&set->lock);
+ rcu_read_unlock_bh();
if (ret == -EAGAIN) {
/* Type requests element to be completed */
pr_debug("element must be completed, ADD is triggered\n");
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
ret = 1;
} else {
/* --return-nomatch: invert matched element */
@@ -524,20 +559,19 @@ int
ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(
- dev_net(par->in ? par->in : par->out), index);
+ struct ip_set *set = ip_set_rcu_get(par->net, index);
int ret;
- BUG_ON(set == NULL);
+ BUG_ON(!set);
pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension ||
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
return ret;
}
@@ -547,27 +581,25 @@ int
ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(
- dev_net(par->in ? par->in : par->out), index);
+ struct ip_set *set = ip_set_rcu_get(par->net, index);
int ret = 0;
- BUG_ON(set == NULL);
+ BUG_ON(!set);
pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension ||
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
return ret;
}
EXPORT_SYMBOL_GPL(ip_set_del);
-/*
- * Find set by name, reference it once. The reference makes sure the
+/* Find set by name, reference it once. The reference makes sure the
* thing pointed to, does not go away under our feet.
*
*/
@@ -581,7 +613,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
rcu_read_lock();
for (i = 0; i < inst->ip_set_max; i++) {
s = rcu_dereference(inst->ip_set_list)[i];
- if (s != NULL && STREQ(s->name, name)) {
+ if (s && STRNCMP(s->name, name)) {
__ip_set_get(s);
index = i;
*set = s;
@@ -594,8 +626,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
}
EXPORT_SYMBOL_GPL(ip_set_get_byname);
-/*
- * If the given set pointer points to a valid set, decrement
+/* If the given set pointer points to a valid set, decrement
* reference count by 1. The caller shall not assume the index
* to be valid, after calling this function.
*
@@ -608,7 +639,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
rcu_read_lock();
set = rcu_dereference(inst->ip_set_list)[index];
- if (set != NULL)
+ if (set)
__ip_set_put(set);
rcu_read_unlock();
}
@@ -622,8 +653,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
}
EXPORT_SYMBOL_GPL(ip_set_put_byindex);
-/*
- * Get the name of a set behind a set index.
+/* Get the name of a set behind a set index.
* We assume the set is referenced, so it does exist and
* can't be destroyed. The set cannot be renamed due to
* the referencing either.
@@ -634,7 +664,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index)
{
const struct ip_set *set = ip_set_rcu_get(net, index);
- BUG_ON(set == NULL);
+ BUG_ON(!set);
BUG_ON(set->ref == 0);
/* Referenced, so it's safe */
@@ -642,13 +672,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index)
}
EXPORT_SYMBOL_GPL(ip_set_name_byindex);
-/*
- * Routines to call by external subsystems, which do not
+/* Routines to call by external subsystems, which do not
* call nfnl_lock for us.
*/
-/*
- * Find set by index, reference it once. The reference makes sure the
+/* Find set by index, reference it once. The reference makes sure the
* thing pointed to, does not go away under our feet.
*
* The nfnl mutex is used in the function.
@@ -674,8 +702,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
}
EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
-/*
- * If the given set pointer points to a valid set, decrement
+/* If the given set pointer points to a valid set, decrement
* reference count by 1. The caller shall not assume the index
* to be valid, after calling this function.
*
@@ -690,15 +717,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index)
nfnl_lock(NFNL_SUBSYS_IPSET);
if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
set = ip_set(inst, index);
- if (set != NULL)
+ if (set)
__ip_set_put(set);
}
nfnl_unlock(NFNL_SUBSYS_IPSET);
}
EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
-/*
- * Communication protocol with userspace over netlink.
+/* Communication protocol with userspace over netlink.
*
* The commands are serialized by the nfnl mutex.
*/
@@ -725,7 +751,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ if (!nlh)
return NULL;
nfmsg = nlmsg_data(nlh);
@@ -758,7 +784,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
*id = IPSET_INVALID_ID;
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);
- if (set != NULL && STREQ(set->name, name)) {
+ if (set && STRNCMP(set->name, name)) {
*id = i;
break;
}
@@ -784,10 +810,10 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
*index = IPSET_INVALID_ID;
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s == NULL) {
+ if (!s) {
if (*index == IPSET_INVALID_ID)
*index = i;
- } else if (STREQ(name, s->name)) {
+ } else if (STRNCMP(name, s->name)) {
/* Name clash */
*set = s;
return -EEXIST;
@@ -816,18 +842,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set, *clash = NULL;
ip_set_id_t index = IPSET_INVALID_ID;
- struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
const char *name, *typename;
u8 family, revision;
u32 flags = flag_exist(nlh);
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_TYPENAME] == NULL ||
- attr[IPSET_ATTR_REVISION] == NULL ||
- attr[IPSET_ATTR_FAMILY] == NULL ||
- (attr[IPSET_ATTR_DATA] != NULL &&
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_TYPENAME] ||
+ !attr[IPSET_ATTR_REVISION] ||
+ !attr[IPSET_ATTR_FAMILY] ||
+ (attr[IPSET_ATTR_DATA] &&
!flag_nested(attr[IPSET_ATTR_DATA]))))
return -IPSET_ERR_PROTOCOL;
@@ -838,33 +864,29 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
name, typename, family_name(family), revision);
- /*
- * First, and without any locks, allocate and initialize
+ /* First, and without any locks, allocate and initialize
* a normal base set structure.
*/
- set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ set = kzalloc(sizeof(*set), GFP_KERNEL);
if (!set)
return -ENOMEM;
- rwlock_init(&set->lock);
+ spin_lock_init(&set->lock);
strlcpy(set->name, name, IPSET_MAXNAMELEN);
set->family = family;
set->revision = revision;
- /*
- * Next, check that we know the type, and take
+ /* Next, check that we know the type, and take
* a reference on the type, to make sure it stays available
* while constructing our new set.
*
* After referencing the type, we try to create the type
* specific part of the set without holding any locks.
*/
- ret = find_set_type_get(typename, family, revision, &(set->type));
+ ret = find_set_type_get(typename, family, revision, &set->type);
if (ret)
goto out;
- /*
- * Without holding any locks, create private part.
- */
+ /* Without holding any locks, create private part. */
if (attr[IPSET_ATTR_DATA] &&
nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
set->type->create_policy)) {
@@ -878,8 +900,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
/* BTW, ret==0 here. */
- /*
- * Here, we have a valid, constructed set and we are protected
+ /* Here, we have a valid, constructed set and we are protected
* by the nfnl mutex. Find the first free index in ip_set_list
* and check clashing.
*/
@@ -887,7 +908,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
if (ret == -EEXIST) {
/* If this is the same set and requested, ignore error */
if ((flags & IPSET_FLAG_EXIST) &&
- STREQ(set->type->name, clash->type->name) &&
+ STRNCMP(set->type->name, clash->type->name) &&
set->type->family == clash->type->family &&
set->type->revision_min == clash->type->revision_min &&
set->type->revision_max == clash->type->revision_max &&
@@ -902,7 +923,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
/* Wraparound */
goto cleanup;
- list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+ list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
if (!list)
goto cleanup;
/* nfnl mutex is held, both lists are valid */
@@ -916,12 +937,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
inst->ip_set_max = i;
kfree(tmp);
ret = 0;
- } else if (ret)
+ } else if (ret) {
goto cleanup;
+ }
- /*
- * Finally! Add our shiny new set to the list, and be done.
- */
+ /* Finally! Add our shiny new set to the list, and be done. */
pr_debug("create: '%s' created with index %u!\n", set->name, index);
ip_set(inst, index) = set;
@@ -946,12 +966,9 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
};
static void
-ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
+ip_set_destroy_set(struct ip_set *set)
{
- struct ip_set *set = ip_set(inst, index);
-
pr_debug("set: %s\n", set->name);
- ip_set(inst, index) = NULL;
/* Must call it without holding any lock */
set->variant->destroy(set);
@@ -986,30 +1003,36 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL && s->ref) {
+ if (s && s->ref) {
ret = -IPSET_ERR_BUSY;
goto out;
}
}
+ inst->is_destroyed = true;
read_unlock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL)
- ip_set_destroy_set(inst, i);
+ if (s) {
+ ip_set(inst, i) = NULL;
+ ip_set_destroy_set(s);
+ }
}
+ /* Modified by ip_set_destroy() only, which is serialized */
+ inst->is_destroyed = false;
} else {
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
- if (s == NULL) {
+ if (!s) {
ret = -ENOENT;
goto out;
} else if (s->ref) {
ret = -IPSET_ERR_BUSY;
goto out;
}
+ ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
- ip_set_destroy_set(inst, i);
+ ip_set_destroy_set(s);
}
return 0;
out:
@@ -1024,9 +1047,9 @@ ip_set_flush_set(struct ip_set *set)
{
pr_debug("set: %s\n", set->name);
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
set->variant->flush(set);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
}
static int
@@ -1044,12 +1067,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL)
+ if (s)
ip_set_flush_set(s);
}
} else {
s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (s == NULL)
+ if (!s)
return -ENOENT;
ip_set_flush_set(s);
@@ -1081,12 +1104,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_SETNAME2] == NULL))
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_SETNAME2]))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
read_lock_bh(&ip_set_ref_lock);
@@ -1098,7 +1121,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL && STREQ(s->name, name2)) {
+ if (s && STRNCMP(s->name, name2)) {
ret = -IPSET_ERR_EXIST_SETNAME2;
goto out;
}
@@ -1130,23 +1153,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
char from_name[IPSET_MAXNAMELEN];
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_SETNAME2] == NULL))
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_SETNAME2]))
return -IPSET_ERR_PROTOCOL;
from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&from_id);
- if (from == NULL)
+ if (!from)
return -ENOENT;
to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
&to_id);
- if (to == NULL)
+ if (!to)
return -IPSET_ERR_EXIST_SETNAME2;
/* Features must not change.
- * Not an artificial restriction anymore, as we must prevent
- * possible loops created by swapping in setlist type of sets. */
+ * Not an artifical restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets.
+ */
if (!(from->type->features == to->type->features &&
from->family == to->family))
return -IPSET_ERR_TYPE_MISMATCH;
@@ -1177,12 +1201,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
static int
ip_set_dump_done(struct netlink_callback *cb)
{
- struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
if (cb->args[IPSET_CB_ARG0]) {
- pr_debug("release set %s\n",
- ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
- __ip_set_put_byindex(inst,
- (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
+ struct ip_set_net *inst =
+ (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
+ struct ip_set *set = ip_set(inst, index);
+
+ if (set->variant->uref)
+ set->variant->uref(set, cb, false);
+ pr_debug("release set %s\n", set->name);
+ __ip_set_put_byindex(inst, index);
}
return 0;
}
@@ -1204,7 +1232,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
ip_set_id_t index;
@@ -1213,27 +1241,23 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
nla_parse(cda, IPSET_ATTR_CMD_MAX,
attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
- /* cb->args[IPSET_CB_NET]: net namespace
- * [IPSET_CB_DUMP]: dump single set/all sets
- * [IPSET_CB_INDEX]: set index
- * [IPSET_CB_ARG0]: type specific
- */
-
if (cda[IPSET_ATTR_SETNAME]) {
struct ip_set *set;
set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
&index);
- if (set == NULL)
+ if (!set)
return -ENOENT;
dump_type = DUMP_ONE;
cb->args[IPSET_CB_INDEX] = index;
- } else
+ } else {
dump_type = DUMP_ALL;
+ }
if (cda[IPSET_ATTR_FLAGS]) {
u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+
dump_type |= (f << 16);
}
cb->args[IPSET_CB_NET] = (unsigned long)inst;
@@ -1251,6 +1275,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
u32 dump_type, dump_flags;
+ bool is_destroyed;
int ret = 0;
if (!cb->args[IPSET_CB_DUMP]) {
@@ -1258,7 +1283,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
if (ret < 0) {
nlh = nlmsg_hdr(cb->skb);
/* We have to create and send the error message
- * manually :-( */
+ * manually :-(
+ */
if (nlh->nlmsg_flags & NLM_F_ACK)
netlink_ack(cb->skb, nlh, ret);
return ret;
@@ -1276,13 +1302,21 @@ dump_last:
pr_debug("dump type, flag: %u %u index: %ld\n",
dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
- index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
+ index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
+ write_lock_bh(&ip_set_ref_lock);
set = ip_set(inst, index);
- if (set == NULL) {
+ is_destroyed = inst->is_destroyed;
+ if (!set || is_destroyed) {
+ write_unlock_bh(&ip_set_ref_lock);
if (dump_type == DUMP_ONE) {
ret = -ENOENT;
goto out;
}
+ if (is_destroyed) {
+ /* All sets are just being destroyed */
+ ret = 0;
+ goto out;
+ }
continue;
}
/* When dumping all sets, we must dump "sorted"
@@ -1290,14 +1324,17 @@ dump_last:
*/
if (dump_type != DUMP_ONE &&
((dump_type == DUMP_ALL) ==
- !!(set->type->features & IPSET_DUMP_LAST)))
+ !!(set->type->features & IPSET_DUMP_LAST))) {
+ write_unlock_bh(&ip_set_ref_lock);
continue;
+ }
pr_debug("List set: %s\n", set->name);
if (!cb->args[IPSET_CB_ARG0]) {
/* Start listing: make sure set won't be destroyed */
pr_debug("reference set\n");
- __ip_set_get(set);
+ set->ref++;
}
+ write_unlock_bh(&ip_set_ref_lock);
nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, flags,
IPSET_CMD_LIST);
@@ -1325,11 +1362,13 @@ dump_last:
goto release_refcount;
if (dump_flags & IPSET_FLAG_LIST_HEADER)
goto next_set;
+ if (set->variant->uref)
+ set->variant->uref(set, cb, true);
/* Fall through and add elements */
default:
- read_lock_bh(&set->lock);
+ rcu_read_lock_bh();
ret = set->variant->list(set, skb, cb);
- read_unlock_bh(&set->lock);
+ rcu_read_unlock_bh();
if (!cb->args[IPSET_CB_ARG0])
/* Set is done, proceed with next one */
goto next_set;
@@ -1341,6 +1380,8 @@ dump_last:
dump_type = DUMP_LAST;
cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
cb->args[IPSET_CB_INDEX] = 0;
+ if (set && set->variant->uref)
+ set->variant->uref(set, cb, false);
goto dump_last;
}
goto out;
@@ -1355,7 +1396,10 @@ next_set:
release_refcount:
/* If there was an error or set is done, release set */
if (ret || !cb->args[IPSET_CB_ARG0]) {
- pr_debug("release set %s\n", ip_set(inst, index)->name);
+ set = ip_set(inst, index);
+ if (set->variant->uref)
+ set->variant->uref(set, cb, false);
+ pr_debug("release set %s\n", set->name);
__ip_set_put_byindex(inst, index);
cb->args[IPSET_CB_ARG0] = 0;
}
@@ -1407,9 +1451,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
retried = true;
} while (ret == -EAGAIN &&
set->variant->resize &&
@@ -1425,12 +1469,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
size_t payload = min(SIZE_MAX,
sizeof(*errmsg) + nlmsg_len(nlh));
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
struct nlattr *cmdattr;
u32 *errline;
skb2 = nlmsg_new(payload, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
@@ -1447,7 +1491,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
/* Signal netlink not to send its ACK/errmsg. */
return -EINTR;
}
@@ -1462,25 +1507,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
{
struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
const struct nlattr *nla;
u32 flags = flag_exist(nlh);
bool use_lineno;
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
+ !attr[IPSET_ATTR_SETNAME] ||
!((attr[IPSET_ATTR_DATA] != NULL) ^
(attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] != NULL &&
+ (attr[IPSET_ATTR_DATA] &&
!flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] != NULL &&
+ (attr[IPSET_ATTR_ADT] &&
(!flag_nested(attr[IPSET_ATTR_ADT]) ||
- attr[IPSET_ATTR_LINENO] == NULL))))
+ !attr[IPSET_ATTR_LINENO]))))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
use_lineno = !!attr[IPSET_ATTR_LINENO];
@@ -1517,25 +1562,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
{
struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
const struct nlattr *nla;
u32 flags = flag_exist(nlh);
bool use_lineno;
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
+ !attr[IPSET_ATTR_SETNAME] ||
!((attr[IPSET_ATTR_DATA] != NULL) ^
(attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] != NULL &&
+ (attr[IPSET_ATTR_DATA] &&
!flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] != NULL &&
+ (attr[IPSET_ATTR_ADT] &&
(!flag_nested(attr[IPSET_ATTR_ADT]) ||
- attr[IPSET_ATTR_LINENO] == NULL))))
+ !attr[IPSET_ATTR_LINENO]))))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
use_lineno = !!attr[IPSET_ATTR_LINENO];
@@ -1572,26 +1617,26 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
{
struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_DATA] == NULL ||
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_DATA] ||
!flag_nested(attr[IPSET_ATTR_DATA])))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
set->type->adt_policy))
return -IPSET_ERR_PROTOCOL;
- read_lock_bh(&set->lock);
+ rcu_read_lock_bh();
ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
- read_unlock_bh(&set->lock);
+ rcu_read_unlock_bh();
/* Userspace can't trigger element to be re-added */
if (ret == -EAGAIN)
ret = 1;
@@ -1613,15 +1658,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL))
+ !attr[IPSET_ATTR_SETNAME]))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1670,8 +1715,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb,
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_TYPENAME] == NULL ||
- attr[IPSET_ATTR_FAMILY] == NULL))
+ !attr[IPSET_ATTR_TYPENAME] ||
+ !attr[IPSET_ATTR_FAMILY]))
return -IPSET_ERR_PROTOCOL;
family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
@@ -1681,7 +1726,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb,
return ret;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1726,11 +1771,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
struct nlmsghdr *nlh2;
int ret = 0;
- if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ if (unlikely(!attr[IPSET_ATTR_PROTOCOL]))
return -IPSET_ERR_PROTOCOL;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1858,7 +1903,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
ret = -EFAULT;
goto done;
}
- op = (unsigned int *) data;
+ op = (unsigned int *)data;
if (*op < IP_SET_OP_VERSION) {
/* Check the version at the beginning of operations */
@@ -1970,10 +2015,11 @@ ip_set_net_init(struct net *net)
if (inst->ip_set_max >= IPSET_INVALID_ID)
inst->ip_set_max = IPSET_INVALID_ID - 1;
- list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
+ list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
if (!list)
return -ENOMEM;
- inst->is_deleted = 0;
+ inst->is_deleted = false;
+ inst->is_destroyed = false;
rcu_assign_pointer(inst->ip_set_list, list);
return 0;
}
@@ -1986,12 +2032,14 @@ ip_set_net_exit(struct net *net)
struct ip_set *set = NULL;
ip_set_id_t i;
- inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+ inst->is_deleted = true; /* flag for ip_set_nfnl_put */
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);
- if (set != NULL)
- ip_set_destroy_set(inst, i);
+ if (set) {
+ ip_set(inst, i) = NULL;
+ ip_set_destroy_set(set);
+ }
}
kfree(rcu_dereference_protected(inst->ip_set_list, 1));
}
@@ -2003,11 +2051,11 @@ static struct pernet_operations ip_set_net_ops = {
.size = sizeof(struct ip_set_net)
};
-
static int __init
ip_set_init(void)
{
int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+
if (ret != 0) {
pr_err("ip_set: cannot register with nfnetlink.\n");
return ret;
diff --git a/kernel/net/netfilter/ipset/ip_set_getport.c b/kernel/net/netfilter/ipset/ip_set_getport.c
index 29fb01ddf..42c3e3ba1 100644
--- a/kernel/net/netfilter/ipset/ip_set_getport.c
+++ b/kernel/net/netfilter/ipset/ip_set_getport.c
@@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct tcphdr *th;
th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
- if (th == NULL)
+ if (!th)
/* No choice either */
return false;
@@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const sctp_sctphdr_t *sh;
sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
- if (sh == NULL)
+ if (!sh)
/* No choice either */
return false;
@@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct udphdr *uh;
uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
- if (uh == NULL)
+ if (!uh)
/* No choice either */
return false;
@@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct icmphdr *ic;
ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
- if (ic == NULL)
+ if (!ic)
return false;
*port = (__force __be16)htons((ic->type << 8) | ic->code);
@@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct icmp6hdr *ic;
ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
- if (ic == NULL)
+ if (!ic)
return false;
*port = (__force __be16)
@@ -98,7 +98,7 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
__be16 *port, u8 *proto)
{
const struct iphdr *iph = ip_hdr(skb);
- unsigned int protooff = ip_hdrlen(skb);
+ unsigned int protooff = skb_network_offset(skb) + ip_hdrlen(skb);
int protocol = iph->protocol;
/* See comments at tcp_match in ip_tables.c */
@@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
return false;
default:
/* Other protocols doesn't have ports,
- so we can match fragments */
+ * so we can match fragments.
+ */
*proto = protocol;
return true;
}
@@ -135,7 +136,9 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
__be16 frag_off = 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ protoff = ipv6_skip_exthdr(skb,
+ skb_network_offset(skb) +
+ sizeof(struct ipv6hdr), &nexthdr,
&frag_off);
if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
return false;
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_gen.h b/kernel/net/netfilter/ipset/ip_set_hash_gen.h
index 974ff386d..e5336ab36 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/kernel/net/netfilter/ipset/ip_set_hash_gen.h
@@ -10,19 +10,19 @@
#include <linux/rcupdate.h>
#include <linux/jhash.h>
+#include <linux/types.h>
#include <linux/netfilter/ipset/ip_set_timeout.h>
-#ifndef rcu_dereference_bh
-#define rcu_dereference_bh(p) rcu_dereference(p)
-#endif
+
+#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
+#define ipset_dereference_protected(p, set) \
+ __ipset_dereference_protected(p, spin_is_locked(&(set)->lock))
#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
/* Hashing which uses arrays to resolve clashing. The hash table is resized
* (doubled) when searching becomes too long.
* Internally jhash is used with the assumption that the size of the
- * stored data is a multiple of sizeof(u32). If storage supports timeout,
- * the timeout field must be the last one in the data structure - that field
- * is ignored when computing the hash key.
+ * stored data is a multiple of sizeof(u32).
*
* Readers and resizing
*
@@ -35,7 +35,9 @@
/* Number of elements to store in an initial array block */
#define AHASH_INIT_SIZE 4
/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
+/* Max muber of elements in the array block when tuned */
+#define AHASH_MAX_TUNED 64
/* Max number of elements can be tuned */
#ifdef IP_SET_HASH_WITH_MULTI
@@ -53,8 +55,9 @@ tune_ahash_max(u8 curr, u32 multi)
/* Currently, at listing one hash bucket must fit into a message.
* Therefore we have a hard limit here.
*/
- return n > curr && n <= 64 ? n : curr;
+ return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
}
+
#define TUNE_AHASH_MAX(h, multi) \
((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
#else
@@ -64,18 +67,24 @@ tune_ahash_max(u8 curr, u32 multi)
/* A hash bucket */
struct hbucket {
- void *value; /* the array of the values */
+ struct rcu_head rcu; /* for call_rcu_bh */
+ /* Which positions are used in the array */
+ DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
+ unsigned char value[0] /* the array of the values */
+ __aligned(__alignof__(u64));
};
/* The hash table: the table size stored here in order to make resizing easy */
struct htable {
+ atomic_t ref; /* References for resizing */
+ atomic_t uref; /* References for dumping */
u8 htable_bits; /* size of hash table == 2^htable_bits */
- struct hbucket bucket[0]; /* hashtable buckets */
+ struct hbucket __rcu *bucket[0]; /* hashtable buckets */
};
-#define hbucket(h, i) (&((h)->bucket[i]))
+#define hbucket(h, i) ((h)->bucket[i])
#ifndef IPSET_NET_COUNT
#define IPSET_NET_COUNT 1
@@ -83,8 +92,8 @@ struct htable {
/* Book-keeping of the prefixes added to the set */
struct net_prefixes {
- u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */
- u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */
+ u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */
+ u8 cidr[IPSET_NET_COUNT]; /* the cidr value */
};
/* Compute the hash table size */
@@ -97,11 +106,11 @@ htable_size(u8 hbits)
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
- if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+ if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;
- return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+ return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
}
/* Compute htable_bits from the user input parameter hashsize */
@@ -110,6 +119,7 @@ htable_bits(u32 hashsize)
{
/* Assume that hashsize == 2^htable_bits */
u8 bits = fls(hashsize - 1);
+
if (jhash_size(bits) != hashsize)
/* Round up to the first 2^n value */
bits = fls(hashsize);
@@ -117,30 +127,6 @@ htable_bits(u32 hashsize)
return bits;
}
-static int
-hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
-{
- if (n->pos >= n->size) {
- void *tmp;
-
- if (n->size >= ahash_max)
- /* Trigger rehashing */
- return -EAGAIN;
-
- tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
- GFP_ATOMIC);
- if (!tmp)
- return -ENOMEM;
- if (n->size) {
- memcpy(tmp, n->value, n->size * dsize);
- kfree(n->value);
- }
- n->value = tmp;
- n->size += AHASH_INIT_SIZE;
- }
- return 0;
-}
-
#ifdef IP_SET_HASH_WITH_NETS
#if IPSET_NET_COUNT > 1
#define __CIDR(cidr, i) (cidr[i])
@@ -149,23 +135,31 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#endif
/* cidr + 1 is stored in net_prefixes to support /0 */
-#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1)
+#define NCIDR_PUT(cidr) ((cidr) + 1)
+#define NCIDR_GET(cidr) ((cidr) - 1)
#ifdef IP_SET_HASH_WITH_NETS_PACKED
/* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */
-#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1)
-#define NCIDR(cidr) (cidr)
+#define DCIDR_PUT(cidr) ((cidr) - 1)
+#define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1)
#else
-#define GCIDR(cidr, i) (__CIDR(cidr, i))
-#define NCIDR(cidr) (cidr - 1)
+#define DCIDR_PUT(cidr) (cidr)
+#define DCIDR_GET(cidr, i) __CIDR(cidr, i)
#endif
+#define INIT_CIDR(cidr, host_mask) \
+ DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
+
#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
#ifdef IP_SET_HASH_WITH_NET0
+/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
#define NLEN(family) (SET_HOST_MASK(family) + 1)
+#define CIDR_POS(c) ((c) - 1)
#else
+/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
#define NLEN(family) SET_HOST_MASK(family)
+#define CIDR_POS(c) ((c) - 2)
#endif
#else
@@ -180,6 +174,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#undef mtype_data_equal
#undef mtype_do_data_match
#undef mtype_data_set_flags
+#undef mtype_data_reset_elem
#undef mtype_data_reset_flags
#undef mtype_data_netmask
#undef mtype_data_list
@@ -193,7 +188,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#undef mtype_ahash_memsize
#undef mtype_flush
#undef mtype_destroy
-#undef mtype_gc_init
#undef mtype_same_set
#undef mtype_kadt
#undef mtype_uadt
@@ -203,6 +197,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#undef mtype_del
#undef mtype_test_cidrs
#undef mtype_test
+#undef mtype_uref
#undef mtype_expire
#undef mtype_resize
#undef mtype_head
@@ -227,6 +222,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list)
#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next)
#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+
#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy)
#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr)
@@ -234,7 +230,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize)
#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
-#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
@@ -244,23 +239,36 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_uref IPSET_TOKEN(MTYPE, _uref)
#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
+#ifndef MTYPE
+#error "MTYPE is not defined!"
+#endif
+
+#ifndef HOST_MASK
+#error "HOST_MASK is not defined!"
+#endif
+
#ifndef HKEY_DATALEN
#define HKEY_DATALEN sizeof(struct mtype_elem)
#endif
#define HKEY(data, initval, htable_bits) \
-(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \
+(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \
& jhash_mask(htable_bits))
#ifndef htype
+#ifndef HTYPE
+#error "HTYPE is not defined!"
+#endif /* HTYPE */
#define htype HTYPE
/* The generic hash structure */
@@ -280,18 +288,16 @@ struct htype {
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
-#ifdef IP_SET_HASH_WITH_RBTREE
- struct rb_root rbtree;
-#endif
#ifdef IP_SET_HASH_WITH_NETS
struct net_prefixes nets[0]; /* book-keeping of prefixes */
#endif
};
-#endif
+#endif /* htype */
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
- * sized networks */
+ * sized networks. cidr == real cidr + 1 to support /0.
+ */
static void
mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
{
@@ -299,12 +305,12 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
/* Add in increasing prefix order, so larger cidr first */
for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) {
- if (j != -1)
+ if (j != -1) {
continue;
- else if (h->nets[i].cidr[n] < cidr)
+ } else if (h->nets[i].cidr[n] < cidr) {
j = i;
- else if (h->nets[i].cidr[n] == cidr) {
- h->nets[cidr - 1].nets[n]++;
+ } else if (h->nets[i].cidr[n] == cidr) {
+ h->nets[CIDR_POS(cidr)].nets[n]++;
return;
}
}
@@ -313,7 +319,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
}
h->nets[i].cidr[n] = cidr;
- h->nets[cidr - 1].nets[n] = 1;
+ h->nets[CIDR_POS(cidr)].nets[n] = 1;
}
static void
@@ -322,15 +328,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
u8 i, j, net_end = nets_length - 1;
for (i = 0; i < nets_length; i++) {
- if (h->nets[i].cidr[n] != cidr)
- continue;
- h->nets[cidr -1].nets[n]--;
- if (h->nets[cidr -1].nets[n] > 0)
- return;
+ if (h->nets[i].cidr[n] != cidr)
+ continue;
+ h->nets[CIDR_POS(cidr)].nets[n]--;
+ if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
+ return;
for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
- h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+ h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
h->nets[j].cidr[n] = 0;
- return;
+ return;
}
}
#endif
@@ -341,15 +347,18 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t,
u8 nets_length, size_t dsize)
{
u32 i;
- size_t memsize = sizeof(*h)
- + sizeof(*t)
+ struct hbucket *n;
+ size_t memsize = sizeof(*h) + sizeof(*t);
+
#ifdef IP_SET_HASH_WITH_NETS
- + sizeof(struct net_prefixes) * nets_length
+ memsize += sizeof(struct net_prefixes) * nets_length;
#endif
- + jhash_size(t->htable_bits) * sizeof(struct hbucket);
-
- for (i = 0; i < jhash_size(t->htable_bits); i++)
- memsize += t->bucket[i].size * dsize;
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = rcu_dereference_bh(hbucket(t, i));
+ if (!n)
+ continue;
+ memsize += sizeof(struct hbucket) + n->size * dsize;
+ }
return memsize;
}
@@ -364,7 +373,8 @@ mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
int i;
for (i = 0; i < n->pos; i++)
- ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
+ if (test_bit(i, n->used))
+ ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
}
/* Flush a hash type of set: destroy all elements */
@@ -376,16 +386,16 @@ mtype_flush(struct ip_set *set)
struct hbucket *n;
u32 i;
- t = rcu_dereference_bh_nfnl(h->table);
+ t = ipset_dereference_protected(h->table, set);
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = hbucket(t, i);
- if (n->size) {
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set, n);
- n->size = n->pos = 0;
- /* FIXME: use slab cache */
- kfree(n->value);
- }
+ n = __ipset_dereference_protected(hbucket(t, i), 1);
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
}
#ifdef IP_SET_HASH_WITH_NETS
memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
@@ -401,13 +411,13 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = hbucket(t, i);
- if (n->size) {
- if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
- mtype_ext_cleanup(set, n);
- /* FIXME: use slab cache */
- kfree(n->value);
- }
+ n = __ipset_dereference_protected(hbucket(t, i), 1);
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ kfree(n);
}
ip_set_free(t);
@@ -419,13 +429,11 @@ mtype_destroy(struct ip_set *set)
{
struct htype *h = set->data;
- if (set->extensions & IPSET_EXT_TIMEOUT)
+ if (SET_WITH_TIMEOUT(set))
del_timer_sync(&h->gc);
- mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true);
-#ifdef IP_SET_HASH_WITH_RBTREE
- rbtree_destroy(&h->rbtree);
-#endif
+ mtype_ahash_destroy(set,
+ __ipset_dereference_protected(h->table, 1), true);
kfree(h);
set->data = NULL;
@@ -437,7 +445,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
struct htype *h = set->data;
init_timer(&h->gc);
- h->gc.data = (unsigned long) set;
+ h->gc.data = (unsigned long)set;
h->gc.function = gc;
h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&h->gc);
@@ -468,63 +476,78 @@ static void
mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
{
struct htable *t;
- struct hbucket *n;
+ struct hbucket *n, *tmp;
struct mtype_elem *data;
- u32 i;
- int j;
+ u32 i, j, d;
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
- rcu_read_lock_bh();
- t = rcu_dereference_bh(h->table);
+ t = ipset_dereference_protected(h->table, set);
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = hbucket(t, i);
- for (j = 0; j < n->pos; j++) {
+ n = __ipset_dereference_protected(hbucket(t, i), 1);
+ if (!n)
+ continue;
+ for (j = 0, d = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used)) {
+ d++;
+ continue;
+ }
data = ahash_data(n, j, dsize);
if (ip_set_timeout_expired(ext_timeout(data, set))) {
pr_debug("expired %u/%u\n", i, j);
+ clear_bit(j, n->used);
+ smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h, SCIDR(data->cidr, k),
- nets_length, k);
+ mtype_del_cidr(h,
+ NCIDR_PUT(DCIDR_GET(data->cidr,
+ k)),
+ nets_length, k);
#endif
ip_set_ext_destroy(set, data);
- if (j != n->pos - 1)
- /* Not last one */
- memcpy(data,
- ahash_data(n, n->pos - 1, dsize),
- dsize);
- n->pos--;
h->elements--;
+ d++;
}
}
- if (n->pos + AHASH_INIT_SIZE < n->size) {
- void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
- * dsize,
- GFP_ATOMIC);
+ if (d >= AHASH_INIT_SIZE) {
+ if (d >= n->size) {
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
+ continue;
+ }
+ tmp = kzalloc(sizeof(*tmp) +
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
if (!tmp)
/* Still try to delete expired elements */
continue;
- n->size -= AHASH_INIT_SIZE;
- memcpy(tmp, n->value, n->size * dsize);
- kfree(n->value);
- n->value = tmp;
+ tmp->size = n->size - AHASH_INIT_SIZE;
+ for (j = 0, d = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ memcpy(tmp->value + d * dsize, data, dsize);
+ set_bit(d, tmp->used);
+ d++;
+ }
+ tmp->pos = d;
+ rcu_assign_pointer(hbucket(t, i), tmp);
+ kfree_rcu(n, rcu);
}
}
- rcu_read_unlock_bh();
}
static void
mtype_gc(unsigned long ul_set)
{
- struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set *set = (struct ip_set *)ul_set;
struct htype *h = set->data;
pr_debug("called\n");
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
mtype_expire(set, h, NLEN(set->family), set->dsize);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&h->gc);
@@ -532,93 +555,152 @@ mtype_gc(unsigned long ul_set)
/* Resize a hash: create a new hash table with doubling the hashsize
* and inserting the elements to it. Repeat until we succeed or
- * fail due to memory pressures. */
+ * fail due to memory pressures.
+ */
static int
mtype_resize(struct ip_set *set, bool retried)
{
struct htype *h = set->data;
- struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table);
- u8 htable_bits = orig->htable_bits;
+ struct htable *t, *orig;
+ u8 htable_bits;
+ size_t dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
+ struct mtype_elem *tmp;
#endif
struct mtype_elem *data;
struct mtype_elem *d;
struct hbucket *n, *m;
- u32 i, j;
+ u32 i, j, key;
int ret;
- /* Try to cleanup once */
- if (SET_WITH_TIMEOUT(set) && !retried) {
- i = h->elements;
- write_lock_bh(&set->lock);
- mtype_expire(set, set->data, NLEN(set->family), set->dsize);
- write_unlock_bh(&set->lock);
- if (h->elements < i)
- return 0;
- }
+#ifdef IP_SET_HASH_WITH_NETS
+ tmp = kmalloc(dsize, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+#endif
+ rcu_read_lock_bh();
+ orig = rcu_dereference_bh_nfnl(h->table);
+ htable_bits = orig->htable_bits;
+ rcu_read_unlock_bh();
retry:
ret = 0;
htable_bits++;
- pr_debug("attempt to resize set %s from %u to %u, t %p\n",
- set->name, orig->htable_bits, htable_bits, orig);
if (!htable_bits) {
/* In case we have plenty of memory :-) */
pr_warn("Cannot increase the hashsize of set %s further\n",
set->name);
- return -IPSET_ERR_HASH_FULL;
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
+ }
+ t = ip_set_alloc(htable_size(htable_bits));
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
}
- t = ip_set_alloc(sizeof(*t)
- + jhash_size(htable_bits) * sizeof(struct hbucket));
- if (!t)
- return -ENOMEM;
t->htable_bits = htable_bits;
- read_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
+ orig = __ipset_dereference_protected(h->table, 1);
+ /* There can't be another parallel resizing, but dumping is possible */
+ atomic_set(&orig->ref, 1);
+ atomic_inc(&orig->uref);
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
for (i = 0; i < jhash_size(orig->htable_bits); i++) {
- n = hbucket(orig, i);
+ n = __ipset_dereference_protected(hbucket(orig, i), 1);
+ if (!n)
+ continue;
for (j = 0; j < n->pos; j++) {
- data = ahash_data(n, j, set->dsize);
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
#ifdef IP_SET_HASH_WITH_NETS
+ /* We have readers running parallel with us,
+ * so the live data cannot be modified.
+ */
flags = 0;
+ memcpy(tmp, data, dsize);
+ data = tmp;
mtype_data_reset_flags(data, &flags);
#endif
- m = hbucket(t, HKEY(data, h->initval, htable_bits));
- ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize);
- if (ret < 0) {
-#ifdef IP_SET_HASH_WITH_NETS
- mtype_data_reset_flags(data, &flags);
-#endif
- read_unlock_bh(&set->lock);
- mtype_ahash_destroy(set, t, false);
- if (ret == -EAGAIN)
- goto retry;
- return ret;
+ key = HKEY(data, h->initval, htable_bits);
+ m = __ipset_dereference_protected(hbucket(t, key), 1);
+ if (!m) {
+ m = kzalloc(sizeof(*m) +
+ AHASH_INIT_SIZE * dsize,
+ GFP_ATOMIC);
+ if (!m) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ m->size = AHASH_INIT_SIZE;
+ RCU_INIT_POINTER(hbucket(t, key), m);
+ } else if (m->pos >= m->size) {
+ struct hbucket *ht;
+
+ if (m->size >= AHASH_MAX(h)) {
+ ret = -EAGAIN;
+ } else {
+ ht = kzalloc(sizeof(*ht) +
+ (m->size + AHASH_INIT_SIZE)
+ * dsize,
+ GFP_ATOMIC);
+ if (!ht)
+ ret = -ENOMEM;
+ }
+ if (ret < 0)
+ goto cleanup;
+ memcpy(ht, m, sizeof(struct hbucket) +
+ m->size * dsize);
+ ht->size = m->size + AHASH_INIT_SIZE;
+ kfree(m);
+ m = ht;
+ RCU_INIT_POINTER(hbucket(t, key), ht);
}
- d = ahash_data(m, m->pos++, set->dsize);
- memcpy(d, data, set->dsize);
+ d = ahash_data(m, m->pos, dsize);
+ memcpy(d, data, dsize);
+ set_bit(m->pos++, m->used);
#ifdef IP_SET_HASH_WITH_NETS
mtype_data_reset_flags(d, &flags);
#endif
}
}
-
rcu_assign_pointer(h->table, t);
- read_unlock_bh(&set->lock);
+
+ spin_unlock_bh(&set->lock);
/* Give time to other readers of the set */
synchronize_rcu_bh();
pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
orig->htable_bits, orig, t->htable_bits, t);
- mtype_ahash_destroy(set, orig, false);
+ /* If there's nobody else dumping the table, destroy it */
+ if (atomic_dec_and_test(&orig->uref)) {
+ pr_debug("Table destroy by resize %p\n", orig);
+ mtype_ahash_destroy(set, orig, false);
+ }
- return 0;
+out:
+#ifdef IP_SET_HASH_WITH_NETS
+ kfree(tmp);
+#endif
+ return ret;
+
+cleanup:
+ atomic_set(&orig->ref, 0);
+ atomic_dec(&orig->uref);
+ spin_unlock_bh(&set->lock);
+ mtype_ahash_destroy(set, t, false);
+ if (ret == -EAGAIN)
+ goto retry;
+ goto out;
}
/* Add an element to a hash and update the internal counters when succeeded,
- * otherwise report the proper error code. */
+ * otherwise report the proper error code.
+ */
static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags)
@@ -627,17 +709,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct htable *t;
const struct mtype_elem *d = value;
struct mtype_elem *data;
- struct hbucket *n;
- int i, ret = 0;
- int j = AHASH_MAX(h) + 1;
+ struct hbucket *n, *old = ERR_PTR(-ENOENT);
+ int i, j = -1;
bool flag_exist = flags & IPSET_FLAG_EXIST;
+ bool deleted = false, forceadd = false, reuse = false;
u32 key, multi = 0;
- rcu_read_lock_bh();
- t = rcu_dereference_bh(h->table);
+ if (h->elements >= h->maxelem) {
+ if (SET_WITH_TIMEOUT(set))
+ /* FIXME: when set is full, we slow down here */
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+ if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ forceadd = true;
+ }
+
+ t = ipset_dereference_protected(h->table, set);
key = HKEY(value, h->initval, t->htable_bits);
- n = hbucket(t, key);
+ n = __ipset_dereference_protected(hbucket(t, key), 1);
+ if (!n) {
+ if (forceadd) {
+ if (net_ratelimit())
+ pr_warn("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
+ } else if (h->elements >= h->maxelem) {
+ goto set_full;
+ }
+ old = NULL;
+ n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
+ GFP_ATOMIC);
+ if (!n)
+ return -ENOMEM;
+ n->size = AHASH_INIT_SIZE;
+ goto copy_elem;
+ }
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used)) {
+ /* Reuse first deleted entry */
+ if (j == -1) {
+ deleted = reuse = true;
+ j = i;
+ }
+ continue;
+ }
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi)) {
if (flag_exist ||
@@ -645,85 +759,94 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ip_set_timeout_expired(ext_timeout(data, set)))) {
/* Just the extensions could be overwritten */
j = i;
- goto reuse_slot;
- } else {
- ret = -IPSET_ERR_EXIST;
- goto out;
+ goto overwrite_extensions;
}
+ return -IPSET_ERR_EXIST;
}
/* Reuse first timed out entry */
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(data, set)) &&
- j != AHASH_MAX(h) + 1)
+ j == -1) {
j = i;
+ reuse = true;
+ }
}
- if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) {
- /* Choosing the first entry in the array to replace */
- j = 0;
- goto reuse_slot;
- }
- if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
- /* FIXME: when set is full, we slow down here */
- mtype_expire(set, h, NLEN(set->family), set->dsize);
-
- if (h->elements >= h->maxelem) {
- if (net_ratelimit())
- pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- ret = -IPSET_ERR_HASH_FULL;
- goto out;
- }
-
-reuse_slot:
- if (j != AHASH_MAX(h) + 1) {
- /* Fill out reused slot */
+ if (reuse || forceadd) {
data = ahash_data(n, j, set->dsize);
+ if (!deleted) {
#ifdef IP_SET_HASH_WITH_NETS
- for (i = 0; i < IPSET_NET_COUNT; i++) {
- mtype_del_cidr(h, SCIDR(data->cidr, i),
- NLEN(set->family), i);
- mtype_add_cidr(h, SCIDR(d->cidr, i),
- NLEN(set->family), i);
- }
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_del_cidr(h,
+ NCIDR_PUT(DCIDR_GET(data->cidr, i)),
+ NLEN(set->family), i);
#endif
- ip_set_ext_destroy(set, data);
- } else {
- /* Use/create a new slot */
+ ip_set_ext_destroy(set, data);
+ h->elements--;
+ }
+ goto copy_data;
+ }
+ if (h->elements >= h->maxelem)
+ goto set_full;
+ /* Create a new slot */
+ if (n->pos >= n->size) {
TUNE_AHASH_MAX(h, multi);
- ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize);
- if (ret != 0) {
- if (ret == -EAGAIN)
- mtype_data_next(&h->next, d);
- goto out;
+ if (n->size >= AHASH_MAX(h)) {
+ /* Trigger rehashing */
+ mtype_data_next(&h->next, d);
+ return -EAGAIN;
}
- data = ahash_data(n, n->pos++, set->dsize);
+ old = n;
+ n = kzalloc(sizeof(*n) +
+ (old->size + AHASH_INIT_SIZE) * set->dsize,
+ GFP_ATOMIC);
+ if (!n)
+ return -ENOMEM;
+ memcpy(n, old, sizeof(struct hbucket) +
+ old->size * set->dsize);
+ n->size = old->size + AHASH_INIT_SIZE;
+ }
+
+copy_elem:
+ j = n->pos++;
+ data = ahash_data(n, j, set->dsize);
+copy_data:
+ h->elements++;
#ifdef IP_SET_HASH_WITH_NETS
- for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family),
- i);
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
+ NLEN(set->family), i);
#endif
- h->elements++;
- }
memcpy(data, d, sizeof(struct mtype_elem));
+overwrite_extensions:
#ifdef IP_SET_HASH_WITH_NETS
mtype_data_set_flags(data, flags);
#endif
- if (SET_WITH_TIMEOUT(set))
- ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(data, set), ext);
if (SET_WITH_COMMENT(set))
ip_set_init_comment(ext_comment(data, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
+ /* Must come last for the case when timed out entry is reused */
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
+ smp_mb__before_atomic();
+ set_bit(j, n->used);
+ if (old != ERR_PTR(-ENOENT)) {
+ rcu_assign_pointer(hbucket(t, key), n);
+ if (old)
+ kfree_rcu(old, rcu);
+ }
-out:
- rcu_read_unlock_bh();
- return ret;
+ return 0;
+set_full:
+ if (net_ratelimit())
+ pr_warn("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
}
-/* Delete an element from the hash: swap it with the last element
- * and free up space if possible.
+/* Delete an element from the hash and free up space if possible.
*/
static int
mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
@@ -734,55 +857,70 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n;
- int i, ret = -IPSET_ERR_EXIST;
-#ifdef IP_SET_HASH_WITH_NETS
- u8 j;
-#endif
+ int i, j, k, ret = -IPSET_ERR_EXIST;
u32 key, multi = 0;
+ size_t dsize = set->dsize;
- rcu_read_lock_bh();
- t = rcu_dereference_bh(h->table);
+ t = ipset_dereference_protected(h->table, set);
key = HKEY(value, h->initval, t->htable_bits);
- n = hbucket(t, key);
- for (i = 0; i < n->pos; i++) {
- data = ahash_data(n, i, set->dsize);
+ n = __ipset_dereference_protected(hbucket(t, key), 1);
+ if (!n)
+ goto out;
+ for (i = 0, k = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used)) {
+ k++;
+ continue;
+ }
+ data = ahash_data(n, i, dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(data, set)))
goto out;
- if (i != n->pos - 1)
- /* Not last one */
- memcpy(data, ahash_data(n, n->pos - 1, set->dsize),
- set->dsize);
- n->pos--;
+ ret = 0;
+ clear_bit(i, n->used);
+ smp_mb__after_atomic();
+ if (i + 1 == n->pos)
+ n->pos--;
h->elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
- mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family),
- j);
+ mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
+ NLEN(set->family), j);
#endif
ip_set_ext_destroy(set, data);
- if (n->pos + AHASH_INIT_SIZE < n->size) {
- void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
- * set->dsize,
- GFP_ATOMIC);
- if (!tmp) {
- ret = 0;
+
+ for (; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ k++;
+ }
+ if (n->pos == 0 && k == 0) {
+ rcu_assign_pointer(hbucket(t, key), NULL);
+ kfree_rcu(n, rcu);
+ } else if (k >= AHASH_INIT_SIZE) {
+ struct hbucket *tmp = kzalloc(sizeof(*tmp) +
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
goto out;
+ tmp->size = n->size - AHASH_INIT_SIZE;
+ for (j = 0, k = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ memcpy(tmp->value + k * dsize, data, dsize);
+ set_bit(j, tmp->used);
+ k++;
}
- n->size -= AHASH_INIT_SIZE;
- memcpy(tmp, n->value, n->size * set->dsize);
- kfree(n->value);
- n->value = tmp;
+ tmp->pos = k;
+ rcu_assign_pointer(hbucket(t, key), tmp);
+ kfree_rcu(n, rcu);
}
- ret = 0;
goto out;
}
out:
- rcu_read_unlock_bh();
return ret;
}
@@ -801,7 +939,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
#ifdef IP_SET_HASH_WITH_NETS
/* Special test function which takes into account the different network
- * sizes added to the set */
+ * sizes added to the set
+ */
static int
mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
const struct ip_set_ext *ext,
@@ -824,16 +963,21 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) {
#if IPSET_NET_COUNT == 2
mtype_data_reset_elem(d, &orig);
- mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false);
+ mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi;
k++) {
- mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true);
+ mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
+ true);
#else
- mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]));
+ mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]));
#endif
key = HKEY(d, h->initval, t->htable_bits);
- n = hbucket(t, key);
+ n = rcu_dereference_bh(hbucket(t, key));
+ if (!n)
+ continue;
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ continue;
data = ahash_data(n, i, set->dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
@@ -871,13 +1015,13 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
int i, ret = 0;
u32 key, multi = 0;
- rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
#ifdef IP_SET_HASH_WITH_NETS
/* If we test an IP address and not a network address,
- * try all possible network sizes */
+ * try all possible network sizes
+ */
for (i = 0; i < IPSET_NET_COUNT; i++)
- if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family))
+ if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family))
break;
if (i == IPSET_NET_COUNT) {
ret = mtype_test_cidrs(set, d, ext, mext, flags);
@@ -886,8 +1030,14 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
#endif
key = HKEY(d, h->initval, t->htable_bits);
- n = hbucket(t, key);
+ n = rcu_dereference_bh(hbucket(t, key));
+ if (!n) {
+ ret = 0;
+ goto out;
+ }
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ continue;
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi) &&
!(SET_WITH_TIMEOUT(set) &&
@@ -897,7 +1047,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
}
out:
- rcu_read_unlock_bh();
return ret;
}
@@ -909,15 +1058,19 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
const struct htable *t;
struct nlattr *nested;
size_t memsize;
+ u8 htable_bits;
+ rcu_read_lock_bh();
t = rcu_dereference_bh_nfnl(h->table);
memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+ htable_bits = t->htable_bits;
+ rcu_read_unlock_bh();
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
- htonl(jhash_size(t->htable_bits))) ||
+ htonl(jhash_size(htable_bits))) ||
nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
goto nla_put_failure;
#ifdef IP_SET_HASH_WITH_NETMASK
@@ -941,32 +1094,63 @@ nla_put_failure:
return -EMSGSIZE;
}
+/* Make possible to run dumping parallel with resizing */
+static void
+mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+
+ if (start) {
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh_nfnl(h->table);
+ atomic_inc(&t->uref);
+ cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
+ rcu_read_unlock_bh();
+ } else if (cb->args[IPSET_CB_PRIVATE]) {
+ t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ /* Resizing didn't destroy the hash table */
+ pr_debug("Table destroy by dump: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+ cb->args[IPSET_CB_PRIVATE] = 0;
+ }
+}
+
/* Reply a LIST/SAVE request: dump the elements of the specified set */
static int
mtype_list(const struct ip_set *set,
struct sk_buff *skb, struct netlink_callback *cb)
{
- const struct htype *h = set->data;
- const struct htable *t = rcu_dereference_bh_nfnl(h->table);
+ const struct htable *t;
struct nlattr *atd, *nested;
const struct hbucket *n;
const struct mtype_elem *e;
u32 first = cb->args[IPSET_CB_ARG0];
/* We assume that one hash bucket fills into one page */
void *incomplete;
- int i;
+ int i, ret = 0;
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
+
pr_debug("list hash set %s\n", set->name);
+ t = (const struct htable *)cb->args[IPSET_CB_PRIVATE];
+ /* Expire may replace a hbucket with another one */
+ rcu_read_lock();
for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
cb->args[IPSET_CB_ARG0]++) {
incomplete = skb_tail_pointer(skb);
- n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0]));
pr_debug("cb->arg bucket: %lu, t %p n %p\n",
cb->args[IPSET_CB_ARG0], t, n);
+ if (!n)
+ continue;
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ continue;
e = ahash_data(n, i, set->dsize);
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
@@ -977,9 +1161,10 @@ mtype_list(const struct ip_set *set,
if (!nested) {
if (cb->args[IPSET_CB_ARG0] == first) {
nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ goto nla_put_failure;
}
if (mtype_data_list(skb, e))
goto nla_put_failure;
@@ -992,7 +1177,7 @@ mtype_list(const struct ip_set *set,
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
- return 0;
+ goto out;
nla_put_failure:
nlmsg_trim(skb, incomplete);
@@ -1000,20 +1185,24 @@ nla_put_failure:
pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n",
set->name);
cb->args[IPSET_CB_ARG0] = 0;
- return -EMSGSIZE;
+ ret = -EMSGSIZE;
+ } else {
+ ipset_nest_end(skb, atd);
}
- ipset_nest_end(skb, atd);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static int
IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt);
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt);
static int
IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+ enum ipset_adt adt, u32 *lineno, u32 flags,
+ bool retried);
static const struct ip_set_type_variant mtype_variant = {
.kadt = mtype_kadt,
@@ -1027,6 +1216,7 @@ static const struct ip_set_type_variant mtype_variant = {
.flush = mtype_flush,
.head = mtype_head,
.list = mtype_list,
+ .uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
};
@@ -1045,7 +1235,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
u8 netmask;
#endif
size_t hsize;
- struct HTYPE *h;
+ struct htype *h;
struct htable *t;
#ifndef IP_SET_PROTO_UNDEF
@@ -1064,12 +1254,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
-#ifdef IP_SET_HASH_WITH_MARKMASK
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
-#endif
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ /* Separated condition in order to avoid directive in argument list */
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK)))
+ return -IPSET_ERR_PROTOCOL;
+#endif
if (tb[IPSET_ATTR_HASHSIZE]) {
hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
@@ -1092,7 +1284,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
if (tb[IPSET_ATTR_MARKMASK]) {
- markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
+ markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
if (markmask == 0)
return -IPSET_ERR_INVALID_MARKMASK;
@@ -1137,12 +1329,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
+ sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)),
+ __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem)));
#ifndef IP_SET_PROTO_UNDEF
} else {
set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)));
+ sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)),
+ __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
}
#endif
if (tb[IPSET_ATTR_TIMEOUT]) {
@@ -1165,3 +1359,5 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
return 0;
}
#endif /* IP_SET_EMIT_CREATE */
+
+#undef HKEY_DATALEN
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ip.c b/kernel/net/netfilter/ipset/ip_set_hash_ip.c
index 76959d79e..9d6bf19f7 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ip.c
@@ -56,15 +56,15 @@ hash_ip4_data_equal(const struct hash_ip4_elem *e1,
return e1->ip == e2->ip;
}
-static inline bool
+static bool
hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e)
{
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -74,7 +74,6 @@ hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
}
#define MTYPE hash_ip4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -109,20 +108,17 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0, ip_to = 0, hosts;
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -145,7 +141,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -162,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -196,10 +192,10 @@ hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
{
if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -208,12 +204,9 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_ip6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
@@ -247,22 +240,25 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -301,7 +297,8 @@ static struct ip_set_type hash_ip_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -318,6 +315,7 @@ hash_ip_init(void)
static void __exit
hash_ip_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ip_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c
index 7abf9788c..a0695a2ab 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -63,10 +63,10 @@ hash_ipmark4_data_list(struct sk_buff *skb,
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -76,10 +76,8 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
next->ip = d->ip;
}
-#define MTYPE hash_ipmark4
-#define PF 4
-#define HOST_MASK 32
-#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem)
+#define MTYPE hash_ipmark4
+#define HOST_MASK 32
#include "ip_set_hash_gen.h"
static int
@@ -110,25 +108,22 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip, ip_to = 0;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
if (adt == IPSET_TEST ||
@@ -147,7 +142,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -160,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -191,10 +186,10 @@ hash_ipmark6_data_list(struct sk_buff *skb,
if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -204,18 +199,13 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_ipmark6
-#define PF 6
#define HOST_MASK 128
-#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem)
-#define IP_SET_EMIT_CREATE
+#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
-
static int
hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
@@ -243,27 +233,30 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK)))
return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
if (ret)
return ret;
- e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
if (adt == IPSET_TEST) {
@@ -274,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
- return ret;
+ return 0;
}
static struct ip_set_type hash_ipmark_type __read_mostly = {
@@ -307,7 +298,8 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -324,6 +316,7 @@ hash_ipmark_init(void)
static void __exit
hash_ipmark_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipmark_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c
index dcbcceb9a..9d84b3dff 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -69,10 +69,10 @@ hash_ipport4_data_list(struct sk_buff *skb,
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -83,10 +83,8 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next,
next->port = d->port;
}
-#define MTYPE hash_ipport4
-#define PF 4
-#define HOST_MASK 32
-#define HKEY_DATALEN sizeof(struct hash_ipport4_elem)
+#define MTYPE hash_ipport4
+#define HOST_MASK 32
#include "ip_set_hash_gen.h"
static int
@@ -118,29 +116,23 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -148,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
@@ -171,7 +164,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -195,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
}
return ret;
@@ -231,10 +224,10 @@ hash_ipport6_data_list(struct sk_buff *skb,
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -245,15 +238,11 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_ipport6
-#define PF 6
#define HOST_MASK 128
-#define HKEY_DATALEN sizeof(struct hash_ipport6_elem)
-#define IP_SET_EMIT_CREATE
+#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
static int
@@ -285,31 +274,31 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -317,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
@@ -341,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -376,7 +366,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -393,6 +384,7 @@ hash_ipport_init(void)
static void __exit
hash_ipport_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipport_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c
index 7ef93fc88..215b7b942 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -63,17 +63,17 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
static bool
hash_ipportip4_data_list(struct sk_buff *skb,
- const struct hash_ipportip4_elem *data)
+ const struct hash_ipportip4_elem *data)
{
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -86,7 +86,6 @@ hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
/* Common functions */
#define MTYPE hash_ipportip4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -120,22 +119,19 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -143,10 +139,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -154,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
@@ -177,7 +171,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -201,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
}
return ret;
@@ -240,10 +234,10 @@ hash_ipportip6_data_list(struct sk_buff *skb,
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -254,11 +248,9 @@ hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_ipportip6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -293,24 +285,27 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -318,10 +313,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -329,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
@@ -353,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -388,7 +381,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -405,6 +399,7 @@ hash_ipportip_init(void)
static void __exit
hash_ipportip_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipportip_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c
index b6012ad92..9ca719625 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -114,10 +114,10 @@ hash_ipportnet4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -130,7 +130,6 @@ hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
}
#define MTYPE hash_ipportnet4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -142,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -174,23 +173,20 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -205,10 +201,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
e.cidr = cidr - 1;
}
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -216,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -249,7 +244,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -270,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (ip2_from + UINT_MAX == ip2_to)
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
+ }
if (retried)
ip = ntohl(h->next.ip);
@@ -294,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip2 = ip2_last + 1;
}
}
@@ -367,10 +363,10 @@ hash_ipportnet6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -381,11 +377,9 @@ hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_ipportnet6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -398,7 +392,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -429,27 +423,28 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -466,10 +461,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip2, e.cidr + 1);
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -477,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -508,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -547,7 +541,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -564,6 +559,7 @@ hash_ipportnet_init(void)
static void __exit
hash_ipportnet_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipportnet_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_mac.c b/kernel/net/netfilter/ipset/ip_set_hash_mac.c
index 65690b52a..f1e7d2c0f 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_mac.c
@@ -52,7 +52,12 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1,
static inline bool
hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e)
{
- return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether);
+ if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
}
static inline void
@@ -62,7 +67,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next,
}
#define MTYPE hash_mac4
-#define PF 4
#define HOST_MASK 32
#define IP_SET_EMIT_CREATE
#define IP_SET_PROTO_UNDEF
@@ -85,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
return 0;
if (skb_mac_header(skb) < skb->head ||
- (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
@@ -103,22 +107,16 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
- if (unlikely(!tb[IPSET_ATTR_ETHER] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (unlikely(!tb[IPSET_ATTR_ETHER]))
+ return -IPSET_ERR_PROTOCOL;
+
ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]));
if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
return -IPSET_ERR_HASH_ELEM;
@@ -149,7 +147,8 @@ static struct ip_set_type hash_mac_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -166,6 +165,7 @@ hash_mac_init(void)
static void __exit
hash_mac_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_mac_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_net.c b/kernel/net/netfilter/ipset/ip_set_hash_net.c
index 6b3ac10ac..3e4bffdc1 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_net.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_net.c
@@ -95,10 +95,10 @@ hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -109,7 +109,6 @@ hash_net4_data_next(struct hash_net4_elem *next,
}
#define MTYPE hash_net4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -121,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -147,21 +146,18 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0, ip_to = 0, last;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -173,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -180,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
e.ip = htonl(ip & ip_set_hostmask(e.cidr));
ret = adtfn(set, &e, &ext, &ext, flags);
- return ip_set_enomatch(ret, flags, adt, set) ? -ret:
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
}
@@ -202,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip = last + 1;
}
return ret;
@@ -264,10 +261,10 @@ hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -277,11 +274,9 @@ hash_net6_data_next(struct hash_net4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_net6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -294,7 +289,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -318,36 +313,34 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
-
- if (!e.cidr || e.cidr > HOST_MASK)
- return -IPSET_ERR_INVALID_CIDR;
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
ip6_netmask(&e.ip, e.cidr);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -383,7 +376,8 @@ static struct ip_set_type hash_net_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -400,6 +394,7 @@ hash_net_init(void)
static void __exit
hash_net_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_net_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c
index 380ef5148..43d8c9896 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -13,7 +13,6 @@
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/random.h>
-#include <linux/rbtree.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlink.h>
@@ -37,88 +36,13 @@ MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,iface");
-/* Interface name rbtree */
-
-struct iface_node {
- struct rb_node node;
- char iface[IFNAMSIZ];
-};
-
-#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface)
-
-static void
-rbtree_destroy(struct rb_root *root)
-{
- struct iface_node *node, *next;
-
- rbtree_postorder_for_each_entry_safe(node, next, root, node)
- kfree(node);
-
- *root = RB_ROOT;
-}
-
-static int
-iface_test(struct rb_root *root, const char **iface)
-{
- struct rb_node *n = root->rb_node;
-
- while (n) {
- const char *d = iface_data(n);
- int res = strcmp(*iface, d);
-
- if (res < 0)
- n = n->rb_left;
- else if (res > 0)
- n = n->rb_right;
- else {
- *iface = d;
- return 1;
- }
- }
- return 0;
-}
-
-static int
-iface_add(struct rb_root *root, const char **iface)
-{
- struct rb_node **n = &(root->rb_node), *p = NULL;
- struct iface_node *d;
-
- while (*n) {
- char *ifname = iface_data(*n);
- int res = strcmp(*iface, ifname);
-
- p = *n;
- if (res < 0)
- n = &((*n)->rb_left);
- else if (res > 0)
- n = &((*n)->rb_right);
- else {
- *iface = ifname;
- return 0;
- }
- }
-
- d = kzalloc(sizeof(*d), GFP_ATOMIC);
- if (!d)
- return -ENOMEM;
- strcpy(d->iface, *iface);
-
- rb_link_node(&d->node, p, n);
- rb_insert_color(&d->node, root);
-
- *iface = d->iface;
- return 0;
-}
-
/* Type specific function prefix */
#define HTYPE hash_netiface
#define IP_SET_HASH_WITH_NETS
-#define IP_SET_HASH_WITH_RBTREE
#define IP_SET_HASH_WITH_MULTI
#define IP_SET_HASH_WITH_NET0
-#define STREQ(a, b) (strcmp(a, b) == 0)
+#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ)
/* IPv4 variant */
@@ -137,7 +61,7 @@ struct hash_netiface4_elem {
u8 cidr;
u8 nomatch;
u8 elem;
- const char *iface;
+ char iface[IFNAMSIZ];
};
/* Common functions */
@@ -151,7 +75,7 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- ip1->iface == ip2->iface;
+ strcmp(ip1->iface, ip2->iface) == 0;
}
static inline int
@@ -193,10 +117,10 @@ hash_netiface4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -207,7 +131,6 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
}
#define MTYPE hash_netiface4
-#define PF 4
#define HOST_MASK 32
#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
#include "ip_set_hash_gen.h"
@@ -220,7 +143,7 @@ static const char *get_physindev_name(const struct sk_buff *skb)
return dev ? dev->name : NULL;
}
-static const char *get_phyoutdev_name(const struct sk_buff *skb)
+static const char *get_physoutdev_name(const struct sk_buff *skb)
{
struct net_device *dev = nf_bridge_get_physoutdev(skb);
@@ -236,11 +159,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
.elem = 1,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- int ret;
if (e.cidr == 0)
return -EINVAL;
@@ -250,35 +172,25 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
e.ip &= ip_set_netmask(e.cidr);
-#define IFACE(dir) (par->dir ? par->dir->name : NULL)
+#define IFACE(dir) (par->dir ? par->dir->name : "")
#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- e.iface = SRCDIR ? get_physindev_name(skb) :
- get_phyoutdev_name(skb);
+ const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ get_physoutdev_name(skb);
- if (!e.iface)
+ if (!eiface)
return -EINVAL;
+ STRLCPY(e.iface, eiface);
e.physdev = 1;
-#else
- e.iface = NULL;
#endif
- } else
- e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+ } else {
+ STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ }
- if (!e.iface)
+ if (strlen(e.iface) == 0)
return -EINVAL;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
-
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -291,25 +203,21 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
- char iface[IFNAMSIZ];
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!tb[IPSET_ATTR_IFACE] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -318,21 +226,11 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
-
- strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
- e.iface = iface;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
+ nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_PHYSDEV)
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
@@ -353,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr);
+ }
if (retried)
ip = ntohl(h->next.ip);
@@ -365,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip = last + 1;
}
return ret;
@@ -388,7 +287,7 @@ struct hash_netiface6_elem {
u8 cidr;
u8 nomatch;
u8 elem;
- const char *iface;
+ char iface[IFNAMSIZ];
};
/* Common functions */
@@ -402,7 +301,7 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- ip1->iface == ip2->iface;
+ strcmp(ip1->iface, ip2->iface) == 0;
}
static inline int
@@ -444,10 +343,10 @@ hash_netiface6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -457,12 +356,9 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_netiface6
-#define PF 6
#define HOST_MASK 128
#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
#define IP_SET_EMIT_CREATE
@@ -476,11 +372,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
.elem = 1,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- int ret;
if (e.cidr == 0)
return -EINVAL;
@@ -492,85 +387,64 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- e.iface = SRCDIR ? get_physindev_name(skb) :
- get_phyoutdev_name(skb);
- if (!e.iface)
- return -EINVAL;
+ const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ get_physoutdev_name(skb);
+ if (!eiface)
+ return -EINVAL;
+ STRLCPY(e.iface, eiface);
e.physdev = 1;
-#else
- e.iface = NULL;
#endif
- } else
- e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+ } else {
+ STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ }
- if (!e.iface)
+ if (strlen(e.iface) == 0)
return -EINVAL;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- char iface[IFNAMSIZ];
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!tb[IPSET_ATTR_IFACE] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (e.cidr > HOST_MASK)
- return -IPSET_ERR_INVALID_CIDR;
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
ip6_netmask(&e.ip, e.cidr);
- strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
- e.iface = iface;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
+ nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_PHYSDEV)
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
@@ -613,7 +487,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -630,6 +505,7 @@ hash_netiface_init(void)
static void __exit
hash_netiface_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netiface_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c
index ea8772afb..a93dfebff 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -57,8 +57,8 @@ struct hash_netnet4_elem {
static inline bool
hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
- const struct hash_netnet4_elem *ip2,
- u32 *multi)
+ const struct hash_netnet4_elem *ip2,
+ u32 *multi)
{
return ip1->ipcmp == ip2->ipcmp &&
ip1->ccmp == ip2->ccmp;
@@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
static inline void
hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
- struct hash_netnet4_elem *orig)
+ struct hash_netnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
static bool
hash_netnet4_data_list(struct sk_buff *skb,
- const struct hash_netnet4_elem *data)
+ const struct hash_netnet4_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -122,28 +122,34 @@ nla_put_failure:
static inline void
hash_netnet4_data_next(struct hash_netnet4_elem *next,
- const struct hash_netnet4_elem *d)
+ const struct hash_netnet4_elem *d)
{
next->ipcmp = d->ipcmp;
}
#define MTYPE hash_netnet4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
+static void
+hash_netnet4_init(struct hash_netnet4_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -157,7 +163,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
@@ -165,45 +171,43 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
- u8 cidr, cidr2;
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netnet4_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
- cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > HOST_MASK)
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[0] = cidr;
}
if (tb[IPSET_ATTR_CIDR2]) {
- cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
- if (!cidr2 || cidr2 > HOST_MASK)
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[1] = cidr2;
}
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -226,8 +230,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (unlikely(ip + UINT_MAX == ip_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+ }
ip2_to = ip2_from;
if (tb[IPSET_ATTR_IP2_TO]) {
@@ -238,28 +243,27 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (unlikely(ip2_from + UINT_MAX == ip2_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+ }
if (retried)
ip = ntohl(h->next.ip[0]);
while (!after(ip, ip_to)) {
e.ip[0] = htonl(ip);
- last = ip_set_range_to_cidr(ip, ip_to, &cidr);
- e.cidr[0] = cidr;
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
ip2 = (retried &&
ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
: ip2_from;
while (!after(ip2, ip2_to)) {
e.ip[1] = htonl(ip2);
- last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2);
- e.cidr[1] = cidr2;
+ last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip2 = last2 + 1;
}
ip = last + 1;
@@ -283,8 +287,8 @@ struct hash_netnet6_elem {
static inline bool
hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
- const struct hash_netnet6_elem *ip2,
- u32 *multi)
+ const struct hash_netnet6_elem *ip2,
+ u32 *multi)
{
return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
@@ -311,7 +315,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
static inline void
hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
- struct hash_netnet6_elem *orig)
+ struct hash_netnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -330,7 +334,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
static bool
hash_netnet6_data_list(struct sk_buff *skb,
- const struct hash_netnet6_elem *data)
+ const struct hash_netnet6_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -349,34 +353,39 @@ nla_put_failure:
static inline void
hash_netnet6_data_next(struct hash_netnet4_elem *next,
- const struct hash_netnet6_elem *d)
+ const struct hash_netnet6_elem *d)
{
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_netnet6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
+static void
+hash_netnet6_init(struct hash_netnet6_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
- e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
+ e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
@@ -388,50 +397,53 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netnet6_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
- ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- if (tb[IPSET_ATTR_CIDR2])
+ if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-
- if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
- e.cidr[1] > HOST_MASK)
- return -IPSET_ERR_INVALID_CIDR;
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -470,7 +482,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -487,6 +500,7 @@ hash_netnet_init(void)
static void __exit
hash_netnet_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netnet_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netport.c b/kernel/net/netfilter/ipset/ip_set_hash_netport.c
index c0ddb58d1..731813e0f 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netport.c
@@ -110,10 +110,10 @@ hash_netport4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -125,7 +125,6 @@ hash_netport4_data_next(struct hash_netport4_elem *next,
}
#define MTYPE hash_netport4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -137,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -167,23 +166,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -194,10 +190,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
e.cidr = cidr - 1;
}
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -205,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
@@ -215,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -240,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
+ }
if (retried)
ip = ntohl(h->next.ip);
@@ -257,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
ip = last + 1;
}
@@ -326,10 +322,10 @@ hash_netport6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -340,11 +336,9 @@ hash_netport6_data_next(struct hash_netport4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_netport6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -357,7 +351,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -387,25 +381,22 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -417,10 +408,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
}
ip6_netmask(&e.ip, e.cidr + 1);
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -428,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -459,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -495,7 +485,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -512,6 +503,7 @@ hash_netport_init(void)
static void __exit
hash_netport_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netport_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c
index bfaa94c7b..9a14c2378 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -54,7 +54,7 @@ struct hash_netportnet4_elem {
u16 ccmp;
};
u16 padding;
- u8 nomatch:1;
+ u8 nomatch;
u8 proto;
};
@@ -62,8 +62,8 @@ struct hash_netportnet4_elem {
static inline bool
hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
- const struct hash_netportnet4_elem *ip2,
- u32 *multi)
+ const struct hash_netportnet4_elem *ip2,
+ u32 *multi)
{
return ip1->ipcmp == ip2->ipcmp &&
ip1->ccmp == ip2->ccmp &&
@@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
static inline void
hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
- struct hash_netportnet4_elem *orig)
+ struct hash_netportnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
static bool
hash_netportnet4_data_list(struct sk_buff *skb,
- const struct hash_netportnet4_elem *data)
+ const struct hash_netportnet4_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -124,37 +124,43 @@ hash_netportnet4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
- const struct hash_netportnet4_elem *d)
+ const struct hash_netportnet4_elem *d)
{
next->ipcmp = d->ipcmp;
next->port = d->port;
}
#define MTYPE hash_netportnet4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
+static void
+hash_netportnet4_init(struct hash_netportnet4_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -172,7 +178,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
@@ -181,49 +187,43 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
bool with_ports = false;
- u8 cidr, cidr2;
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netportnet4_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
- cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > HOST_MASK)
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[0] = cidr;
}
if (tb[IPSET_ATTR_CIDR2]) {
- cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
- if (!cidr || cidr > HOST_MASK)
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[1] = cidr;
}
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -231,14 +231,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -262,8 +264,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (unlikely(ip + UINT_MAX == ip_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+ }
port_to = port = ntohs(e.port);
if (tb[IPSET_ATTR_PORT_TO]) {
@@ -281,16 +284,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (unlikely(ip2_from + UINT_MAX == ip2_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+ }
if (retried)
ip = ntohl(h->next.ip[0]);
while (!after(ip, ip_to)) {
e.ip[0] = htonl(ip);
- ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr);
- e.cidr[0] = cidr;
+ ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++) {
@@ -301,13 +304,12 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
while (!after(ip2, ip2_to)) {
e.ip[1] = htonl(ip2);
ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
- &cidr2);
- e.cidr[1] = cidr2;
+ &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip2 = ip2_last + 1;
}
}
@@ -326,7 +328,7 @@ struct hash_netportnet6_elem {
u16 ccmp;
};
u16 padding;
- u8 nomatch:1;
+ u8 nomatch;
u8 proto;
};
@@ -334,8 +336,8 @@ struct hash_netportnet6_elem {
static inline bool
hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
- const struct hash_netportnet6_elem *ip2,
- u32 *multi)
+ const struct hash_netportnet6_elem *ip2,
+ u32 *multi)
{
return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
@@ -364,7 +366,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
static inline void
hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
- struct hash_netportnet6_elem *orig)
+ struct hash_netportnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -384,7 +386,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
static bool
hash_netportnet6_data_list(struct sk_buff *skb,
- const struct hash_netportnet6_elem *data)
+ const struct hash_netportnet6_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -397,41 +399,46 @@ hash_netportnet6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
- const struct hash_netportnet6_elem *d)
+ const struct hash_netportnet6_elem *d)
{
next->port = d->port;
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_netportnet6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
+static void
+hash_netportnet6_init(struct hash_netportnet6_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
@@ -449,7 +456,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
@@ -459,47 +466,46 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netportnet6_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
- ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- if (tb[IPSET_ATTR_CIDR2])
+ if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-
- if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
- e.cidr[1] > HOST_MASK))
- return -IPSET_ERR_INVALID_CIDR;
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -507,14 +513,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -538,8 +546,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -577,7 +585,8 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -594,6 +603,7 @@ hash_netportnet_init(void)
static void __exit
hash_netportnet_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netportnet_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_list_set.c b/kernel/net/netfilter/ipset/ip_set_list_set.c
index f8f682806..bbede95c9 100644
--- a/kernel/net/netfilter/ipset/ip_set_list_set.c
+++ b/kernel/net/netfilter/ipset/ip_set_list_set.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/ip.h>
+#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
@@ -27,8 +28,10 @@ MODULE_ALIAS("ip_set_list:set");
/* Member elements */
struct set_elem {
+ struct rcu_head rcu;
+ struct list_head list;
ip_set_id_t id;
-};
+} __aligned(__alignof__(u64));
struct set_adt_elem {
ip_set_id_t id;
@@ -41,12 +44,9 @@ struct list_set {
u32 size; /* size of set list array */
struct timer_list gc; /* garbage collection */
struct net *net; /* namespace */
- struct set_elem members[0]; /* the set members */
+ struct list_head members; /* the set members */
};
-#define list_set_elem(set, map, id) \
- (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize)
-
static int
list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
@@ -54,17 +54,14 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
{
struct list_set *map = set->data;
struct set_elem *e;
- u32 i, cmdflags = opt->cmdflags;
+ u32 cmdflags = opt->cmdflags;
int ret;
/* Don't lookup sub-counters at all */
opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -91,13 +88,9 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
{
struct list_set *map = set->data;
struct set_elem *e;
- u32 i;
int ret;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
+ list_for_each_entry(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -115,13 +108,9 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
{
struct list_set *map = set->data;
struct set_elem *e;
- u32 i;
int ret;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
+ list_for_each_entry(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -138,110 +127,65 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret = -EINVAL;
+ rcu_read_lock();
switch (adt) {
case IPSET_TEST:
- return list_set_ktest(set, skb, par, opt, &ext);
+ ret = list_set_ktest(set, skb, par, opt, &ext);
+ break;
case IPSET_ADD:
- return list_set_kadd(set, skb, par, opt, &ext);
+ ret = list_set_kadd(set, skb, par, opt, &ext);
+ break;
case IPSET_DEL:
- return list_set_kdel(set, skb, par, opt, &ext);
+ ret = list_set_kdel(set, skb, par, opt, &ext);
+ break;
default:
break;
}
- return -EINVAL;
-}
-
-static bool
-id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)
-{
- const struct list_set *map = set->data;
- const struct set_elem *e;
-
- if (i >= map->size)
- return 0;
+ rcu_read_unlock();
- e = list_set_elem(set, map, i);
- return !!(e->id == id &&
- !(SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set))));
+ return ret;
}
-static int
-list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
- const struct ip_set_ext *ext)
-{
- struct list_set *map = set->data;
- struct set_elem *e = list_set_elem(set, map, i);
+/* Userspace interfaces: we are protected by the nfnl mutex */
- if (e->id != IPSET_INVALID_ID) {
- if (i == map->size - 1) {
- /* Last element replaced: e.g. add new,before,last */
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- } else {
- struct set_elem *x = list_set_elem(set, map,
- map->size - 1);
-
- /* Last element pushed off */
- if (x->id != IPSET_INVALID_ID) {
- ip_set_put_byindex(map->net, x->id);
- ip_set_ext_destroy(set, x);
- }
- memmove(list_set_elem(set, map, i + 1), e,
- set->dsize * (map->size - (i + 1)));
- /* Extensions must be initialized to zero */
- memset(e, 0, set->dsize);
- }
- }
-
- e->id = d->id;
- if (SET_WITH_TIMEOUT(set))
- ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
- if (SET_WITH_COUNTER(set))
- ip_set_init_counter(ext_counter(e, set), ext);
- if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(e, set), ext);
- if (SET_WITH_SKBINFO(set))
- ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
- return 0;
-}
-
-static int
-list_set_del(struct ip_set *set, u32 i)
+static void
+__list_set_del(struct ip_set *set, struct set_elem *e)
{
struct list_set *map = set->data;
- struct set_elem *e = list_set_elem(set, map, i);
ip_set_put_byindex(map->net, e->id);
+ /* We may call it, because we don't have a to be destroyed
+ * extension which is used by the kernel.
+ */
ip_set_ext_destroy(set, e);
+ kfree_rcu(e, rcu);
+}
- if (i < map->size - 1)
- memmove(e, list_set_elem(set, map, i + 1),
- set->dsize * (map->size - (i + 1)));
+static inline void
+list_set_del(struct ip_set *set, struct set_elem *e)
+{
+ list_del_rcu(&e->list);
+ __list_set_del(set, e);
+}
- /* Last element */
- e = list_set_elem(set, map, map->size - 1);
- e->id = IPSET_INVALID_ID;
- return 0;
+static inline void
+list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
+{
+ list_replace_rcu(&old->list, &e->list);
+ __list_set_del(set, old);
}
static void
set_cleanup_entries(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e;
- u32 i = 0;
+ struct set_elem *e, *n;
- while (i < map->size) {
- e = list_set_elem(set, map, i);
- if (e->id != IPSET_INVALID_ID &&
- ip_set_timeout_expired(ext_timeout(e, set)))
- list_set_del(set, i);
- /* Check element moved to position i in next loop */
- else
- i++;
- }
+ list_for_each_entry_safe(e, n, &map->members, list)
+ if (ip_set_timeout_expired(ext_timeout(e, set)))
+ list_set_del(set, e);
}
static int
@@ -250,31 +194,46 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e;
- u32 i;
+ struct set_elem *e, *next, *prev = NULL;
int ret;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
- else if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ list_for_each_entry(e, &map->members, list) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
continue;
- else if (e->id != d->id)
+ else if (e->id != d->id) {
+ prev = e;
continue;
+ }
- if (d->before == 0)
- return 1;
- else if (d->before > 0)
- ret = id_eq(set, i + 1, d->refid);
- else
- ret = i > 0 && id_eq(set, i - 1, d->refid);
+ if (d->before == 0) {
+ ret = 1;
+ } else if (d->before > 0) {
+ next = list_next_entry(e, list);
+ ret = !list_is_last(&e->list, &map->members) &&
+ next->id == d->refid;
+ } else {
+ ret = prev && prev->id == d->refid;
+ }
return ret;
}
return 0;
}
+static void
+list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext,
+ struct set_elem *e)
+{
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
+ /* Update timeout last */
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+}
static int
list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
@@ -282,60 +241,78 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e;
+ struct set_elem *e, *n, *prev, *next;
bool flag_exist = flags & IPSET_FLAG_EXIST;
- u32 i, ret = 0;
if (SET_WITH_TIMEOUT(set))
set_cleanup_entries(set);
- /* Check already added element */
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- goto insert;
- else if (e->id != d->id)
+ /* Find where to add the new entry */
+ n = prev = next = NULL;
+ list_for_each_entry(e, &map->members, list) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
continue;
-
- if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) ||
- (d->before < 0 &&
- (i == 0 || !id_eq(set, i - 1, d->refid))))
- /* Before/after doesn't match */
+ else if (d->id == e->id)
+ n = e;
+ else if (d->before == 0 || e->id != d->refid)
+ continue;
+ else if (d->before > 0)
+ next = e;
+ else
+ prev = e;
+ }
+ /* Re-add already existing element */
+ if (n) {
+ if ((d->before > 0 && !next) ||
+ (d->before < 0 && !prev))
return -IPSET_ERR_REF_EXIST;
if (!flag_exist)
- /* Can't re-add */
return -IPSET_ERR_EXIST;
/* Update extensions */
- ip_set_ext_destroy(set, e);
+ ip_set_ext_destroy(set, n);
+ list_set_init_extensions(set, ext, n);
- if (SET_WITH_TIMEOUT(set))
- ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
- if (SET_WITH_COUNTER(set))
- ip_set_init_counter(ext_counter(e, set), ext);
- if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(e, set), ext);
- if (SET_WITH_SKBINFO(set))
- ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
/* Set is already added to the list */
ip_set_put_byindex(map->net, d->id);
return 0;
}
-insert:
- ret = -IPSET_ERR_LIST_FULL;
- for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- ret = d->before != 0 ? -IPSET_ERR_REF_EXIST
- : list_set_add(set, i, d, ext);
- else if (e->id != d->refid)
- continue;
- else if (d->before > 0)
- ret = list_set_add(set, i, d, ext);
- else if (i + 1 < map->size)
- ret = list_set_add(set, i + 1, d, ext);
+ /* Add new entry */
+ if (d->before == 0) {
+ /* Append */
+ n = list_empty(&map->members) ? NULL :
+ list_last_entry(&map->members, struct set_elem, list);
+ } else if (d->before > 0) {
+ /* Insert after next element */
+ if (!list_is_last(&next->list, &map->members))
+ n = list_next_entry(next, list);
+ } else {
+ /* Insert before prev element */
+ if (prev->list.prev != &map->members)
+ n = list_prev_entry(prev, list);
}
+ /* Can we replace a timed out entry? */
+ if (n &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(n, set))))
+ n = NULL;
+
+ e = kzalloc(set->dsize, GFP_ATOMIC);
+ if (!e)
+ return -ENOMEM;
+ e->id = d->id;
+ INIT_LIST_HEAD(&e->list);
+ list_set_init_extensions(set, ext, e);
+ if (n)
+ list_set_replace(set, e, n);
+ else if (next)
+ list_add_tail_rcu(&e->list, &next->list);
+ else if (prev)
+ list_add_rcu(&e->list, &prev->list);
+ else
+ list_add_tail_rcu(&e->list, &map->members);
- return ret;
+ return 0;
}
static int
@@ -344,32 +321,30 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e;
- u32 i;
-
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return d->before != 0 ? -IPSET_ERR_REF_EXIST
- : -IPSET_ERR_EXIST;
- else if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ struct set_elem *e, *next, *prev = NULL;
+
+ list_for_each_entry(e, &map->members, list) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
continue;
- else if (e->id != d->id)
+ else if (e->id != d->id) {
+ prev = e;
continue;
+ }
- if (d->before == 0)
- return list_set_del(set, i);
- else if (d->before > 0) {
- if (!id_eq(set, i + 1, d->refid))
+ if (d->before > 0) {
+ next = list_next_entry(e, list);
+ if (list_is_last(&e->list, &map->members) ||
+ next->id != d->refid)
return -IPSET_ERR_REF_EXIST;
- return list_set_del(set, i);
- } else if (i == 0 || !id_eq(set, i - 1, d->refid))
- return -IPSET_ERR_REF_EXIST;
- else
- return list_set_del(set, i);
+ } else if (d->before < 0) {
+ if (!prev || prev->id != d->refid)
+ return -IPSET_ERR_REF_EXIST;
+ }
+ list_set_del(set, e);
+ return 0;
}
- return -IPSET_ERR_EXIST;
+ return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST;
}
static int
@@ -383,19 +358,13 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set *s;
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_NAME] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -410,6 +379,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
e.before = f & IPSET_FLAG_BEFORE;
}
@@ -447,27 +417,26 @@ static void
list_set_flush(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e;
- u32 i;
-
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id != IPSET_INVALID_ID) {
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- e->id = IPSET_INVALID_ID;
- }
- }
+ struct set_elem *e, *n;
+
+ list_for_each_entry_safe(e, n, &map->members, list)
+ list_set_del(set, e);
}
static void
list_set_destroy(struct ip_set *set)
{
struct list_set *map = set->data;
+ struct set_elem *e, *n;
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
- list_set_flush(set);
+ list_for_each_entry_safe(e, n, &map->members, list) {
+ list_del(&e->list);
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ kfree(e);
+ }
kfree(map);
set->data = NULL;
@@ -478,6 +447,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
{
const struct list_set *map = set->data;
struct nlattr *nested;
+ struct set_elem *e;
+ u32 n = 0;
+
+ list_for_each_entry(e, &map->members, list)
+ n++;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
@@ -485,7 +459,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) + map->size * set->dsize)))
+ htonl(sizeof(*map) + n * set->dsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -502,18 +476,22 @@ list_set_list(const struct ip_set *set,
{
const struct list_set *map = set->data;
struct nlattr *atd, *nested;
- u32 i, first = cb->args[IPSET_CB_ARG0];
- const struct set_elem *e;
+ u32 i = 0, first = cb->args[IPSET_CB_ARG0];
+ struct set_elem *e;
+ int ret = 0;
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
- for (; cb->args[IPSET_CB_ARG0] < map->size;
- cb->args[IPSET_CB_ARG0]++) {
- i = cb->args[IPSET_CB_ARG0];
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- goto finish;
+ list_for_each_entry(e, &map->members, list) {
+ if (i == first)
+ break;
+ i++;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_from(e, &map->members, list) {
+ i++;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -521,9 +499,10 @@ list_set_list(const struct ip_set *set,
if (!nested) {
if (i == first) {
nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ goto nla_put_failure;
}
if (nla_put_string(skb, IPSET_ATTR_NAME,
ip_set_name_byindex(map->net, e->id)))
@@ -532,20 +511,23 @@ list_set_list(const struct ip_set *set,
goto nla_put_failure;
ipset_nest_end(skb, nested);
}
-finish:
+
ipset_nest_end(skb, atd);
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
- return 0;
+ goto out;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
cb->args[IPSET_CB_ARG0] = 0;
- return -EMSGSIZE;
+ ret = -EMSGSIZE;
}
+ cb->args[IPSET_CB_ARG0] = i - 1;
ipset_nest_end(skb, atd);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static bool
@@ -577,12 +559,12 @@ static const struct ip_set_type_variant set_variant = {
static void
list_set_gc(unsigned long ul_set)
{
- struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set *set = (struct ip_set *)ul_set;
struct list_set *map = set->data;
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
set_cleanup_entries(set);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
@@ -594,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
struct list_set *map = set->data;
init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
+ map->gc.data = (unsigned long)set;
map->gc.function = gc;
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
@@ -606,24 +588,16 @@ static bool
init_list_set(struct net *net, struct ip_set *set, u32 size)
{
struct list_set *map;
- struct set_elem *e;
- u32 i;
- map = kzalloc(sizeof(*map) +
- min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize,
- GFP_KERNEL);
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
if (!map)
return false;
map->size = size;
map->net = net;
+ INIT_LIST_HEAD(&map->members);
set->data = map;
- for (i = 0; i < size; i++) {
- e = list_set_elem(set, map, i);
- e->id = IPSET_INVALID_ID;
- }
-
return true;
}
@@ -644,7 +618,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
size = IP_SET_LIST_MIN_SIZE;
set->variant = &set_variant;
- set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem));
+ set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem),
+ __alignof__(struct set_elem));
if (!init_list_set(net, set, size))
return -ENOMEM;
if (tb[IPSET_ATTR_TIMEOUT]) {
@@ -678,7 +653,8 @@ static struct ip_set_type list_set_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -695,6 +671,7 @@ list_set_init(void)
static void __exit
list_set_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&list_set_type);
}
diff --git a/kernel/net/netfilter/ipset/pfxlen.c b/kernel/net/netfilter/ipset/pfxlen.c
index 04d15fdc9..1c8a42c10 100644
--- a/kernel/net/netfilter/ipset/pfxlen.c
+++ b/kernel/net/netfilter/ipset/pfxlen.c
@@ -1,9 +1,7 @@
#include <linux/export.h>
#include <linux/netfilter/ipset/pfxlen.h>
-/*
- * Prefixlen maps for fast conversions, by Jan Engelhardt.
- */
+/* Prefixlen maps for fast conversions, by Jan Engelhardt. */
#define E(a, b, c, d) \
{.ip6 = { \
@@ -11,8 +9,7 @@
htonl(c), htonl(d), \
} }
-/*
- * This table works for both IPv4 and IPv6;
+/* This table works for both IPv4 and IPv6;
* just use prefixlen_netmask_map[prefixlength].ip.
*/
const union nf_inet_addr ip_set_netmask_map[] = {
@@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = {
EXPORT_SYMBOL_GPL(ip_set_netmask_map);
#undef E
-#define E(a, b, c, d) \
- {.ip6 = { (__force __be32) a, (__force __be32) b, \
- (__force __be32) c, (__force __be32) d, \
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32)a, (__force __be32)b, \
+ (__force __be32)c, (__force __be32)d, \
} }
-/*
- * This table works for both IPv4 and IPv6;
+/* This table works for both IPv4 and IPv6;
* just use prefixlen_hostmask_map[prefixlength].ip.
*/
const union nf_inet_addr ip_set_hostmask_map[] = {
diff --git a/kernel/net/netfilter/ipvs/Kconfig b/kernel/net/netfilter/ipvs/Kconfig
index 3b6929dec..b32fb0dbe 100644
--- a/kernel/net/netfilter/ipvs/Kconfig
+++ b/kernel/net/netfilter/ipvs/Kconfig
@@ -162,6 +162,17 @@ config IP_VS_FO
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_OVF
+ tristate "weighted overflow scheduling"
+ ---help---
+ The weighted overflow scheduling algorithm directs network
+ connections to the server with the highest weight that is
+ currently available and overflows to the next when active
+ connections exceed the node's weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
config IP_VS_LBLC
tristate "locality-based least-connection scheduling"
---help---
diff --git a/kernel/net/netfilter/ipvs/Makefile b/kernel/net/netfilter/ipvs/Makefile
index 38b2723b2..67f3f4389 100644
--- a/kernel/net/netfilter/ipvs/Makefile
+++ b/kernel/net/netfilter/ipvs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o
+obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o
obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
diff --git a/kernel/net/netfilter/ipvs/ip_vs_app.c b/kernel/net/netfilter/ipvs/ip_vs_app.c
index dfd7b65b3..0328f7250 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_app.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_app.c
@@ -75,7 +75,7 @@ static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
-ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto,
__u16 port)
{
struct ip_vs_protocol *pp;
@@ -107,7 +107,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
}
}
- ret = pp->register_app(net, inc);
+ ret = pp->register_app(ipvs, inc);
if (ret)
goto out;
@@ -127,7 +127,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
* Release app incarnation
*/
static void
-ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
+ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
@@ -135,7 +135,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
return;
if (pp->unregister_app)
- pp->unregister_app(net, inc);
+ pp->unregister_app(ipvs, inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, ntohs(inc->port));
@@ -175,14 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
* Register an application incarnation in protocol applications
*/
int
-register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto,
__u16 port)
{
int result;
mutex_lock(&__ip_vs_app_mutex);
- result = ip_vs_app_inc_new(net, app, proto, port);
+ result = ip_vs_app_inc_new(ipvs, app, proto, port);
mutex_unlock(&__ip_vs_app_mutex);
@@ -191,15 +191,11 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
/* Register application for netns */
-struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *a;
int err = 0;
- if (!ipvs)
- return ERR_PTR(-ENOENT);
-
mutex_lock(&__ip_vs_app_mutex);
list_for_each_entry(a, &ipvs->app_list, a_list) {
@@ -230,21 +226,17 @@ out_unlock:
* We are sure there are no app incarnations attached to services
* Caller should use synchronize_rcu() or rcu_barrier()
*/
-void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
+void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *a, *anxt, *inc, *nxt;
- if (!ipvs)
- return;
-
mutex_lock(&__ip_vs_app_mutex);
list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
if (app && strcmp(app->name, a->name))
continue;
list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
- ip_vs_app_inc_release(net, inc);
+ ip_vs_app_inc_release(ipvs, inc);
}
list_del(&a->a_list);
@@ -611,17 +603,19 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-int __net_init ip_vs_app_net_init(struct net *net)
+int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net *net = ipvs->net;
INIT_LIST_HEAD(&ipvs->app_list);
proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
return 0;
}
-void __net_exit ip_vs_app_net_cleanup(struct net *net)
+void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
- unregister_ip_vs_app(net, NULL /* all */);
+ struct net *net = ipvs->net;
+
+ unregister_ip_vs_app(ipvs, NULL /* all */);
remove_proc_entry("ip_vs_app", net->proc_net);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_conn.c b/kernel/net/netfilter/ipvs/ip_vs_conn.c
index b0f7b626b..85ca189bd 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_conn.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_conn.c
@@ -108,7 +108,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
+static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr,
__be16 port)
{
@@ -116,11 +116,11 @@ static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int pro
if (af == AF_INET6)
return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
(__force u32)port, proto, ip_vs_conn_rnd) ^
- ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+ ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
#endif
return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
ip_vs_conn_rnd) ^
- ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+ ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
}
static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -141,14 +141,14 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
port = p->vport;
}
- return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
+ return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
}
static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
&cp->caddr, cp->cport, NULL, 0, &p);
if (cp->pe) {
@@ -279,7 +279,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
p->protocol == cp->protocol &&
- ip_vs_conn_net_eq(cp, p->net)) {
+ cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
continue;
/* HIT */
@@ -314,33 +314,34 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
}
static int
-ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
+ int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
- int inverse, struct ip_vs_conn_param *p)
+ struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
- struct net *net = skb_net(skb);
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return 1;
- if (likely(!inverse))
- ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ if (likely(!ip_vs_iph_inverse(iph)))
+ ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
pptr[0], &iph->daddr, pptr[1], p);
else
- ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
pptr[1], &iph->saddr, pptr[0], p);
return 0;
}
struct ip_vs_conn *
-ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, int inverse)
+ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
+ const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
return NULL;
return ip_vs_conn_in_get(&p);
@@ -359,7 +360,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (unlikely(p->pe_data && p->pe->ct_match)) {
- if (!ip_vs_conn_net_eq(cp, p->net))
+ if (cp->ipvs != p->ipvs)
continue;
if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
if (__ip_vs_conn_get(cp))
@@ -377,7 +378,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
p->vport == cp->vport && p->cport == cp->cport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
p->protocol == cp->protocol &&
- ip_vs_conn_net_eq(cp, p->net)) {
+ cp->ipvs == p->ipvs) {
if (__ip_vs_conn_get(cp))
goto out;
}
@@ -418,7 +419,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
- ip_vs_conn_net_eq(cp, p->net)) {
+ cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
continue;
/* HIT */
@@ -439,12 +440,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
}
struct ip_vs_conn *
-ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, int inverse)
+ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
+ const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
return NULL;
return ip_vs_conn_out_get(&p);
@@ -638,7 +640,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
* so we can make the assumption that the svc_af is the same as the
* dest_af
*/
- dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr,
+ dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
cp->dport, &cp->vaddr, cp->vport,
cp->protocol, cp->fwmark, cp->flags);
if (dest) {
@@ -668,7 +670,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
#endif
ip_vs_bind_xmit(cp);
- pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
if (pd && atomic_read(&pd->appcnt))
ip_vs_bind_app(cp, pd->pp);
}
@@ -746,7 +748,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs,
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
+ struct netns_ipvs *ipvs = ct->ipvs;
/*
* Checking the dest server status.
@@ -800,8 +802,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head)
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
- struct net *net = ip_vs_conn_net(cp);
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs = cp->ipvs;
/*
* do I control anybody?
@@ -847,7 +848,7 @@ static void ip_vs_conn_expire(unsigned long data)
cp->timeout = 60*HZ;
if (ipvs->sync_state & IP_VS_STATE_MASTER)
- ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+ ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp);
}
@@ -875,8 +876,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
- struct netns_ipvs *ipvs = net_ipvs(p->net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ struct netns_ipvs *ipvs = p->ipvs;
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
p->protocol);
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
@@ -887,7 +888,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
- ip_vs_conn_net_set(cp, p->net);
+ cp->ipvs = ipvs;
cp->af = p->af;
cp->daf = dest_af;
cp->protocol = p->protocol;
@@ -1061,7 +1062,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
size_t len = 0;
char dbuf[IP_VS_ADDRSTRLEN];
- if (!ip_vs_conn_net_eq(cp, net))
+ if (!net_eq(cp->ipvs->net, net))
return 0;
if (cp->pe_data) {
pe_data[0] = ' ';
@@ -1146,7 +1147,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_conn *cp = v;
struct net *net = seq_file_net(seq);
- if (!ip_vs_conn_net_eq(cp, net))
+ if (!net_eq(cp->ipvs->net, net))
return 0;
#ifdef CONFIG_IP_VS_IPV6
@@ -1240,7 +1241,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
}
/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(struct net *net)
+void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_conn *cp, *cp_c;
@@ -1256,7 +1257,7 @@ void ip_vs_random_dropentry(struct net *net)
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
- if (!ip_vs_conn_net_eq(cp, net))
+ if (cp->ipvs != ipvs)
continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
@@ -1308,18 +1309,17 @@ void ip_vs_random_dropentry(struct net *net)
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
-static void ip_vs_conn_flush(struct net *net)
+static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_conn *cp, *cp_c;
- struct netns_ipvs *ipvs = net_ipvs(net);
flush_again:
rcu_read_lock();
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
- if (!ip_vs_conn_net_eq(cp, net))
+ if (cp->ipvs != ipvs)
continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
@@ -1345,23 +1345,22 @@ flush_again:
/*
* per netns init and exit
*/
-int __net_init ip_vs_conn_net_init(struct net *net)
+int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
atomic_set(&ipvs->conn_count, 0);
- proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
- proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+ proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_fops);
return 0;
}
-void __net_exit ip_vs_conn_net_cleanup(struct net *net)
+void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
- ip_vs_conn_flush(net);
- remove_proc_entry("ip_vs_conn", net->proc_net);
- remove_proc_entry("ip_vs_conn_sync", net->proc_net);
+ ip_vs_conn_flush(ipvs);
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
}
int __init ip_vs_conn_init(void)
diff --git a/kernel/net/netfilter/ipvs/ip_vs_core.c b/kernel/net/netfilter/ipvs/ip_vs_core.c
index 5d2b806a8..f57b4dcdb 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_core.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_core.c
@@ -112,7 +112,7 @@ static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ struct netns_ipvs *ipvs = cp->ipvs;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
@@ -146,7 +146,7 @@ static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ struct netns_ipvs *ipvs = cp->ipvs;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
@@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
struct ip_vs_cpu_stats *s;
s = this_cpu_ptr(cp->dest->stats.cpustats);
@@ -215,7 +215,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
const union nf_inet_addr *vaddr, __be16 vport,
struct ip_vs_conn_param *p)
{
- ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr,
vport, p);
p->pe = rcu_dereference(svc->pe);
if (p->pe && p->pe->fill_param)
@@ -245,20 +245,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
union nf_inet_addr snet; /* source network of the client,
after masking */
+ const union nf_inet_addr *src_addr, *dst_addr;
+
+ if (likely(!ip_vs_iph_inverse(iph))) {
+ src_addr = &iph->saddr;
+ dst_addr = &iph->daddr;
+ } else {
+ src_addr = &iph->daddr;
+ dst_addr = &iph->saddr;
+ }
+
/* Mask saddr with the netmask to adjust template granularity */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ ipv6_addr_prefix(&snet.in6, &src_addr->in6,
(__force __u32) svc->netmask);
else
#endif
- snet.ip = iph->saddr.ip & svc->netmask;
+ snet.ip = src_addr->ip & svc->netmask;
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
- IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
+ IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -276,7 +286,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
*/
{
int protocol = iph->protocol;
- const union nf_inet_addr *vaddr = &iph->daddr;
+ const union nf_inet_addr *vaddr = dst_addr;
__be16 vport = 0;
if (dst_port == svc->port) {
@@ -319,7 +329,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* return *ignored=0 i.e. ICMP and NF_DROP
*/
sched = rcu_dereference(svc->scheduler);
- dest = sched->schedule(svc, skb, iph);
+ if (sched) {
+ /* read svc->sched_data after svc->scheduler */
+ smp_rmb();
+ dest = sched->schedule(svc, skb, iph);
+ } else {
+ dest = NULL;
+ }
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
@@ -360,8 +376,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
- src_port, &iph->daddr, dst_port, &param);
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr,
+ src_port, dst_addr, dst_port, &param);
cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
skb->mark);
@@ -412,7 +428,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_conn *cp = NULL;
struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
- __be16 _ports[2], *pptr;
+ __be16 _ports[2], *pptr, cport, vport;
+ const void *caddr, *vaddr;
unsigned int flags;
*ignored = 1;
@@ -423,14 +440,26 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
if (pptr == NULL)
return NULL;
+ if (likely(!ip_vs_iph_inverse(iph))) {
+ cport = pptr[0];
+ caddr = &iph->saddr;
+ vport = pptr[1];
+ vaddr = &iph->daddr;
+ } else {
+ cport = pptr[1];
+ caddr = &iph->daddr;
+ vport = pptr[0];
+ vaddr = &iph->saddr;
+ }
+
/*
* FTPDATA needs this check when using local real server.
* Never schedule Active FTPDATA connections from real server.
* For LVS-NAT they must be already created. For other methods
* with persistence the connection is created on SYN+ACK.
*/
- if (pptr[0] == FTPDATA) {
- IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ if (cport == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
"Not scheduling FTPDATA");
return NULL;
}
@@ -438,19 +467,25 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Do not schedule replies from local real server.
*/
- if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
- IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
- "Not scheduling reply for existing connection");
- __ip_vs_conn_put(cp);
- return NULL;
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
+ iph->hdr_flags ^= IP_VS_HDR_INVERSE;
+ cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph);
+ iph->hdr_flags ^= IP_VS_HDR_INVERSE;
+
+ if (cp) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
+ "Not scheduling reply for existing"
+ " connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
}
/*
* Persistent service
*/
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
- return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ return ip_vs_sched_persist(svc, skb, cport, vport, ignored,
iph);
*ignored = 0;
@@ -458,7 +493,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Non-persistent service
*/
- if (!svc->fwmark && pptr[1] != svc->port) {
+ if (!svc->fwmark && vport != svc->port) {
if (!svc->port)
pr_err("Schedule: port zero only supported "
"in persistent services, "
@@ -467,7 +502,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
sched = rcu_dereference(svc->scheduler);
- dest = sched->schedule(svc, skb, iph);
+ if (sched) {
+ /* read svc->sched_data after svc->scheduler */
+ smp_rmb();
+ dest = sched->schedule(svc, skb, iph);
+ } else {
+ dest = NULL;
+ }
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
@@ -483,11 +524,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
- &iph->saddr, pptr[0], &iph->daddr,
- pptr[1], &p);
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
+ caddr, cport, vaddr, vport, &p);
cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
- dest->port ? dest->port : pptr[1],
+ dest->port ? dest->port : vport,
flags, dest, skb->mark);
if (!cp) {
*ignored = -1;
@@ -507,6 +547,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return cp;
}
+static inline int ip_vs_addr_is_unicast(struct net *net, int af,
+ union nf_inet_addr *addr)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST;
+#endif
+ return (inet_addr_type(net, addr->ip) == RTN_UNICAST);
+}
/*
* Pass or drop the packet.
@@ -516,33 +565,21 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
{
- __be16 _ports[2], *pptr;
-#ifdef CONFIG_SYSCTL
- struct net *net;
- struct netns_ipvs *ipvs;
- int unicast;
-#endif
+ __be16 _ports[2], *pptr, dport;
+ struct netns_ipvs *ipvs = svc->ipvs;
+ struct net *net = ipvs->net;
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
- if (pptr == NULL) {
+ if (!pptr)
return NF_DROP;
- }
-
-#ifdef CONFIG_SYSCTL
- net = skb_net(skb);
-
-#ifdef CONFIG_IP_VS_IPV6
- if (svc->af == AF_INET6)
- unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
- else
-#endif
- unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
+ dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
- ipvs = net_ipvs(net);
- if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+ if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
+ !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
+ ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
int ret;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -554,7 +591,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
&iph->saddr, pptr[0],
&iph->daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
@@ -578,7 +615,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_conn_put(cp);
return ret;
}
-#endif
/*
* When the virtual ftp service is presented, packets destined
@@ -586,9 +622,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
+ if (svc->port == FTPPORT && dport != FTPPORT)
return NF_ACCEPT;
+ if (unlikely(ip_vs_iph_icmp(iph)))
+ return NF_DROP;
+
/*
* Notify the client that the destination is unreachable, and
* release the socket buffer.
@@ -598,11 +637,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
*/
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6) {
- if (!skb->dev) {
- struct net *net_ = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net_->loopback_dev;
- }
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
} else
#endif
@@ -613,15 +649,13 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#ifdef CONFIG_SYSCTL
-static int sysctl_snat_reroute(struct sk_buff *skb)
+static int sysctl_snat_reroute(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
return ipvs->sysctl_snat_reroute;
}
-static int sysctl_nat_icmp_send(struct net *net)
+static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
return ipvs->sysctl_nat_icmp_send;
}
@@ -632,8 +666,8 @@ static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
#else
-static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
-static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
+static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
#endif
@@ -652,12 +686,13 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
return IP_DEFRAG_VS_OUT;
}
-static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs,
+ struct sk_buff *skb, u_int32_t user)
{
int err;
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(ipvs->net, skb, user);
local_bh_enable();
if (!err)
ip_send_check(ip_hdr(skb));
@@ -665,10 +700,10 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
return err;
}
-static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
- unsigned int hooknum)
+static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
+ struct sk_buff *skb, unsigned int hooknum)
{
- if (!sysctl_snat_reroute(skb))
+ if (!sysctl_snat_reroute(ipvs))
return 0;
/* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
if (NF_INET_LOCAL_IN == hooknum)
@@ -678,12 +713,12 @@ static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
struct dst_entry *dst = skb_dst(skb);
if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
- ip6_route_me_harder(skb) != 0)
+ ip6_route_me_harder(ipvs->net, skb) != 0)
return 1;
} else
#endif
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
return 1;
return 0;
@@ -836,7 +871,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
- if (ip_vs_route_me_harder(af, skb, hooknum))
+ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
/* do the statistics and put it back */
@@ -860,8 +895,8 @@ out:
* Find any that might be relevant, check against existing connections.
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
- unsigned int hooknum)
+static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ int *related, unsigned int hooknum)
{
struct iphdr *iph;
struct icmphdr _icmph, *ic;
@@ -876,7 +911,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
/* reassemble IP fragments */
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -922,10 +957,10 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
"Checking outgoing ICMP for");
- ip_vs_fill_ip4hdr(cih, &ciph);
- ciph.len += offset;
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
+
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
+ cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph);
if (!cp)
return NF_ACCEPT;
@@ -935,16 +970,16 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
}
#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
- unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
+static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ int *related, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
{
struct icmp6hdr _icmph, *ic;
- struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
union nf_inet_addr snet;
- unsigned int writable;
+ unsigned int offset;
*related = 1;
ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
@@ -972,31 +1007,23 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
ic->icmp6_type, ntohs(icmpv6_id(ic)),
&ipvsh->saddr, &ipvsh->daddr);
- /* Now find the contained IP header */
- ciph.len = ipvsh->len + sizeof(_icmph);
- ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
- if (ip6h == NULL)
+ if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph),
+ true, &ciph))
return NF_ACCEPT; /* The packet looks wrong, ignore */
- ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
- ciph.daddr.in6 = ip6h->daddr;
- /* skip possible IPv6 exthdrs of contained IPv6 packet */
- ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
- if (ciph.protocol < 0)
- return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
pp = ip_vs_proto_get(ciph.protocol);
if (!pp)
return NF_ACCEPT;
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
+ cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph);
if (!cp)
return NF_ACCEPT;
snet.in6 = ciph.saddr.in6;
- writable = ciph.len;
+ offset = ciph.len;
return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
- pp, writable, sizeof(struct ipv6hdr),
+ pp, offset, sizeof(struct ipv6hdr),
hooknum);
}
#endif
@@ -1081,7 +1108,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
{
struct ip_vs_protocol *pp = pd->pp;
- IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
+ IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
if (!skb_make_writable(skb, iph->len))
goto drop;
@@ -1115,10 +1142,10 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* if it came from this machine itself. So re-compute
* the routing information.
*/
- if (ip_vs_route_me_harder(af, skb, hooknum))
+ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto drop;
- IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
+ IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
@@ -1143,13 +1170,13 @@ drop:
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
- struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ struct sock *sk;
EnterFunction(11);
@@ -1157,29 +1184,27 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (skb->ipvs_property)
return NF_ACCEPT;
+ sk = skb_to_full_sk(skb);
/* Bad... Do not break raw sockets */
- if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ if (!ipvs->enable)
return NF_ACCEPT;
- ip_vs_fill_iph_skb(af, skb, &iph);
+ ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
- int verdict = ip_vs_out_icmp_v6(skb, &related,
+ int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
hooknum, &iph);
if (related)
@@ -1189,13 +1214,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
- int verdict = ip_vs_out_icmp(skb, &related, hooknum);
+ int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
if (related)
return verdict;
}
- pd = ip_vs_proto_data_get(net, iph.protocol);
+ pd = ip_vs_proto_data_get(ipvs, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
@@ -1205,21 +1230,21 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (af == AF_INET)
#endif
if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
- if (ip_vs_gather_frags(skb,
+ if (ip_vs_gather_frags(ipvs, skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
- ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, &iph, 0);
+ cp = pp->conn_out_get(ipvs, af, skb, &iph);
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
- if (sysctl_nat_icmp_send(net) &&
+ if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
@@ -1229,7 +1254,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
pptr[0])) {
/*
* Notify the real server: there is no
@@ -1246,7 +1271,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (!skb->dev)
- skb->dev = net->loopback_dev;
+ skb->dev = ipvs->net->loopback_dev;
icmpv6_send(skb,
ICMPV6_DEST_UNREACH,
ICMPV6_PORT_UNREACH,
@@ -1260,7 +1285,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
}
}
}
- IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
}
@@ -1271,10 +1296,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_reply4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
}
/*
@@ -1282,10 +1307,10 @@ ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_reply4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1296,10 +1321,10 @@ ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_reply6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET6);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
/*
@@ -1307,14 +1332,51 @@ ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_reply6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET6);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
#endif
+static unsigned int
+ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+
+ if (!iph->fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
+
+ /* Schedule and create new connection entry into cpp */
+ if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph))
+ return 0;
+ }
+
+ if (unlikely(!*cpp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
+ "ip_vs_in: packet continues traversal as normal");
+ if (iph->fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
+ "unhandled fragment");
+ }
+ *verdict = NF_ACCEPT;
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
@@ -1322,9 +1384,9 @@ ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
static int
-ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
+ unsigned int hooknum)
{
- struct net *net = NULL;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
@@ -1333,13 +1395,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
- bool ipip;
+ bool ipip, new_cp = false;
*related = 1;
/* reassemble IP fragments */
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1373,8 +1435,6 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- net = skb_net(skb);
-
/* Special case for errors for IPIP packets */
ipip = false;
if (cih->protocol == IPPROTO_IPIP) {
@@ -1390,7 +1450,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
ipip = true;
}
- pd = ip_vs_proto_data_get(net, cih->protocol);
+ pd = ip_vs_proto_data_get(ipvs, cih->protocol);
if (!pd)
return NF_ACCEPT;
pp = pd->pp;
@@ -1404,15 +1464,24 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
"Checking incoming ICMP for");
offset2 = offset;
- ip_vs_fill_ip4hdr(cih, &ciph);
- ciph.len += offset;
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
offset = ciph.len;
+
/* The embedded headers contain source and dest in reverse order.
* For IPIP this is error for request, not for reply.
*/
- cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
- if (!cp)
- return NF_ACCEPT;
+ cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph);
+
+ if (!cp) {
+ int v;
+
+ if (!sysctl_schedule_icmp(ipvs))
+ return NF_ACCEPT;
+
+ if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
+ return v;
+ new_cp = true;
+ }
verdict = NF_DROP;
@@ -1443,7 +1512,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
skb_reset_network_header(skb);
IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
- ipv4_update_pmtu(skb, dev_net(skb->dev),
+ ipv4_update_pmtu(skb, ipvs->net,
mtu, 0, 0, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
@@ -1489,23 +1558,26 @@ ignore_ipip:
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
out:
- __ip_vs_conn_put(cp);
+ if (likely(!new_cp))
+ __ip_vs_conn_put(cp);
+ else
+ ip_vs_conn_put(cp);
return verdict;
}
#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
- unsigned int hooknum, struct ip_vs_iphdr *iph)
+static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ int *related, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
{
- struct net *net = NULL;
- struct ipv6hdr _ip6h, *ip6h;
struct icmp6hdr _icmph, *ic;
struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
- unsigned int offs_ciph, writable, verdict;
+ unsigned int offset, verdict;
+ bool new_cp = false;
*related = 1;
@@ -1534,21 +1606,11 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
ic->icmp6_type, ntohs(icmpv6_id(ic)),
&iph->saddr, &iph->daddr);
- /* Now find the contained IP header */
- ciph.len = iph->len + sizeof(_icmph);
- offs_ciph = ciph.len; /* Save ip header offset */
- ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
- if (ip6h == NULL)
- return NF_ACCEPT; /* The packet looks wrong, ignore */
- ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
- ciph.daddr.in6 = ip6h->daddr;
- /* skip possible IPv6 exthdrs of contained IPv6 packet */
- ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
- if (ciph.protocol < 0)
- return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
-
- net = skb_net(skb);
- pd = ip_vs_proto_data_get(net, ciph.protocol);
+ offset = iph->len + sizeof(_icmph);
+ if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph))
+ return NF_ACCEPT;
+
+ pd = ip_vs_proto_data_get(ipvs, ciph.protocol);
if (!pd)
return NF_ACCEPT;
pp = pd->pp;
@@ -1557,36 +1619,49 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
if (ciph.fragoffs)
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
"Checking incoming ICMPv6 for");
/* The embedded headers contain source and dest in reverse order
* if not from localhost
*/
- cp = pp->conn_in_get(AF_INET6, skb, &ciph,
- (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+ cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph);
+
+ if (!cp) {
+ int v;
+
+ if (!sysctl_schedule_icmp(ipvs))
+ return NF_ACCEPT;
+
+ if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph))
+ return v;
+
+ new_cp = true;
+ }
- if (!cp)
- return NF_ACCEPT;
/* VS/TUN, VS/DR and LOCALNODE just let it go */
if ((hooknum == NF_INET_LOCAL_OUT) &&
(IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
- __ip_vs_conn_put(cp);
- return NF_ACCEPT;
+ verdict = NF_ACCEPT;
+ goto out;
}
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
/* Need to mangle contained IPv6 header in ICMPv6 packet */
- writable = ciph.len;
+ offset = ciph.len;
if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
IPPROTO_SCTP == ciph.protocol)
- writable += 2 * sizeof(__u16); /* Also mangle ports */
+ offset += 2 * sizeof(__u16); /* Also mangle ports */
- verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph);
- __ip_vs_conn_put(cp);
+out:
+ if (likely(!new_cp))
+ __ip_vs_conn_put(cp);
+ else
+ ip_vs_conn_put(cp);
return verdict;
}
@@ -1598,16 +1673,15 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
* and send it on its way...
*/
static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
- struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
- struct netns_ipvs *ipvs;
int conn_reuse_mode;
+ struct sock *sk;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1621,7 +1695,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
- ip_vs_fill_iph_skb(af, skb, &iph);
+ ip_vs_fill_iph_skb(af, skb, false, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
@@ -1629,20 +1703,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
- net = skb_net(skb);
- ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- ip_vs_fill_iph_skb(af, skb, &iph);
+ ip_vs_fill_iph_skb(af, skb, false, &iph);
/* Bad... Do not break raw sockets */
- if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ sk = skb_to_full_sk(skb);
+ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
@@ -1650,8 +1721,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
- int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
- &iph);
+ int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,
+ hooknum, &iph);
if (related)
return verdict;
@@ -1660,21 +1731,30 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
- int verdict = ip_vs_in_icmp(skb, &related, hooknum);
+ int verdict = ip_vs_in_icmp(ipvs, skb, &related,
+ hooknum);
if (related)
return verdict;
}
/* Protocol supported? */
- pd = ip_vs_proto_data_get(net, iph.protocol);
- if (unlikely(!pd))
+ pd = ip_vs_proto_data_get(ipvs, iph.protocol);
+ if (unlikely(!pd)) {
+ /* The only way we'll see this packet again is if it's
+ * encapsulated, so mark it with ipvs_property=1 so we
+ * skip it if we're ignoring tunneled packets
+ */
+ if (sysctl_ignore_tunneled(ipvs))
+ skb->ipvs_property = 1;
+
return NF_ACCEPT;
+ }
pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, &iph, 0);
+ cp = pp->conn_in_get(ipvs, af, skb, &iph);
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
if (conn_reuse_mode && !iph.fragoffs &&
@@ -1688,32 +1768,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
cp = NULL;
}
- if (unlikely(!cp) && !iph.fragoffs) {
- /* No (second) fragments need to enter here, as nf_defrag_ipv6
- * replayed fragment zero will already have created the cp
- */
+ if (unlikely(!cp)) {
int v;
- /* Schedule and create new connection entry into &cp */
- if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
+ if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
return v;
}
- if (unlikely(!cp)) {
- /* sorry, all this trouble for a no-hit :) */
- IP_VS_DBG_PKT(12, af, pp, skb, 0,
- "ip_vs_in: packet continues traversal as normal");
- if (iph.fragoffs) {
- /* Fragment that couldn't be mapped to a conn entry
- * is missing module nf_defrag_ipv6
- */
- IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
- IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
- }
- return NF_ACCEPT;
- }
+ IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
- IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
@@ -1753,7 +1816,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
pkts = atomic_add_return(1, &cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
- ip_vs_sync_conn(net, cp, pkts);
+ ip_vs_sync_conn(ipvs, cp, pkts);
ip_vs_conn_put(cp);
return ret;
@@ -1764,10 +1827,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_remote_request4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
}
/*
@@ -1775,10 +1838,10 @@ ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_request4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1788,10 +1851,10 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_remote_request6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET6);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
/*
@@ -1799,10 +1862,10 @@ ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_request6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET6);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
#endif
@@ -1818,46 +1881,40 @@ ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* and send them to ip_vs_in_icmp.
*/
static unsigned int
-ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
int r;
- struct net *net;
- struct netns_ipvs *ipvs;
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
- net = skb_net(skb);
- ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp(skb, &r, ops->hooknum);
+ return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
}
#ifdef CONFIG_IP_VS_IPV6
static unsigned int
-ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
int r;
- struct net *net;
- struct netns_ipvs *ipvs;
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
struct ip_vs_iphdr iphdr;
- ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
if (iphdr.protocol != IPPROTO_ICMPV6)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
- net = skb_net(skb);
- ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
+ return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
}
#endif
@@ -1866,7 +1923,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
@@ -1876,7 +1932,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* applied to IPVS. */
{
.hook = ip_vs_remote_request4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
@@ -1884,7 +1939,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
@@ -1892,7 +1946,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
@@ -1901,7 +1954,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
@@ -1909,7 +1961,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
@@ -1918,7 +1969,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 2,
@@ -1928,7 +1978,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* applied to IPVS. */
{
.hook = ip_vs_remote_request6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 1,
@@ -1936,7 +1985,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 1,
@@ -1944,7 +1992,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 2,
@@ -1953,7 +2000,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp_v6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
@@ -1961,7 +2007,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
@@ -1987,22 +2032,22 @@ static int __net_init __ip_vs_init(struct net *net)
atomic_inc(&ipvs_netns_cnt);
net->ipvs = ipvs;
- if (ip_vs_estimator_net_init(net) < 0)
+ if (ip_vs_estimator_net_init(ipvs) < 0)
goto estimator_fail;
- if (ip_vs_control_net_init(net) < 0)
+ if (ip_vs_control_net_init(ipvs) < 0)
goto control_fail;
- if (ip_vs_protocol_net_init(net) < 0)
+ if (ip_vs_protocol_net_init(ipvs) < 0)
goto protocol_fail;
- if (ip_vs_app_net_init(net) < 0)
+ if (ip_vs_app_net_init(ipvs) < 0)
goto app_fail;
- if (ip_vs_conn_net_init(net) < 0)
+ if (ip_vs_conn_net_init(ipvs) < 0)
goto conn_fail;
- if (ip_vs_sync_net_init(net) < 0)
+ if (ip_vs_sync_net_init(ipvs) < 0)
goto sync_fail;
printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
@@ -2013,15 +2058,15 @@ static int __net_init __ip_vs_init(struct net *net)
*/
sync_fail:
- ip_vs_conn_net_cleanup(net);
+ ip_vs_conn_net_cleanup(ipvs);
conn_fail:
- ip_vs_app_net_cleanup(net);
+ ip_vs_app_net_cleanup(ipvs);
app_fail:
- ip_vs_protocol_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(ipvs);
protocol_fail:
- ip_vs_control_net_cleanup(net);
+ ip_vs_control_net_cleanup(ipvs);
control_fail:
- ip_vs_estimator_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(ipvs);
estimator_fail:
net->ipvs = NULL;
return -ENOMEM;
@@ -2029,22 +2074,25 @@ estimator_fail:
static void __net_exit __ip_vs_cleanup(struct net *net)
{
- ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
- ip_vs_conn_net_cleanup(net);
- ip_vs_app_net_cleanup(net);
- ip_vs_protocol_net_cleanup(net);
- ip_vs_control_net_cleanup(net);
- ip_vs_estimator_net_cleanup(net);
- IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(ipvs);
+ ip_vs_app_net_cleanup(ipvs);
+ ip_vs_protocol_net_cleanup(ipvs);
+ ip_vs_control_net_cleanup(ipvs);
+ ip_vs_estimator_net_cleanup(ipvs);
+ IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
net->ipvs = NULL;
}
static void __net_exit __ip_vs_dev_cleanup(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
EnterFunction(2);
- net_ipvs(net)->enable = 0; /* Disable packet reception */
+ ipvs->enable = 0; /* Disable packet reception */
smp_wmb();
- ip_vs_sync_net_cleanup(net);
+ ip_vs_sync_net_cleanup(ipvs);
LeaveFunction(2);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_ctl.c b/kernel/net/netfilter/ipvs/ip_vs_ctl.c
index 285eae3a1..e7c1b052c 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_ctl.c
@@ -228,7 +228,7 @@ static void defense_work_handler(struct work_struct *work)
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
- ip_vs_random_dropentry(ipvs->net);
+ ip_vs_random_dropentry(ipvs);
schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
#endif
@@ -263,7 +263,7 @@ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
* Returns hash value for virtual service
*/
static inline unsigned int
-ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
+ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
register unsigned int porth = ntohs(port);
@@ -276,7 +276,7 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
addr->ip6[2]^addr->ip6[3];
#endif
ahash = ntohl(addr_fold);
- ahash ^= ((size_t) net >> 8);
+ ahash ^= ((size_t) ipvs >> 8);
return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
IP_VS_SVC_TAB_MASK;
@@ -285,9 +285,9 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
{
- return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+ return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
@@ -309,14 +309,14 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/*
* Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
- hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
&svc->addr, svc->port);
hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
- hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
@@ -357,21 +357,21 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
* Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
+ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
&& (svc->protocol == protocol)
- && net_eq(svc->net, net)) {
+ && (svc->ipvs == ipvs)) {
/* HIT */
return svc;
}
@@ -385,17 +385,17 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(net, fwmark);
+ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
- && net_eq(svc->net, net)) {
+ && (svc->ipvs == ipvs)) {
/* HIT */
return svc;
}
@@ -406,17 +406,16 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
/* Find service, called under RCU lock */
struct ip_vs_service *
-ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
- struct netns_ipvs *ipvs = net_ipvs(net);
/*
* Check the table hashed by fwmark first
*/
if (fwmark) {
- svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
if (svc)
goto out;
}
@@ -425,7 +424,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
@@ -435,7 +434,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
@@ -443,7 +442,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
}
out:
@@ -543,10 +542,9 @@ static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
}
/* Check if real service by <proto,addr,port> is present */
-bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash;
struct ip_vs_dest *dest;
@@ -601,7 +599,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
* on the backup.
* Called under RCU lock, no refcnt is returned.
*/
-struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af,
+struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
@@ -612,7 +610,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af,
struct ip_vs_service *svc;
__be16 port = dport;
- svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport);
+ svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
@@ -660,7 +658,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
const union nf_inet_addr *daddr, __be16 dport)
{
struct ip_vs_dest *dest;
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
/*
* Find the destination in trash
@@ -715,10 +713,9 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
* are expired, and the refcnt of each destination in the trash must
* be 0, so we simply release them here.
*/
-static void ip_vs_trash_cleanup(struct net *net)
+static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
struct ip_vs_dest *dest, *nxt;
- struct netns_ipvs *ipvs = net_ipvs(net);
del_timer_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
@@ -788,7 +785,7 @@ static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
struct ip_vs_service *old_svc;
struct ip_vs_scheduler *sched;
int conn_flags;
@@ -842,15 +839,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
__ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
- sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
- ip_vs_start_estimator(svc->net, &dest->stats);
+ ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
- if (sched->add_dest)
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched && sched->add_dest)
sched->add_dest(svc, dest);
} else {
- if (sched->upd_dest)
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched && sched->upd_dest)
sched->upd_dest(svc, dest);
}
}
@@ -873,12 +871,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
- !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
+ !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
- atype = inet_addr_type(svc->net, udest->addr.ip);
+ atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
@@ -1035,12 +1033,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
bool cleanup)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- ip_vs_stop_estimator(net, &dest->stats);
+ ip_vs_stop_estimator(ipvs, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
@@ -1078,13 +1074,13 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
svc->num_dests--;
if (dest->af != svc->af)
- net_ipvs(svc->net)->mixed_address_family_dests--;
+ svc->ipvs->mixed_address_family_dests--;
if (svcupd) {
struct ip_vs_scheduler *sched;
sched = rcu_dereference_protected(svc->scheduler, 1);
- if (sched->del_dest)
+ if (sched && sched->del_dest)
sched->del_dest(svc, dest);
}
}
@@ -1119,7 +1115,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete the destination
*/
- __ip_vs_del_dest(svc->net, dest, false);
+ __ip_vs_del_dest(svc->ipvs, dest, false);
LeaveFunction(2);
@@ -1128,8 +1124,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
static void ip_vs_dest_trash_expire(unsigned long data)
{
- struct net *net = (struct net *) data;
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs = (struct netns_ipvs *)data;
struct ip_vs_dest *dest, *next;
unsigned long now = jiffies;
@@ -1162,24 +1157,26 @@ static void ip_vs_dest_trash_expire(unsigned long data)
* Add a service into the service hash table
*/
static int
-ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
- struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
/* Lookup the scheduler by 'u->sched_name' */
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
- ret = -ENOENT;
- goto out_err;
+ if (strcmp(u->sched_name, "none")) {
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (!sched) {
+ pr_info("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
}
if (u->pe_name && *u->pe_name) {
@@ -1233,17 +1230,19 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
- svc->net = net;
+ svc->ipvs = ipvs;
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
- ret = ip_vs_bind_scheduler(svc, sched);
- if (ret)
- goto out_err;
- sched = NULL;
+ if (sched) {
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+ }
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
@@ -1255,7 +1254,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
- ip_vs_start_estimator(net, &svc->stats);
+ ip_vs_start_estimator(ipvs, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
@@ -1291,17 +1290,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
{
- struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
/*
* Lookup the scheduler, by 'u->sched_name'
*/
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
- return -ENOENT;
+ if (strcmp(u->sched_name, "none")) {
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (!sched) {
+ pr_info("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ return -ENOENT;
+ }
}
old_sched = sched;
@@ -1329,14 +1331,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched != old_sched) {
+ if (old_sched) {
+ ip_vs_unbind_scheduler(svc, old_sched);
+ RCU_INIT_POINTER(svc->scheduler, NULL);
+ /* Wait all svc->sched_data users */
+ synchronize_rcu();
+ }
/* Bind the new scheduler */
- ret = ip_vs_bind_scheduler(svc, sched);
- if (ret) {
- old_sched = sched;
- goto out;
+ if (sched) {
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ ip_vs_scheduler_put(sched);
+ goto out;
+ }
}
- /* Unbind the old scheduler on success */
- ip_vs_unbind_scheduler(svc, old_sched);
}
/*
@@ -1366,7 +1374,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
pr_info("%s: enter\n", __func__);
@@ -1374,7 +1382,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
if (svc->af == AF_INET)
ipvs->num_services--;
- ip_vs_stop_estimator(svc->net, &svc->stats);
+ ip_vs_stop_estimator(svc->ipvs, &svc->stats);
/* Unbind scheduler */
old_sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -1390,7 +1398,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(svc->net, dest, cleanup);
+ __ip_vs_del_dest(svc->ipvs, dest, cleanup);
}
/*
@@ -1441,7 +1449,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(struct net *net, bool cleanup)
+static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
{
int idx;
struct ip_vs_service *svc;
@@ -1453,7 +1461,7 @@ static int ip_vs_flush(struct net *net, bool cleanup)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
s_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1464,7 +1472,7 @@ static int ip_vs_flush(struct net *net, bool cleanup)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
f_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1476,12 +1484,12 @@ static int ip_vs_flush(struct net *net, bool cleanup)
* Delete service by {netns} in the service table.
* Called by __ip_vs_cleanup()
*/
-void ip_vs_service_net_cleanup(struct net *net)
+void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
{
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(net, true);
+ ip_vs_flush(ipvs, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
@@ -1525,7 +1533,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (net_eq(svc->net, net)) {
+ if (svc->ipvs == ipvs) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
@@ -1534,7 +1542,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (net_eq(svc->net, net)) {
+ if (svc->ipvs == ipvs) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
@@ -1568,26 +1576,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
return 0;
}
-static int ip_vs_zero_all(struct net *net)
+static int ip_vs_zero_all(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_zero_service(svc);
}
}
- ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
+ ip_vs_zero_stats(&ipvs->tot_stats);
return 0;
}
@@ -1600,7 +1608,7 @@ static int
proc_do_defense_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- struct net *net = current->nsproxy->net_ns;
+ struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val = *valp;
int rc;
@@ -1611,7 +1619,7 @@ proc_do_defense_mode(struct ctl_table *table, int write,
/* Restore the correct value */
*valp = val;
} else {
- update_defense_level(net_ipvs(net));
+ update_defense_level(ipvs);
}
}
return rc;
@@ -1829,6 +1837,18 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "schedule_icmp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ignore_tunneled",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -1874,6 +1894,7 @@ static inline const char *ip_vs_fwd_name(unsigned int flags)
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
@@ -1881,7 +1902,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
- if (net_eq(svc->net, net) && pos-- == 0) {
+ if ((svc->ipvs == ipvs) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
@@ -1893,7 +1914,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
f_list) {
- if (net_eq(svc->net, net) && pos-- == 0) {
+ if ((svc->ipvs == ipvs) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
@@ -1982,6 +2003,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+ char *sched_name = sched ? sched->name : "none";
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
@@ -1990,18 +2012,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
- sched->name);
+ sched_name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- sched->name,
+ sched_name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
- svc->fwmark, sched->name,
+ svc->fwmark, sched_name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
@@ -2180,7 +2202,7 @@ static const struct file_operations ip_vs_stats_percpu_fops = {
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
-static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
@@ -2193,13 +2215,13 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
- pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
- pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
@@ -2207,7 +2229,7 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
- pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
@@ -2319,24 +2341,34 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- mutex_lock(&ipvs->sync_mutex);
- if (cmd == IP_VS_SO_SET_STARTDAEMON)
- ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
- dm->syncid);
- else
- ret = stop_sync_thread(net, dm->state);
- mutex_unlock(&ipvs->sync_mutex);
+ if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+ struct ipvs_sync_daemon_cfg cfg;
+
+ memset(&cfg, 0, sizeof(cfg));
+ strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
+ sizeof(cfg.mcast_ifn));
+ cfg.syncid = dm->syncid;
+ rtnl_lock();
+ mutex_lock(&ipvs->sync_mutex);
+ ret = start_sync_thread(ipvs, &cfg, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
+ } else {
+ mutex_lock(&ipvs->sync_mutex);
+ ret = stop_sync_thread(ipvs, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ }
goto out_dec;
}
mutex_lock(&__ip_vs_mutex);
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush(net, false);
+ ret = ip_vs_flush(ipvs, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
+ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
}
@@ -2351,7 +2383,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
- ret = ip_vs_zero_all(net);
+ ret = ip_vs_zero_all(ipvs);
goto out_unlock;
}
}
@@ -2369,10 +2401,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
rcu_read_lock();
if (usvc.fwmark == 0)
- svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
+ svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
@@ -2386,7 +2418,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (svc != NULL)
ret = -EEXIST;
else
- ret = ip_vs_add_service(net, &usvc, &svc);
+ ret = ip_vs_add_service(ipvs, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
@@ -2427,13 +2459,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
struct ip_vs_scheduler *sched;
struct ip_vs_kstats kstats;
+ char *sched_name;
sched = rcu_dereference_protected(src->scheduler, 1);
+ sched_name = sched ? sched->name : "none";
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
+ strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2443,7 +2477,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
}
static inline int
-__ip_vs_get_service_entries(struct net *net,
+__ip_vs_get_service_entries(struct netns_ipvs *ipvs,
const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
@@ -2455,7 +2489,7 @@ __ip_vs_get_service_entries(struct net *net,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET || !net_eq(svc->net, net))
+ if (svc->af != AF_INET || (svc->ipvs != ipvs))
continue;
if (count >= get->num_services)
@@ -2474,7 +2508,7 @@ __ip_vs_get_service_entries(struct net *net,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET || !net_eq(svc->net, net))
+ if (svc->af != AF_INET || (svc->ipvs != ipvs))
continue;
if (count >= get->num_services)
@@ -2494,7 +2528,7 @@ out:
}
static inline int
-__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
@@ -2503,9 +2537,9 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
rcu_read_lock();
if (get->fwmark)
- svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
else
- svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
+ svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
get->port);
rcu_read_unlock();
@@ -2550,7 +2584,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
}
static inline void
-__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
@@ -2559,12 +2593,12 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
memset(u, 0, sizeof (*u));
#ifdef CONFIG_IP_VS_PROTO_TCP
- pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
- pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
u->udp_timeout =
pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
@@ -2627,15 +2661,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
- d[0].syncid = ipvs->master_syncid;
+ d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
- d[1].syncid = ipvs->backup_syncid;
+ d[1].syncid = ipvs->bcfg.syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
@@ -2683,7 +2717,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_service_entries(net, get, user);
+ ret = __ip_vs_get_service_entries(ipvs, get, user);
}
break;
@@ -2697,9 +2731,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
addr.ip = entry->addr;
rcu_read_lock();
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_find(net, AF_INET,
+ svc = __ip_vs_service_find(ipvs, AF_INET,
entry->protocol, &addr,
entry->port);
rcu_read_unlock();
@@ -2725,7 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_dest_entries(net, get, user);
+ ret = __ip_vs_get_dest_entries(ipvs, get, user);
}
break;
@@ -2733,7 +2767,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(net, &t);
+ __ip_vs_get_timeouts(ipvs, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
@@ -2790,6 +2824,11 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
[IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
.len = IP_VS_IFNAME_MAXLEN },
[IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 },
+ [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
+ [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 },
+ [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
@@ -2892,6 +2931,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
struct ip_vs_kstats kstats;
+ char *sched_name;
nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
if (!nl_service)
@@ -2910,8 +2950,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
}
sched = rcu_dereference_protected(svc->scheduler, 1);
+ sched_name = sched ? sched->name : "none";
pe = rcu_dereference_protected(svc->pe, 1);
- if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
(pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
@@ -2961,12 +3002,13 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
- struct net *net = skb_sknet(skb);
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
- if (++idx <= start || !net_eq(svc->net, net))
+ if (++idx <= start || (svc->ipvs != ipvs))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2977,7 +3019,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
- if (++idx <= start || !net_eq(svc->net, net))
+ if (++idx <= start || (svc->ipvs != ipvs))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2993,7 +3035,7 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_parse_service(struct net *net,
+static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
@@ -3038,9 +3080,9 @@ static int ip_vs_genl_parse_service(struct net *net,
rcu_read_lock();
if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
else
- svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
+ svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
rcu_read_unlock();
*ret_svc = svc;
@@ -3078,14 +3120,14 @@ static int ip_vs_genl_parse_service(struct net *net,
return 0;
}
-static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -3160,7 +3202,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
- struct net *net = skb_sknet(skb);
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&__ip_vs_mutex);
@@ -3170,7 +3213,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
goto out_err;
- svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
+ svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
@@ -3246,7 +3289,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
}
static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
- const char *mcast_ifn, __u32 syncid)
+ struct ipvs_sync_daemon_cfg *c)
{
struct nlattr *nl_daemon;
@@ -3255,9 +3298,23 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
return -EMSGSIZE;
if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
- nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
- nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
+ nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
+ nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
+ nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
goto nla_put_failure;
+#ifdef CONFIG_IP_VS_IPV6
+ if (c->mcast_af == AF_INET6) {
+ if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
+ &c->mcast_group.in6))
+ goto nla_put_failure;
+ } else
+#endif
+ if (c->mcast_af == AF_INET &&
+ nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
+ c->mcast_group.ip))
+ goto nla_put_failure;
nla_nest_end(skb, nl_daemon);
return 0;
@@ -3268,7 +3325,7 @@ nla_put_failure:
}
static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
- const char *mcast_ifn, __u32 syncid,
+ struct ipvs_sync_daemon_cfg *c,
struct netlink_callback *cb)
{
void *hdr;
@@ -3278,7 +3335,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
if (!hdr)
return -EMSGSIZE;
- if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+ if (ip_vs_genl_fill_daemon(skb, state, c))
goto nla_put_failure;
genlmsg_end(skb, hdr);
@@ -3292,14 +3349,13 @@ nla_put_failure:
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct net *net = skb_sknet(skb);
+ struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&ipvs->sync_mutex);
if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
- ipvs->master_mcast_ifn,
- ipvs->master_syncid, cb) < 0)
+ &ipvs->mcfg, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
@@ -3307,8 +3363,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
- ipvs->backup_mcast_ifn,
- ipvs->backup_syncid, cb) < 0)
+ &ipvs->bcfg, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
@@ -3320,39 +3375,90 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
+ struct ipvs_sync_daemon_cfg c;
+ struct nlattr *a;
+ int ret;
+
+ memset(&c, 0, sizeof(c));
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
+ strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ sizeof(c.mcast_ifn));
+ c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
+
+ a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
+ if (a)
+ c.sync_maxlen = nla_get_u16(a);
+
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
+ if (a) {
+ c.mcast_af = AF_INET;
+ c.mcast_group.ip = nla_get_in_addr(a);
+ if (!ipv4_is_multicast(c.mcast_group.ip))
+ return -EINVAL;
+ } else {
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
+ if (a) {
+#ifdef CONFIG_IP_VS_IPV6
+ int addr_type;
+
+ c.mcast_af = AF_INET6;
+ c.mcast_group.in6 = nla_get_in6_addr(a);
+ addr_type = ipv6_addr_type(&c.mcast_group.in6);
+ if (!(addr_type & IPV6_ADDR_MULTICAST))
+ return -EINVAL;
+#else
+ return -EAFNOSUPPORT;
+#endif
+ }
+ }
+
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
+ if (a)
+ c.mcast_port = nla_get_u16(a);
+
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
+ if (a)
+ c.mcast_ttl = nla_get_u8(a);
/* The synchronization protocol is incompatible with mixed family
* services
*/
- if (net_ipvs(net)->mixed_address_family_dests > 0)
+ if (ipvs->mixed_address_family_dests > 0)
return -EINVAL;
- return start_sync_thread(net,
- nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
- nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
- nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+ rtnl_lock();
+ mutex_lock(&ipvs->sync_mutex);
+ ret = start_sync_thread(ipvs, &c,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
+ return ret;
}
-static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
+ int ret;
+
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- return stop_sync_thread(net,
- nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ mutex_lock(&ipvs->sync_mutex);
+ ret = stop_sync_thread(ipvs,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
}
-static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(net, &t);
+ __ip_vs_get_timeouts(ipvs, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3364,38 +3470,33 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
- return ip_vs_set_timeout(net, &t);
+ return ip_vs_set_timeout(ipvs, &t);
}
static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
{
- int ret = 0, cmd;
- struct net *net;
- struct netns_ipvs *ipvs;
+ int ret = -EINVAL, cmd;
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- net = skb_sknet(skb);
- ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
- mutex_lock(&ipvs->sync_mutex);
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
info->attrs[IPVS_CMD_ATTR_DAEMON],
- ip_vs_daemon_policy)) {
- ret = -EINVAL;
+ ip_vs_daemon_policy))
goto out;
- }
if (cmd == IPVS_CMD_NEW_DAEMON)
- ret = ip_vs_genl_new_daemon(net, daemon_attrs);
+ ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
else
- ret = ip_vs_genl_del_daemon(net, daemon_attrs);
-out:
- mutex_unlock(&ipvs->sync_mutex);
+ ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
}
+
+out:
return ret;
}
@@ -3406,22 +3507,22 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
int need_full_svc = 0, need_full_dest = 0;
- struct net *net;
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush(net, false);
+ ret = ip_vs_flush(ipvs, false);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(net, info->attrs);
+ ret = ip_vs_genl_set_config(ipvs, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
- ret = ip_vs_zero_all(net);
+ ret = ip_vs_zero_all(ipvs);
goto out;
}
@@ -3431,7 +3532,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
- ret = ip_vs_genl_parse_service(net, &usvc,
+ ret = ip_vs_genl_parse_service(ipvs, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
@@ -3470,7 +3571,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
/* The synchronization protocol is incompatible
* with mixed family services
*/
- if (net_ipvs(net)->sync_state) {
+ if (ipvs->sync_state) {
ret = -EINVAL;
goto out;
}
@@ -3490,7 +3591,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
- ret = ip_vs_add_service(net, &usvc, &svc);
+ ret = ip_vs_add_service(ipvs, &usvc, &svc);
else
ret = -EEXIST;
break;
@@ -3528,9 +3629,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
- struct net *net;
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3559,7 +3660,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc;
- svc = ip_vs_genl_find_service(net,
+ svc = ip_vs_genl_find_service(ipvs,
info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
@@ -3580,7 +3681,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(net, &t);
+ __ip_vs_get_timeouts(ipvs, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
t.tcp_timeout) ||
@@ -3735,10 +3836,10 @@ static void ip_vs_genl_unregister(void)
* per netns intit/exit func.
*/
#ifdef CONFIG_SYSCTL
-static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
+static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
+ struct net *net = ipvs->net;
int idx;
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ctl_table *tbl;
atomic_set(&ipvs->dropentry, 0);
@@ -3757,6 +3858,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
+ for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
+ if (tbl[idx].proc_handler == proc_do_defense_mode)
+ tbl[idx].extra2 = ipvs;
+ }
idx = 0;
ipvs->sysctl_amemthresh = 1024;
tbl[idx++].data = &ipvs->sysctl_amemthresh;
@@ -3798,7 +3903,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
tbl[idx++].data = &ipvs->sysctl_backup_only;
ipvs->sysctl_conn_reuse_mode = 1;
tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
-
+ tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
+ tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
if (ipvs->sysctl_hdr == NULL) {
@@ -3806,7 +3912,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
kfree(tbl);
return -ENOMEM;
}
- ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
ipvs->sysctl_tbl = tbl;
/* Schedule defense work */
INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
@@ -3815,14 +3921,14 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
return 0;
}
-static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net *net = ipvs->net;
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(net, &ipvs->tot_stats);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
@@ -3830,8 +3936,8 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
#else
-static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
-static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
#endif
@@ -3839,10 +3945,10 @@ static struct notifier_block ip_vs_dst_notifier = {
.notifier_call = ip_vs_dst_event,
};
-int __net_init ip_vs_control_net_init(struct net *net)
+int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
+ struct net *net = ipvs->net;
int i, idx;
- struct netns_ipvs *ipvs = net_ipvs(net);
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -3851,7 +3957,7 @@ int __net_init ip_vs_control_net_init(struct net *net)
INIT_LIST_HEAD(&ipvs->dest_trash);
spin_lock_init(&ipvs->dest_trash_lock);
setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
- (unsigned long) net);
+ (unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
@@ -3873,7 +3979,7 @@ int __net_init ip_vs_control_net_init(struct net *net)
proc_create("ip_vs_stats_percpu", 0, net->proc_net,
&ip_vs_stats_percpu_fops);
- if (ip_vs_control_net_init_sysctl(net))
+ if (ip_vs_control_net_init_sysctl(ipvs))
goto err;
return 0;
@@ -3883,12 +3989,12 @@ err:
return -ENOMEM;
}
-void __net_exit ip_vs_control_net_cleanup(struct net *net)
+void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net *net = ipvs->net;
- ip_vs_trash_cleanup(net);
- ip_vs_control_net_cleanup_sysctl(net);
+ ip_vs_trash_cleanup(ipvs);
+ ip_vs_control_net_cleanup_sysctl(ipvs);
remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
remove_proc_entry("ip_vs_stats", net->proc_net);
remove_proc_entry("ip_vs", net->proc_net);
diff --git a/kernel/net/netfilter/ipvs/ip_vs_est.c b/kernel/net/netfilter/ipvs/ip_vs_est.c
index ef0eb0a8d..457c6c193 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_est.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_est.c
@@ -102,10 +102,8 @@ static void estimation_timer(unsigned long arg)
struct ip_vs_estimator *e;
struct ip_vs_stats *s;
u64 rate;
- struct net *net = (struct net *)arg;
- struct netns_ipvs *ipvs;
+ struct netns_ipvs *ipvs = (struct netns_ipvs *)arg;
- ipvs = net_ipvs(net);
spin_lock(&ipvs->est_lock);
list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
@@ -140,9 +138,8 @@ static void estimation_timer(unsigned long arg)
mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
+void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
@@ -152,9 +149,8 @@ void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
spin_unlock_bh(&ipvs->est_lock);
}
-void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
+void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
spin_lock_bh(&ipvs->est_lock);
@@ -192,18 +188,16 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
dst->outbps = (e->outbps + 0xF) >> 5;
}
-int __net_init ip_vs_estimator_net_init(struct net *net)
+int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
INIT_LIST_HEAD(&ipvs->est_list);
spin_lock_init(&ipvs->est_lock);
- setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs);
mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
return 0;
}
-void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
+void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
{
- del_timer_sync(&net_ipvs(net)->est_timer);
+ del_timer_sync(&ipvs->est_timer);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_ftp.c b/kernel/net/netfilter/ipvs/ip_vs_ftp.c
index 5d3daae98..d30c327bb 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_ftp.c
@@ -181,7 +181,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
- struct net *net;
*diff = 0;
@@ -223,14 +222,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ ip_vs_conn_fill_param(cp->ipvs, AF_INET,
iph->protocol, &from, port,
&cp->caddr, 0, &p);
n_cp = ip_vs_conn_out_get(&p);
}
if (!n_cp) {
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ ip_vs_conn_fill_param(cp->ipvs,
AF_INET, IPPROTO_TCP, &cp->caddr,
0, &cp->vaddr, port, &p);
/* As above, this is ipv4 only */
@@ -289,9 +288,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* would be adjusted twice.
*/
- net = skb_net(skb);
cp->app_data = NULL;
- ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
@@ -320,7 +318,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
- struct net *net;
/* no diff required for incoming packets */
*diff = 0;
@@ -392,7 +389,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ ip_vs_conn_fill_param(cp->ipvs, AF_INET,
iph->protocol, &to, port, &cp->vaddr,
htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
@@ -413,8 +410,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Move tunnel to listen state
*/
- net = skb_net(skb);
- ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return 1;
@@ -447,14 +443,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
if (!ipvs)
return -ENOENT;
- app = register_ip_vs_app(net, &ip_vs_ftp);
+ app = register_ip_vs_app(ipvs, &ip_vs_ftp);
if (IS_ERR(app))
return PTR_ERR(app);
for (i = 0; i < ports_count; i++) {
if (!ports[i])
continue;
- ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
+ ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
pr_info("%s: loaded support on port[%d] = %d\n",
@@ -463,7 +459,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
return 0;
err_unreg:
- unregister_ip_vs_app(net, &ip_vs_ftp);
+ unregister_ip_vs_app(ipvs, &ip_vs_ftp);
return ret;
}
/*
@@ -471,7 +467,12 @@ err_unreg:
*/
static void __ip_vs_ftp_exit(struct net *net)
{
- unregister_ip_vs_app(net, &ip_vs_ftp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return;
+
+ unregister_ip_vs_app(ipvs, &ip_vs_ftp);
}
static struct pernet_operations ip_vs_ftp_ops = {
diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblc.c b/kernel/net/netfilter/ipvs/ip_vs_lblc.c
index 127f14046..cccf4d637 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_lblc.c
@@ -250,8 +250,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
static int sysctl_lblc_expiration(struct ip_vs_service *svc)
{
#ifdef CONFIG_SYSCTL
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
- return ipvs->sysctl_lblc_expiration;
+ return svc->ipvs->sysctl_lblc_expiration;
#else
return DEFAULT_EXPIRATION;
#endif
diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c
index 2229d2d8b..796d70e47 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -415,8 +415,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
{
#ifdef CONFIG_SYSCTL
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
- return ipvs->sysctl_lblcr_expiration;
+ return svc->ipvs->sysctl_lblcr_expiration;
#else
return DEFAULT_EXPIRATION;
#endif
diff --git a/kernel/net/netfilter/ipvs/ip_vs_nfct.c b/kernel/net/netfilter/ipvs/ip_vs_nfct.c
index 5882bbfd1..30434fb13 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_nfct.c
@@ -161,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
/* RS->CLIENT */
orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
+ ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum,
&orig->src.u3, orig->src.u.tcp.port,
&orig->dst.u3, orig->dst.u.tcp.port, &p);
cp = ip_vs_conn_out_get(&p);
@@ -274,8 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
" for conn " FMT_CONN "\n",
__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
- h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
- &tuple);
+ h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
/* Show what happens instead of calling nf_ct_kill() */
diff --git a/kernel/net/netfilter/ipvs/ip_vs_ovf.c b/kernel/net/netfilter/ipvs/ip_vs_ovf.c
new file mode 100644
index 000000000..f7d62c3b7
--- /dev/null
+++ b/kernel/net/netfilter/ipvs/ip_vs_ovf.c
@@ -0,0 +1,86 @@
+/*
+ * IPVS: Overflow-Connection Scheduling module
+ *
+ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Scheduler implements "overflow" loadbalancing according to number of active
+ * connections , will keep all conections to the node with the highest weight
+ * and overflow to the next node if the number of connections exceeds the node's
+ * weight.
+ * Note that this scheduler might not be suitable for UDP because it only uses
+ * active connections
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/* OVF Connection scheduling */
+static struct ip_vs_dest *
+ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *h = NULL;
+ int hw = 0, w;
+
+ IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n");
+ /* select the node with highest weight, go to next in line if active
+ * connections exceed weight
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ w = atomic_read(&dest->weight);
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->activeconns) > w ||
+ w == 0)
+ continue;
+ if (!h || w > hw) {
+ h = dest;
+ hw = w;
+ }
+ }
+
+ if (h) {
+ IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n",
+ IP_VS_DBG_ADDR(h->af, &h->addr),
+ ntohs(h->port),
+ atomic_read(&h->activeconns),
+ atomic_read(&h->weight));
+ return h;
+ }
+
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+}
+
+static struct ip_vs_scheduler ip_vs_ovf_scheduler = {
+ .name = "ovf",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list),
+ .schedule = ip_vs_ovf_schedule,
+};
+
+static int __init ip_vs_ovf_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_ovf_scheduler);
+}
+
+static void __exit ip_vs_ovf_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_ovf_init);
+module_exit(ip_vs_ovf_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c
index bed5f7042..1b8d594e4 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -70,7 +70,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
const char *dptr;
int retc;
- ip_vs_fill_iph_skb(p->af, skb, &iph);
+ ip_vs_fill_iph_skb(p->af, skb, false, &iph);
/* Only useful with UDP */
if (iph.protocol != IPPROTO_UDP)
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto.c b/kernel/net/netfilter/ipvs/ip_vs_proto.c
index 939f7fbe9..8ae480715 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto.c
@@ -63,9 +63,8 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
* register an ipvs protocols netns related data
*/
static int
-register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
struct ip_vs_proto_data *pd =
kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
@@ -79,7 +78,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
atomic_set(&pd->appcnt, 0); /* Init app counter */
if (pp->init_netns != NULL) {
- int ret = pp->init_netns(net, pd);
+ int ret = pp->init_netns(ipvs, pd);
if (ret) {
/* unlink an free proto data */
ipvs->proto_data_table[hash] = pd->next;
@@ -116,9 +115,8 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
* unregister an ipvs protocols netns data
*/
static int
-unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data **pd_p;
unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
@@ -127,7 +125,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
if (*pd_p == pd) {
*pd_p = pd->next;
if (pd->pp->exit_netns != NULL)
- pd->pp->exit_netns(net, pd);
+ pd->pp->exit_netns(ipvs, pd);
kfree(pd);
return 0;
}
@@ -156,8 +154,8 @@ EXPORT_SYMBOL(ip_vs_proto_get);
/*
* get ip_vs_protocol object data by netns and proto
*/
-static struct ip_vs_proto_data *
-__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
{
struct ip_vs_proto_data *pd;
unsigned int hash = IP_VS_PROTO_HASH(proto);
@@ -169,14 +167,6 @@ __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
return NULL;
}
-
-struct ip_vs_proto_data *
-ip_vs_proto_data_get(struct net *net, unsigned short proto)
-{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- return __ipvs_proto_data_get(ipvs, proto);
-}
EXPORT_SYMBOL(ip_vs_proto_data_get);
/*
@@ -317,7 +307,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
/*
* per network name-space init
*/
-int __net_init ip_vs_protocol_net_init(struct net *net)
+int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs)
{
int i, ret;
static struct ip_vs_protocol *protos[] = {
@@ -339,27 +329,26 @@ int __net_init ip_vs_protocol_net_init(struct net *net)
};
for (i = 0; i < ARRAY_SIZE(protos); i++) {
- ret = register_ip_vs_proto_netns(net, protos[i]);
+ ret = register_ip_vs_proto_netns(ipvs, protos[i]);
if (ret < 0)
goto cleanup;
}
return 0;
cleanup:
- ip_vs_protocol_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(ipvs);
return ret;
}
-void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
+void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd;
int i;
/* unregister all the ipvs proto data for this netns */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pd = ipvs->proto_data_table[i]) != NULL)
- unregister_ip_vs_proto_netns(net, pd);
+ unregister_ip_vs_proto_netns(ipvs, pd);
}
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 5de3dd312..5320d3997 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,30 +41,28 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
static void
-ah_esp_conn_fill_param_proto(struct net *net, int af,
- const struct ip_vs_iphdr *iph, int inverse,
+ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af,
+ const struct ip_vs_iphdr *iph,
struct ip_vs_conn_param *p)
{
- if (likely(!inverse))
- ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ if (likely(!ip_vs_iph_inverse(iph)))
+ ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP,
&iph->saddr, htons(PORT_ISAKMP),
&iph->daddr, htons(PORT_ISAKMP), p);
else
- ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP,
&iph->daddr, htons(PORT_ISAKMP),
&iph->saddr, htons(PORT_ISAKMP), p);
}
static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph,
- int inverse)
+ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
- struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(ipvs, af, iph, &p);
cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
@@ -73,7 +71,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb,
*/
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
- inverse ? "ICMP+" : "",
+ ip_vs_iph_icmp(iph) ? "ICMP+" : "",
ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
@@ -84,19 +82,18 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb,
static struct ip_vs_conn *
-ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, int inverse)
+ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
- struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(ipvs, af, iph, &p);
cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
- inverse ? "ICMP+" : "",
+ ip_vs_iph_icmp(iph) ? "ICMP+" : "",
ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
@@ -107,7 +104,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 5b84c0b56..010ddeec1 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,35 +9,44 @@
#include <net/ip_vs.h>
static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
- struct net *net;
struct ip_vs_service *svc;
- struct netns_ipvs *ipvs;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
-
- sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
- if (sh == NULL) {
- *verdict = NF_DROP;
- return 0;
+ __be16 _ports[2], *ports = NULL;
+
+ if (likely(!ip_vs_iph_icmp(iph))) {
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh) {
+ sch = skb_header_pointer(
+ skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(_schunkh), &_schunkh);
+ if (sch && (sch->type == SCTP_CID_INIT ||
+ sysctl_sloppy_sctp(ipvs)))
+ ports = &sh->source;
+ }
+ } else {
+ ports = skb_header_pointer(
+ skb, iph->len, sizeof(_ports), &_ports);
}
- sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
- sizeof(_schunkh), &_schunkh);
- if (sch == NULL) {
+ if (!ports) {
*verdict = NF_DROP;
return 0;
}
- net = skb_net(skb);
- ipvs = net_ipvs(net);
rcu_read_lock();
- if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
- (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
- &iph->daddr, sh->dest))) {
+ if (likely(!ip_vs_iph_inverse(iph)))
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->daddr, ports[1]);
+ else
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->saddr, ports[0]);
+ if (svc) {
int ignored;
if (ip_vs_todrop(ipvs)) {
@@ -474,14 +483,13 @@ static inline __u16 sctp_app_hashkey(__be16 port)
& SCTP_APP_TAB_MASK;
}
-static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
+static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP);
hash = sctp_app_hashkey(port);
@@ -498,9 +506,9 @@ out:
return ret;
}
-static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
+static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP);
atomic_dec(&pd->appcnt);
list_del_rcu(&inc->p_list);
@@ -508,7 +516,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct netns_ipvs *ipvs = cp->ipvs;
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -549,10 +557,8 @@ out:
* timeouts is netns related now.
* ---------------------------------------------
*/
-static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
sizeof(sctp_timeouts));
@@ -561,7 +567,7 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
return 0;
}
-static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
+static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
kfree(pd->timeout_table);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 8e92beb0c..d7024b2ed 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -32,27 +32,47 @@
#include <net/ip_vs.h>
static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
- struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
- struct netns_ipvs *ipvs;
+ __be16 _ports[2], *ports = NULL;
- th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
- if (th == NULL) {
+ /* In the event of icmp, we're only guaranteed to have the first 8
+ * bytes of the transport header, so we only check the rest of the
+ * TCP packet for non-ICMP packets
+ */
+ if (likely(!ip_vs_iph_icmp(iph))) {
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th) {
+ if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
+ return 1;
+ ports = &th->source;
+ }
+ } else {
+ ports = skb_header_pointer(
+ skb, iph->len, sizeof(_ports), &_ports);
+ }
+
+ if (!ports) {
*verdict = NF_DROP;
return 0;
}
- net = skb_net(skb);
- ipvs = net_ipvs(net);
+
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
rcu_read_lock();
- if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
- (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
- &iph->daddr, th->dest))) {
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->daddr, ports[1]);
+ else
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->saddr, ports[0]);
+
+ if (svc) {
int ignored;
if (ip_vs_todrop(ipvs)) {
@@ -571,14 +591,13 @@ static inline __u16 tcp_app_hashkey(__be16 port)
}
-static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
+static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
hash = tcp_app_hashkey(port);
@@ -597,9 +616,9 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
static void
-tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
+tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
atomic_dec(&pd->appcnt);
list_del_rcu(&inc->p_list);
@@ -609,7 +628,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct netns_ipvs *ipvs = cp->ipvs;
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -653,9 +672,9 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
-void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
spin_lock_bh(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
@@ -668,10 +687,8 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
* timeouts is netns related now.
* ---------------------------------------------
*/
-static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
sizeof(tcp_timeouts));
@@ -681,7 +698,7 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
return 0;
}
-static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
+static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
kfree(pd->timeout_table);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c
index b62a3c0ff..e494e9a88 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -29,28 +29,42 @@
#include <net/ip6_checksum.h>
static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
- struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
+ __be16 _ports[2], *ports = NULL;
- /* IPv6 fragments, only first fragment will hit this */
- uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
- if (uh == NULL) {
+ if (likely(!ip_vs_iph_icmp(iph))) {
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (uh)
+ ports = &uh->source;
+ } else {
+ ports = skb_header_pointer(
+ skb, iph->len, sizeof(_ports), &_ports);
+ }
+
+ if (!ports) {
*verdict = NF_DROP;
return 0;
}
- net = skb_net(skb);
+
rcu_read_lock();
- svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
- &iph->daddr, uh->dest);
+ if (likely(!ip_vs_iph_inverse(iph)))
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->daddr, ports[1]);
+ else
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->saddr, ports[0]);
+
if (svc) {
int ignored;
- if (ip_vs_todrop(net_ipvs(net))) {
+ if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -348,14 +362,13 @@ static inline __u16 udp_app_hashkey(__be16 port)
}
-static int udp_register_app(struct net *net, struct ip_vs_app *inc)
+static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
hash = udp_app_hashkey(port);
@@ -374,9 +387,9 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
static void
-udp_unregister_app(struct net *net, struct ip_vs_app *inc)
+udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
atomic_dec(&pd->appcnt);
list_del_rcu(&inc->p_list);
@@ -385,7 +398,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc)
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct netns_ipvs *ipvs = cp->ipvs;
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -456,10 +469,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
}
-static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
sizeof(udp_timeouts));
@@ -468,7 +479,7 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
return 0;
}
-static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
+static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
kfree(pd->timeout_table);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_sched.c b/kernel/net/netfilter/ipvs/ip_vs_sched.c
index 199760c71..a2ff7d746 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_sched.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_sched.c
@@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
if (sched->done_service)
sched->done_service(svc);
- /* svc->scheduler can not be set to NULL */
+ /* svc->scheduler can be set to NULL only by caller */
}
@@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
- if (scheduler && scheduler->module)
+ if (scheduler)
module_put(scheduler->module);
}
@@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
{
- struct ip_vs_scheduler *sched;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+ char *sched_name = sched ? sched->name : "none";
- sched = rcu_dereference(svc->scheduler);
if (svc->fwmark) {
IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
- sched->name, svc->fwmark, svc->fwmark, msg);
+ sched_name, svc->fwmark, svc->fwmark, msg);
#ifdef CONFIG_IP_VS_IPV6
} else if (svc->af == AF_INET6) {
IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
- sched->name, ip_vs_proto_name(svc->protocol),
+ sched_name, ip_vs_proto_name(svc->protocol),
&svc->addr.in6, ntohs(svc->port), msg);
#endif
} else {
IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
- sched->name, ip_vs_proto_name(svc->protocol),
+ sched_name, ip_vs_proto_name(svc->protocol),
&svc->addr.ip, ntohs(svc->port), msg);
}
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_sh.c b/kernel/net/netfilter/ipvs/ip_vs_sh.c
index 98a13433b..1e373a5e4 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_sh.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_sh.c
@@ -280,35 +280,29 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
static inline __be16
ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
{
- __be16 port;
- struct tcphdr _tcph, *th;
- struct udphdr _udph, *uh;
- sctp_sctphdr_t _sctph, *sh;
+ __be16 _ports[2], *ports;
+ /* At this point we know that we have a valid packet of some kind.
+ * Because ICMP packets are only guaranteed to have the first 8
+ * bytes, let's just grab the ports. Fortunately they're in the
+ * same position for all three of the protocols we care about.
+ */
switch (iph->protocol) {
case IPPROTO_TCP:
- th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
- if (unlikely(th == NULL))
- return 0;
- port = th->source;
- break;
case IPPROTO_UDP:
- uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
- if (unlikely(uh == NULL))
- return 0;
- port = uh->source;
- break;
case IPPROTO_SCTP:
- sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
- if (unlikely(sh == NULL))
+ ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+ &_ports);
+ if (unlikely(!ports))
return 0;
- port = sh->source;
- break;
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ return ports[0];
+ else
+ return ports[1];
default:
- port = 0;
+ return 0;
}
-
- return port;
}
@@ -322,6 +316,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_dest *dest;
struct ip_vs_sh_state *s;
__be16 port = 0;
+ const union nf_inet_addr *hash_addr;
+
+ hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
@@ -331,9 +328,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
s = (struct ip_vs_sh_state *) svc->sched_data;
if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
- dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port);
else
- dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+ dest = ip_vs_sh_get(svc, s, hash_addr, port);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
@@ -341,7 +338,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph->saddr),
+ IP_VS_DBG_ADDR(svc->af, hash_addr),
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
diff --git a/kernel/net/netfilter/ipvs/ip_vs_sync.c b/kernel/net/netfilter/ipvs/ip_vs_sync.c
index 19b9cce6c..803001a45 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_sync.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_sync.c
@@ -193,7 +193,7 @@ union ip_vs_sync_conn {
#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
struct ip_vs_sync_thread_data {
- struct net *net;
+ struct netns_ipvs *ipvs;
struct socket *sock;
char *buf;
int id;
@@ -262,6 +262,11 @@ struct ip_vs_sync_mesg {
/* ip_vs_sync_conn entries start here */
};
+union ipvs_sockaddr {
+ struct sockaddr_in in;
+ struct sockaddr_in6 in6;
+};
+
struct ip_vs_sync_buff {
struct list_head list;
unsigned long firstuse;
@@ -320,26 +325,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
* Create a new sync buffer for Version 1 proto.
*/
static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
+ ipvs->mcfg.sync_maxlen);
+ sb->mesg = kmalloc(len, GFP_ATOMIC);
if (!sb->mesg) {
kfree(sb);
return NULL;
}
sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
sb->mesg->version = SYNC_PROTO_VER;
- sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->syncid = ipvs->mcfg.syncid;
sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
sb->mesg->nr_conns = 0;
sb->mesg->spare = 0;
sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
- sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+ sb->end = (unsigned char *)sb->mesg + len;
sb->firstuse = jiffies;
return sb;
@@ -402,7 +409,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
* Create a new sync buffer for Version 0 proto.
*/
static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
{
struct ip_vs_sync_buff *sb;
struct ip_vs_sync_mesg_v0 *mesg;
@@ -410,17 +417,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
+ ipvs->mcfg.sync_maxlen);
+ sb->mesg = kmalloc(len, GFP_ATOMIC);
if (!sb->mesg) {
kfree(sb);
return NULL;
}
mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
mesg->nr_conns = 0;
- mesg->syncid = ipvs->master_syncid;
+ mesg->syncid = ipvs->mcfg.syncid;
mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
- sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->end = (unsigned char *)mesg + len;
sb->firstuse = jiffies;
return sb;
}
@@ -524,16 +533,15 @@ set:
* Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
*/
-static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
int pkts)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg_v0 *m;
struct ip_vs_sync_conn_v0 *s;
struct ip_vs_sync_buff *buff;
struct ipvs_master_sync_state *ms;
int id;
- int len;
+ unsigned int len;
if (unlikely(cp->af != AF_INET))
return;
@@ -553,17 +561,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
id = select_master_thread_id(ipvs, cp);
ms = &ipvs->ms[id];
buff = ms->sync_buff;
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
if (buff) {
m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
/* Send buffer if it is for v1 */
- if (!m->nr_conns) {
+ if (buff->head + len > buff->end || !m->nr_conns) {
sb_queue_tail(ipvs, ms);
ms->sync_buff = NULL;
buff = NULL;
}
}
if (!buff) {
- buff = ip_vs_sync_buff_create_v0(ipvs);
+ buff = ip_vs_sync_buff_create_v0(ipvs, len);
if (!buff) {
spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
@@ -572,8 +582,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
ms->sync_buff = buff;
}
- len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
- SIMPLE_CONN_SIZE;
m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
s = (struct ip_vs_sync_conn_v0 *) buff->head;
@@ -597,12 +605,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
m->nr_conns++;
m->size = htons(ntohs(m->size) + len);
buff->head += len;
-
- /* check if there is a space for next one */
- if (buff->head + FULL_CONN_SIZE > buff->end) {
- sb_queue_tail(ipvs, ms);
- ms->sync_buff = NULL;
- }
spin_unlock_bh(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
@@ -612,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
pkts = atomic_add_return(1, &cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
- ip_vs_sync_conn(net, cp->control, pkts);
+ ip_vs_sync_conn(ipvs, cp, pkts);
}
}
@@ -621,9 +623,8 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
* Called by ip_vs_in.
* Sending Version 1 messages
*/
-void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
+void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m;
union ip_vs_sync_conn *s;
struct ip_vs_sync_buff *buff;
@@ -634,7 +635,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
/* Handle old version of the protocol */
if (sysctl_sync_ver(ipvs) == 0) {
- ip_vs_sync_conn_v0(net, cp, pkts);
+ ip_vs_sync_conn_v0(ipvs, cp, pkts);
return;
}
/* Do not sync ONE PACKET */
@@ -694,7 +695,7 @@ sloop:
}
if (!buff) {
- buff = ip_vs_sync_buff_create(ipvs);
+ buff = ip_vs_sync_buff_create(ipvs, len);
if (!buff) {
spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
@@ -781,21 +782,21 @@ control:
* fill_param used by version 1
*/
static inline int
-ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
struct ip_vs_conn_param *p,
__u8 *pe_data, unsigned int pe_data_len,
__u8 *pe_name, unsigned int pe_name_len)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
- ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
(const union nf_inet_addr *)&sc->v6.caddr,
sc->v6.cport,
(const union nf_inet_addr *)&sc->v6.vaddr,
sc->v6.vport, p);
else
#endif
- ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
(const union nf_inet_addr *)&sc->v4.caddr,
sc->v4.cport,
(const union nf_inet_addr *)&sc->v4.vaddr,
@@ -834,7 +835,7 @@ ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
* Param: ...
* timeout is in sec.
*/
-static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
unsigned int flags, unsigned int state,
unsigned int protocol, unsigned int type,
const union nf_inet_addr *daddr, __be16 dport,
@@ -843,7 +844,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
{
struct ip_vs_dest *dest;
struct ip_vs_conn *cp;
- struct netns_ipvs *ipvs = net_ipvs(net);
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
cp = ip_vs_conn_in_get(param);
@@ -901,7 +901,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
* with synchronization, so we can make the assumption that
* the svc_af is the same as the dest_af
*/
- dest = ip_vs_find_dest(net, type, type, daddr, dport,
+ dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
param->vaddr, param->vport, protocol,
fwmark, flags);
@@ -938,7 +938,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
} else {
struct ip_vs_proto_data *pd;
- pd = ip_vs_proto_data_get(net, protocol);
+ pd = ip_vs_proto_data_get(ipvs, protocol);
if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
cp->timeout = pd->timeout_table[state];
else
@@ -950,7 +950,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
/*
* Process received multicast message for Version 0
*/
-static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
const size_t buflen)
{
struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
@@ -1006,14 +1006,14 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer,
}
}
- ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
(const union nf_inet_addr *)&s->caddr,
s->cport,
(const union nf_inet_addr *)&s->vaddr,
s->vport, &param);
/* Send timeout as Zero */
- ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ ip_vs_proc_conn(ipvs, &param, flags, state, s->protocol, AF_INET,
(union nf_inet_addr *)&s->daddr, s->dport,
0, 0, opt);
}
@@ -1064,7 +1064,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
/*
* Process a Version 1 sync. connection
*/
-static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
{
struct ip_vs_sync_conn_options opt;
union ip_vs_sync_conn *s;
@@ -1168,21 +1168,21 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
state = 0;
}
}
- if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
pe_data_len, pe_name, pe_name_len)) {
retc = 50;
goto out;
}
/* If only IPv4, just silent skip IPv6 */
if (af == AF_INET)
- ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ ip_vs_proc_conn(ipvs, &param, flags, state, s->v4.protocol, af,
(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
);
#ifdef CONFIG_IP_VS_IPV6
else
- ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ ip_vs_proc_conn(ipvs, &param, flags, state, s->v6.protocol, af,
(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
@@ -1201,10 +1201,9 @@ out:
* ip_vs_conn entries.
* Handles Version 0 & 1
*/
-static void ip_vs_process_message(struct net *net, __u8 *buffer,
+static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
const size_t buflen)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
__u8 *p, *msg_end;
int i, nr_conns;
@@ -1219,7 +1218,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
return;
}
/* SyncID sanity check */
- if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
return;
}
@@ -1254,7 +1253,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
return;
}
/* Process a single sync_conn */
- retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
if (retc < 0) {
IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
retc);
@@ -1265,7 +1264,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
}
} else {
/* Old type of message */
- ip_vs_process_message_v0(net, buffer, buflen);
+ ip_vs_process_message_v0(ipvs, buffer, buflen);
return;
}
}
@@ -1303,6 +1302,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
lock_sock(sk);
inet->mc_loop = loop ? 1 : 0;
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ /* IPV6_MULTICAST_LOOP */
+ np->mc_loop = loop ? 1 : 0;
+ }
+#endif
release_sock(sk);
}
@@ -1316,6 +1323,33 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
inet->mc_ttl = ttl;
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ /* IPV6_MULTICAST_HOPS */
+ np->mcast_hops = ttl;
+ }
+#endif
+ release_sock(sk);
+}
+
+/* Control fragmentation of messages */
+static void set_mcast_pmtudisc(struct sock *sk, int val)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
+ lock_sock(sk);
+ inet->pmtudisc = val;
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ /* IPV6_MTU_DISCOVER */
+ np->pmtudisc = val;
+ }
+#endif
release_sock(sk);
}
@@ -1338,44 +1372,15 @@ static int set_mcast_if(struct sock *sk, char *ifname)
lock_sock(sk);
inet->mc_index = dev->ifindex;
/* inet->mc_addr = 0; */
- release_sock(sk);
-
- return 0;
-}
-
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
-/*
- * Set the maximum length of sync message according to the
- * specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(struct net *net, int sync_state)
-{
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct net_device *dev;
- int num;
-
- if (sync_state == IP_VS_STATE_MASTER) {
- dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
- if (!dev)
- return -ENODEV;
-
- num = (dev->mtu - sizeof(struct iphdr) -
- sizeof(struct udphdr) -
- SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
- SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
- IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", ipvs->send_mesg_maxlen);
- } else if (sync_state == IP_VS_STATE_BACKUP) {
- dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
- if (!dev)
- return -ENODEV;
-
- ipvs->recv_mesg_maxlen = dev->mtu -
- sizeof(struct iphdr) - sizeof(struct udphdr);
- IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", ipvs->recv_mesg_maxlen);
+ /* IPV6_MULTICAST_IF */
+ np->mcast_oif = dev->ifindex;
}
+#endif
+ release_sock(sk);
return 0;
}
@@ -1405,15 +1410,34 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
mreq.imr_ifindex = dev->ifindex;
- rtnl_lock();
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
- rtnl_unlock();
return ret;
}
+#ifdef CONFIG_IP_VS_IPV6
+static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
+ char *ifname)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev;
+ int ret;
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
+ release_sock(sk);
+
+ return ret;
+}
+#endif
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
@@ -1442,53 +1466,69 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
}
+static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
+ struct ipvs_sync_daemon_cfg *c, int id)
+{
+ if (AF_INET6 == c->mcast_af) {
+ sa->in6 = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(c->mcast_port + id),
+ };
+ sa->in6.sin6_addr = c->mcast_group.in6;
+ *salen = sizeof(sa->in6);
+ } else {
+ sa->in = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_port = htons(c->mcast_port + id),
+ };
+ sa->in.sin_addr = c->mcast_group.in;
+ *salen = sizeof(sa->in);
+ }
+}
+
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct net *net, int id)
+static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
/* multicast addr */
- struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
- };
+ union ipvs_sockaddr mcast_addr;
struct socket *sock;
- int result;
+ int result, salen;
- /* First create a socket move it to right name space later */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ /* First create a socket */
+ result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
+ IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- /*
- * Kernel sockets that are a part of a namespace, should not
- * hold a reference to a namespace in order to allow to stop it.
- * After sk_change_net should be released using sk_release_kernel.
- */
- sk_change_net(sock->sk, net);
- result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+ result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
}
set_mcast_loop(sock->sk, 0);
- set_mcast_ttl(sock->sk, 1);
+ set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
+ /* Allow fragmentation if MTU changes */
+ set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
result = sysctl_sync_sock_size(ipvs);
if (result > 0)
set_sock_size(sock->sk, 1, result);
- result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+ if (AF_INET == ipvs->mcfg.mcast_af)
+ result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+ else
+ result = 0;
if (result < 0) {
pr_err("Error binding address of the mcast interface\n");
goto error;
}
+ get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
- sizeof(struct sockaddr), 0);
+ salen, 0);
if (result < 0) {
pr_err("Error connecting to the multicast addr\n");
goto error;
@@ -1497,7 +1537,7 @@ static struct socket *make_send_sock(struct net *net, int id)
return sock;
error:
- sk_release_kernel(sock->sk);
+ sock_release(sock);
return ERR_PTR(result);
}
@@ -1505,47 +1545,42 @@ error:
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct net *net, int id)
+static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
/* multicast addr */
- struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
- };
+ union ipvs_sockaddr mcast_addr;
struct socket *sock;
- int result;
+ int result, salen;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
+ IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- /*
- * Kernel sockets that are a part of a namespace, should not
- * hold a reference to a namespace in order to allow to stop it.
- * After sk_change_net should be released using sk_release_kernel.
- */
- sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = SK_CAN_REUSE;
result = sysctl_sync_sock_size(ipvs);
if (result > 0)
set_sock_size(sock->sk, 0, result);
- result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
- sizeof(struct sockaddr));
+ get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
+ result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
goto error;
}
/* join the multicast group */
- result = join_mcast_group(sock->sk,
- (struct in_addr *) &mcast_addr.sin_addr,
- ipvs->backup_mcast_ifn);
+#ifdef CONFIG_IP_VS_IPV6
+ if (ipvs->bcfg.mcast_af == AF_INET6)
+ result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
+ ipvs->bcfg.mcast_ifn);
+ else
+#endif
+ result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
+ ipvs->bcfg.mcast_ifn);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
@@ -1554,7 +1589,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
return sock;
error:
- sk_release_kernel(sock->sk);
+ sock_release(sock);
return ERR_PTR(result);
}
@@ -1646,14 +1681,14 @@ next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
- struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct netns_ipvs *ipvs = tinfo->ipvs;
struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
struct sock *sk = tinfo->sock->sk;
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d, id = %d\n",
- ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+ ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
for (;;) {
sb = next_sync_buff(ipvs, ms);
@@ -1692,7 +1727,7 @@ done:
ip_vs_sync_buff_release(sb);
/* release the sending multicast socket */
- sk_release_kernel(tinfo->sock->sk);
+ sock_release(tinfo->sock);
kfree(tinfo);
return 0;
@@ -1702,12 +1737,12 @@ done:
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
- struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct netns_ipvs *ipvs = tinfo->ipvs;
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d, id = %d\n",
- ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+ ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1717,19 +1752,19 @@ static int sync_thread_backup(void *data)
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
- ipvs->recv_mesg_maxlen);
+ ipvs->bcfg.sync_maxlen);
if (len <= 0) {
if (len != -EAGAIN)
pr_err("receiving message error\n");
break;
}
- ip_vs_process_message(tinfo->net, tinfo->buf, len);
+ ip_vs_process_message(ipvs, tinfo->buf, len);
}
}
/* release the sending multicast socket */
- sk_release_kernel(tinfo->sock->sk);
+ sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
@@ -1737,16 +1772,18 @@ static int sync_thread_backup(void *data)
}
-int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
+ int state)
{
struct ip_vs_sync_thread_data *tinfo;
struct task_struct **array = NULL, *task;
struct socket *sock;
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net_device *dev;
char *name;
int (*threadfn)(void *data);
- int id, count;
+ int id, count, hlen;
int result = -ENOMEM;
+ u16 mtu, min_mtu;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
@@ -1758,22 +1795,46 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
} else
count = ipvs->threads_mask + 1;
+ if (c->mcast_af == AF_UNSPEC) {
+ c->mcast_af = AF_INET;
+ c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
+ }
+ if (!c->mcast_port)
+ c->mcast_port = IP_VS_SYNC_PORT;
+ if (!c->mcast_ttl)
+ c->mcast_ttl = 1;
+
+ dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
+ if (!dev) {
+ pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
+ return -ENODEV;
+ }
+ hlen = (AF_INET6 == c->mcast_af) ?
+ sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
+ sizeof(struct iphdr) + sizeof(struct udphdr);
+ mtu = (state == IP_VS_STATE_BACKUP) ?
+ clamp(dev->mtu, 1500U, 65535U) : 1500U;
+ min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
+
+ if (c->sync_maxlen)
+ c->sync_maxlen = clamp_t(unsigned int,
+ c->sync_maxlen, min_mtu,
+ 65535 - hlen);
+ else
+ c->sync_maxlen = mtu - hlen;
+
if (state == IP_VS_STATE_MASTER) {
if (ipvs->ms)
return -EEXIST;
- strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
- sizeof(ipvs->master_mcast_ifn));
- ipvs->master_syncid = syncid;
+ ipvs->mcfg = *c;
name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
} else if (state == IP_VS_STATE_BACKUP) {
if (ipvs->backup_threads)
return -EEXIST;
- strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
- sizeof(ipvs->backup_mcast_ifn));
- ipvs->backup_syncid = syncid;
+ ipvs->bcfg = *c;
name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
} else {
@@ -1801,14 +1862,13 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
if (!array)
goto out;
}
- set_sync_mesg_maxlen(net, state);
tinfo = NULL;
for (id = 0; id < count; id++) {
if (state == IP_VS_STATE_MASTER)
- sock = make_send_sock(net, id);
+ sock = make_send_sock(ipvs, id);
else
- sock = make_receive_sock(net, id);
+ sock = make_receive_sock(ipvs, id);
if (IS_ERR(sock)) {
result = PTR_ERR(sock);
goto outtinfo;
@@ -1816,10 +1876,10 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
goto outsocket;
- tinfo->net = net;
+ tinfo->ipvs = ipvs;
tinfo->sock = sock;
if (state == IP_VS_STATE_BACKUP) {
- tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
GFP_KERNEL);
if (!tinfo->buf)
goto outtinfo;
@@ -1854,11 +1914,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
return 0;
outsocket:
- sk_release_kernel(sock->sk);
+ sock_release(sock);
outtinfo:
if (tinfo) {
- sk_release_kernel(tinfo->sock->sk);
+ sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
}
@@ -1880,9 +1940,8 @@ out:
}
-int stop_sync_thread(struct net *net, int state)
+int stop_sync_thread(struct netns_ipvs *ipvs, int state)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct task_struct **array;
int id;
int retc = -EINVAL;
@@ -1948,27 +2007,24 @@ int stop_sync_thread(struct net *net, int state)
/*
* Initialize data struct for each netns
*/
-int __net_init ip_vs_sync_net_init(struct net *net)
+int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
__mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
spin_lock_init(&ipvs->sync_lock);
spin_lock_init(&ipvs->sync_buff_lock);
return 0;
}
-void ip_vs_sync_net_cleanup(struct net *net)
+void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
{
int retc;
- struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&ipvs->sync_mutex);
- retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+ retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Master Daemon\n");
- retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+ retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Backup Daemon\n");
mutex_unlock(&ipvs->sync_mutex);
diff --git a/kernel/net/netfilter/ipvs/ip_vs_xmit.c b/kernel/net/netfilter/ipvs/ip_vs_xmit.c
index 19986ec5f..3264cb49b 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_xmit.c
@@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
- fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
FLOWI_FLAG_KNOWN_NH : 0;
@@ -213,19 +212,20 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
-static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
+static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
+ int rt_mode,
struct ip_vs_iphdr *ipvsh,
struct sk_buff *skb, int mtu)
{
#ifdef CONFIG_IP_VS_IPV6
if (skb_af == AF_INET6) {
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = ipvs->net;
if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
if (!skb->dev)
skb->dev = net->loopback_dev;
/* only send ICMP too big on first fragment */
- if (!ipvsh->fragoffs)
+ if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG(1, "frag needed for %pI6c\n",
&ipv6_hdr(skb)->saddr);
@@ -234,8 +234,6 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
} else
#endif
{
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
-
/* If we're going to tunnel the packet and pmtu discovery
* is disabled, we'll just fragment it anyway
*/
@@ -243,7 +241,8 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
return true;
if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len > mtu && !skb_is_gso(skb))) {
+ skb->len > mtu && !skb_is_gso(skb) &&
+ !ip_vs_iph_icmp(ipvsh))) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
IP_VS_DBG(1, "frag needed for %pI4\n",
@@ -257,11 +256,12 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
/* Get route to destination or remote server */
static int
-__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
+ struct ip_vs_dest *dest,
__be32 daddr, int rt_mode, __be32 *ret_saddr,
struct ip_vs_iphdr *ipvsh)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = ipvs->net;
struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
int mtu;
@@ -337,7 +337,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
maybe_update_pmtu(skb_af, skb, mtu);
}
- if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
goto err_put;
skb_dst_drop(skb);
@@ -364,13 +364,16 @@ err_unreach:
#ifdef CONFIG_IP_VS_IPV6
static struct dst_entry *
__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
- struct in6_addr *ret_saddr, int do_xfrm)
+ struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
{
struct dst_entry *dst;
struct flowi6 fl6 = {
.daddr = *daddr,
};
+ if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
+
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error)
goto out_err;
@@ -400,11 +403,12 @@ out_err:
* Get route to destination or remote server
*/
static int
-__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
+ struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = ipvs->net;
struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct dst_entry *dst;
@@ -427,7 +431,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
}
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
&dest_dst->dst_saddr.in6,
- do_xfrm);
+ do_xfrm, rt_mode);
if (!dst) {
__ip_vs_dst_set(dest, NULL, NULL, 0);
spin_unlock_bh(&dest->dst_lock);
@@ -435,7 +439,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
goto err_unreach;
}
rt = (struct rt6_info *) dst;
- cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ cookie = rt6_get_cookie(rt);
__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
@@ -446,7 +450,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
*ret_saddr = dest_dst->dst_saddr.in6;
} else {
noref = 0;
- dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
+ rt_mode);
if (!dst)
goto err_unreach;
rt = (struct rt6_info *) dst;
@@ -481,7 +486,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
maybe_update_pmtu(skb_af, skb, mtu);
}
- if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
goto err_put;
skb_dst_drop(skb);
@@ -501,6 +506,13 @@ err_put:
return -1;
err_unreach:
+ /* The ip6_link_failure function requires the dev field to be set
+ * in order to get the net (further for the sake of fwmark
+ * reflection).
+ */
+ if (!skb->dev)
+ skb->dev = skb_dst(skb)->dev;
+
dst_link_failure(skb);
return -1;
}
@@ -519,10 +531,27 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (ret == NF_ACCEPT) {
nf_reset(skb);
skb_forward_csum(skb);
+ if (!skb->sk)
+ skb_sender_cpu_clear(skb);
}
return ret;
}
+/* In the event of a remote destination, it's possible that we would have
+ * matches against an old socket (particularly a TIME-WAIT socket). This
+ * causes havoc down the line (ip_local_out et. al. expect regular sockets
+ * and invalid memory accesses will happen) so simply drop the association
+ * in this case.
+*/
+static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
+{
+ /* If dev is set, the packet came from the LOCAL_IN callback and
+ * not from a local TCP socket.
+ */
+ if (skb->dev)
+ skb_orphan(skb);
+}
+
/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
struct ip_vs_conn *cp, int local)
@@ -534,12 +563,23 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 1);
+
+ /* Remove the early_demux association unless it's bound for the
+ * exact same port and address on this host after translation.
+ */
+ if (!local || cp->vport != cp->dport ||
+ !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
+ ip_vs_drop_early_demux_sk(skb);
+
if (!local) {
skb_forward_csum(skb);
- NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
- NULL, skb_dst(skb)->dev, dst_output_sk);
+ if (!skb->sk)
+ skb_sender_cpu_clear(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output);
} else
ret = NF_ACCEPT;
+
return ret;
}
@@ -553,9 +593,12 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
ip_vs_notrack(skb);
if (!local) {
+ ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
- NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
- NULL, skb_dst(skb)->dev, dst_output_sk);
+ if (!skb->sk)
+ skb_sender_cpu_clear(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output);
} else
ret = NF_ACCEPT;
return ret;
@@ -588,7 +631,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr,
+ if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
goto tx_error;
@@ -615,10 +658,13 @@ int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+
EnterFunction(10);
rcu_read_lock();
- if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL,
+ if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
+ &iph->daddr, NULL,
ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
goto tx_error;
@@ -665,7 +711,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
was_input = rt_is_input_route(skb_rtable(skb));
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR, NULL, ipvsh);
@@ -682,7 +728,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
- IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
"ip_vs_nat_xmit(): "
"stopping DNAT to local address");
goto tx_error;
@@ -692,8 +738,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
- IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
- "stopping DNAT to loopback address");
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
+ "ip_vs_nat_xmit(): stopping DNAT to loopback "
+ "address");
goto tx_error;
}
@@ -710,7 +757,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -753,7 +800,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6,
NULL, ipvsh, 0,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -771,7 +819,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
- IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to local address");
goto tx_error;
@@ -781,8 +829,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
- IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
goto tx_error;
@@ -800,7 +848,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
ipv6_hdr(skb)->daddr = cp->daddr.in6;
- IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -841,6 +889,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
struct ipv6hdr *old_ipv6h = NULL;
#endif
+ ip_vs_drop_early_demux_sk(skb);
+
if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb)
@@ -924,8 +974,8 @@ int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct net *net = skb_net(skb);
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs = cp->ipvs;
+ struct net *net = ipvs->net;
struct rtable *rt; /* Route to the other host */
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
@@ -941,7 +991,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_CONNECT |
@@ -999,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip_local_out(skb);
+ ip_local_out(net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);
rcu_read_unlock();
@@ -1035,7 +1085,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6,
&saddr, ipvsh, 1,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1090,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip6_local_out(skb);
+ ip6_local_out(cp->ipvs->net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);
rcu_read_unlock();
@@ -1122,7 +1173,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
@@ -1161,10 +1212,12 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6,
NULL, ipvsh, 0,
IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL);
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH);
if (local < 0)
goto tx_error;
if (local) {
@@ -1229,7 +1282,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
rcu_read_lock();
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
+ local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
NULL, iph);
if (local < 0)
goto tx_error;
@@ -1321,8 +1374,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
rcu_read_lock();
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
- NULL, ipvsh, 0, rt_mode);
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
if (local < 0)
goto tx_error;
rt = (struct rt6_info *) skb_dst(skb);
@@ -1346,7 +1399,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
diff --git a/kernel/net/netfilter/nf_conntrack_core.c b/kernel/net/netfilter/nf_conntrack_core.c
index 13fad8668..3cb3cb831 100644
--- a/kernel/net/netfilter/nf_conntrack_core.c
+++ b/kernel/net/netfilter/nf_conntrack_core.c
@@ -126,7 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
unsigned int nf_conntrack_hash_rnd __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
{
unsigned int n;
@@ -135,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
* three bytes manually.
*/
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
}
@@ -151,15 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net)
}
static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- u16 zone, unsigned int size)
+ unsigned int size)
{
- return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
+ return __hash_bucket(hash_conntrack_raw(tuple), size);
}
-static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
+static inline u_int32_t hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, zone, net->ct.htable_size);
+ return __hash_conntrack(tuple, net->ct.htable_size);
}
bool
@@ -168,6 +168,7 @@ nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
+ struct net *net,
struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
@@ -181,12 +182,13 @@ nf_ct_get_tuple(const struct sk_buff *skb,
tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- return l4proto->pkt_to_tuple(skb, dataoff, tuple);
+ return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
}
EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
- u_int16_t l3num, struct nf_conntrack_tuple *tuple)
+ u_int16_t l3num,
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -205,7 +207,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
l4proto = __nf_ct_l4proto_find(l3num, protonum);
- ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
+ ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
l3proto, l4proto);
rcu_read_unlock();
@@ -287,6 +289,40 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
spin_unlock(&pcpu->lock);
}
+/* Released via destroy_conntrack() */
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
+ const struct nf_conntrack_zone *zone,
+ gfp_t flags)
+{
+ struct nf_conn *tmpl;
+
+ tmpl = kzalloc(sizeof(*tmpl), flags);
+ if (tmpl == NULL)
+ return NULL;
+
+ tmpl->status = IPS_TEMPLATE;
+ write_pnet(&tmpl->ct_net, net);
+
+ if (nf_ct_zone_add(tmpl, flags, zone) < 0)
+ goto out_free;
+
+ atomic_set(&tmpl->ct_general.use, 0);
+
+ return tmpl;
+out_free:
+ kfree(tmpl);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
+
+void nf_ct_tmpl_free(struct nf_conn *tmpl)
+{
+ nf_ct_ext_destroy(tmpl);
+ nf_ct_ext_free(tmpl);
+ kfree(tmpl);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
+
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
@@ -298,6 +334,10 @@ destroy_conntrack(struct nf_conntrack *nfct)
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
+ if (unlikely(nf_ct_is_template(ct))) {
+ nf_ct_tmpl_free(ct);
+ return;
+ }
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (l4proto && l4proto->destroy)
@@ -329,7 +369,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
- u16 zone = nf_ct_zone(ct);
unsigned int sequence;
nf_ct_helper_destroy(ct);
@@ -337,9 +376,9 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_conntrack(net, zone,
+ hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- reply_hash = hash_conntrack(net, zone,
+ reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -387,8 +426,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
- const struct nf_conntrack_tuple *tuple,
- u16 zone)
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -396,8 +435,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
* so we need to check that the conntrack is confirmed
*/
return nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone(ct) == zone &&
- nf_ct_is_confirmed(ct);
+ nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
+ nf_ct_is_confirmed(ct);
}
/*
@@ -406,7 +445,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
*/
static struct nf_conntrack_tuple_hash *
-____nf_conntrack_find(struct net *net, u16 zone,
+____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
@@ -442,7 +481,7 @@ begin:
/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
-__nf_conntrack_find_get(struct net *net, u16 zone,
+__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
@@ -469,11 +508,11 @@ begin:
}
struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
+nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple, zone));
+ hash_conntrack_raw(tuple));
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -492,11 +531,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
+ const struct nf_conntrack_zone *zone;
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- u16 zone;
unsigned int sequence;
zone = nf_ct_zone(ct);
@@ -504,9 +543,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_conntrack(net, zone,
+ hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- reply_hash = hash_conntrack(net, zone,
+ reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -514,12 +553,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
add_timer(&ct->timeout);
@@ -540,32 +581,11 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-/* deletion from this larval template list happens via nf_ct_put() */
-void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
-{
- struct ct_pcpu *pcpu;
-
- __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
- __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
- nf_conntrack_get(&tmpl->ct_general);
-
- /* add this conntrack to the (per cpu) tmpl list */
- local_bh_disable();
- tmpl->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
-
- spin_lock(&pcpu->lock);
- /* Overload tuple linked list to put us in template list. */
- hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->tmpl);
- spin_unlock_bh(&pcpu->lock);
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
-
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
+ const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -574,7 +594,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
- u16 zone;
unsigned int sequence;
ct = nf_ct_get(skb, &ctinfo);
@@ -595,7 +614,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = hash_bucket(hash, net);
- reply_hash = hash_conntrack(net, zone,
+ reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -627,12 +646,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
/* Timer relative to confirmation time, not original
@@ -685,11 +706,14 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
{
struct net *net = nf_ct_net(ignored_conntrack);
+ const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nf_conn *ct;
- u16 zone = nf_ct_zone(ignored_conntrack);
- unsigned int hash = hash_conntrack(net, zone, tuple);
+ unsigned int hash;
+
+ zone = nf_ct_zone(ignored_conntrack);
+ hash = hash_conntrack(net, tuple);
/* Disable BHs the entire time since we need to disable them at
* least once for the stats anyway.
@@ -699,7 +723,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone(ct) == zone) {
+ nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
NF_CT_STAT_INC(net, found);
rcu_read_unlock_bh();
return 1;
@@ -788,7 +812,8 @@ void init_nf_conntrack_hash_rnd(void)
}
static struct nf_conn *
-__nf_conntrack_alloc(struct net *net, u16 zone,
+__nf_conntrack_alloc(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
@@ -798,7 +823,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
if (unlikely(!nf_conntrack_hash_rnd)) {
init_nf_conntrack_hash_rnd();
/* recompute the hash as nf_conntrack_hash_rnd is initialized */
- hash = hash_conntrack_raw(orig, zone);
+ hash = hash_conntrack_raw(orig);
}
/* We don't want any race condition at early drop stage */
@@ -818,10 +843,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
* SLAB_DESTROY_BY_RCU.
*/
ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
- if (ct == NULL) {
- atomic_dec(&net->ct.count);
- return ERR_PTR(-ENOMEM);
- }
+ if (ct == NULL)
+ goto out;
+
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -835,31 +859,24 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
offsetof(struct nf_conn, __nfct_init_offset[0]));
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- if (zone) {
- struct nf_conntrack_zone *nf_ct_zone;
- nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
- if (!nf_ct_zone)
- goto out_free;
- nf_ct_zone->id = zone;
- }
-#endif
+ if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
+ goto out_free;
+
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
atomic_set(&ct->ct_general.use, 0);
return ct;
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
out_free:
- atomic_dec(&net->ct.count);
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+out:
+ atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
-#endif
}
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+struct nf_conn *nf_conntrack_alloc(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp)
@@ -901,8 +918,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_ecache *ecache;
struct nf_conntrack_expect *exp = NULL;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
+ struct nf_conntrack_zone tmp;
unsigned int *timeouts;
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
@@ -910,6 +928,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return NULL;
}
+ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
@@ -921,10 +940,13 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
- if (timeout_ext)
- timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
- else
+ if (timeout_ext) {
+ timeouts = nf_ct_timeout_data(timeout_ext);
+ if (unlikely(!timeouts))
+ timeouts = l4proto->get_timeouts(net);
+ } else {
timeouts = l4proto->get_timeouts(net);
+ }
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
@@ -933,7 +955,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
if (timeout_ext)
- nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);
+ nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
+ GFP_ATOMIC);
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
@@ -1004,21 +1027,23 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
int *set_reply,
enum ip_conntrack_info *ctinfo)
{
+ const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_zone tmp;
struct nf_conn *ct;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
- dataoff, l3num, protonum, &tuple, l3proto,
+ dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
pr_debug("resolve_normal_ct: Can't get tuple\n");
return NULL;
}
/* look for tuple match */
- hash = hash_conntrack_raw(&tuple, zone);
+ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
+ hash = hash_conntrack_raw(&tuple);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1522,10 +1547,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
- if (!hash) {
- printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
+ if (!hash)
hash = vzalloc(sz);
- }
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -1576,8 +1599,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
- bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
- hashsize);
+ bucket = __hash_conntrack(&h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
@@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net)
spin_lock_init(&pcpu->lock);
INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
}
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
diff --git a/kernel/net/netfilter/nf_conntrack_expect.c b/kernel/net/netfilter/nf_conntrack_expect.c
index 7a17070c5..acf5c7b3f 100644
--- a/kernel/net/netfilter/nf_conntrack_expect.c
+++ b/kernel/net/netfilter/nf_conntrack_expect.c
@@ -88,7 +88,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple
}
struct nf_conntrack_expect *
-__nf_ct_expect_find(struct net *net, u16 zone,
+__nf_ct_expect_find(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i;
@@ -100,7 +101,7 @@ __nf_ct_expect_find(struct net *net, u16 zone,
h = nf_ct_expect_dst_hash(tuple);
hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone(i->master) == zone)
+ nf_ct_zone_equal_any(i->master, zone))
return i;
}
return NULL;
@@ -109,7 +110,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
/* Just find a expectation corresponding to a tuple. */
struct nf_conntrack_expect *
-nf_ct_expect_find_get(struct net *net, u16 zone,
+nf_ct_expect_find_get(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i;
@@ -127,7 +129,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
/* If an expectation for this connection is found, it gets delete from
* global list then returned. */
struct nf_conntrack_expect *
-nf_ct_find_expectation(struct net *net, u16 zone,
+nf_ct_find_expectation(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i, *exp = NULL;
@@ -140,7 +143,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,
hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone(i->master) == zone) {
+ nf_ct_zone_equal_any(i->master, zone)) {
exp = i;
break;
}
@@ -219,16 +222,17 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
}
- return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+ return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
+ nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
static inline int expect_matches(const struct nf_conntrack_expect *a,
const struct nf_conntrack_expect *b)
{
return a->master == b->master && a->class == b->class &&
- nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
- nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
- nf_ct_zone(a->master) == nf_ct_zone(b->master);
+ nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+ nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
/* Generally a bad idea to call this: could have matched already. */
diff --git a/kernel/net/netfilter/nf_conntrack_h323_main.c b/kernel/net/netfilter/nf_conntrack_h323_main.c
index 1d69f5b97..9511af04d 100644
--- a/kernel/net/netfilter/nf_conntrack_h323_main.c
+++ b/kernel/net/netfilter/nf_conntrack_h323_main.c
@@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net,
flowi6_to_flowi(&fl1), false)) {
if (!afinfo->route(net, (struct dst_entry **)&rt2,
flowi6_to_flowi(&fl2), false)) {
- if (ipv6_addr_equal(rt6_nexthop(rt1),
- rt6_nexthop(rt2)) &&
+ if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr),
+ rt6_nexthop(rt2, &fl2.daddr)) &&
rt1->dst.dev == rt2->dst.dev)
ret = 1;
dst_release(&rt2->dst);
diff --git a/kernel/net/netfilter/nf_conntrack_labels.c b/kernel/net/netfilter/nf_conntrack_labels.c
index bb53f120e..3ce5c314e 100644
--- a/kernel/net/netfilter/nf_conntrack_labels.c
+++ b/kernel/net/netfilter/nf_conntrack_labels.c
@@ -14,6 +14,8 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
+static spinlock_t nf_connlabels_lock;
+
static unsigned int label_bits(const struct nf_conn_labels *l)
{
unsigned int longs = l->words;
@@ -48,7 +50,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
}
EXPORT_SYMBOL_GPL(nf_connlabel_set);
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static void replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
@@ -89,7 +90,35 @@ int nf_connlabels_replace(struct nf_conn *ct,
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_replace);
-#endif
+
+int nf_connlabels_get(struct net *net, unsigned int n_bits)
+{
+ size_t words;
+
+ if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+ return -ERANGE;
+
+ words = BITS_TO_LONGS(n_bits);
+
+ spin_lock(&nf_connlabels_lock);
+ net->ct.labels_used++;
+ if (words > net->ct.label_words)
+ net->ct.label_words = words;
+ spin_unlock(&nf_connlabels_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_get);
+
+void nf_connlabels_put(struct net *net)
+{
+ spin_lock(&nf_connlabels_lock);
+ net->ct.labels_used--;
+ if (net->ct.labels_used == 0)
+ net->ct.label_words = 0;
+ spin_unlock(&nf_connlabels_lock);
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
static struct nf_ct_ext_type labels_extend __read_mostly = {
.len = sizeof(struct nf_conn_labels),
@@ -99,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
int nf_conntrack_labels_init(void)
{
+ spin_lock_init(&nf_connlabels_lock);
return nf_ct_extend_register(&labels_extend);
}
diff --git a/kernel/net/netfilter/nf_conntrack_netlink.c b/kernel/net/netfilter/nf_conntrack_netlink.c
index d1c23940a..9f5272968 100644
--- a/kernel/net/netfilter/nf_conntrack_netlink.c
+++ b/kernel/net/netfilter/nf_conntrack_netlink.c
@@ -128,6 +128,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
}
static inline int
+ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
+ const struct nf_conntrack_zone *zone, int dir)
+{
+ if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
+ return 0;
+ if (nla_put_be16(skb, attrtype, htons(zone->id)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
@@ -458,6 +472,7 @@ static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct nf_conn *ct)
{
+ const struct nf_conntrack_zone *zone;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
@@ -473,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
+ zone = nf_ct_zone(ct);
+
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_ORIG) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -485,10 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_REPL) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct) &&
- nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+ NF_CT_DEFAULT_ZONE_DIR) < 0)
goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0 ||
@@ -598,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
- + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
@@ -609,6 +632,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
+ const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
@@ -655,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
nfmsg->res_id = 0;
rcu_read_lock();
+ zone = nf_ct_zone(ct);
+
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_ORIG) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -667,10 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_REPL) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct) &&
- nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+ NF_CT_DEFAULT_ZONE_DIR) < 0)
goto nla_put_failure;
if (ctnetlink_dump_id(skb, ct) < 0)
@@ -920,15 +952,54 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr,
return ret;
}
+static int
+ctnetlink_parse_zone(const struct nlattr *attr,
+ struct nf_conntrack_zone *zone)
+{
+ nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID,
+ NF_CT_DEFAULT_ZONE_DIR, 0);
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ if (attr)
+ zone->id = ntohs(nla_get_be16(attr));
+#else
+ if (attr)
+ return -EOPNOTSUPP;
+#endif
+ return 0;
+}
+
+static int
+ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type,
+ struct nf_conntrack_zone *zone)
+{
+ int ret;
+
+ if (zone->id != NF_CT_DEFAULT_ZONE_ID)
+ return -EINVAL;
+
+ ret = ctnetlink_parse_zone(attr, zone);
+ if (ret < 0)
+ return ret;
+
+ if (type == CTA_TUPLE_REPLY)
+ zone->dir = NF_CT_ZONE_DIR_REPL;
+ else
+ zone->dir = NF_CT_ZONE_DIR_ORIG;
+
+ return 0;
+}
+
static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
[CTA_TUPLE_IP] = { .type = NLA_NESTED },
[CTA_TUPLE_PROTO] = { .type = NLA_NESTED },
+ [CTA_TUPLE_ZONE] = { .type = NLA_U16 },
};
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
struct nf_conntrack_tuple *tuple,
- enum ctattr_type type, u_int8_t l3num)
+ enum ctattr_type type, u_int8_t l3num,
+ struct nf_conntrack_zone *zone)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -955,6 +1026,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
if (err < 0)
return err;
+ if (tb[CTA_TUPLE_ZONE]) {
+ if (!zone)
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE],
+ type, zone);
+ if (err < 0)
+ return err;
+ }
+
/* orig and expect tuples get DIR_ORIGINAL */
if (type == CTA_TUPLE_REPLY)
tuple->dst.dir = IP_CT_DIR_REPLY;
@@ -964,21 +1045,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
return 0;
}
-static int
-ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
-{
- if (attr)
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- *zone = ntohs(nla_get_be16(attr));
-#else
- return -EOPNOTSUPP;
-#endif
- else
- *zone = 0;
-
- return 0;
-}
-
static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
[CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1058,7 +1124,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nf_conn *ct;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1066,9 +1132,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_TUPLE_ORIG])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+ u3, &zone);
else if (cda[CTA_TUPLE_REPLY])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+ u3, &zone);
else {
return ctnetlink_flush_conntrack(net, cda,
NETLINK_CB(skb).portid,
@@ -1078,7 +1146,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, zone, &tuple);
+ h = nf_conntrack_find_get(net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -1112,7 +1180,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct sk_buff *skb2 = NULL;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -1138,16 +1206,18 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_TUPLE_ORIG])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+ u3, &zone);
else if (cda[CTA_TUPLE_REPLY])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+ u3, &zone);
else
return -EINVAL;
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, zone, &tuple);
+ h = nf_conntrack_find_get(net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -1645,7 +1715,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
}
static struct nf_conn *
-ctnetlink_create_conntrack(struct net *net, u16 zone,
+ctnetlink_create_conntrack(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nlattr * const cda[],
struct nf_conntrack_tuple *otuple,
struct nf_conntrack_tuple *rtuple,
@@ -1761,7 +1832,8 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
struct nf_conntrack_tuple_hash *master_h;
struct nf_conn *master_ct;
- err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER,
+ u3, NULL);
if (err < 0)
goto err2;
@@ -1804,7 +1876,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nf_conn *ct;
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1812,21 +1884,23 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_TUPLE_ORIG]) {
- err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+ err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG,
+ u3, &zone);
if (err < 0)
return err;
}
if (cda[CTA_TUPLE_REPLY]) {
- err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+ err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY,
+ u3, &zone);
if (err < 0)
return err;
}
if (cda[CTA_TUPLE_ORIG])
- h = nf_conntrack_find_get(net, zone, &otuple);
+ h = nf_conntrack_find_get(net, &zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
- h = nf_conntrack_find_get(net, zone, &rtuple);
+ h = nf_conntrack_find_get(net, &zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
@@ -1836,7 +1910,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
return -EINVAL;
- ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
+ ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
&rtuple, u3);
if (IS_ERR(ct))
return PTR_ERR(ct);
@@ -2059,9 +2133,9 @@ ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *mask);
-#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
static size_t
-ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+ctnetlink_glue_build_size(const struct nf_conn *ct)
{
return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ 3 * nla_total_size(0) /* CTA_TUPLE_IP */
@@ -2082,23 +2156,40 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
- + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
#endif
+ ctnetlink_proto_size(ct)
;
}
-static int
-ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
+static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, ctinfo);
+ if (ct && nf_ct_is_untracked(ct))
+ ct = NULL;
+
+ return ct;
+}
+
+static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
+ const struct nf_conntrack_zone *zone;
struct nlattr *nest_parms;
rcu_read_lock();
+ zone = nf_ct_zone(ct);
+
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_ORIG) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -2106,12 +2197,14 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_REPL) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct)) {
- if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
- goto nla_put_failure;
- }
+ if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+ NF_CT_DEFAULT_ZONE_DIR) < 0)
+ goto nla_put_failure;
if (ctnetlink_dump_id(skb, ct) < 0)
goto nla_put_failure;
@@ -2154,7 +2247,32 @@ nla_put_failure:
}
static int
-ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
+ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ u_int16_t ct_attr, u_int16_t ct_info_attr)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, ct_attr | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (__ctnetlink_glue_build(skb, ct) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int
+ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
{
int err;
@@ -2194,7 +2312,7 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
}
static int
-ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
+ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct)
{
struct nlattr *cda[CTA_MAX+1];
int ret;
@@ -2204,31 +2322,31 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
return ret;
spin_lock_bh(&nf_conntrack_expect_lock);
- ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
+ ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct);
spin_unlock_bh(&nf_conntrack_expect_lock);
return ret;
}
-static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
- const struct nf_conn *ct,
- struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_tuple *mask)
+static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda,
+ const struct nf_conn *ct,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
{
int err;
err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
- nf_ct_l3num(ct));
+ nf_ct_l3num(ct), NULL);
if (err < 0)
return err;
return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
- nf_ct_l3num(ct));
+ nf_ct_l3num(ct), NULL);
}
static int
-ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
- u32 portid, u32 report)
+ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+ u32 portid, u32 report)
{
struct nlattr *cda[CTA_EXPECT_MAX+1];
struct nf_conntrack_tuple tuple, mask;
@@ -2240,8 +2358,8 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
if (err < 0)
return err;
- err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
- ct, &tuple, &mask);
+ err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda,
+ ct, &tuple, &mask);
if (err < 0)
return err;
@@ -2268,14 +2386,24 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
return 0;
}
-static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
- .build_size = ctnetlink_nfqueue_build_size,
- .build = ctnetlink_nfqueue_build,
- .parse = ctnetlink_nfqueue_parse,
- .attach_expect = ctnetlink_nfqueue_attach_expect,
- .seq_adjust = nf_ct_tcp_seqadj_set,
+static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int diff)
+{
+ if (!(ct->status & IPS_NAT_MASK))
+ return;
+
+ nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
+}
+
+static struct nfnl_ct_hook ctnetlink_glue_hook = {
+ .get_ct = ctnetlink_glue_get_ct,
+ .build_size = ctnetlink_glue_build_size,
+ .build = ctnetlink_glue_build,
+ .parse = ctnetlink_glue_parse,
+ .attach_expect = ctnetlink_glue_attach_expect,
+ .seq_adjust = ctnetlink_glue_seqadj,
};
-#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
+#endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */
/***********************************************************************
* EXPECT
@@ -2612,23 +2740,22 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- u16 zone = 0;
+ struct nf_conntrack_zone zone;
struct netlink_dump_control c = {
.dump = ctnetlink_exp_ct_dump_table,
.done = ctnetlink_exp_done,
};
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+ u3, NULL);
if (err < 0)
return err;
- if (cda[CTA_EXPECT_ZONE]) {
- err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
- if (err < 0)
- return err;
- }
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
- h = nf_conntrack_find_get(net, zone, &tuple);
+ h = nf_conntrack_find_get(net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -2652,7 +2779,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
struct sk_buff *skb2;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -2672,16 +2799,18 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_EXPECT_TUPLE])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
else if (cda[CTA_EXPECT_MASTER])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+ u3, NULL);
else
return -EINVAL;
if (err < 0)
return err;
- exp = nf_ct_expect_find_get(net, zone, &tuple);
+ exp = nf_ct_expect_find_get(net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -2732,8 +2861,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct hlist_node *next;
u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_zone zone;
unsigned int i;
- u16 zone;
int err;
if (cda[CTA_EXPECT_TUPLE]) {
@@ -2742,12 +2871,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
/* bump usage count to 2 */
- exp = nf_ct_expect_find_get(net, zone, &tuple);
+ exp = nf_ct_expect_find_get(net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -2849,7 +2979,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
return -EINVAL;
err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
- &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
+ &nat_tuple, CTA_EXPECT_NAT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
@@ -2937,7 +3068,8 @@ err_out:
}
static int
-ctnetlink_create_expect(struct net *net, u16 zone,
+ctnetlink_create_expect(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nlattr * const cda[],
u_int8_t u3, u32 portid, int report)
{
@@ -2949,13 +3081,16 @@ ctnetlink_create_expect(struct net *net, u16 zone,
int err;
/* caller guarantees that those three CTA_EXPECT_* exist */
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK,
+ u3, NULL);
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER,
+ u3, NULL);
if (err < 0)
return err;
@@ -2995,11 +3130,6 @@ ctnetlink_create_expect(struct net *net, u16 zone,
}
err = nf_ct_expect_related_report(exp, portid, report);
- if (err < 0)
- goto err_exp;
-
- return 0;
-err_exp:
nf_ct_expect_put(exp);
err_ct:
nf_ct_put(ct);
@@ -3016,7 +3146,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
struct nf_conntrack_expect *exp;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
if (!cda[CTA_EXPECT_TUPLE]
@@ -3028,19 +3158,18 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
spin_lock_bh(&nf_conntrack_expect_lock);
- exp = __nf_ct_expect_find(net, zone, &tuple);
-
+ exp = __nf_ct_expect_find(net, &zone, &tuple);
if (!exp) {
spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE) {
- err = ctnetlink_create_expect(net, zone, cda,
- u3,
+ err = ctnetlink_create_expect(net, &zone, cda, u3,
NETLINK_CB(skb).portid,
nlmsg_report(nlh));
}
@@ -3258,9 +3387,9 @@ static int __init ctnetlink_init(void)
pr_err("ctnetlink_init: cannot register pernet operations\n");
goto err_unreg_exp_subsys;
}
-#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
/* setup interaction between nf_queue and nf_conntrack_netlink. */
- RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
+ RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook);
#endif
return 0;
@@ -3279,8 +3408,8 @@ static void __exit ctnetlink_exit(void)
unregister_pernet_subsys(&ctnetlink_net_ops);
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
nfnetlink_subsys_unregister(&ctnl_subsys);
-#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
- RCU_INIT_POINTER(nfq_ct_hook, NULL);
+#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
+ RCU_INIT_POINTER(nfnl_ct_hook, NULL);
#endif
}
diff --git a/kernel/net/netfilter/nf_conntrack_pptp.c b/kernel/net/netfilter/nf_conntrack_pptp.c
index 825c3e3f8..5588c7ae1 100644
--- a/kernel/net/netfilter/nf_conntrack_pptp.c
+++ b/kernel/net/netfilter/nf_conntrack_pptp.c
@@ -143,13 +143,14 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
const struct nf_conntrack_tuple *t)
{
const struct nf_conntrack_tuple_hash *h;
+ const struct nf_conntrack_zone *zone;
struct nf_conntrack_expect *exp;
struct nf_conn *sibling;
- u16 zone = nf_ct_zone(ct);
pr_debug("trying to timeout ct or exp for tuple ");
nf_ct_dump_tuple(t);
+ zone = nf_ct_zone(ct);
h = nf_conntrack_find_get(net, zone, t);
if (h) {
sibling = nf_ct_tuplehash_to_ctrack(h);
diff --git a/kernel/net/netfilter/nf_conntrack_proto_dccp.c b/kernel/net/netfilter/nf_conntrack_proto_dccp.c
index 6dd995c7c..fce1b1cca 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_dccp.c
@@ -398,7 +398,7 @@ static inline struct dccp_net *dccp_pernet(struct net *net)
}
static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
struct dccp_hdr _hdr, *dh;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_generic.c b/kernel/net/netfilter/nf_conntrack_proto_generic.c
index 60865f110..86dc752e5 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_generic.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_generic.c
@@ -45,7 +45,7 @@ static inline struct nf_generic_net *generic_pernet(struct net *net)
static bool generic_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
tuple->src.u.all = 0;
tuple->dst.u.all = 0;
@@ -90,7 +90,13 @@ static int generic_packet(struct nf_conn *ct,
static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
- return nf_generic_should_process(nf_ct_protonum(ct));
+ bool ret;
+
+ ret = nf_generic_should_process(nf_ct_protonum(ct));
+ if (!ret)
+ pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
+ nf_ct_protonum(ct));
+ return ret;
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
diff --git a/kernel/net/netfilter/nf_conntrack_proto_gre.c b/kernel/net/netfilter/nf_conntrack_proto_gre.c
index 7648674f2..a96451a7a 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_gre.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_gre.c
@@ -190,9 +190,8 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
/* gre hdr info to tuple */
static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
- struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev);
const struct gre_hdr_pptp *pgrehdr;
struct gre_hdr_pptp _pgrehdr;
__be16 srckey;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_sctp.c b/kernel/net/netfilter/nf_conntrack_proto_sctp.c
index b45da90fa..9578a7c37 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_sctp.c
@@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = {
"SHUTDOWN_SENT",
"SHUTDOWN_RECD",
"SHUTDOWN_ACK_SENT",
+ "HEARTBEAT_SENT",
+ "HEARTBEAT_ACKED",
};
#define SECS * HZ
@@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
[SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
+ [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
};
#define sNO SCTP_CONNTRACK_NONE
@@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
+#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
to that of the SHUTDOWN chunk.
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
+HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
+HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to
+ that of the HEARTBEAT chunk. Secondary connection is
+ established.
*/
/* TODO
@@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
- Check the error type in the reply dir before transitioning from
cookie echoed to closed.
- Sec 5.2.4 of RFC 2960
- - Multi Homing support.
+ - Full Multi Homing support.
*/
/* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
}
};
@@ -142,7 +156,7 @@ static inline struct sctp_net *sctp_pernet(struct net *net)
}
static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct sctphdr *hp;
struct sctphdr _hdr;
@@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
i = 8;
break;
+ case SCTP_CID_HEARTBEAT:
+ pr_debug("SCTP_CID_HEARTBEAT");
+ i = 9;
+ break;
+ case SCTP_CID_HEARTBEAT_ACK:
+ pr_debug("SCTP_CID_HEARTBEAT_ACK");
+ i = 10;
+ break;
default:
- /* Other chunks like DATA, SACK, HEARTBEAT and
- its ACK do not cause a change in state */
+ /* Other chunks like DATA or SACK do not change the state */
pr_debug("Unknown chunk type, Will stay in %s\n",
sctp_conntrack_names[cur_state]);
return cur_state;
@@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct,
!test_bit(SCTP_CID_COOKIE_ECHO, map) &&
!test_bit(SCTP_CID_ABORT, map) &&
!test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
sh->vtag != ct->proto.sctp.vtag[dir]) {
pr_debug("Verification tag check failed\n");
goto out;
@@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct,
/* Sec 8.5.1 (D) */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
+ } else if (sch->type == SCTP_CID_HEARTBEAT ||
+ sch->type == SCTP_CID_HEARTBEAT_ACK) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting vtag %x for dir %d\n",
+ sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out_unlock;
+ }
}
old_state = ct->proto.sctp.state;
@@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Sec 8.5.1 (A) */
return false;
}
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ pr_debug("Setting vtag %x for secondary conntrack\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
}
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
@@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
@@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_acked",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
{ }
};
@@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+ pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT];
+ pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED];
#endif
return 0;
}
diff --git a/kernel/net/netfilter/nf_conntrack_proto_tcp.c b/kernel/net/netfilter/nf_conntrack_proto_tcp.c
index 70383de72..278f3b935 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_tcp.c
@@ -277,7 +277,7 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net)
}
static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct tcphdr *hp;
struct tcphdr _hdr;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_udp.c b/kernel/net/netfilter/nf_conntrack_proto_udp.c
index 6957281ff..478f92f83 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_udp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_udp.c
@@ -38,6 +38,7 @@ static inline struct nf_udp_net *udp_pernet(struct net *net)
static bool udp_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
+ struct net *net,
struct nf_conntrack_tuple *tuple)
{
const struct udphdr *hp;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_udplite.c b/kernel/net/netfilter/nf_conntrack_proto_udplite.c
index c5903d164..1ac8ee13a 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_udplite.c
@@ -48,6 +48,7 @@ static inline struct udplite_net *udplite_pernet(struct net *net)
static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
+ struct net *net,
struct nf_conntrack_tuple *tuple)
{
const struct udphdr *hp;
diff --git a/kernel/net/netfilter/nf_conntrack_seqadj.c b/kernel/net/netfilter/nf_conntrack_seqadj.c
index ce3e840c8..dff0f0cc5 100644
--- a/kernel/net/netfilter/nf_conntrack_seqadj.c
+++ b/kernel/net/netfilter/nf_conntrack_seqadj.c
@@ -103,9 +103,9 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb,
ntohl(sack->end_seq), ntohl(new_end_seq));
inet_proto_csum_replace4(&tcph->check, skb,
- sack->start_seq, new_start_seq, 0);
+ sack->start_seq, new_start_seq, false);
inet_proto_csum_replace4(&tcph->check, skb,
- sack->end_seq, new_end_seq, 0);
+ sack->end_seq, new_end_seq, false);
sack->start_seq = new_start_seq;
sack->end_seq = new_end_seq;
sackoff += sizeof(*sack);
@@ -193,8 +193,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
newseq = htonl(ntohl(tcph->seq) + seqoff);
newack = htonl(ntohl(tcph->ack_seq) - ackoff);
- inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
- inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
+ false);
pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
diff --git a/kernel/net/netfilter/nf_conntrack_standalone.c b/kernel/net/netfilter/nf_conntrack_standalone.c
index fc823fa5d..1fb3cacc0 100644
--- a/kernel/net/netfilter/nf_conntrack_standalone.c
+++ b/kernel/net/netfilter/nf_conntrack_standalone.c
@@ -140,6 +140,35 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+ int dir)
+{
+ const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+ if (zone->dir != dir)
+ return;
+ switch (zone->dir) {
+ case NF_CT_DEFAULT_ZONE_DIR:
+ seq_printf(s, "zone=%u ", zone->id);
+ break;
+ case NF_CT_ZONE_DIR_ORIG:
+ seq_printf(s, "zone-orig=%u ", zone->id);
+ break;
+ case NF_CT_ZONE_DIR_REPL:
+ seq_printf(s, "zone-reply=%u ", zone->id);
+ break;
+ default:
+ break;
+ }
+}
+#else
+static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+ int dir)
+{
+}
+#endif
+
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
{
@@ -202,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
l3proto, l4proto);
+ ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
+
if (seq_has_overflowed(s))
goto release;
@@ -214,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
l3proto, l4proto);
+ ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
+
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
goto release;
@@ -228,11 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
#endif
ct_show_secctx(s, ct);
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- seq_printf(s, "zone=%u ", nf_ct_zone(ct));
-#endif
-
+ ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
ct_show_delta_time(s, ct);
seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
diff --git a/kernel/net/netfilter/nf_internals.h b/kernel/net/netfilter/nf_internals.h
index ea7f36784..065522564 100644
--- a/kernel/net/netfilter/nf_internals.h
+++ b/kernel/net/netfilter/nf_internals.h
@@ -19,6 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
/* nf_queue.c */
int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
struct nf_hook_state *state, unsigned int queuenum);
+void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops);
int __init netfilter_queue_init(void);
/* nf_log.c */
diff --git a/kernel/net/netfilter/nf_log.c b/kernel/net/netfilter/nf_log.c
index 675d12c69..a5d41dfa9 100644
--- a/kernel/net/netfilter/nf_log.c
+++ b/kernel/net/netfilter/nf_log.c
@@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register);
void nf_log_unregister(struct nf_logger *logger)
{
+ const struct nf_logger *log;
int i;
mutex_lock(&nf_log_mutex);
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
- RCU_INIT_POINTER(loggers[i][logger->type], NULL);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = nft_log_dereference(loggers[i][logger->type]);
+ if (log == logger)
+ RCU_INIT_POINTER(loggers[i][logger->type], NULL);
+ }
mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
}
EXPORT_SYMBOL(nf_log_unregister);
diff --git a/kernel/net/netfilter/nf_nat_core.c b/kernel/net/netfilter/nf_nat_core.c
index 4e0b47831..06a9f4577 100644
--- a/kernel/net/netfilter/nf_nat_core.c
+++ b/kernel/net/netfilter/nf_nat_core.c
@@ -83,7 +83,7 @@ out:
rcu_read_unlock();
}
-int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
+int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
{
struct flowi fl;
unsigned int hh_len;
@@ -99,7 +99,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
dst = ((struct xfrm_dst *)dst)->route;
dst_hold(dst);
- dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+ dst = xfrm_lookup(net, dst, &fl, skb->sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -118,14 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
-hash_by_src(const struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
+ tuple->dst.protonum ^ nf_conntrack_hash_rnd);
return reciprocal_scale(hash, net->ct.nat_htable_size);
}
@@ -185,20 +184,22 @@ same_src(const struct nf_conn *ct,
/* Only called for SRC manip */
static int
-find_appropriate_src(struct net *net, u16 zone,
+find_appropriate_src(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_nat_l3proto *l3proto,
const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
- unsigned int h = hash_by_src(net, zone, tuple);
+ unsigned int h = hash_by_src(net, tuple);
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
ct = nat->ct;
- if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+ if (same_src(ct, tuple) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -218,7 +219,8 @@ find_appropriate_src(struct net *net, u16 zone,
* the ip with the lowest src-ip/dst-ip/proto usage.
*/
static void
-find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+find_best_ips_proto(const struct nf_conntrack_zone *zone,
+ struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
@@ -258,7 +260,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
*/
j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
range->flags & NF_NAT_RANGE_PERSISTENT ?
- 0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
+ 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
full_range = false;
for (i = 0; i <= max; i++) {
@@ -297,10 +299,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
+ const struct nf_conntrack_zone *zone;
const struct nf_nat_l3proto *l3proto;
const struct nf_nat_l4proto *l4proto;
struct net *net = nf_ct_net(ct);
- u16 zone = nf_ct_zone(ct);
+
+ zone = nf_ct_zone(ct);
rcu_read_lock();
l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
@@ -420,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct,
if (maniptype == NF_NAT_MANIP_SRC) {
unsigned int srchash;
- srchash = hash_by_src(net, nf_ct_zone(ct),
+ srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_lock);
/* nf_conntrack_alter_reply might re-allocate extension aera */
diff --git a/kernel/net/netfilter/nf_nat_proto_dccp.c b/kernel/net/netfilter/nf_nat_proto_dccp.c
index b8067b53f..15c47b246 100644
--- a/kernel/net/netfilter/nf_nat_proto_dccp.c
+++ b/kernel/net/netfilter/nf_nat_proto_dccp.c
@@ -69,7 +69,7 @@ dccp_manip_pkt(struct sk_buff *skb,
l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
tuple, maniptype);
inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- 0);
+ false);
return true;
}
diff --git a/kernel/net/netfilter/nf_nat_proto_tcp.c b/kernel/net/netfilter/nf_nat_proto_tcp.c
index 37f5505f4..4f8820fc5 100644
--- a/kernel/net/netfilter/nf_nat_proto_tcp.c
+++ b/kernel/net/netfilter/nf_nat_proto_tcp.c
@@ -70,7 +70,7 @@ tcp_manip_pkt(struct sk_buff *skb,
return true;
l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
return true;
}
diff --git a/kernel/net/netfilter/nf_nat_proto_udp.c b/kernel/net/netfilter/nf_nat_proto_udp.c
index b0ede2f0d..b1e627227 100644
--- a/kernel/net/netfilter/nf_nat_proto_udp.c
+++ b/kernel/net/netfilter/nf_nat_proto_udp.c
@@ -57,7 +57,7 @@ udp_manip_pkt(struct sk_buff *skb,
l3proto->csum_update(skb, iphdroff, &hdr->check,
tuple, maniptype);
inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
- 0);
+ false);
if (!hdr->check)
hdr->check = CSUM_MANGLED_0;
}
diff --git a/kernel/net/netfilter/nf_nat_proto_udplite.c b/kernel/net/netfilter/nf_nat_proto_udplite.c
index 368f14e01..58340c97b 100644
--- a/kernel/net/netfilter/nf_nat_proto_udplite.c
+++ b/kernel/net/netfilter/nf_nat_proto_udplite.c
@@ -56,7 +56,7 @@ udplite_manip_pkt(struct sk_buff *skb,
}
l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
if (!hdr->check)
hdr->check = CSUM_MANGLED_0;
diff --git a/kernel/net/netfilter/nf_nat_redirect.c b/kernel/net/netfilter/nf_nat_redirect.c
index 97b75f9bf..d43869879 100644
--- a/kernel/net/netfilter/nf_nat_redirect.c
+++ b/kernel/net/netfilter/nf_nat_redirect.c
@@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
- if (indev != NULL) {
+ if (indev && indev->ifa_list) {
ifa = indev->ifa_list;
newdst = ifa->ifa_local;
}
diff --git a/kernel/net/netfilter/nf_queue.c b/kernel/net/netfilter/nf_queue.c
index 2e88032cd..5baa8e24e 100644
--- a/kernel/net/netfilter/nf_queue.c
+++ b/kernel/net/netfilter/nf_queue.c
@@ -69,19 +69,14 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
dev_put(physdev);
}
#endif
- /* Drop reference to owner of hook which queued us. */
- module_put(entry->elem->owner);
}
EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
/* Bump dev refs so they don't vanish while packet is out */
-bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
- if (!try_module_get(entry->elem->owner))
- return false;
-
if (state->in)
dev_hold(state->in);
if (state->out)
@@ -100,11 +95,20 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
dev_hold(physdev);
}
#endif
-
- return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
+void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
+{
+ const struct nf_queue_handler *qh;
+
+ rcu_read_lock();
+ qh = rcu_dereference(queue_handler);
+ if (qh)
+ qh->nf_hook_drop(net, ops);
+ rcu_read_unlock();
+}
+
/*
* Any packet that leaves via this function must come back
* through nf_reinject().
@@ -120,22 +124,20 @@ int nf_queue(struct sk_buff *skb,
const struct nf_queue_handler *qh;
/* QUEUE == DROP if no one is waiting, to be safe. */
- rcu_read_lock();
-
qh = rcu_dereference(queue_handler);
if (!qh) {
status = -ESRCH;
- goto err_unlock;
+ goto err;
}
afinfo = nf_get_afinfo(state->pf);
if (!afinfo)
- goto err_unlock;
+ goto err;
entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
if (!entry) {
status = -ENOMEM;
- goto err_unlock;
+ goto err;
}
*entry = (struct nf_queue_entry) {
@@ -145,16 +147,11 @@ int nf_queue(struct sk_buff *skb,
.size = sizeof(*entry) + afinfo->route_key_size,
};
- if (!nf_queue_entry_get_refs(entry)) {
- status = -ECANCELED;
- goto err_unlock;
- }
+ nf_queue_entry_get_refs(entry);
skb_dst_force(skb);
afinfo->saveroute(skb, entry);
status = qh->outfn(entry, queuenum);
- rcu_read_unlock();
-
if (status < 0) {
nf_queue_entry_release_refs(entry);
goto err;
@@ -162,8 +159,6 @@ int nf_queue(struct sk_buff *skb,
return 0;
-err_unlock:
- rcu_read_unlock();
err:
kfree(entry);
return status;
@@ -176,19 +171,15 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
const struct nf_afinfo *afinfo;
int err;
- rcu_read_lock();
-
nf_queue_entry_release_refs(entry);
/* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT) {
- elem = list_entry(elem->list.prev, struct nf_hook_ops, list);
- verdict = NF_ACCEPT;
- }
+ if (verdict == NF_REPEAT)
+ verdict = elem->hook(elem->priv, skb, &entry->state);
if (verdict == NF_ACCEPT) {
afinfo = nf_get_afinfo(entry->state.pf);
- if (!afinfo || afinfo->reroute(skb, entry) < 0)
+ if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0)
verdict = NF_DROP;
}
@@ -196,7 +187,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
if (verdict == NF_ACCEPT) {
next_hook:
- verdict = nf_iterate(&nf_hooks[entry->state.pf][entry->state.hook],
+ verdict = nf_iterate(entry->state.hook_list,
skb, &entry->state, &elem);
}
@@ -204,15 +195,13 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
case NF_ACCEPT:
case NF_STOP:
local_bh_disable();
- entry->state.okfn(entry->state.sk, skb);
+ entry->state.okfn(entry->state.net, entry->state.sk, skb);
local_bh_enable();
break;
case NF_QUEUE:
err = nf_queue(skb, elem, &entry->state,
verdict >> NF_VERDICT_QBITS);
if (err < 0) {
- if (err == -ECANCELED)
- goto next_hook;
if (err == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
goto next_hook;
@@ -224,7 +213,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
default:
kfree_skb(skb);
}
- rcu_read_unlock();
+
kfree(entry);
}
EXPORT_SYMBOL(nf_reinject);
diff --git a/kernel/net/netfilter/nf_synproxy_core.c b/kernel/net/netfilter/nf_synproxy_core.c
index 52e20c9a4..c8a4a48bc 100644
--- a/kernel/net/netfilter/nf_synproxy_core.c
+++ b/kernel/net/netfilter/nf_synproxy_core.c
@@ -11,15 +11,18 @@
#include <asm/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
+#include <linux/proc_fs.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_tcpudp.h>
#include <linux/netfilter/xt_SYNPROXY.h>
+
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_zones.h>
int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -185,7 +188,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
- u32 *ptr, old;
+ __be32 *ptr, old;
if (synproxy->tsoff == 0)
return 1;
@@ -213,18 +216,18 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
if (op[0] == TCPOPT_TIMESTAMP &&
op[1] == TCPOLEN_TIMESTAMP) {
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
- ptr = (u32 *)&op[2];
+ ptr = (__be32 *)&op[2];
old = *ptr;
*ptr = htonl(ntohl(*ptr) -
synproxy->tsoff);
} else {
- ptr = (u32 *)&op[6];
+ ptr = (__be32 *)&op[6];
old = *ptr;
*ptr = htonl(ntohl(*ptr) +
synproxy->tsoff);
}
inet_proto_csum_replace4(&th->check, skb,
- old, *ptr, 0);
+ old, *ptr, false);
return 1;
}
optoff += op[1];
@@ -348,23 +351,20 @@ static void __net_exit synproxy_proc_exit(struct net *net)
static int __net_init synproxy_net_init(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
- struct nf_conntrack_tuple t;
struct nf_conn *ct;
int err = -ENOMEM;
- memset(&t, 0, sizeof(t));
- ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
- if (IS_ERR(ct)) {
- err = PTR_ERR(ct);
+ ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL);
+ if (!ct)
goto err1;
- }
if (!nfct_seqadj_ext_add(ct))
goto err2;
if (!nfct_synproxy_ext_add(ct))
goto err2;
- nf_conntrack_tmpl_insert(net, ct);
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
@@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net)
err3:
free_percpu(snet->stats);
err2:
- nf_conntrack_free(ct);
+ nf_ct_tmpl_free(ct);
err1:
return err;
}
diff --git a/kernel/net/netfilter/nf_tables_api.c b/kernel/net/netfilter/nf_tables_api.c
index 34ded0931..2cb429d34 100644
--- a/kernel/net/netfilter/nf_tables_api.c
+++ b/kernel/net/netfilter/nf_tables_api.c
@@ -89,6 +89,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
}
static void nft_ctx_init(struct nft_ctx *ctx,
+ struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct nft_af_info *afi,
@@ -96,7 +97,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
struct nft_chain *chain,
const struct nlattr * const *nla)
{
- ctx->net = sock_net(skb->sk);
+ ctx->net = net;
ctx->afi = afi;
ctx->table = table;
ctx->chain = chain;
@@ -127,13 +128,50 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
+int nft_register_basechain(struct nft_base_chain *basechain,
+ unsigned int hook_nops)
+{
+ struct net *net = read_pnet(&basechain->pnet);
+
+ if (basechain->flags & NFT_BASECHAIN_DISABLED)
+ return 0;
+
+ return nf_register_net_hooks(net, basechain->ops, hook_nops);
+}
+EXPORT_SYMBOL_GPL(nft_register_basechain);
+
+void nft_unregister_basechain(struct nft_base_chain *basechain,
+ unsigned int hook_nops)
+{
+ struct net *net = read_pnet(&basechain->pnet);
+
+ if (basechain->flags & NFT_BASECHAIN_DISABLED)
+ return;
+
+ nf_unregister_net_hooks(net, basechain->ops, hook_nops);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_basechain);
+
+static int nf_tables_register_hooks(const struct nft_table *table,
+ struct nft_chain *chain,
+ unsigned int hook_nops)
+{
+ if (table->flags & NFT_TABLE_F_DORMANT ||
+ !(chain->flags & NFT_BASE_CHAIN))
+ return 0;
+
+ return nft_register_basechain(nft_base_chain(chain), hook_nops);
+}
+
static void nf_tables_unregister_hooks(const struct nft_table *table,
- const struct nft_chain *chain,
+ struct nft_chain *chain,
unsigned int hook_nops)
{
- if (!(table->flags & NFT_TABLE_F_DORMANT) &&
- chain->flags & NFT_BASE_CHAIN)
- nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops);
+ if (table->flags & NFT_TABLE_F_DORMANT ||
+ !(chain->flags & NFT_BASE_CHAIN))
+ return;
+
+ nft_unregister_basechain(nft_base_chain(chain), hook_nops);
}
/* Internal table flags */
@@ -560,7 +598,7 @@ static int nf_tables_table_enable(const struct nft_af_info *afi,
if (!(chain->flags & NFT_BASE_CHAIN))
continue;
- err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ err = nft_register_basechain(nft_base_chain(chain), afi->nops);
if (err < 0)
goto err;
@@ -575,20 +613,20 @@ err:
if (i-- <= 0)
break;
- nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops);
+ nft_unregister_basechain(nft_base_chain(chain), afi->nops);
}
return err;
}
static void nf_tables_table_disable(const struct nft_af_info *afi,
- struct nft_table *table)
+ struct nft_table *table)
{
struct nft_chain *chain;
list_for_each_entry(chain, &table->chains, list) {
if (chain->flags & NFT_BASE_CHAIN)
- nf_unregister_hooks(nft_base_chain(chain)->ops,
- afi->nops);
+ nft_unregister_basechain(nft_base_chain(chain),
+ afi->nops);
}
}
@@ -635,15 +673,14 @@ err:
return ret;
}
-static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newtable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nlattr *name;
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
u32 flags = 0;
struct nft_ctx ctx;
@@ -669,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
return nf_tables_updtable(&ctx);
}
@@ -679,30 +716,32 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
return -EINVAL;
}
+ err = -EAFNOSUPPORT;
if (!try_module_get(afi->owner))
- return -EAFNOSUPPORT;
+ goto err1;
err = -ENOMEM;
table = kzalloc(sizeof(*table), GFP_KERNEL);
if (table == NULL)
- goto err1;
+ goto err2;
nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
table->flags = flags;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
- goto err2;
+ goto err3;
list_add_tail_rcu(&table->list, &afi->tables);
return 0;
-err2:
+err3:
kfree(table);
-err1:
+err2:
module_put(afi->owner);
+err1:
return err;
}
@@ -771,18 +810,17 @@ out:
return err;
}
-static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_deltable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla);
if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
return nft_flush(&ctx, family);
@@ -881,6 +919,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
[NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 },
[NFTA_HOOK_PRIORITY] = { .type = NLA_U32 },
+ [NFTA_HOOK_DEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
};
static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
@@ -954,6 +994,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
goto nla_put_failure;
+ if (basechain->dev_name[0] &&
+ nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -1165,16 +1208,20 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
BUG_ON(chain->use > 0);
if (chain->flags & NFT_BASE_CHAIN) {
- module_put(nft_base_chain(chain)->type->owner);
- free_percpu(nft_base_chain(chain)->stats);
- kfree(nft_base_chain(chain));
+ struct nft_base_chain *basechain = nft_base_chain(chain);
+
+ module_put(basechain->type->owner);
+ free_percpu(basechain->stats);
+ if (basechain->ops[0].dev != NULL)
+ dev_put(basechain->ops[0].dev);
+ kfree(basechain);
} else {
kfree(chain);
}
}
-static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -1184,8 +1231,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
struct nft_chain *chain;
struct nft_base_chain *basechain = NULL;
struct nlattr *ha[NFTA_HOOK_MAX + 1];
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct net_device *dev = NULL;
u8 policy = NF_ACCEPT;
u64 handle = 0;
unsigned int i;
@@ -1264,7 +1311,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(stats);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
sizeof(struct nft_trans_chain));
if (trans == NULL) {
@@ -1325,17 +1372,43 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
return -ENOENT;
hookfn = type->hooks[hooknum];
+ if (afi->flags & NFT_AF_NEEDS_DEV) {
+ char ifname[IFNAMSIZ];
+
+ if (!ha[NFTA_HOOK_DEV]) {
+ module_put(type->owner);
+ return -EOPNOTSUPP;
+ }
+
+ nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
+ dev = dev_get_by_name(net, ifname);
+ if (!dev) {
+ module_put(type->owner);
+ return -ENOENT;
+ }
+ } else if (ha[NFTA_HOOK_DEV]) {
+ module_put(type->owner);
+ return -EOPNOTSUPP;
+ }
+
basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
if (basechain == NULL) {
module_put(type->owner);
+ if (dev != NULL)
+ dev_put(dev);
return -ENOMEM;
}
+ if (dev != NULL)
+ strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
if (IS_ERR(stats)) {
module_put(type->owner);
kfree(basechain);
+ if (dev != NULL)
+ dev_put(dev);
return PTR_ERR(stats);
}
basechain->stats = stats;
@@ -1344,6 +1417,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
if (stats == NULL) {
module_put(type->owner);
kfree(basechain);
+ if (dev != NULL)
+ dev_put(dev);
return -ENOMEM;
}
rcu_assign_pointer(basechain->stats, stats);
@@ -1356,11 +1431,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
for (i = 0; i < afi->nops; i++) {
ops = &basechain->ops[i];
ops->pf = family;
- ops->owner = afi->owner;
ops->hooknum = hooknum;
ops->priority = priority;
ops->priv = chain;
ops->hook = afi->hooks[ops->hooknum];
+ ops->dev = dev;
if (hookfn)
ops->hook = hookfn;
if (afi->hook_ops_init)
@@ -1380,14 +1455,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
chain->table = table;
nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
- if (!(table->flags & NFT_TABLE_F_DORMANT) &&
- chain->flags & NFT_BASE_CHAIN) {
- err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
- if (err < 0)
- goto err1;
- }
+ err = nf_tables_register_hooks(table, chain, afi->nops);
+ if (err < 0)
+ goto err1;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
if (err < 0)
goto err2;
@@ -1402,15 +1474,14 @@ err1:
return err;
}
-static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
@@ -1432,7 +1503,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
if (chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
return nft_delchain(&ctx);
}
@@ -1936,13 +2007,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
static struct nft_expr_info *info;
-static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule, *old_rule = NULL;
@@ -2001,7 +2071,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(old_rule);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
n = 0;
size = 0;
@@ -2102,13 +2172,12 @@ err1:
return err;
}
-static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain = NULL;
struct nft_rule *rule;
@@ -2131,7 +2200,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(chain);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
@@ -2270,12 +2339,11 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi = NULL;
struct nft_table *table = NULL;
@@ -2297,7 +2365,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
return -ENOENT;
}
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
@@ -2549,6 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_ctx ctx;
struct sk_buff *skb2;
@@ -2556,7 +2625,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -2619,14 +2688,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
return 0;
}
-static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nft_set_ops *ops;
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
@@ -2724,7 +2792,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (IS_ERR(table))
return PTR_ERR(table);
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
if (IS_ERR(set)) {
@@ -2808,8 +2876,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set
nft_set_destroy(set);
}
-static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2822,7 +2890,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -2950,7 +3018,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2959,7 +3027,6 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
if (IS_ERR(afi))
@@ -2971,7 +3038,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
if (!trans && (table->flags & NFT_TABLE_INACTIVE))
return -ENOENT;
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
@@ -3061,6 +3128,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
@@ -3076,8 +3144,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
return err;
- err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
- false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
+ (void *)nla, false);
if (err < 0)
return err;
@@ -3138,11 +3206,12 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_ctx ctx;
int err;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false);
if (err < 0)
return err;
@@ -3454,11 +3523,10 @@ err1:
return err;
}
-static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
@@ -3467,7 +3535,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true);
if (err < 0)
return err;
@@ -3549,8 +3617,8 @@ err1:
return err;
}
-static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nlattr *attr;
@@ -3561,7 +3629,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false);
if (err < 0)
return err;
@@ -3956,7 +4024,8 @@ static int nf_tables_abort(struct sk_buff *skb)
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ list) {
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
diff --git a/kernel/net/netfilter/nf_tables_core.c b/kernel/net/netfilter/nf_tables_core.c
index f153b0707..f3695a497 100644
--- a/kernel/net/netfilter/nf_tables_core.c
+++ b/kernel/net/netfilter/nf_tables_core.c
@@ -48,9 +48,7 @@ static void __nft_trace_packet(const struct nft_pktinfo *pkt,
const struct nft_chain *chain,
int rulenum, enum nft_trace type)
{
- struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
-
- nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+ nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
chain->table->name, chain->name, comments[type],
rulenum);
@@ -111,10 +109,10 @@ struct nft_jumpstack {
};
unsigned int
-nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
+nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
- const struct nft_chain *chain = ops->priv, *basechain = chain;
- const struct net *net = read_pnet(&nft_base_chain(basechain)->pnet);
+ const struct nft_chain *chain = priv, *basechain = chain;
+ const struct net *net = pkt->net;
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_regs regs;
diff --git a/kernel/net/netfilter/nf_tables_netdev.c b/kernel/net/netfilter/nf_tables_netdev.c
new file mode 100644
index 000000000..edb3502f2
--- /dev/null
+++ b/kernel/net/netfilter/nf_tables_netdev.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+static inline void
+nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct iphdr *iph, _iph;
+ u32 len, thoff;
+
+ nft_set_pktinfo(pkt, skb, state);
+
+ iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
+ &_iph);
+ if (!iph)
+ return;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return;
+
+ len = ntohs(iph->tot_len);
+ thoff = iph->ihl * 4;
+ if (skb->len < len)
+ return;
+ else if (len < thoff)
+ return;
+
+ pkt->tprot = iph->protocol;
+ pkt->xt.thoff = thoff;
+ pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
+}
+
+static inline void
+__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *ip6h, _ip6h;
+ unsigned int thoff = 0;
+ unsigned short frag_off;
+ int protohdr;
+ u32 pkt_len;
+
+ ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
+ &_ip6h);
+ if (!ip6h)
+ return;
+
+ if (ip6h->version != 6)
+ return;
+
+ pkt_len = ntohs(ip6h->payload_len);
+ if (pkt_len + sizeof(*ip6h) > skb->len)
+ return;
+
+ protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+ if (protohdr < 0)
+ return;
+
+ pkt->tprot = protohdr;
+ pkt->xt.thoff = thoff;
+ pkt->xt.fragoff = frag_off;
+#endif
+}
+
+static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ nft_set_pktinfo(pkt, skb, state);
+ __nft_netdev_set_pktinfo_ipv6(pkt, skb, state);
+}
+
+static unsigned int
+nft_do_chain_netdev(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
+ break;
+ case htons(ETH_P_IPV6):
+ nft_netdev_set_pktinfo_ipv6(&pkt, skb, state);
+ break;
+ default:
+ nft_set_pktinfo(&pkt, skb, state);
+ break;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
+static struct nft_af_info nft_af_netdev __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .nhooks = NF_NETDEV_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .flags = NFT_AF_NEEDS_DEV,
+ .nops = 1,
+ .hooks = {
+ [NF_NETDEV_INGRESS] = nft_do_chain_netdev,
+ },
+};
+
+static int nf_tables_netdev_init_net(struct net *net)
+{
+ net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.netdev == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev));
+
+ if (nft_register_afinfo(net, net->nft.netdev) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.netdev);
+ return -ENOMEM;
+}
+
+static void nf_tables_netdev_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.netdev);
+ kfree(net->nft.netdev);
+}
+
+static struct pernet_operations nf_tables_netdev_net_ops = {
+ .init = nf_tables_netdev_init_net,
+ .exit = nf_tables_netdev_exit_net,
+};
+
+static const struct nf_chain_type nft_filter_chain_netdev = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_NETDEV,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_NETDEV_INGRESS),
+};
+
+static void nft_netdev_event(unsigned long event, struct nft_af_info *afi,
+ struct net_device *dev, struct nft_table *table,
+ struct nft_base_chain *basechain)
+{
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (strcmp(basechain->dev_name, dev->name) != 0)
+ return;
+
+ BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED));
+
+ dev_hold(dev);
+ basechain->ops[0].dev = dev;
+ basechain->flags &= ~NFT_BASECHAIN_DISABLED;
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nft_register_basechain(basechain, afi->nops);
+ break;
+ case NETDEV_UNREGISTER:
+ if (strcmp(basechain->dev_name, dev->name) != 0)
+ return;
+
+ BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED);
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nft_unregister_basechain(basechain, afi->nops);
+
+ dev_put(basechain->ops[0].dev);
+ basechain->ops[0].dev = NULL;
+ basechain->flags |= NFT_BASECHAIN_DISABLED;
+ break;
+ case NETDEV_CHANGENAME:
+ if (dev->ifindex != basechain->ops[0].dev->ifindex)
+ return;
+
+ strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+ break;
+ }
+}
+
+static int nf_tables_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
+ if (afi->family != NFPROTO_NETDEV)
+ continue;
+
+ list_for_each_entry(table, &afi->tables, list) {
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ nft_netdev_event(event, afi, dev, table,
+ nft_base_chain(chain));
+ }
+ }
+ }
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nf_tables_netdev_notifier = {
+ .notifier_call = nf_tables_netdev_event,
+};
+
+static int __init nf_tables_netdev_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&nft_filter_chain_netdev);
+ ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&nft_filter_chain_netdev);
+
+ register_netdevice_notifier(&nf_tables_netdev_notifier);
+
+ return ret;
+}
+
+static void __exit nf_tables_netdev_exit(void)
+{
+ unregister_netdevice_notifier(&nf_tables_netdev_notifier);
+ unregister_pernet_subsys(&nf_tables_netdev_net_ops);
+ nft_unregister_chain_type(&nft_filter_chain_netdev);
+}
+
+module_init(nf_tables_netdev_init);
+module_exit(nf_tables_netdev_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */
diff --git a/kernel/net/netfilter/nfnetlink.c b/kernel/net/netfilter/nfnetlink.c
index 8b117c90e..77afe913d 100644
--- a/kernel/net/netfilter/nfnetlink.c
+++ b/kernel/net/netfilter/nfnetlink.c
@@ -64,7 +64,7 @@ void nfnl_unlock(__u8 subsys_id)
EXPORT_SYMBOL_GPL(nfnl_unlock);
#ifdef CONFIG_PROVE_LOCKING
-int lockdep_nfnl_is_held(u8 subsys_id)
+bool lockdep_nfnl_is_held(u8 subsys_id)
{
return lockdep_is_held(&table[subsys_id].mutex);
}
@@ -269,6 +269,12 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb)
}
}
+enum {
+ NFNL_BATCH_FAILURE = (1 << 0),
+ NFNL_BATCH_DONE = (1 << 1),
+ NFNL_BATCH_REPLAY = (1 << 2),
+};
+
static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
u_int16_t subsys_id)
{
@@ -276,19 +282,19 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
- bool success = true, done = false;
static LIST_HEAD(err_list);
+ u32 status;
int err;
if (subsys_id >= NFNL_SUBSYS_COUNT)
return netlink_ack(skb, nlh, -EINVAL);
replay:
+ status = 0;
+
skb = netlink_skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM);
- skb->sk = oskb->sk;
-
nfnl_lock(subsys_id);
ss = rcu_dereference_protected(table[subsys_id].subsys,
lockdep_is_held(&table[subsys_id].mutex));
@@ -336,10 +342,10 @@ replay:
if (type == NFNL_MSG_BATCH_BEGIN) {
/* Malformed: Batch begin twice */
nfnl_err_reset(&err_list);
- success = false;
+ status |= NFNL_BATCH_FAILURE;
goto done;
} else if (type == NFNL_MSG_BATCH_END) {
- done = true;
+ status |= NFNL_BATCH_DONE;
goto done;
} else if (type < NLMSG_MIN_TYPE) {
err = -EINVAL;
@@ -373,7 +379,7 @@ replay:
goto ack;
if (nc->call_batch) {
- err = nc->call_batch(net->nfnl, skb, nlh,
+ err = nc->call_batch(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
}
@@ -382,11 +388,8 @@ replay:
* original skb.
*/
if (err == -EAGAIN) {
- nfnl_err_reset(&err_list);
- ss->abort(oskb);
- nfnl_unlock(subsys_id);
- kfree_skb(skb);
- goto replay;
+ status |= NFNL_BATCH_REPLAY;
+ goto next;
}
}
ack:
@@ -402,7 +405,7 @@ ack:
*/
nfnl_err_reset(&err_list);
netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM);
- success = false;
+ status |= NFNL_BATCH_FAILURE;
goto done;
}
/* We don't stop processing the batch on errors, thus,
@@ -410,19 +413,26 @@ ack:
* triggers.
*/
if (err)
- success = false;
+ status |= NFNL_BATCH_FAILURE;
}
-
+next:
msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
skb_pull(skb, msglen);
}
done:
- if (success && done)
+ if (status & NFNL_BATCH_REPLAY) {
+ ss->abort(oskb);
+ nfnl_err_reset(&err_list);
+ nfnl_unlock(subsys_id);
+ kfree_skb(skb);
+ goto replay;
+ } else if (status == NFNL_BATCH_DONE) {
ss->commit(oskb);
- else
+ } else {
ss->abort(oskb);
+ }
nfnl_err_deliver(&err_list, oskb);
nfnl_unlock(subsys_id);
@@ -432,6 +442,7 @@ done:
static void nfnetlink_rcv(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ u_int16_t res_id;
int msglen;
if (nlh->nlmsg_len < NLMSG_HDRLEN ||
@@ -456,7 +467,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
- nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+ /* Work around old nft using host byte order */
+ if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ res_id = NFNL_SUBSYS_NFTABLES;
+ else
+ res_id = ntohs(nfgenmsg->res_id);
+ nfnetlink_rcv_batch(skb, nlh, res_id);
} else {
netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
}
@@ -474,7 +490,7 @@ static int nfnetlink_bind(struct net *net, int group)
type = nfnl_group2type[group];
rcu_read_lock();
- ss = nfnetlink_get_subsys(type);
+ ss = nfnetlink_get_subsys(type << 8);
rcu_read_unlock();
if (!ss)
request_module("nfnetlink-subsys-%d", type);
diff --git a/kernel/net/netfilter/nfnetlink_acct.c b/kernel/net/netfilter/nfnetlink_acct.c
index c18af2f63..fefbf5f0b 100644
--- a/kernel/net/netfilter/nfnetlink_acct.c
+++ b/kernel/net/netfilter/nfnetlink_acct.c
@@ -27,8 +27,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
-static LIST_HEAD(nfnl_acct_list);
-
struct nf_acct {
atomic64_t pkts;
atomic64_t bytes;
@@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
const struct nlmsghdr *nlh, const struct nlattr * const tb[])
{
struct nf_acct *nfacct, *matching = NULL;
+ struct net *net = sock_net(nfnl);
char *acct_name;
unsigned int size = 0;
u32 flags = 0;
@@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
if (strlen(acct_name) == 0)
return -EINVAL;
- list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+ list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
}
atomic_set(&nfacct->refcnt, 1);
- list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+ list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
return 0;
}
@@ -185,6 +184,7 @@ nla_put_failure:
static int
nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct nf_acct *cur, *last;
const struct nfacct_filter *filter = cb->data;
@@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
if (last) {
if (cur != last)
continue;
@@ -257,6 +257,7 @@ static int
nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
const struct nlmsghdr *nlh, const struct nlattr * const tb[])
{
+ struct net *net = sock_net(nfnl);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
@@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &nfnl_acct_list, head) {
+ list_for_each_entry(cur, &net->nfnl_acct_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -336,19 +337,20 @@ static int
nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
const struct nlmsghdr *nlh, const struct nlattr * const tb[])
{
+ struct net *net = sock_net(nfnl);
char *acct_name;
struct nf_acct *cur;
int ret = -ENOENT;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry(cur, &nfnl_acct_list, head)
+ list_for_each_entry(cur, &net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
}
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &nfnl_acct_list, head) {
+ list_for_each_entry(cur, &net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = {
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
-struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
{
struct nf_acct *cur, *acct = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
continue;
@@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
void nfnl_acct_put(struct nf_acct *acct)
{
- atomic_dec(&acct->refcnt);
+ if (atomic_dec_and_test(&acct->refcnt))
+ kfree_rcu(acct, rcu_head);
+
module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(nfnl_acct_put);
@@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
}
EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
+static int __net_init nfnl_acct_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nfnl_acct_list);
+
+ return 0;
+}
+
+static void __net_exit nfnl_acct_net_exit(struct net *net)
+{
+ struct nf_acct *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+ list_del_rcu(&cur->head);
+
+ if (atomic_dec_and_test(&cur->refcnt))
+ kfree_rcu(cur, rcu_head);
+ }
+}
+
+static struct pernet_operations nfnl_acct_ops = {
+ .init = nfnl_acct_net_init,
+ .exit = nfnl_acct_net_exit,
+};
+
static int __init nfnl_acct_init(void)
{
int ret;
+ ret = register_pernet_subsys(&nfnl_acct_ops);
+ if (ret < 0) {
+ pr_err("nfnl_acct_init: failed to register pernet ops\n");
+ goto err_out;
+ }
+
pr_info("nfnl_acct: registering with nfnetlink.\n");
ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
if (ret < 0) {
pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
- goto err_out;
+ goto cleanup_pernet;
}
return 0;
+
+cleanup_pernet:
+ unregister_pernet_subsys(&nfnl_acct_ops);
err_out:
return ret;
}
static void __exit nfnl_acct_exit(void)
{
- struct nf_acct *cur, *tmp;
-
pr_info("nfnl_acct: unregistering from nfnetlink.\n");
nfnetlink_subsys_unregister(&nfnl_acct_subsys);
-
- list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
- list_del_rcu(&cur->head);
- /* We are sure that our objects have no clients at this point,
- * it's safe to release them all without checking refcnt. */
- kfree_rcu(cur, rcu_head);
- }
+ unregister_pernet_subsys(&nfnl_acct_ops);
}
module_init(nfnl_acct_init);
diff --git a/kernel/net/netfilter/nfnetlink_cttimeout.c b/kernel/net/netfilter/nfnetlink_cttimeout.c
index 476accd17..c7a2d0e1c 100644
--- a/kernel/net/netfilter/nfnetlink_cttimeout.c
+++ b/kernel/net/netfilter/nfnetlink_cttimeout.c
@@ -291,6 +291,34 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
return ret;
}
+static void untimeout(struct nf_conntrack_tuple_hash *i,
+ struct ctnl_timeout *timeout)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+}
+
+static void ctnl_untimeout(struct ctnl_timeout *timeout)
+{
+ struct nf_conntrack_tuple_hash *h;
+ const struct hlist_nulls_node *nn;
+ int i;
+
+ local_bh_disable();
+ for (i = 0; i < init_net.ct.htable_size; i++) {
+ spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ if (i < init_net.ct.htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &init_net.ct.hash[i], hnnode)
+ untimeout(h, timeout);
+ }
+ spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ }
+ local_bh_enable();
+}
+
/* try to delete object, fail if it is still in use. */
static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
{
@@ -301,6 +329,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
nf_ct_l4proto_put(timeout->l4proto);
+ ctnl_untimeout(timeout);
kfree_rcu(timeout, rcu_head);
} else {
/* still in use, restore reference counter. */
@@ -567,6 +596,10 @@ static void __exit cttimeout_exit(void)
pr_info("cttimeout: unregistering from nfnetlink.\n");
nfnetlink_subsys_unregister(&cttimeout_subsys);
+
+ /* Make sure no conntrack objects refer to custom timeouts anymore. */
+ ctnl_untimeout(NULL);
+
list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) {
list_del_rcu(&cur->head);
/* We are sure that our objects have no clients at this point,
@@ -579,6 +612,7 @@ static void __exit cttimeout_exit(void)
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+ rcu_barrier();
}
module_init(cttimeout_init);
diff --git a/kernel/net/netfilter/nfnetlink_log.c b/kernel/net/netfilter/nfnetlink_log.c
index 4ef1fae84..740cce468 100644
--- a/kernel/net/netfilter/nfnetlink_log.c
+++ b/kernel/net/netfilter/nfnetlink_log.c
@@ -27,6 +27,7 @@
#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/spinlock.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
@@ -401,7 +402,9 @@ __build_packet_message(struct nfnl_log_net *log,
unsigned int hooknum,
const struct net_device *indev,
const struct net_device *outdev,
- const char *prefix, unsigned int plen)
+ const char *prefix, unsigned int plen,
+ const struct nfnl_ct_hook *nfnl_ct,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
{
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
@@ -538,9 +541,9 @@ __build_packet_message(struct nfnl_log_net *log,
if (skb->tstamp.tv64) {
struct nfulnl_msg_packet_timestamp ts;
- struct timeval tv = ktime_to_timeval(skb->tstamp);
- ts.sec = cpu_to_be64(tv.tv_sec);
- ts.usec = cpu_to_be64(tv.tv_usec);
+ struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ ts.sec = cpu_to_be64(kts.tv_sec);
+ ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts))
goto nla_put_failure;
@@ -575,6 +578,10 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(atomic_inc_return(&log->global_seq))))
goto nla_put_failure;
+ if (ct && nfnl_ct->build(inst->skb, ct, ctinfo,
+ NFULA_CT, NFULA_CT_INFO) < 0)
+ goto nla_put_failure;
+
if (data_len) {
struct nlattr *nla;
int size = nla_attr_size(data_len);
@@ -598,8 +605,6 @@ nla_put_failure:
return -1;
}
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_ULOG,
.u = {
@@ -622,12 +627,16 @@ nfulnl_log_packet(struct net *net,
const struct nf_loginfo *li_user,
const char *prefix)
{
- unsigned int size, data_len;
+ size_t size;
+ unsigned int data_len;
struct nfulnl_instance *inst;
const struct nf_loginfo *li;
unsigned int qthreshold;
unsigned int plen;
struct nfnl_log_net *log = nfnl_log_pernet(net);
+ const struct nfnl_ct_hook *nfnl_ct = NULL;
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
@@ -673,6 +682,14 @@ nfulnl_log_packet(struct net *net,
size += nla_total_size(sizeof(u_int32_t));
if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
size += nla_total_size(sizeof(u_int32_t));
+ if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+ if (nfnl_ct != NULL) {
+ ct = nfnl_ct->get_ct(skb, &ctinfo);
+ if (ct != NULL)
+ size += nfnl_ct->build_size(ct);
+ }
+ }
qthreshold = inst->qthreshold;
/* per-rule qthreshold overrides per-instance */
@@ -717,7 +734,8 @@ nfulnl_log_packet(struct net *net,
inst->qlen++;
__build_packet_message(log, inst, skb, data_len, pf,
- hooknum, in, out, prefix, plen);
+ hooknum, in, out, prefix, plen,
+ nfnl_ct, ct, ctinfo);
if (inst->qlen >= qthreshold)
__nfulnl_flush(inst);
@@ -807,6 +825,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
struct net *net = sock_net(ctnl);
struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
+ u16 flags = 0;
if (nfula[NFULA_CFG_CMD]) {
u_int8_t pf = nfmsg->nfgen_family;
@@ -828,6 +847,28 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
}
+ /* Check if we support these flags in first place, dependencies should
+ * be there too not to break atomicity.
+ */
+ if (nfula[NFULA_CFG_FLAGS]) {
+ flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS]));
+
+ if ((flags & NFULNL_CFG_F_CONNTRACK) &&
+ !rcu_access_pointer(nfnl_ct_hook)) {
+#ifdef CONFIG_MODULES
+ nfnl_unlock(NFNL_SUBSYS_ULOG);
+ request_module("ip_conntrack_netlink");
+ nfnl_lock(NFNL_SUBSYS_ULOG);
+ if (rcu_access_pointer(nfnl_ct_hook)) {
+ ret = -EAGAIN;
+ goto out_put;
+ }
+#endif
+ ret = -EOPNOTSUPP;
+ goto out_put;
+ }
+ }
+
if (cmd != NULL) {
switch (cmd->command) {
case NFULNL_CFG_CMD_BIND:
@@ -856,16 +897,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -ENOTSUPP;
break;
}
+ } else if (!inst) {
+ ret = -ENODEV;
+ goto out;
}
if (nfula[NFULA_CFG_MODE]) {
- struct nfulnl_msg_config_mode *params;
- params = nla_data(nfula[NFULA_CFG_MODE]);
+ struct nfulnl_msg_config_mode *params =
+ nla_data(nfula[NFULA_CFG_MODE]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_mode(inst, params->copy_mode,
ntohl(params->copy_range));
}
@@ -873,42 +913,23 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
if (nfula[NFULA_CFG_TIMEOUT]) {
__be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_timeout(inst, ntohl(timeout));
}
if (nfula[NFULA_CFG_NLBUFSIZ]) {
__be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
}
if (nfula[NFULA_CFG_QTHRESH]) {
__be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_qthresh(inst, ntohl(qthresh));
}
- if (nfula[NFULA_CFG_FLAGS]) {
- __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]);
-
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
- nfulnl_set_flags(inst, ntohs(flags));
- }
+ if (nfula[NFULA_CFG_FLAGS])
+ nfulnl_set_flags(inst, flags);
out_put:
instance_put(inst);
diff --git a/kernel/net/netfilter/nfnetlink_queue_core.c b/kernel/net/netfilter/nfnetlink_queue.c
index 11c7682fa..861c66152 100644
--- a/kernel/net/netfilter/nfnetlink_queue_core.c
+++ b/kernel/net/netfilter/nfnetlink_queue.c
@@ -28,12 +28,12 @@
#include <linux/netfilter_bridge.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
#include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_queue.h>
#include <linux/atomic.h>
@@ -278,13 +278,30 @@ nla_put_failure:
return -1;
}
+static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
+{
+ u32 seclen = 0;
+#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
+ if (!skb || !sk_fullsock(skb->sk))
+ return 0;
+
+ read_lock_bh(&skb->sk->sk_callback_lock);
+
+ if (skb->secmark)
+ security_secid_to_secctx(skb->secmark, secdata, &seclen);
+
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+#endif
+ return seclen;
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
__be32 **packet_id_ptr)
{
size_t size;
- size_t data_len = 0, cap_len = 0;
+ size_t data_len = 0, cap_len = 0, rem_len = 0;
unsigned int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
@@ -296,7 +313,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct net_device *outdev;
struct nf_conn *ct = NULL;
enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
+ char *secdata = NULL;
+ u32 seclen = 0;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
@@ -341,18 +361,32 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
hlen = min_t(unsigned int, hlen, data_len);
size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
+ rem_len = data_len - hlen;
break;
}
- if (queue->flags & NFQA_CFG_F_CONNTRACK)
- ct = nfqnl_ct_get(entskb, &size, &ctinfo);
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
+ if (queue->flags & NFQA_CFG_F_CONNTRACK) {
+ if (nfnl_ct != NULL) {
+ ct = nfnl_ct->get_ct(entskb, &ctinfo);
+ if (ct != NULL)
+ size += nfnl_ct->build_size(ct);
+ }
+ }
if (queue->flags & NFQA_CFG_F_UID_GID) {
size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ nla_total_size(sizeof(u_int32_t))); /* gid */
}
- skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+ if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
+ seclen = nfqnl_get_sk_secctx(entskb, &secdata);
+ if (seclen)
+ size += nla_total_size(seclen);
+ }
+
+ skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
GFP_ATOMIC);
if (!skb) {
skb_tx_error(entskb);
@@ -467,9 +501,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (entskb->tstamp.tv64) {
struct nfqnl_msg_packet_timestamp ts;
- struct timeval tv = ktime_to_timeval(entskb->tstamp);
- ts.sec = cpu_to_be64(tv.tv_sec);
- ts.usec = cpu_to_be64(tv.tv_usec);
+ struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+
+ ts.sec = cpu_to_be64(kts.tv_sec);
+ ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
goto nla_put_failure;
@@ -479,7 +514,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
goto nla_put_failure;
- if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
+ goto nla_put_failure;
+
+ if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
goto nla_put_failure;
if (cap_len > data_len &&
@@ -569,12 +607,9 @@ static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry) {
- if (nf_queue_entry_get_refs(entry))
- return entry;
- kfree(entry);
- }
- return NULL;
+ if (entry)
+ nf_queue_entry_get_refs(entry);
+ return entry;
}
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -641,8 +676,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
struct nfqnl_instance *queue;
struct sk_buff *skb, *segs;
int err = -ENOBUFS;
- struct net *net = dev_net(entry->state.in ?
- entry->state.in : entry->state.out);
+ struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
/* rcu_read_lock()ed by nf_hook_slow() */
@@ -670,7 +704,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
nf_bridge_adjust_skb_data(skb);
segs = skb_gso_segment(skb, 0);
/* Does not use PTR_ERR to limit the number of error codes that can be
- * returned by nf_queue. For instance, callers rely on -ECANCELED to
+ * returned by nf_queue. For instance, callers rely on -ESRCH to
* mean 'ignore this hook'.
*/
if (IS_ERR_OR_NULL(segs))
@@ -806,8 +840,6 @@ nfqnl_dev_drop(struct net *net, int ifindex)
rcu_read_unlock();
}
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
static int
nfqnl_rcv_dev_event(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -824,6 +856,27 @@ static struct notifier_block nfqnl_dev_notifier = {
.notifier_call = nfqnl_rcv_dev_event,
};
+static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr)
+{
+ return entry->elem == (struct nf_hook_ops *)ops_ptr;
+}
+
+static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook)
+{
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_rcu(inst, head, hlist)
+ nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook);
+ }
+ rcu_read_unlock();
+}
+
static int
nfqnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -954,6 +1007,28 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
return 0;
}
+static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[],
+ struct nf_queue_entry *entry,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nf_conn *ct;
+
+ ct = nfnl_ct->get_ct(entry->skb, ctinfo);
+ if (ct == NULL)
+ return NULL;
+
+ if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0)
+ return NULL;
+
+ if (nfqa[NFQA_EXP])
+ nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct,
+ NETLINK_CB(entry->skb).portid,
+ nlmsg_report(nlh));
+ return ct;
+}
+
static int
nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -967,6 +1042,7 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
unsigned int verdict;
struct nf_queue_entry *entry;
enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
struct net *net = sock_net(ctnl);
@@ -989,13 +1065,12 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
if (entry == NULL)
return -ENOENT;
+ /* rcu lock already held from nfnl->call_rcu. */
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
if (nfqa[NFQA_CT]) {
- ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
- if (ct && nfqa[NFQA_EXP]) {
- nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
- NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
- }
+ if (nfnl_ct != NULL)
+ ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
}
if (nfqa[NFQA_PAYLOAD]) {
@@ -1006,8 +1081,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
payload_len, entry, diff) < 0)
verdict = NF_DROP;
- if (ct)
- nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff);
+ if (ct && diff)
+ nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff);
}
if (nfqa[NFQA_MARK])
@@ -1031,7 +1106,8 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
};
static const struct nf_queue_handler nfqh = {
- .outfn = &nfqnl_enqueue_packet,
+ .outfn = &nfqnl_enqueue_packet,
+ .nf_hook_drop = &nfqnl_nf_hook_drop,
};
static int
@@ -1142,7 +1218,12 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -EOPNOTSUPP;
goto err_out_unlock;
}
-
+#if !IS_ENABLED(CONFIG_NETWORK_SECMARK)
+ if (flags & mask & NFQA_CFG_F_SECCTX) {
+ ret = -EOPNOTSUPP;
+ goto err_out_unlock;
+ }
+#endif
spin_lock_bh(&queue->lock);
queue->flags &= ~mask;
queue->flags |= flags & mask;
@@ -1257,7 +1338,7 @@ static int seq_show(struct seq_file *s, void *v)
inst->copy_mode, inst->copy_range,
inst->queue_dropped, inst->queue_user_dropped,
inst->id_sequence, 1);
- return seq_has_overflowed(s);
+ return 0;
}
static const struct seq_operations nfqnl_seq_ops = {
@@ -1338,6 +1419,7 @@ static int __init nfnetlink_queue_init(void)
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
out:
return status;
}
diff --git a/kernel/net/netfilter/nfnetlink_queue_ct.c b/kernel/net/netfilter/nfnetlink_queue_ct.c
deleted file mode 100644
index 96cac50e0..000000000
--- a/kernel/net/netfilter/nfnetlink_queue_ct.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/skbuff.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_queue.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nfnetlink_queue.h>
-
-struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
- enum ip_conntrack_info *ctinfo)
-{
- struct nfq_ct_hook *nfq_ct;
- struct nf_conn *ct;
-
- /* rcu_read_lock()ed by __nf_queue already. */
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return NULL;
-
- ct = nf_ct_get(entskb, ctinfo);
- if (ct) {
- if (!nf_ct_is_untracked(ct))
- *size += nfq_ct->build_size(ct);
- else
- ct = NULL;
- }
- return ct;
-}
-
-struct nf_conn *
-nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr,
- enum ip_conntrack_info *ctinfo)
-{
- struct nfq_ct_hook *nfq_ct;
- struct nf_conn *ct;
-
- /* rcu_read_lock()ed by __nf_queue already. */
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return NULL;
-
- ct = nf_ct_get(skb, ctinfo);
- if (ct && !nf_ct_is_untracked(ct))
- nfq_ct->parse(attr, ct);
-
- return ct;
-}
-
-int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct nfq_ct_hook *nfq_ct;
- struct nlattr *nest_parms;
- u_int32_t tmp;
-
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return 0;
-
- nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
- if (!nest_parms)
- goto nla_put_failure;
-
- if (nfq_ct->build(skb, ct) < 0)
- goto nla_put_failure;
-
- nla_nest_end(skb, nest_parms);
-
- tmp = ctinfo;
- if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp)))
- goto nla_put_failure;
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo, int diff)
-{
- struct nfq_ct_hook *nfq_ct;
-
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return;
-
- if ((ct->status & IPS_NAT_MASK) && diff)
- nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
-}
-
-int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
- u32 portid, u32 report)
-{
- struct nfq_ct_hook *nfq_ct;
-
- if (nf_ct_is_untracked(ct))
- return 0;
-
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return -EOPNOTSUPP;
-
- return nfq_ct->attach_expect(attr, ct, portid, report);
-}
diff --git a/kernel/net/netfilter/nft_compat.c b/kernel/net/netfilter/nft_compat.c
index 7f29cfc76..9c8fab001 100644
--- a/kernel/net/netfilter/nft_compat.c
+++ b/kernel/net/netfilter/nft_compat.c
@@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
par->hook_mask = 0;
}
par->family = ctx->afi->family;
+ par->nft_compat = true;
}
static void target_compat_from_user(struct xt_target *t, void *in, void *out)
@@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
par->hook_mask = 0;
}
par->family = ctx->afi->family;
+ par->nft_compat = true;
}
static void match_compat_from_user(struct xt_match *m, void *in, void *out)
@@ -617,6 +619,13 @@ struct nft_xt {
static struct nft_expr_type nft_match_type;
+static bool nft_match_cmp(const struct xt_match *match,
+ const char *name, u32 rev, u32 family)
+{
+ return strcmp(match->name, name) == 0 && match->revision == rev &&
+ (match->family == NFPROTO_UNSPEC || match->family == family);
+}
+
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -624,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
struct nft_xt *nft_match;
struct xt_match *match;
char *mt_name;
- __u32 rev, family;
+ u32 rev, family;
if (tb[NFTA_MATCH_NAME] == NULL ||
tb[NFTA_MATCH_REV] == NULL ||
@@ -639,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
list_for_each_entry(nft_match, &nft_match_list, head) {
struct xt_match *match = nft_match->ops.data;
- if (strcmp(match->name, mt_name) == 0 &&
- match->revision == rev && match->family == family) {
+ if (nft_match_cmp(match, mt_name, rev, family)) {
if (!try_module_get(match->me))
return ERR_PTR(-ENOENT);
@@ -691,6 +699,13 @@ static LIST_HEAD(nft_target_list);
static struct nft_expr_type nft_target_type;
+static bool nft_target_cmp(const struct xt_target *tg,
+ const char *name, u32 rev, u32 family)
+{
+ return strcmp(tg->name, name) == 0 && tg->revision == rev &&
+ (tg->family == NFPROTO_UNSPEC || tg->family == family);
+}
+
static const struct nft_expr_ops *
nft_target_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -698,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
struct nft_xt *nft_target;
struct xt_target *target;
char *tg_name;
- __u32 rev, family;
+ u32 rev, family;
if (tb[NFTA_TARGET_NAME] == NULL ||
tb[NFTA_TARGET_REV] == NULL ||
@@ -713,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
list_for_each_entry(nft_target, &nft_target_list, head) {
struct xt_target *target = nft_target->ops.data;
- if (strcmp(target->name, tg_name) == 0 &&
- target->revision == rev && target->family == family) {
+ if (nft_target_cmp(target, tg_name, rev, family)) {
if (!try_module_get(target->me))
return ERR_PTR(-ENOENT);
diff --git a/kernel/net/netfilter/nft_counter.c b/kernel/net/netfilter/nft_counter.c
index 175912392..c7808fc19 100644
--- a/kernel/net/netfilter/nft_counter.c
+++ b/kernel/net/netfilter/nft_counter.c
@@ -18,39 +18,66 @@
#include <net/netfilter/nf_tables.h>
struct nft_counter {
- seqlock_t lock;
u64 bytes;
u64 packets;
};
+struct nft_counter_percpu {
+ struct nft_counter counter;
+ struct u64_stats_sync syncp;
+};
+
+struct nft_counter_percpu_priv {
+ struct nft_counter_percpu __percpu *counter;
+};
+
static void nft_counter_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_counter *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu *this_cpu;
+
+ local_bh_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ u64_stats_update_begin(&this_cpu->syncp);
+ this_cpu->counter.bytes += pkt->skb->len;
+ this_cpu->counter.packets++;
+ u64_stats_update_end(&this_cpu->syncp);
+ local_bh_enable();
+}
- write_seqlock_bh(&priv->lock);
- priv->bytes += pkt->skb->len;
- priv->packets++;
- write_sequnlock_bh(&priv->lock);
+static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
+ struct nft_counter *total)
+{
+ const struct nft_counter_percpu *cpu_stats;
+ u64 bytes, packets;
+ unsigned int seq;
+ int cpu;
+
+ memset(total, 0, sizeof(*total));
+ for_each_possible_cpu(cpu) {
+ cpu_stats = per_cpu_ptr(counter, cpu);
+ do {
+ seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ bytes = cpu_stats->counter.bytes;
+ packets = cpu_stats->counter.packets;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+
+ total->packets += packets;
+ total->bytes += bytes;
+ }
}
static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
- struct nft_counter *priv = nft_expr_priv(expr);
- unsigned int seq;
- u64 bytes;
- u64 packets;
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter total;
- do {
- seq = read_seqbegin(&priv->lock);
- bytes = priv->bytes;
- packets = priv->packets;
- } while (read_seqretry(&priv->lock, seq));
+ nft_counter_fetch(priv->counter, &total);
- if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) ||
+ nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets)))
goto nla_put_failure;
return 0;
@@ -67,24 +94,71 @@ static int nft_counter_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_counter *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu __percpu *cpu_stats;
+ struct nft_counter_percpu *this_cpu;
+
+ cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
+ if (cpu_stats == NULL)
+ return ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ if (tb[NFTA_COUNTER_PACKETS]) {
+ this_cpu->counter.packets =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ }
+ if (tb[NFTA_COUNTER_BYTES]) {
+ this_cpu->counter.bytes =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ }
+ preempt_enable();
+ priv->counter = cpu_stats;
+ return 0;
+}
- if (tb[NFTA_COUNTER_PACKETS])
- priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
- if (tb[NFTA_COUNTER_BYTES])
- priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+static void nft_counter_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- seqlock_init(&priv->lock);
+ free_percpu(priv->counter);
+}
+
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
+ struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
+ struct nft_counter_percpu __percpu *cpu_stats;
+ struct nft_counter_percpu *this_cpu;
+ struct nft_counter total;
+
+ nft_counter_fetch(priv->counter, &total);
+
+ cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
+ GFP_ATOMIC);
+ if (cpu_stats == NULL)
+ return ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu->counter.packets = total.packets;
+ this_cpu->counter.bytes = total.bytes;
+ preempt_enable();
+
+ priv_clone->counter = cpu_stats;
return 0;
}
static struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)),
.eval = nft_counter_eval,
.init = nft_counter_init,
+ .destroy = nft_counter_destroy,
.dump = nft_counter_dump,
+ .clone = nft_counter_clone,
};
static struct nft_expr_type nft_counter_type __read_mostly = {
diff --git a/kernel/net/netfilter/nft_ct.c b/kernel/net/netfilter/nft_ct.c
index 8cbca3432..939921532 100644
--- a/kernel/net/netfilter/nft_ct.c
+++ b/kernel/net/netfilter/nft_ct.c
@@ -366,6 +366,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
case NFT_CT_PROTOCOL:
case NFT_CT_SRC:
case NFT_CT_DST:
diff --git a/kernel/net/netfilter/nft_dynset.c b/kernel/net/netfilter/nft_dynset.c
index 513a8ef60..9dec3bd1b 100644
--- a/kernel/net/netfilter/nft_dynset.c
+++ b/kernel/net/netfilter/nft_dynset.c
@@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
}
ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL)
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr);
+ if (priv->expr != NULL &&
+ nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ return NULL;
return elem;
}
diff --git a/kernel/net/netfilter/nft_limit.c b/kernel/net/netfilter/nft_limit.c
index 435c1ccd6..5d67938f8 100644
--- a/kernel/net/netfilter/nft_limit.c
+++ b/kernel/net/netfilter/nft_limit.c
@@ -20,63 +20,79 @@
static DEFINE_SPINLOCK(limit_lock);
struct nft_limit {
+ u64 last;
u64 tokens;
+ u64 tokens_max;
u64 rate;
- u64 unit;
- unsigned long stamp;
+ u64 nsecs;
+ u32 burst;
};
-static void nft_limit_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ u64 now, tokens;
+ s64 delta;
spin_lock_bh(&limit_lock);
- if (time_after_eq(jiffies, priv->stamp)) {
- priv->tokens = priv->rate;
- priv->stamp = jiffies + priv->unit * HZ;
- }
-
- if (priv->tokens >= 1) {
- priv->tokens--;
+ now = ktime_get_ns();
+ tokens = limit->tokens + now - limit->last;
+ if (tokens > limit->tokens_max)
+ tokens = limit->tokens_max;
+
+ limit->last = now;
+ delta = tokens - cost;
+ if (delta >= 0) {
+ limit->tokens = delta;
spin_unlock_bh(&limit_lock);
- return;
+ return false;
}
+ limit->tokens = tokens;
spin_unlock_bh(&limit_lock);
-
- regs->verdict.code = NFT_BREAK;
+ return true;
}
-static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
- [NFTA_LIMIT_RATE] = { .type = NLA_U64 },
- [NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
-};
-
-static int nft_limit_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
+static int nft_limit_init(struct nft_limit *limit,
const struct nlattr * const tb[])
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ u64 unit;
if (tb[NFTA_LIMIT_RATE] == NULL ||
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
- priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
- priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- priv->stamp = jiffies + priv->unit * HZ;
- priv->tokens = priv->rate;
+ limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+ limit->nsecs = unit * NSEC_PER_SEC;
+ if (limit->rate == 0 || limit->nsecs < unit)
+ return -EOVERFLOW;
+ limit->tokens = limit->tokens_max = limit->nsecs;
+
+ if (tb[NFTA_LIMIT_BURST]) {
+ u64 rate;
+
+ limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+
+ rate = limit->rate + limit->burst;
+ if (rate < limit->rate)
+ return -EOVERFLOW;
+
+ limit->rate = rate;
+ }
+ limit->last = ktime_get_ns();
+
return 0;
}
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+ enum nft_limit_type type)
{
- const struct nft_limit *priv = nft_expr_priv(expr);
+ u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+ u64 rate = limit->rate - limit->burst;
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit)))
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
+ nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
+ nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+ nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)))
goto nla_put_failure;
return 0;
@@ -84,18 +100,114 @@ nla_put_failure:
return -1;
}
+struct nft_limit_pkts {
+ struct nft_limit limit;
+ u64 cost;
+};
+
+static void nft_limit_pkts_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_limit_pkts *priv = nft_expr_priv(expr);
+
+ if (nft_limit_eval(&priv->limit, priv->cost))
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
+ [NFTA_LIMIT_RATE] = { .type = NLA_U64 },
+ [NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
+ [NFTA_LIMIT_BURST] = { .type = NLA_U32 },
+ [NFTA_LIMIT_TYPE] = { .type = NLA_U32 },
+};
+
+static int nft_limit_pkts_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_limit_init(&priv->limit, tb);
+ if (err < 0)
+ return err;
+
+ priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate);
+ return 0;
+}
+
+static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_limit_pkts *priv = nft_expr_priv(expr);
+
+ return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
+}
+
static struct nft_expr_type nft_limit_type;
-static const struct nft_expr_ops nft_limit_ops = {
+static const struct nft_expr_ops nft_limit_pkts_ops = {
+ .type = &nft_limit_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .eval = nft_limit_pkts_eval,
+ .init = nft_limit_pkts_init,
+ .dump = nft_limit_pkts_dump,
+};
+
+static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+ u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate);
+
+ if (nft_limit_eval(priv, cost))
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_pkt_bytes_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+
+ return nft_limit_init(priv, tb);
+}
+
+static int nft_limit_pkt_bytes_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit *priv = nft_expr_priv(expr);
+
+ return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
+}
+
+static const struct nft_expr_ops nft_limit_pkt_bytes_ops = {
.type = &nft_limit_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
- .eval = nft_limit_eval,
- .init = nft_limit_init,
- .dump = nft_limit_dump,
+ .eval = nft_limit_pkt_bytes_eval,
+ .init = nft_limit_pkt_bytes_init,
+ .dump = nft_limit_pkt_bytes_dump,
};
+static const struct nft_expr_ops *
+nft_limit_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_LIMIT_TYPE] == NULL)
+ return &nft_limit_pkts_ops;
+
+ switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) {
+ case NFT_LIMIT_PKTS:
+ return &nft_limit_pkts_ops;
+ case NFT_LIMIT_PKT_BYTES:
+ return &nft_limit_pkt_bytes_ops;
+ }
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct nft_expr_type nft_limit_type __read_mostly = {
.name = "limit",
- .ops = &nft_limit_ops,
+ .select_ops = nft_limit_select_ops,
.policy = nft_limit_policy,
.maxattr = NFTA_LIMIT_MAX,
.flags = NFT_EXPR_STATEFUL,
diff --git a/kernel/net/netfilter/nft_log.c b/kernel/net/netfilter/nft_log.c
index a13d6a386..319c22b4b 100644
--- a/kernel/net/netfilter/nft_log.c
+++ b/kernel/net/netfilter/nft_log.c
@@ -31,9 +31,8 @@ static void nft_log_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
const struct nft_log *priv = nft_expr_priv(expr);
- struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
- nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in,
+ nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
pkt->out, &priv->loginfo, "%s", priv->prefix);
}
diff --git a/kernel/net/netfilter/nft_meta.c b/kernel/net/netfilter/nft_meta.c
index 52561e1c3..9dfaf4d55 100644
--- a/kernel/net/netfilter/nft_meta.c
+++ b/kernel/net/netfilter/nft_meta.c
@@ -31,6 +31,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
const struct net_device *in = pkt->in, *out = pkt->out;
+ struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
switch (priv->key) {
@@ -42,7 +43,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(__be16 *)dest = skb->protocol;
break;
case NFT_META_NFPROTO:
- *dest = pkt->ops->pf;
+ *dest = pkt->pf;
break;
case NFT_META_L4PROTO:
*dest = pkt->tprot;
@@ -86,33 +87,35 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(u16 *)dest = out->type;
break;
case NFT_META_SKUID:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket == NULL ||
- skb->sk->sk_socket->file == NULL) {
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket == NULL ||
+ sk->sk_socket->file == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
goto err;
}
*dest = from_kuid_munged(&init_user_ns,
- skb->sk->sk_socket->file->f_cred->fsuid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ sk->sk_socket->file->f_cred->fsuid);
+ read_unlock_bh(&sk->sk_callback_lock);
break;
case NFT_META_SKGID:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket == NULL ||
- skb->sk->sk_socket->file == NULL) {
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket == NULL ||
+ sk->sk_socket->file == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
goto err;
}
*dest = from_kgid_munged(&init_user_ns,
- skb->sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ sk->sk_socket->file->f_cred->fsgid);
+ read_unlock_bh(&sk->sk_callback_lock);
break;
#ifdef CONFIG_IP_ROUTE_CLASSID
case NFT_META_RTCLASSID: {
@@ -135,7 +138,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
}
- switch (pkt->ops->pf) {
+ switch (pkt->pf) {
case NFPROTO_IPV4:
if (ipv4_is_multicast(ip_hdr(skb)->daddr))
*dest = PACKET_MULTICAST;
@@ -166,11 +169,14 @@ void nft_meta_get_eval(const struct nft_expr *expr,
goto err;
*dest = out->group;
break;
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- *dest = skb->sk->sk_classid;
+ *dest = sk->sk_classid;
break;
+#endif
default:
WARN_ON(1);
goto err;
@@ -246,7 +252,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_CPU:
case NFT_META_IIFGROUP:
case NFT_META_OIFGROUP:
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
+#endif
len = sizeof(u32);
break;
case NFT_META_IIFNAME:
diff --git a/kernel/net/netfilter/nft_payload.c b/kernel/net/netfilter/nft_payload.c
index 94fb3b27a..09b4b07eb 100644
--- a/kernel/net/netfilter/nft_payload.c
+++ b/kernel/net/netfilter/nft_payload.c
@@ -9,6 +9,7 @@
*/
#include <linux/kernel.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netlink.h>
@@ -17,6 +18,53 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+/* add vlan header into the user buffer for if tag was removed by offloads */
+static bool
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+{
+ int mac_off = skb_mac_header(skb) - skb->data;
+ u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d;
+ struct vlan_ethhdr veth;
+
+ vlanh = (u8 *) &veth;
+ if (offset < ETH_HLEN) {
+ u8 ethlen = min_t(u8, len, ETH_HLEN - offset);
+
+ if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN))
+ return false;
+
+ veth.h_vlan_proto = skb->vlan_proto;
+
+ memcpy(dst_u8, vlanh + offset, ethlen);
+
+ len -= ethlen;
+ if (len == 0)
+ return true;
+
+ dst_u8 += ethlen;
+ offset = ETH_HLEN;
+ } else if (offset >= VLAN_ETH_HLEN) {
+ offset -= VLAN_HLEN;
+ goto skip;
+ }
+
+ veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ veth.h_vlan_encapsulated_proto = skb->protocol;
+
+ vlanh += offset;
+
+ vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset);
+ memcpy(dst_u8, vlanh, vlan_len);
+
+ len -= vlan_len;
+ if (!len)
+ return true;
+
+ dst_u8 += vlan_len;
+ skip:
+ return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
+}
+
static void nft_payload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -26,10 +74,18 @@ static void nft_payload_eval(const struct nft_expr *expr,
u32 *dest = &regs->data[priv->dreg];
int offset;
+ dest[priv->len / NFT_REG32_SIZE] = 0;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
if (!skb_mac_header_was_set(skb))
goto err;
+
+ if (skb_vlan_tag_present(skb)) {
+ if (!nft_payload_copy_vlan(dest, skb,
+ priv->offset, priv->len))
+ goto err;
+ return;
+ }
offset = skb_mac_header(skb) - skb->data;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
@@ -43,7 +99,6 @@ static void nft_payload_eval(const struct nft_expr *expr,
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
goto err;
return;
diff --git a/kernel/net/netfilter/nft_queue.c b/kernel/net/netfilter/nft_queue.c
index 96805d21d..61d216eb7 100644
--- a/kernel/net/netfilter/nft_queue.c
+++ b/kernel/net/netfilter/nft_queue.c
@@ -42,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr,
queue = priv->queuenum + cpu % priv->queues_total;
} else {
queue = nfqueue_hash(pkt->skb, queue,
- priv->queues_total, pkt->ops->pf,
+ priv->queues_total, pkt->pf,
jhash_initval);
}
}
diff --git a/kernel/net/netfilter/nft_reject_inet.c b/kernel/net/netfilter/nft_reject_inet.c
index 635dbba93..759ca5248 100644
--- a/kernel/net/netfilter/nft_reject_inet.c
+++ b/kernel/net/netfilter/nft_reject_inet.c
@@ -22,38 +22,37 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_reject *priv = nft_expr_priv(expr);
- struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
- switch (pkt->ops->pf) {
+ switch (pkt->pf) {
case NFPROTO_IPV4:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ nf_send_reset(pkt->net, pkt->skb, pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
nft_reject_icmp_code(priv->icmp_code),
- pkt->ops->hooknum);
+ pkt->hook);
break;
}
break;
case NFPROTO_IPV6:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(net, pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
+ pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(net, pkt->skb, pkt->ops->hooknum);
+ nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
- nf_send_unreach6(net, pkt->skb,
+ nf_send_unreach6(pkt->net, pkt->skb,
nft_reject_icmpv6_code(priv->icmp_code),
- pkt->ops->hooknum);
+ pkt->hook);
break;
}
break;
diff --git a/kernel/net/netfilter/x_tables.c b/kernel/net/netfilter/x_tables.c
index 51a459c3c..d4aaad747 100644
--- a/kernel/net/netfilter/x_tables.c
+++ b/kernel/net/netfilter/x_tables.c
@@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_IPV6] = "ip6",
};
-/* Allow this many total (re)entries. */
-static const unsigned int xt_jumpstack_multiplier = 2;
-
/* Registration hooks for targets. */
int xt_register_target(struct xt_target *target)
{
@@ -658,35 +655,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
struct xt_table_info *xt_alloc_table_info(unsigned int size)
{
- struct xt_table_info *newinfo;
- int cpu;
+ struct xt_table_info *info = NULL;
+ size_t sz = sizeof(*info) + size;
/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
return NULL;
- newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
- if (!newinfo)
- return NULL;
-
- newinfo->size = size;
-
- for_each_possible_cpu(cpu) {
- if (size <= PAGE_SIZE)
- newinfo->entries[cpu] = kmalloc_node(size,
- GFP_KERNEL,
- cpu_to_node(cpu));
- else
- newinfo->entries[cpu] = vmalloc_node(size,
- cpu_to_node(cpu));
-
- if (newinfo->entries[cpu] == NULL) {
- xt_free_table_info(newinfo);
+ if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (!info) {
+ info = vmalloc(sz);
+ if (!info)
return NULL;
- }
}
-
- return newinfo;
+ memset(info, 0, sizeof(*info));
+ info->size = size;
+ return info;
}
EXPORT_SYMBOL(xt_alloc_table_info);
@@ -694,18 +679,13 @@ void xt_free_table_info(struct xt_table_info *info)
{
int cpu;
- for_each_possible_cpu(cpu)
- kvfree(info->entries[cpu]);
-
if (info->jumpstack != NULL) {
for_each_possible_cpu(cpu)
kvfree(info->jumpstack[cpu]);
kvfree(info->jumpstack);
}
- free_percpu(info->stackptr);
-
- kfree(info);
+ kvfree(info);
}
EXPORT_SYMBOL(xt_free_table_info);
@@ -747,15 +727,14 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
DEFINE_PER_CPU(seqcount_t, xt_recseq);
EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+struct static_key xt_tee_enabled __read_mostly;
+EXPORT_SYMBOL_GPL(xt_tee_enabled);
+
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
unsigned int size;
int cpu;
- i->stackptr = alloc_percpu(unsigned int);
- if (i->stackptr == NULL)
- return -ENOMEM;
-
size = sizeof(void **) * nr_cpu_ids;
if (size > PAGE_SIZE)
i->jumpstack = vzalloc(size);
@@ -764,8 +743,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
if (i->jumpstack == NULL)
return -ENOMEM;
- i->stacksize *= xt_jumpstack_multiplier;
- size = sizeof(void *) * i->stacksize;
+ /* ruleset without jumps -- no stack needed */
+ if (i->stacksize == 0)
+ return 0;
+
+ /* Jumpstack needs to be able to record two full callchains, one
+ * from the first rule set traversal, plus one table reentrancy
+ * via -j TEE without clobbering the callchain that brought us to
+ * TEE target.
+ *
+ * This is done by allocating two jumpstacks per cpu, on reentry
+ * the upper half of the stack is used.
+ *
+ * see the jumpstack setup in ipt_do_table() for more details.
+ */
+ size = sizeof(void *) * i->stacksize * 2u;
for_each_possible_cpu(cpu) {
if (size > PAGE_SIZE)
i->jumpstack[cpu] = vmalloc_node(size,
@@ -947,11 +939,9 @@ static int xt_table_seq_show(struct seq_file *seq, void *v)
{
struct xt_table *table = list_entry(v, struct xt_table, list);
- if (strlen(table->name)) {
+ if (*table->name)
seq_printf(seq, "%s\n", table->name);
- return seq_has_overflowed(seq);
- } else
- return 0;
+ return 0;
}
static const struct seq_operations xt_table_seq_ops = {
@@ -1087,10 +1077,8 @@ static int xt_match_seq_show(struct seq_file *seq, void *v)
if (trav->curr == trav->head)
return 0;
match = list_entry(trav->curr, struct xt_match, list);
- if (*match->name == '\0')
- return 0;
- seq_printf(seq, "%s\n", match->name);
- return seq_has_overflowed(seq);
+ if (*match->name)
+ seq_printf(seq, "%s\n", match->name);
}
return 0;
}
@@ -1142,10 +1130,8 @@ static int xt_target_seq_show(struct seq_file *seq, void *v)
if (trav->curr == trav->head)
return 0;
target = list_entry(trav->curr, struct xt_target, list);
- if (*target->name == '\0')
- return 0;
- seq_printf(seq, "%s\n", target->name);
- return seq_has_overflowed(seq);
+ if (*target->name)
+ seq_printf(seq, "%s\n", target->name);
}
return 0;
}
@@ -1207,7 +1193,6 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
if (!(hook_mask & 1))
continue;
ops[i].hook = fn;
- ops[i].owner = table->me;
ops[i].pf = table->af;
ops[i].hooknum = hooknum;
ops[i].priority = table->priority;
diff --git a/kernel/net/netfilter/xt_CT.c b/kernel/net/netfilter/xt_CT.c
index 75747aecd..e7ac07e53 100644
--- a/kernel/net/netfilter/xt_CT.c
+++ b/kernel/net/netfilter/xt_CT.c
@@ -171,6 +171,9 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
if (timeout_ext == NULL)
ret = -ENOMEM;
+ rcu_read_unlock();
+ return ret;
+
err_put_timeout:
__xt_ct_tg_timeout_put(timeout);
out:
@@ -181,10 +184,23 @@ out:
#endif
}
+static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
+{
+ switch (info->flags & (XT_CT_ZONE_DIR_ORIG |
+ XT_CT_ZONE_DIR_REPL)) {
+ case XT_CT_ZONE_DIR_ORIG:
+ return NF_CT_ZONE_DIR_ORIG;
+ case XT_CT_ZONE_DIR_REPL:
+ return NF_CT_ZONE_DIR_REPL;
+ default:
+ return NF_CT_DEFAULT_ZONE_DIR;
+ }
+}
+
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_zone zone;
struct nf_conn *ct;
int ret = -EOPNOTSUPP;
@@ -194,7 +210,9 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
}
#ifndef CONFIG_NF_CONNTRACK_ZONES
- if (info->zone)
+ if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
+ XT_CT_ZONE_DIR_REPL |
+ XT_CT_ZONE_MARK))
goto err1;
#endif
@@ -202,11 +220,17 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
if (ret < 0)
goto err1;
- memset(&t, 0, sizeof(t));
- ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
- ret = PTR_ERR(ct);
- if (IS_ERR(ct))
+ memset(&zone, 0, sizeof(zone));
+ zone.id = info->zone;
+ zone.dir = xt_ct_flags_to_dir(info);
+ if (info->flags & XT_CT_ZONE_MARK)
+ zone.flags |= NF_CT_FLAG_MARK;
+
+ ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
+ if (!ct) {
+ ret = -ENOMEM;
goto err2;
+ }
ret = 0;
if ((info->ct_events || info->exp_events) &&
@@ -227,14 +251,14 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
if (ret < 0)
goto err3;
}
-
- nf_conntrack_tmpl_insert(par->net, ct);
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ nf_conntrack_get(&ct->ct_general);
out:
info->ct = ct;
return 0;
err3:
- nf_conntrack_free(ct);
+ nf_ct_tmpl_free(ct);
err2:
nf_ct_l3proto_module_put(par->family);
err1:
@@ -297,8 +321,10 @@ static void xt_ct_destroy_timeout(struct nf_conn *ct)
if (timeout_put) {
timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext)
+ if (timeout_ext) {
timeout_put(timeout_ext->timeout);
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
}
rcu_read_unlock();
#endif
diff --git a/kernel/net/netfilter/xt_IDLETIMER.c b/kernel/net/netfilter/xt_IDLETIMER.c
index f407ebc13..29d2c31f4 100644
--- a/kernel/net/netfilter/xt_IDLETIMER.c
+++ b/kernel/net/netfilter/xt_IDLETIMER.c
@@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
goto out;
}
+ sysfs_attr_init(&info->timer->attr.attr);
info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
if (!info->timer->attr.attr.name) {
ret = -ENOMEM;
diff --git a/kernel/net/netfilter/xt_LOG.c b/kernel/net/netfilter/xt_LOG.c
index c13b79440..1763ab82b 100644
--- a/kernel/net/netfilter/xt_LOG.c
+++ b/kernel/net/netfilter/xt_LOG.c
@@ -33,7 +33,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
struct nf_loginfo li;
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = loginfo->level;
diff --git a/kernel/net/netfilter/xt_NFLOG.c b/kernel/net/netfilter/xt_NFLOG.c
index fb7497c92..a1fa2c800 100644
--- a/kernel/net/netfilter/xt_NFLOG.c
+++ b/kernel/net/netfilter/xt_NFLOG.c
@@ -26,7 +26,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
struct nf_loginfo li;
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
diff --git a/kernel/net/netfilter/xt_TCPMSS.c b/kernel/net/netfilter/xt_TCPMSS.c
index e762de5ee..b7c43def0 100644
--- a/kernel/net/netfilter/xt_TCPMSS.c
+++ b/kernel/net/netfilter/xt_TCPMSS.c
@@ -108,7 +108,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
return -1;
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
if (dst_mtu(skb_dst(skb)) <= minlen) {
@@ -144,7 +144,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
inet_proto_csum_replace2(&tcph->check, skb,
htons(oldmss), htons(newmss),
- 0);
+ false);
return 0;
}
}
@@ -185,18 +185,18 @@ tcpmss_mangle_packet(struct sk_buff *skb,
memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr));
inet_proto_csum_replace2(&tcph->check, skb,
- htons(len), htons(len + TCPOLEN_MSS), 1);
+ htons(len), htons(len + TCPOLEN_MSS), true);
opt[0] = TCPOPT_MSS;
opt[1] = TCPOLEN_MSS;
opt[2] = (newmss & 0xff00) >> 8;
opt[3] = newmss & 0x00ff;
- inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0);
+ inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false);
oldval = ((__be16 *)tcph)[6];
tcph->doff += TCPOLEN_MSS/4;
inet_proto_csum_replace2(&tcph->check, skb,
- oldval, ((__be16 *)tcph)[6], 0);
+ oldval, ((__be16 *)tcph)[6], false);
return TCPOLEN_MSS;
}
@@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
"FORWARD, OUTPUT and POSTROUTING hooks\n");
return -EINVAL;
}
+ if (par->nft_compat)
+ return 0;
+
xt_ematch_foreach(ematch, e)
if (find_syn_match(ematch))
return 0;
@@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
"FORWARD, OUTPUT and POSTROUTING hooks\n");
return -EINVAL;
}
+ if (par->nft_compat)
+ return 0;
+
xt_ematch_foreach(ematch, e)
if (find_syn_match(ematch))
return 0;
diff --git a/kernel/net/netfilter/xt_TCPOPTSTRIP.c b/kernel/net/netfilter/xt_TCPOPTSTRIP.c
index 625fa1d63..eb92bffff 100644
--- a/kernel/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/kernel/net/netfilter/xt_TCPOPTSTRIP.c
@@ -80,7 +80,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
n <<= 8;
}
inet_proto_csum_replace2(&tcph->check, skb, htons(o),
- htons(n), 0);
+ htons(n), false);
}
memset(opt + i, TCPOPT_NOP, optl);
}
diff --git a/kernel/net/netfilter/xt_TEE.c b/kernel/net/netfilter/xt_TEE.c
index 292934d23..3eff7b67c 100644
--- a/kernel/net/netfilter/xt_TEE.c
+++ b/kernel/net/netfilter/xt_TEE.c
@@ -10,26 +10,15 @@
* modify it under the terms of the GNU General Public License
* version 2 or later, as published by the Free Software Foundation.
*/
-#include <linux/ip.h>
#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/route.h>
#include <linux/skbuff.h>
-#include <linux/notifier.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/route.h>
+#include <linux/route.h>
#include <linux/netfilter/x_tables.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
#include <linux/netfilter/xt_TEE.h>
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-# define WITH_CONNTRACK 1
-# include <net/netfilter/nf_conntrack.h>
-#endif
-
struct xt_tee_priv {
struct notifier_block notifier;
struct xt_tee_tginfo *tginfo;
@@ -37,162 +26,27 @@ struct xt_tee_priv {
};
static const union nf_inet_addr tee_zero_address;
-static DEFINE_PER_CPU(bool, tee_active);
-
-static struct net *pick_net(struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_NS
- const struct dst_entry *dst;
-
- if (skb->dev != NULL)
- return dev_net(skb->dev);
- dst = skb_dst(skb);
- if (dst != NULL && dst->dev != NULL)
- return dev_net(dst->dev);
-#endif
- return &init_net;
-}
-
-static bool
-tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct net *net = pick_net(skb);
- struct rtable *rt;
- struct flowi4 fl4;
-
- memset(&fl4, 0, sizeof(fl4));
- if (info->priv) {
- if (info->priv->oif == -1)
- return false;
- fl4.flowi4_oif = info->priv->oif;
- }
- fl4.daddr = info->gw.ip;
- fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
- fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt))
- return false;
-
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- skb->dev = rt->dst.dev;
- skb->protocol = htons(ETH_P_IP);
- return true;
-}
static unsigned int
tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
- struct iphdr *iph;
+ int oif = info->priv ? info->priv->oif : 0;
- if (__this_cpu_read(tee_active))
- return XT_CONTINUE;
- /*
- * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
- * the original skb, which should continue on its way as if nothing has
- * happened. The copy should be independently delivered to the TEE
- * --gateway.
- */
- skb = pskb_copy(skb, GFP_ATOMIC);
- if (skb == NULL)
- return XT_CONTINUE;
-
-#ifdef WITH_CONNTRACK
- /* Avoid counting cloned packets towards the original connection. */
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
-#endif
- /*
- * If we are in PREROUTING/INPUT, the checksum must be recalculated
- * since the length could have changed as a result of defragmentation.
- *
- * We also decrease the TTL to mitigate potential TEE loops
- * between two hosts.
- *
- * Set %IP_DF so that the original source is notified of a potentially
- * decreased MTU on the clone route. IPv6 does this too.
- */
- iph = ip_hdr(skb);
- iph->frag_off |= htons(IP_DF);
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_IN)
- --iph->ttl;
- ip_send_check(iph);
+ nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif);
- if (tee_tg_route4(skb, info)) {
- __this_cpu_write(tee_active, true);
- ip_local_out(skb);
- __this_cpu_write(tee_active, false);
- } else {
- kfree_skb(skb);
- }
return XT_CONTINUE;
}
-#if IS_ENABLED(CONFIG_IPV6)
-static bool
-tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
-{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct net *net = pick_net(skb);
- struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- if (info->priv) {
- if (info->priv->oif == -1)
- return false;
- fl6.flowi6_oif = info->priv->oif;
- }
- fl6.daddr = info->gw.in6;
- fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
- (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
- dst = ip6_route_output(net, NULL, &fl6);
- if (dst->error) {
- dst_release(dst);
- return false;
- }
- skb_dst_drop(skb);
- skb_dst_set(skb, dst);
- skb->dev = dst->dev;
- skb->protocol = htons(ETH_P_IPV6);
- return true;
-}
-
+#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
static unsigned int
tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
+ int oif = info->priv ? info->priv->oif : 0;
- if (__this_cpu_read(tee_active))
- return XT_CONTINUE;
- skb = pskb_copy(skb, GFP_ATOMIC);
- if (skb == NULL)
- return XT_CONTINUE;
+ nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif);
-#ifdef WITH_CONNTRACK
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
-#endif
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_IN) {
- struct ipv6hdr *iph = ipv6_hdr(skb);
- --iph->hop_limit;
- }
- if (tee_tg_route6(skb, info)) {
- __this_cpu_write(tee_active, true);
- ip6_local_out(skb);
- __this_cpu_write(tee_active, false);
- } else {
- kfree_skb(skb);
- }
return XT_CONTINUE;
}
#endif
@@ -251,6 +105,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
} else
info->priv = NULL;
+ static_key_slow_inc(&xt_tee_enabled);
return 0;
}
@@ -262,6 +117,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par)
unregister_netdevice_notifier(&info->priv->notifier);
kfree(info->priv);
}
+ static_key_slow_dec(&xt_tee_enabled);
}
static struct xt_target tee_tg_reg[] __read_mostly = {
@@ -275,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
{
.name = "TEE",
.revision = 1,
diff --git a/kernel/net/netfilter/xt_TPROXY.c b/kernel/net/netfilter/xt_TPROXY.c
index cca96cec1..3ab591e73 100644
--- a/kernel/net/netfilter/xt_TPROXY.c
+++ b/kernel/net/netfilter/xt_TPROXY.c
@@ -250,8 +250,8 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
* no such listener is found, or NULL if the TCP header is incomplete.
*/
static struct sock *
-tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
- struct sock *sk)
+tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+ __be32 laddr, __be16 lport, struct sock *sk)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr _hdr, *hp;
@@ -267,13 +267,12 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ sk2 = nf_tproxy_get_sock_v4(net, iph->protocol,
iph->saddr, laddr ? laddr : iph->daddr,
hp->source, lport ? lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -291,7 +290,7 @@ nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
}
static unsigned int
-tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -306,7 +305,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
skb->dev, NFT_LOOKUP_ESTABLISHED);
@@ -318,11 +317,11 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
- sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+ sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
skb->dev, NFT_LOOKUP_LISTENER);
@@ -352,7 +351,7 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
- return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
}
static unsigned int
@@ -360,7 +359,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
- return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
}
#ifdef XT_TPROXY_HAVE_IPV6
@@ -430,15 +429,14 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ sk2 = nf_tproxy_get_sock_v6(par->net, tproto,
&iph->saddr,
tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
hp->source,
tgi->lport ? tgi->lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -474,7 +472,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ sk = nf_tproxy_get_sock_v6(par->net, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
par->in, NFT_LOOKUP_ESTABLISHED);
@@ -489,7 +487,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ sk = nf_tproxy_get_sock_v6(par->net, tproto,
&iph->saddr, laddr,
hp->source, lport,
par->in, NFT_LOOKUP_LISTENER);
diff --git a/kernel/net/netfilter/xt_addrtype.c b/kernel/net/netfilter/xt_addrtype.c
index fab6eea1b..11d609199 100644
--- a/kernel/net/netfilter/xt_addrtype.c
+++ b/kernel/net/netfilter/xt_addrtype.c
@@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
ret |= XT_ADDRTYPE_LOCAL;
- if (rt->rt6i_flags & RTF_ANYCAST)
+ if (ipv6_anycast_destination((struct dst_entry *)rt, addr))
ret |= XT_ADDRTYPE_ANYCAST;
dst_release(&rt->dst);
@@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
static bool
addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
const struct xt_addrtype_info *info = par->matchinfo;
const struct iphdr *iph = ip_hdr(skb);
bool ret = true;
@@ -143,7 +143,7 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
static bool
addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
const struct xt_addrtype_info_v1 *info = par->matchinfo;
const struct iphdr *iph;
const struct net_device *dev = NULL;
diff --git a/kernel/net/netfilter/xt_connlabel.c b/kernel/net/netfilter/xt_connlabel.c
index 9f8719df2..bb9cbeb18 100644
--- a/kernel/net/netfilter/xt_connlabel.c
+++ b/kernel/net/netfilter/xt_connlabel.c
@@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
XT_CONNLABEL_OP_SET;
struct xt_connlabel_mtinfo *info = par->matchinfo;
int ret;
- size_t words;
-
- if (info->bit > XT_CONNLABEL_MAXBIT)
- return -ERANGE;
if (info->options & ~options) {
pr_err("Unknown options in mask %x\n", info->options);
@@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return ret;
}
- par->net->ct.labels_used++;
- words = BITS_TO_LONGS(info->bit+1);
- if (words > par->net->ct.label_words)
- par->net->ct.label_words = words;
-
+ ret = nf_connlabels_get(par->net, info->bit + 1);
+ if (ret < 0)
+ nf_ct_l3proto_module_put(par->family);
return ret;
}
static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
{
- par->net->ct.labels_used--;
- if (par->net->ct.labels_used == 0)
- par->net->ct.label_words = 0;
+ nf_connlabels_put(par->net);
nf_ct_l3proto_module_put(par->family);
}
diff --git a/kernel/net/netfilter/xt_connlimit.c b/kernel/net/netfilter/xt_connlimit.c
index 29ba6218a..99bbc8298 100644
--- a/kernel/net/netfilter/xt_connlimit.c
+++ b/kernel/net/netfilter/xt_connlimit.c
@@ -134,7 +134,7 @@ static bool add_hlist(struct hlist_head *head,
static unsigned int check_hlist(struct net *net,
struct hlist_head *head,
const struct nf_conntrack_tuple *tuple,
- u16 zone,
+ const struct nf_conntrack_zone *zone,
bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
@@ -201,7 +201,7 @@ static unsigned int
count_tree(struct net *net, struct rb_root *root,
const struct nf_conntrack_tuple *tuple,
const union nf_inet_addr *addr, const union nf_inet_addr *mask,
- u8 family, u16 zone)
+ u8 family, const struct nf_conntrack_zone *zone)
{
struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
struct rb_node **rbnode, *parent;
@@ -290,7 +290,8 @@ static int count_them(struct net *net,
const struct nf_conntrack_tuple *tuple,
const union nf_inet_addr *addr,
const union nf_inet_addr *mask,
- u_int8_t family, u16 zone)
+ u_int8_t family,
+ const struct nf_conntrack_zone *zone)
{
struct rb_root *root;
int count;
@@ -316,22 +317,22 @@ static int count_them(struct net *net,
static bool
connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
const struct xt_connlimit_info *info = par->matchinfo;
union nf_inet_addr addr;
struct nf_conntrack_tuple tuple;
const struct nf_conntrack_tuple *tuple_ptr = &tuple;
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
unsigned int connections;
- u16 zone = NF_CT_DEFAULT_ZONE;
ct = nf_ct_get(skb, &ctinfo);
if (ct != NULL) {
tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
zone = nf_ct_zone(ct);
} else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- par->family, &tuple)) {
+ par->family, net, &tuple)) {
goto hotdrop;
}
diff --git a/kernel/net/netfilter/xt_ipvs.c b/kernel/net/netfilter/xt_ipvs.c
index 8d47c3780..71a9d95e0 100644
--- a/kernel/net/netfilter/xt_ipvs.c
+++ b/kernel/net/netfilter/xt_ipvs.c
@@ -48,6 +48,7 @@ static bool
ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_ipvs_mtinfo *data = par->matchinfo;
+ struct netns_ipvs *ipvs = net_ipvs(par->net);
/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
const u_int8_t family = par->family;
struct ip_vs_iphdr iph;
@@ -67,7 +68,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
goto out;
}
- ip_vs_fill_iph_skb(family, skb, &iph);
+ ip_vs_fill_iph_skb(family, skb, true, &iph);
if (data->bitmask & XT_IPVS_PROTO)
if ((iph.protocol == data->l4proto) ^
@@ -85,7 +86,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */);
+ cp = pp->conn_out_get(ipvs, family, skb, &iph);
if (unlikely(cp == NULL)) {
match = false;
goto out;
diff --git a/kernel/net/netfilter/xt_mark.c b/kernel/net/netfilter/xt_mark.c
index 233452387..ebd41dc50 100644
--- a/kernel/net/netfilter/xt_mark.c
+++ b/kernel/net/netfilter/xt_mark.c
@@ -23,6 +23,7 @@ MODULE_ALIAS("ipt_mark");
MODULE_ALIAS("ip6t_mark");
MODULE_ALIAS("ipt_MARK");
MODULE_ALIAS("ip6t_MARK");
+MODULE_ALIAS("arpt_MARK");
static unsigned int
mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
diff --git a/kernel/net/netfilter/xt_nfacct.c b/kernel/net/netfilter/xt_nfacct.c
index 8c646ed9c..3048a7e3a 100644
--- a/kernel/net/netfilter/xt_nfacct.c
+++ b/kernel/net/netfilter/xt_nfacct.c
@@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
struct xt_nfacct_match_info *info = par->matchinfo;
struct nf_acct *nfacct;
- nfacct = nfnl_acct_find_get(info->name);
+ nfacct = nfnl_acct_find_get(par->net, info->name);
if (nfacct == NULL) {
pr_info("xt_nfacct: accounting object with name `%s' "
"does not exists\n", info->name);
diff --git a/kernel/net/netfilter/xt_osf.c b/kernel/net/netfilter/xt_osf.c
index 0778855ea..df8801e02 100644
--- a/kernel/net/netfilter/xt_osf.c
+++ b/kernel/net/netfilter/xt_osf.c
@@ -200,7 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
- struct net *net = dev_net(p->in ? p->in : p->out);
+ struct net *net = p->net;
if (!info)
return false;
diff --git a/kernel/net/netfilter/xt_owner.c b/kernel/net/netfilter/xt_owner.c
index ca2e577ed..1302b475a 100644
--- a/kernel/net/netfilter/xt_owner.c
+++ b/kernel/net/netfilter/xt_owner.c
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <linux/file.h>
#include <net/sock.h>
+#include <net/inet_sock.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_owner.h>
@@ -33,8 +34,9 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
+ struct sock *sk = skb_to_full_sk(skb);
- if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+ if (sk == NULL || sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
else if (info->match & info->invert & XT_OWNER_SOCKET)
/*
@@ -43,7 +45,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
*/
return false;
- filp = skb->sk->sk_socket->file;
+ filp = sk->sk_socket->file;
if (filp == NULL)
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
diff --git a/kernel/net/netfilter/xt_recent.c b/kernel/net/netfilter/xt_recent.c
index 45e1b30e4..d725a2774 100644
--- a/kernel/net/netfilter/xt_recent.c
+++ b/kernel/net/netfilter/xt_recent.c
@@ -237,7 +237,7 @@ static void recent_table_flush(struct recent_table *t)
static bool
recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
struct recent_net *recent_net = recent_pernet(net);
const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
diff --git a/kernel/net/netfilter/xt_set.c b/kernel/net/netfilter/xt_set.c
index 89045982e..5669e5b45 100644
--- a/kernel/net/netfilter/xt_set.c
+++ b/kernel/net/netfilter/xt_set.c
@@ -9,14 +9,16 @@
*/
/* Kernel module which implements the set match and SET target
- * for netfilter/iptables. */
+ * for netfilter/iptables.
+ */
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/xt_set.h>
+#include <linux/netfilter/ipset/ip_set.h>
#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <uapi/linux/netfilter/xt_set.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -52,6 +54,7 @@ static bool
set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v0 *info = par->matchinfo;
+
ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
info->match_set.u.compat.flags, 0, UINT_MAX);
@@ -68,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info)
info->u.compat.dim = IPSET_DIM_ZERO;
if (info->u.flags[0] & IPSET_MATCH_INV)
info->u.compat.flags |= IPSET_INV_MATCH;
- for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+ for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) {
info->u.compat.dim++;
if (info->u.flags[i] & IPSET_SRC)
- info->u.compat.flags |= (1<<info->u.compat.dim);
+ info->u.compat.flags |= (1 << info->u.compat.dim);
}
}
@@ -88,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
info->match_set.index);
return -ENOENT;
}
- if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) {
pr_warn("Protocol error: set match dimension is over the limit!\n");
ip_set_nfnl_put(par->net, info->match_set.index);
return -ERANGE;
@@ -114,6 +117,7 @@ static bool
set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v1 *info = par->matchinfo;
+
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, 0, UINT_MAX);
@@ -178,9 +182,10 @@ static bool
set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v3 *info = par->matchinfo;
+ int ret;
+
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
- int ret;
if (info->packets.op != IPSET_COUNTER_NONE ||
info->bytes.op != IPSET_COUNTER_NONE)
@@ -224,9 +229,10 @@ static bool
set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v4 *info = par->matchinfo;
+ int ret;
+
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
- int ret;
if (info->packets.op != IPSET_COUNTER_NONE ||
info->bytes.op != IPSET_COUNTER_NONE)
@@ -252,6 +258,7 @@ static unsigned int
set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v0 *info = par->targinfo;
+
ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
info->add_set.u.compat.flags, 0, UINT_MAX);
ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
@@ -290,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
return -ENOENT;
}
}
- if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
- info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 ||
+ info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) {
pr_warn("Protocol error: SET target dimension is over the limit!\n");
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_nfnl_put(par->net, info->add_set.index);
@@ -324,6 +331,7 @@ static unsigned int
set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v1 *info = par->targinfo;
+
ADT_OPT(add_opt, par->family, info->add_set.dim,
info->add_set.flags, 0, UINT_MAX);
ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -392,6 +400,7 @@ static unsigned int
set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v2 *info = par->targinfo;
+
ADT_OPT(add_opt, par->family, info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -399,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
/* Normalize to fit into jiffies */
if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
- add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
- add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_add(info->add_set.index, skb, par, &add_opt);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -418,6 +427,8 @@ static unsigned int
set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
+ int ret;
+
ADT_OPT(add_opt, par->family, info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -425,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
ADT_OPT(map_opt, par->family, info->map_set.dim,
info->map_set.flags, 0, UINT_MAX);
- int ret;
-
/* Normalize to fit into jiffies */
if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
- add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
- add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_add(info->add_set.index, skb, par, &add_opt);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -456,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-
static int
set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
@@ -496,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
!(par->hook_mask & (1 << NF_INET_FORWARD |
1 << NF_INET_LOCAL_OUT |
1 << NF_INET_POST_ROUTING))) {
- pr_warn("mapping of prio or/and queue is allowed only"
- "from OUTPUT/FORWARD/POSTROUTING chains\n");
+ pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
return -EINVAL;
}
index = ip_set_nfnl_get_byindex(par->net,
@@ -518,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
if (info->add_set.dim > IPSET_DIM_MAX ||
info->del_set.dim > IPSET_DIM_MAX ||
info->map_set.dim > IPSET_DIM_MAX) {
- pr_warn("Protocol error: SET target dimension "
- "is over the limit!\n");
+ pr_warn("Protocol error: SET target dimension is over the limit!\n");
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_nfnl_put(par->net, info->add_set.index);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -545,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par)
ip_set_nfnl_put(par->net, info->map_set.index);
}
-
static struct xt_match set_matches[] __read_mostly = {
{
.name = "set",
diff --git a/kernel/net/netfilter/xt_socket.c b/kernel/net/netfilter/xt_socket.c
index e092cb046..2ec08f04b 100644
--- a/kernel/net/netfilter/xt_socket.c
+++ b/kernel/net/netfilter/xt_socket.c
@@ -143,7 +143,8 @@ static bool xt_socket_sk_is_transparent(struct sock *sk)
}
}
-static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb,
+static struct sock *xt_socket_lookup_slow_v4(struct net *net,
+ const struct sk_buff *skb,
const struct net_device *indev)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -197,7 +198,7 @@ static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb,
}
#endif
- return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr,
+ return xt_socket_get_sock_v4(net, protocol, saddr, daddr,
sport, dport, indev);
}
@@ -205,10 +206,11 @@ static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
{
+ struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v4(skb, par->in);
+ sk = xt_socket_lookup_slow_v4(par->net, skb, par->in);
if (sk) {
bool wildcard;
bool transparent = true;
@@ -226,6 +228,10 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = xt_socket_sk_is_transparent(sk);
+ if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
+ transparent)
+ pskb->mark = sk->sk_mark;
+
if (sk != skb->sk)
sock_gen_put(sk);
@@ -247,7 +253,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
}
static bool
-socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, par->matchinfo);
}
@@ -330,7 +336,8 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol,
return NULL;
}
-static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb,
+static struct sock *xt_socket_lookup_slow_v6(struct net *net,
+ const struct sk_buff *skb,
const struct net_device *indev)
{
__be16 uninitialized_var(dport), uninitialized_var(sport);
@@ -366,18 +373,19 @@ static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb,
return NULL;
}
- return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr,
+ return xt_socket_get_sock_v6(net, tproto, saddr, daddr,
sport, dport, indev);
}
static bool
-socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+ struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v6(skb, par->in);
+ sk = xt_socket_lookup_slow_v6(par->net, skb, par->in);
if (sk) {
bool wildcard;
bool transparent = true;
@@ -395,6 +403,10 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = xt_socket_sk_is_transparent(sk);
+ if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
+ transparent)
+ pskb->mark = sk->sk_mark;
+
if (sk != skb->sk)
sock_gen_put(sk);
@@ -428,6 +440,19 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par)
return 0;
}
+static int socket_mt_v3_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_socket_mtinfo3 *info =
+ (struct xt_socket_mtinfo3 *)par->matchinfo;
+
+ if (info->flags & ~XT_SOCKET_FLAGS_V3) {
+ pr_info("unknown flags 0x%x\n",
+ info->flags & ~XT_SOCKET_FLAGS_V3);
+ return -EINVAL;
+ }
+ return 0;
+}
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
@@ -442,7 +467,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = socket_mt4_v1_v2,
+ .match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -454,7 +479,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV6,
- .match = socket_mt6_v1_v2,
+ .match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -466,7 +491,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 2,
.family = NFPROTO_IPV4,
- .match = socket_mt4_v1_v2,
+ .match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -478,7 +503,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 2,
.family = NFPROTO_IPV6,
- .match = socket_mt6_v1_v2,
+ .match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -486,6 +511,30 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.me = THIS_MODULE,
},
#endif
+ {
+ .name = "socket",
+ .revision = 3,
+ .family = NFPROTO_IPV4,
+ .match = socket_mt4_v1_v2_v3,
+ .checkentry = socket_mt_v3_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#ifdef XT_SOCKET_HAVE_IPV6
+ {
+ .name = "socket",
+ .revision = 3,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1_v2_v3,
+ .checkentry = socket_mt_v3_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init socket_mt_init(void)
diff --git a/kernel/net/netlink/af_netlink.c b/kernel/net/netlink/af_netlink.c
index 980121e75..59651af8c 100644
--- a/kernel/net/netlink/af_netlink.c
+++ b/kernel/net/netlink/af_netlink.c
@@ -76,17 +76,19 @@ struct listeners {
};
/* state bits */
-#define NETLINK_CONGESTED 0x0
+#define NETLINK_S_CONGESTED 0x0
/* flags */
-#define NETLINK_KERNEL_SOCKET 0x1
-#define NETLINK_RECV_PKTINFO 0x2
-#define NETLINK_BROADCAST_SEND_ERROR 0x4
-#define NETLINK_RECV_NO_ENOBUFS 0x8
+#define NETLINK_F_KERNEL_SOCKET 0x1
+#define NETLINK_F_RECV_PKTINFO 0x2
+#define NETLINK_F_BROADCAST_SEND_ERROR 0x4
+#define NETLINK_F_RECV_NO_ENOBUFS 0x8
+#define NETLINK_F_LISTEN_ALL_NSID 0x10
+#define NETLINK_F_CAP_ACK 0x20
static inline int netlink_is_kernel(struct sock *sk)
{
- return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
+ return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
}
struct netlink_table *nl_table __read_mostly;
@@ -175,7 +177,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt)
out:
spin_unlock(&netlink_tap_lock);
- if (found && nt->module)
+ if (found)
module_put(nt->module);
return found ? 0 : -ENODEV;
@@ -278,8 +280,9 @@ static void netlink_overrun(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
- if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
- if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
+ if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) {
+ if (!test_and_set_bit(NETLINK_S_CONGESTED,
+ &nlk_sk(sk)->state)) {
sk->sk_err = ENOBUFS;
sk->sk_error_report(sk);
}
@@ -292,8 +295,8 @@ static void netlink_rcv_wake(struct sock *sk)
struct netlink_sock *nlk = nlk_sk(sk);
if (skb_queue_empty(&sk->sk_receive_queue))
- clear_bit(NETLINK_CONGESTED, &nlk->state);
- if (!test_bit(NETLINK_CONGESTED, &nlk->state))
+ clear_bit(NETLINK_S_CONGESTED, &nlk->state);
+ if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
wake_up_interruptible(&nlk->wait);
}
@@ -608,16 +611,6 @@ netlink_current_frame(const struct netlink_ring *ring,
return netlink_lookup_frame(ring, ring->head, status);
}
-static struct nl_mmap_hdr *
-netlink_previous_frame(const struct netlink_ring *ring,
- enum nl_mmap_status status)
-{
- unsigned int prev;
-
- prev = ring->head ? ring->head - 1 : ring->frame_max;
- return netlink_lookup_frame(ring, prev, status);
-}
-
static void netlink_increment_head(struct netlink_ring *ring)
{
ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
@@ -625,11 +618,11 @@ static void netlink_increment_head(struct netlink_ring *ring)
static void netlink_forward_ring(struct netlink_ring *ring)
{
- unsigned int head = ring->head, pos = head;
+ unsigned int head = ring->head;
const struct nl_mmap_hdr *hdr;
do {
- hdr = __netlink_lookup_frame(ring, pos);
+ hdr = __netlink_lookup_frame(ring, ring->head);
if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
break;
if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
@@ -638,6 +631,21 @@ static void netlink_forward_ring(struct netlink_ring *ring)
} while (ring->head != head);
}
+static bool netlink_has_valid_frame(struct netlink_ring *ring)
+{
+ unsigned int head = ring->head, pos = head;
+ const struct nl_mmap_hdr *hdr;
+
+ do {
+ hdr = __netlink_lookup_frame(ring, pos);
+ if (hdr->nm_status == NL_MMAP_STATUS_VALID)
+ return true;
+ pos = pos != 0 ? pos - 1 : ring->frame_max;
+ } while (pos != head);
+
+ return false;
+}
+
static bool netlink_dump_space(struct netlink_sock *nlk)
{
struct netlink_ring *ring = &nlk->rx_ring;
@@ -683,13 +691,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
mask = datagram_poll(file, sock, wait);
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (nlk->rx_ring.pg_vec) {
- netlink_forward_ring(&nlk->rx_ring);
- if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
- mask |= POLLIN | POLLRDNORM;
+ /* We could already have received frames in the normal receive
+ * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
+ * so if mask contains pollin/etc already, there's no point
+ * walking the ring.
+ */
+ if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (nlk->rx_ring.pg_vec) {
+ if (netlink_has_valid_frame(&nlk->rx_ring))
+ mask |= POLLIN | POLLRDNORM;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
}
- spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (nlk->tx_ring.pg_vec) {
@@ -1118,6 +1132,7 @@ static int netlink_insert(struct sock *sk, u32 portid)
if (err == -EEXIST)
err = -EADDRINUSE;
sock_put(sk);
+ goto err;
}
/* We need to ensure that the socket is hashed and visible. */
@@ -1157,14 +1172,15 @@ static struct proto netlink_proto = {
};
static int __netlink_create(struct net *net, struct socket *sock,
- struct mutex *cb_mutex, int protocol)
+ struct mutex *cb_mutex, int protocol,
+ int kern)
{
struct sock *sk;
struct netlink_sock *nlk;
sock->ops = &netlink_ops;
- sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+ sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
if (!sk)
return -ENOMEM;
@@ -1226,7 +1242,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
if (err < 0)
goto out;
- err = __netlink_create(net, sock, cb_mutex, protocol);
+ err = __netlink_create(net, sock, cb_mutex, protocol, kern);
if (err < 0)
goto out_module;
@@ -1336,20 +1352,24 @@ static int netlink_autobind(struct socket *sock)
struct netlink_table *table = &nl_table[sk->sk_protocol];
s32 portid = task_tgid_vnr(current);
int err;
- static s32 rover = -4097;
+ s32 rover = -4096;
+ bool ok;
retry:
cond_resched();
rcu_read_lock();
- if (__netlink_lookup(table, portid, net)) {
+ ok = !__netlink_lookup(table, portid, net);
+ rcu_read_unlock();
+ if (!ok) {
/* Bind collision, search negative portid values. */
- portid = rover--;
- if (rover > -4097)
+ if (rover == -4096)
+ /* rover will be in range [S32_MIN, -4097] */
+ rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN);
+ else if (rover >= -4096)
rover = -4097;
- rcu_read_unlock();
+ portid = rover--;
goto retry;
}
- rcu_read_unlock();
err = netlink_insert(sk, portid);
if (err == -EADDRINUSE)
@@ -1708,7 +1728,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
nlk = nlk_sk(sk);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(NETLINK_CONGESTED, &nlk->state)) &&
+ test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
!netlink_skb_is_mmaped(skb)) {
DECLARE_WAITQUEUE(wait, current);
if (!*timeo) {
@@ -1723,7 +1743,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
add_wait_queue(&nlk->wait, &wait);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(NETLINK_CONGESTED, &nlk->state)) &&
+ test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
!sock_flag(sk, SOCK_DEAD))
*timeo = schedule_timeout(*timeo);
@@ -1856,15 +1876,16 @@ retry:
}
EXPORT_SYMBOL(netlink_unicast);
-struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
- u32 dst_portid, gfp_t gfp_mask)
+struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+ unsigned int ldiff, u32 dst_portid,
+ gfp_t gfp_mask)
{
#ifdef CONFIG_NETLINK_MMAP
+ unsigned int maxlen, linear_size;
struct sock *sk = NULL;
struct sk_buff *skb;
struct netlink_ring *ring;
struct nl_mmap_hdr *hdr;
- unsigned int maxlen;
sk = netlink_getsockbyportid(ssk, dst_portid);
if (IS_ERR(sk))
@@ -1875,7 +1896,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
if (ring->pg_vec == NULL)
goto out_put;
- if (ring->frame_size - NL_MMAP_HDRLEN < size)
+ /* We need to account the full linear size needed as a ring
+ * slot cannot have non-linear parts.
+ */
+ linear_size = size + ldiff;
+ if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
goto out_put;
skb = alloc_skb_head(gfp_mask);
@@ -1889,13 +1914,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
/* check again under lock */
maxlen = ring->frame_size - NL_MMAP_HDRLEN;
- if (maxlen < size)
+ if (maxlen < linear_size)
goto out_free;
netlink_forward_ring(ring);
hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
if (hdr == NULL)
goto err2;
+
netlink_ring_setup_skb(skb, sk, ring, hdr);
netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
atomic_inc(&ring->pending);
@@ -1921,7 +1947,7 @@ out:
#endif
return alloc_skb(size, gfp_mask);
}
-EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
@@ -1947,7 +1973,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
struct netlink_sock *nlk = nlk_sk(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
- !test_bit(NETLINK_CONGESTED, &nlk->state)) {
+ !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
netlink_skb_set_owner_r(skb, sk);
__netlink_sendskb(sk, skb);
return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
@@ -1983,8 +2009,17 @@ static void do_one_broadcast(struct sock *sk,
!test_bit(p->group - 1, nlk->groups))
return;
- if (!net_eq(sock_net(sk), p->net))
- return;
+ if (!net_eq(sock_net(sk), p->net)) {
+ if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
+ return;
+
+ if (!peernet_has_id(sock_net(sk), p->net))
+ return;
+
+ if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
+ CAP_NET_BROADCAST))
+ return;
+ }
if (p->failure) {
netlink_overrun(sk);
@@ -2008,23 +2043,33 @@ static void do_one_broadcast(struct sock *sk,
netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */
p->failure = 1;
- if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
p->delivery_failure = 1;
- } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
+ goto out;
+ }
+ if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
kfree_skb(p->skb2);
p->skb2 = NULL;
- } else if (sk_filter(sk, p->skb2)) {
+ goto out;
+ }
+ if (sk_filter(sk, p->skb2)) {
kfree_skb(p->skb2);
p->skb2 = NULL;
- } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+ goto out;
+ }
+ NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
+ NETLINK_CB(p->skb2).nsid_is_set = true;
+ val = netlink_broadcast_deliver(sk, p->skb2);
+ if (val < 0) {
netlink_overrun(sk);
- if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
p->delivery_failure = 1;
} else {
p->congested |= val;
p->delivered = 1;
p->skb2 = NULL;
}
+out:
sock_put(sk);
}
@@ -2071,7 +2116,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid
consume_skb(info.skb2);
if (info.delivered) {
- if (info.congested && (allocation & __GFP_WAIT))
+ if (info.congested && gfpflags_allow_blocking(allocation))
yield();
return 0;
}
@@ -2109,7 +2154,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
!test_bit(p->group - 1, nlk->groups))
goto out;
- if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
+ if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) {
ret = 1;
goto out;
}
@@ -2128,7 +2173,7 @@ out:
* @code: error code, must be negative (as usual in kernelspace)
*
* This function returns the number of broadcast listeners that have set the
- * NETLINK_RECV_NO_ENOBUFS socket option.
+ * NETLINK_NO_ENOBUFS socket option.
*/
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
@@ -2188,9 +2233,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
switch (optname) {
case NETLINK_PKTINFO:
if (val)
- nlk->flags |= NETLINK_RECV_PKTINFO;
+ nlk->flags |= NETLINK_F_RECV_PKTINFO;
else
- nlk->flags &= ~NETLINK_RECV_PKTINFO;
+ nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
err = 0;
break;
case NETLINK_ADD_MEMBERSHIP:
@@ -2219,18 +2264,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
}
case NETLINK_BROADCAST_ERROR:
if (val)
- nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+ nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
else
- nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+ nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
err = 0;
break;
case NETLINK_NO_ENOBUFS:
if (val) {
- nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
- clear_bit(NETLINK_CONGESTED, &nlk->state);
+ nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS;
+ clear_bit(NETLINK_S_CONGESTED, &nlk->state);
wake_up_interruptible(&nlk->wait);
} else {
- nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
+ nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
}
err = 0;
break;
@@ -2253,6 +2298,23 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
break;
}
#endif /* CONFIG_NETLINK_MMAP */
+ case NETLINK_LISTEN_ALL_NSID:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
+ return -EPERM;
+
+ if (val)
+ nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
+ else
+ nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
+ err = 0;
+ break;
+ case NETLINK_CAP_ACK:
+ if (val)
+ nlk->flags |= NETLINK_F_CAP_ACK;
+ else
+ nlk->flags &= ~NETLINK_F_CAP_ACK;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
@@ -2279,7 +2341,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
if (len < sizeof(int))
return -EINVAL;
len = sizeof(int);
- val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+ val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0;
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
@@ -2289,7 +2351,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
if (len < sizeof(int))
return -EINVAL;
len = sizeof(int);
- val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+ val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0;
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
@@ -2299,7 +2361,39 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
if (len < sizeof(int))
return -EINVAL;
len = sizeof(int);
- val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
+ val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
+ case NETLINK_LIST_MEMBERSHIPS: {
+ int pos, idx, shift;
+
+ err = 0;
+ netlink_lock_table();
+ for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
+ if (len - pos < sizeof(u32))
+ break;
+
+ idx = pos / sizeof(unsigned long);
+ shift = (pos % sizeof(unsigned long)) * 8;
+ if (put_user((u32)(nlk->groups[idx] >> shift),
+ (u32 __user *)(optval + pos))) {
+ err = -EFAULT;
+ break;
+ }
+ }
+ if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))
+ err = -EFAULT;
+ netlink_unlock_table();
+ break;
+ }
+ case NETLINK_CAP_ACK:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0;
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
@@ -2319,6 +2413,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}
+static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ if (!NETLINK_CB(skb).nsid_is_set)
+ return;
+
+ put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
+ &NETLINK_CB(skb).nsid);
+}
+
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
@@ -2367,7 +2471,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
* sendmsg(), but that's what we've got...
*/
if (netlink_tx_is_mmaped(sk) &&
- msg->msg_iter.type == ITER_IOVEC &&
+ iter_is_iovec(&msg->msg_iter) &&
msg->msg_iter.nr_segs == 1 &&
msg->msg_iter.iov->iov_base == NULL) {
err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
@@ -2473,8 +2577,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
msg->msg_namelen = sizeof(*addr);
}
- if (nlk->flags & NETLINK_RECV_PKTINFO)
+ if (nlk->flags & NETLINK_F_RECV_PKTINFO)
netlink_cmsg_recv_pktinfo(msg, skb);
+ if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
+ netlink_cmsg_listen_all_nsid(sk, msg, skb);
memset(&scm, 0, sizeof(scm));
scm.creds = *NETLINK_CREDS(skb);
@@ -2528,17 +2634,10 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
return NULL;
- /*
- * We have to just have a reference on the net from sk, but don't
- * get_net it. Besides, we cannot get and then put the net here.
- * So we create one inside init_net and the move it to net.
- */
-
- if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+ if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0)
goto out_sock_release_nosk;
sk = sock->sk;
- sk_change_net(sk, net);
if (!cfg || cfg->groups < 32)
groups = 32;
@@ -2557,7 +2656,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
goto out_sock_release;
nlk = nlk_sk(sk);
- nlk->flags |= NETLINK_KERNEL_SOCKET;
+ nlk->flags |= NETLINK_F_KERNEL_SOCKET;
netlink_table_grab();
if (!nl_table[unit].registered) {
@@ -2594,7 +2693,10 @@ EXPORT_SYMBOL(__netlink_kernel_create);
void
netlink_kernel_release(struct sock *sk)
{
- sk_release_kernel(sk);
+ if (sk == NULL || sk->sk_socket == NULL)
+ return;
+
+ sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);
@@ -2683,6 +2785,7 @@ static int netlink_dump(struct sock *sk)
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
int len, err = -ENOBUFS;
+ int alloc_min_size;
int alloc_size;
mutex_lock(nlk->cb_mutex);
@@ -2691,9 +2794,6 @@ static int netlink_dump(struct sock *sk)
goto errout_skb;
}
- cb = &nlk->cb;
- alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
-
if (!netlink_rx_is_mmaped(sk) &&
atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
goto errout_skb;
@@ -2703,23 +2803,35 @@ static int netlink_dump(struct sock *sk)
* to reduce number of system calls on dump operations, if user
* ever provided a big enough buffer.
*/
- if (alloc_size < nlk->max_recvmsg_len) {
- skb = netlink_alloc_skb(sk,
- nlk->max_recvmsg_len,
- nlk->portid,
+ cb = &nlk->cb;
+ alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
+
+ if (alloc_min_size < nlk->max_recvmsg_len) {
+ alloc_size = nlk->max_recvmsg_len;
+ skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
GFP_KERNEL |
__GFP_NOWARN |
__GFP_NORETRY);
- /* available room should be exact amount to avoid MSG_TRUNC */
- if (skb)
- skb_reserve(skb, skb_tailroom(skb) -
- nlk->max_recvmsg_len);
}
- if (!skb)
+ if (!skb) {
+ alloc_size = alloc_min_size;
skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
GFP_KERNEL);
+ }
if (!skb)
goto errout_skb;
+
+ /* Trim skb to allocated size. User is expected to provide buffer as
+ * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at
+ * netlink_recvmsg())). dump will pack as many smaller messages as
+ * could fit within the allocated skb. skb is typically allocated
+ * with larger space than required (could be as much as near 2x the
+ * requested size with align to next power of 2 approach). Allowing
+ * dump to use the excess space makes it difficult for a user to have a
+ * reasonable static buffer based on the expected largest dump of a
+ * single netdev. The outcome is MSG_TRUNC error.
+ */
+ skb_reserve(skb, skb_tailroom(skb) - alloc_size);
netlink_skb_set_owner_r(skb, sk);
len = cb->dump(skb, cb);
@@ -2841,9 +2953,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
struct nlmsghdr *rep;
struct nlmsgerr *errmsg;
size_t payload = sizeof(*errmsg);
+ struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
- /* error messages get the original request appened */
- if (err)
+ /* Error messages get the original request appened, unless the user
+ * requests to cap the error message.
+ */
+ if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
payload += nlmsg_len(nlh);
skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
@@ -2866,7 +2981,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
NLMSG_ERROR, payload, 0);
errmsg = nlmsg_data(rep);
errmsg->error = err;
- memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
+ memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
}
EXPORT_SYMBOL(netlink_ack);
diff --git a/kernel/net/netlink/genetlink.c b/kernel/net/netlink/genetlink.c
index 2ed5f9647..bc0e504f3 100644
--- a/kernel/net/netlink/genetlink.c
+++ b/kernel/net/netlink/genetlink.c
@@ -39,7 +39,7 @@ void genl_unlock(void)
EXPORT_SYMBOL(genl_unlock);
#ifdef CONFIG_LOCKDEP
-int lockdep_genl_is_held(void)
+bool lockdep_genl_is_held(void)
{
return lockdep_is_held(&genl_mutex);
}
@@ -1136,19 +1136,19 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
}
EXPORT_SYMBOL(genlmsg_multicast_allns);
-void genl_notify(struct genl_family *family,
- struct sk_buff *skb, struct net *net, u32 portid, u32 group,
- struct nlmsghdr *nlh, gfp_t flags)
+void genl_notify(struct genl_family *family, struct sk_buff *skb,
+ struct genl_info *info, u32 group, gfp_t flags)
{
+ struct net *net = genl_info_net(info);
struct sock *sk = net->genl_sock;
int report = 0;
- if (nlh)
- report = nlmsg_report(nlh);
+ if (info->nlhdr)
+ report = nlmsg_report(info->nlhdr);
if (WARN_ON_ONCE(group >= family->n_mcgrps))
return;
group = family->mcgrp_offset + group;
- nlmsg_notify(sk, skb, portid, group, report, flags);
+ nlmsg_notify(sk, skb, info->snd_portid, group, report, flags);
}
EXPORT_SYMBOL(genl_notify);
diff --git a/kernel/net/netrom/af_netrom.c b/kernel/net/netrom/af_netrom.c
index b987fd56c..ed212ffc1 100644
--- a/kernel/net/netrom/af_netrom.c
+++ b/kernel/net/netrom/af_netrom.c
@@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_SEQPACKET || protocol != 0)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto);
+ sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern);
if (sk == NULL)
return -ENOMEM;
@@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk)
if (osk->sk_type != SOCK_SEQPACKET)
return NULL;
- sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot);
+ sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0);
if (sk == NULL)
return NULL;
diff --git a/kernel/net/netrom/nr_route.c b/kernel/net/netrom/nr_route.c
index 96b64d2f6..d72a4f155 100644
--- a/kernel/net/netrom/nr_route.c
+++ b/kernel/net/netrom/nr_route.c
@@ -31,7 +31,6 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
-#include <linux/netfilter.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <net/netrom.h>
diff --git a/kernel/net/nfc/af_nfc.c b/kernel/net/nfc/af_nfc.c
index 2277276f5..54e40fa47 100644
--- a/kernel/net/nfc/af_nfc.c
+++ b/kernel/net/nfc/af_nfc.c
@@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto,
read_lock(&proto_tab_lock);
if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) {
- rc = proto_tab[proto]->create(net, sock, proto_tab[proto]);
+ rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern);
module_put(proto_tab[proto]->owner);
}
read_unlock(&proto_tab_lock);
diff --git a/kernel/net/nfc/core.c b/kernel/net/nfc/core.c
index cff3f1614..1fe3d3b36 100644
--- a/kernel/net/nfc/core.c
+++ b/kernel/net/nfc/core.c
@@ -449,7 +449,7 @@ error:
* @dev: The nfc device that found the target
* @target_idx: index of the target that must be deactivated
*/
-int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx)
+int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode)
{
int rc = 0;
@@ -476,7 +476,7 @@ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx)
if (dev->ops->check_presence)
del_timer_sync(&dev->check_pres_timer);
- dev->ops->deactivate_target(dev, dev->active_target);
+ dev->ops->deactivate_target(dev, dev->active_target, mode);
dev->active_target = NULL;
error:
diff --git a/kernel/net/nfc/digital_core.c b/kernel/net/nfc/digital_core.c
index 009bcf317..23c2a118a 100644
--- a/kernel/net/nfc/digital_core.c
+++ b/kernel/net/nfc/digital_core.c
@@ -631,7 +631,8 @@ static int digital_activate_target(struct nfc_dev *nfc_dev,
}
static void digital_deactivate_target(struct nfc_dev *nfc_dev,
- struct nfc_target *target)
+ struct nfc_target *target,
+ u8 mode)
{
struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
diff --git a/kernel/net/nfc/hci/core.c b/kernel/net/nfc/hci/core.c
index 6e061da22..2b0f0ac49 100644
--- a/kernel/net/nfc/hci/core.c
+++ b/kernel/net/nfc/hci/core.c
@@ -678,7 +678,8 @@ static int hci_activate_target(struct nfc_dev *nfc_dev,
}
static void hci_deactivate_target(struct nfc_dev *nfc_dev,
- struct nfc_target *target)
+ struct nfc_target *target,
+ u8 mode)
{
}
diff --git a/kernel/net/nfc/hci/llc.c b/kernel/net/nfc/hci/llc.c
index 1b90c0531..1399a03fa 100644
--- a/kernel/net/nfc/hci/llc.c
+++ b/kernel/net/nfc/hci/llc.c
@@ -144,11 +144,13 @@ inline int nfc_llc_start(struct nfc_llc *llc)
{
return llc->ops->start(llc);
}
+EXPORT_SYMBOL(nfc_llc_start);
inline int nfc_llc_stop(struct nfc_llc *llc)
{
return llc->ops->stop(llc);
}
+EXPORT_SYMBOL(nfc_llc_stop);
inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
{
diff --git a/kernel/net/nfc/llcp.h b/kernel/net/nfc/llcp.h
index de1789e3c..1f68724d4 100644
--- a/kernel/net/nfc/llcp.h
+++ b/kernel/net/nfc/llcp.h
@@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
struct sk_buff *skb, u8 direction);
/* Sock API */
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp);
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern);
void nfc_llcp_sock_free(struct nfc_llcp_sock *sock);
void nfc_llcp_accept_unlink(struct sock *sk);
void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/kernel/net/nfc/llcp_core.c b/kernel/net/nfc/llcp_core.c
index b18f07ccb..98876274a 100644
--- a/kernel/net/nfc/llcp_core.c
+++ b/kernel/net/nfc/llcp_core.c
@@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
sock->ssap = ssap;
}
- new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC);
+ new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0);
if (new_sk == NULL) {
reason = LLCP_DM_REJ;
release_sock(&sock->sk);
diff --git a/kernel/net/nfc/llcp_sock.c b/kernel/net/nfc/llcp_sock.c
index 9578bd6a4..ecf0a0196 100644
--- a/kernel/net/nfc/llcp_sock.c
+++ b/kernel/net/nfc/llcp_sock.c
@@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock,
if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
pr_debug("mask 0x%x\n", mask);
@@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk)
}
}
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp)
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern)
{
struct sock *sk;
struct nfc_llcp_sock *llcp_sock;
- sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto);
+ sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern);
if (!sk)
return NULL;
@@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock)
}
static int llcp_sock_create(struct net *net, struct socket *sock,
- const struct nfc_protocol *nfc_proto)
+ const struct nfc_protocol *nfc_proto, int kern)
{
struct sock *sk;
@@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock,
else
sock->ops = &llcp_sock_ops;
- sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC);
+ sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
if (sk == NULL)
return -ENOMEM;
diff --git a/kernel/net/nfc/nci/Kconfig b/kernel/net/nfc/nci/Kconfig
index a4f1e42e3..85d4819ab 100644
--- a/kernel/net/nfc/nci/Kconfig
+++ b/kernel/net/nfc/nci/Kconfig
@@ -12,10 +12,17 @@ config NFC_NCI
config NFC_NCI_SPI
depends on NFC_NCI && SPI
select CRC_CCITT
- bool "NCI over SPI protocol support"
+ tristate "NCI over SPI protocol support"
default n
help
NCI (NFC Controller Interface) is a communication protocol between
an NFC Controller (NFCC) and a Device Host (DH).
Say yes if you use an NCI driver that requires SPI link layer.
+
+config NFC_NCI_UART
+ depends on NFC_NCI && TTY
+ tristate "NCI over UART protocol support"
+ default n
+ help
+ Say yes if you use an NCI driver that requires UART link layer.
diff --git a/kernel/net/nfc/nci/Makefile b/kernel/net/nfc/nci/Makefile
index 7ed894926..0ca31d9bf 100644
--- a/kernel/net/nfc/nci/Makefile
+++ b/kernel/net/nfc/nci/Makefile
@@ -6,4 +6,8 @@ obj-$(CONFIG_NFC_NCI) += nci.o
nci-objs := core.o data.o lib.o ntf.o rsp.o hci.o
-nci-$(CONFIG_NFC_NCI_SPI) += spi.o
+nci_spi-y += spi.o
+obj-$(CONFIG_NFC_NCI_SPI) += nci_spi.o
+
+nci_uart-y += uart.o
+obj-$(CONFIG_NFC_NCI_UART) += nci_uart.o
diff --git a/kernel/net/nfc/nci/core.c b/kernel/net/nfc/nci/core.c
index 49ff32106..10c99a578 100644
--- a/kernel/net/nfc/nci/core.c
+++ b/kernel/net/nfc/nci/core.c
@@ -28,6 +28,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
#include <linux/module.h>
+#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/completion.h>
@@ -63,6 +64,19 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev,
return NULL;
}
+int nci_get_conn_info_by_id(struct nci_dev *ndev, u8 id)
+{
+ struct nci_conn_info *conn_info;
+
+ list_for_each_entry(conn_info, &ndev->conn_info_list, list) {
+ if (conn_info->id == id)
+ return conn_info->conn_id;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(nci_get_conn_info_by_id);
+
/* ---- NCI requests ---- */
void nci_req_complete(struct nci_dev *ndev, int result)
@@ -73,6 +87,7 @@ void nci_req_complete(struct nci_dev *ndev, int result)
complete(&ndev->req_completion);
}
}
+EXPORT_SYMBOL(nci_req_complete);
static void nci_req_cancel(struct nci_dev *ndev, int err)
{
@@ -323,6 +338,60 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt)
sizeof(struct nci_rf_deactivate_cmd), &cmd);
}
+struct nci_cmd_param {
+ __u16 opcode;
+ size_t len;
+ __u8 *payload;
+};
+
+static void nci_generic_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_cmd_param *param =
+ (struct nci_cmd_param *)opt;
+
+ nci_send_cmd(ndev, param->opcode, param->len, param->payload);
+}
+
+int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload)
+{
+ struct nci_cmd_param param;
+
+ param.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY, oid);
+ param.len = len;
+ param.payload = payload;
+
+ return __nci_request(ndev, nci_generic_req, (unsigned long)&param,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_prop_cmd);
+
+int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload)
+{
+ struct nci_cmd_param param;
+
+ param.opcode = opcode;
+ param.len = len;
+ param.payload = payload;
+
+ return __nci_request(ndev, nci_generic_req, (unsigned long)&param,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_core_cmd);
+
+int nci_core_reset(struct nci_dev *ndev)
+{
+ return __nci_request(ndev, nci_reset_req, 0,
+ msecs_to_jiffies(NCI_RESET_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_core_reset);
+
+int nci_core_init(struct nci_dev *ndev)
+{
+ return __nci_request(ndev, nci_init_req, 0,
+ msecs_to_jiffies(NCI_INIT_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_core_init);
+
static int nci_open_device(struct nci_dev *ndev)
{
int rc = 0;
@@ -343,17 +412,26 @@ static int nci_open_device(struct nci_dev *ndev)
set_bit(NCI_INIT, &ndev->flags);
- rc = __nci_request(ndev, nci_reset_req, 0,
- msecs_to_jiffies(NCI_RESET_TIMEOUT));
+ if (ndev->ops->init)
+ rc = ndev->ops->init(ndev);
+
+ if (!rc) {
+ rc = __nci_request(ndev, nci_reset_req, 0,
+ msecs_to_jiffies(NCI_RESET_TIMEOUT));
+ }
- if (ndev->ops->setup)
- ndev->ops->setup(ndev);
+ if (!rc && ndev->ops->setup) {
+ rc = ndev->ops->setup(ndev);
+ }
if (!rc) {
rc = __nci_request(ndev, nci_init_req, 0,
msecs_to_jiffies(NCI_INIT_TIMEOUT));
}
+ if (!rc && ndev->ops->post_setup)
+ rc = ndev->ops->post_setup(ndev);
+
if (!rc) {
rc = __nci_request(ndev, nci_init_complete_req, 0,
msecs_to_jiffies(NCI_INIT_TIMEOUT));
@@ -407,6 +485,12 @@ static int nci_close_device(struct nci_dev *ndev)
set_bit(NCI_INIT, &ndev->flags);
__nci_request(ndev, nci_reset_req, 0,
msecs_to_jiffies(NCI_RESET_TIMEOUT));
+
+ /* After this point our queues are empty
+ * and no works are scheduled.
+ */
+ ndev->ops->close(ndev);
+
clear_bit(NCI_INIT, &ndev->flags);
del_timer_sync(&ndev->cmd_timer);
@@ -414,10 +498,6 @@ static int nci_close_device(struct nci_dev *ndev)
/* Flush cmd wq */
flush_workqueue(ndev->cmd_wq);
- /* After this point our queues are empty
- * and no works are scheduled. */
- ndev->ops->close(ndev);
-
/* Clear flags */
ndev->flags = 0;
@@ -486,7 +566,7 @@ static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt)
int nci_nfcee_discover(struct nci_dev *ndev, u8 action)
{
- return nci_request(ndev, nci_nfcee_discover_req, action,
+ return __nci_request(ndev, nci_nfcee_discover_req, action,
msecs_to_jiffies(NCI_CMD_TIMEOUT));
}
EXPORT_SYMBOL(nci_nfcee_discover);
@@ -507,8 +587,9 @@ int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode)
cmd.nfcee_id = nfcee_id;
cmd.nfcee_mode = nfcee_mode;
- return nci_request(ndev, nci_nfcee_mode_set_req, (unsigned long)&cmd,
- msecs_to_jiffies(NCI_CMD_TIMEOUT));
+ return __nci_request(ndev, nci_nfcee_mode_set_req,
+ (unsigned long)&cmd,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
}
EXPORT_SYMBOL(nci_nfcee_mode_set);
@@ -534,12 +615,19 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
if (!cmd)
return -ENOMEM;
+ if (!number_destination_params)
+ return -EINVAL;
+
cmd->destination_type = destination_type;
cmd->number_destination_params = number_destination_params;
memcpy(cmd->params, params, params_len);
data.cmd = cmd;
- ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX];
+
+ if (params->length > 0)
+ ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX];
+ else
+ ndev->cur_id = 0;
r = __nci_request(ndev, nci_core_conn_create_req,
(unsigned long)&data,
@@ -558,8 +646,8 @@ static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt)
int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id)
{
- return nci_request(ndev, nci_core_conn_close_req, conn_id,
- msecs_to_jiffies(NCI_CMD_TIMEOUT));
+ return __nci_request(ndev, nci_core_conn_close_req, conn_id,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
}
EXPORT_SYMBOL(nci_core_conn_close);
@@ -747,9 +835,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev,
}
static void nci_deactivate_target(struct nfc_dev *nfc_dev,
- struct nfc_target *target)
+ struct nfc_target *target,
+ __u8 mode)
{
struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ u8 nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE;
pr_debug("entry\n");
@@ -760,9 +850,14 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev,
ndev->target_active_prot = 0;
+ switch (mode) {
+ case NFC_TARGET_MODE_SLEEP:
+ nci_mode = NCI_DEACTIVATE_TYPE_SLEEP_MODE;
+ break;
+ }
+
if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) {
- nci_request(ndev, nci_rf_deactivate_req,
- NCI_DEACTIVATE_TYPE_SLEEP_MODE,
+ nci_request(ndev, nci_rf_deactivate_req, nci_mode,
msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
}
}
@@ -796,7 +891,7 @@ static int nci_dep_link_down(struct nfc_dev *nfc_dev)
pr_debug("entry\n");
if (nfc_dev->rf_mode == NFC_RF_INITIATOR) {
- nci_deactivate_target(nfc_dev, NULL);
+ nci_deactivate_target(nfc_dev, NULL, NCI_DEACTIVATE_TYPE_IDLE_MODE);
} else {
if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE ||
atomic_read(&ndev->state) == NCI_DISCOVERY) {
@@ -961,6 +1056,14 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops,
return NULL;
ndev->ops = ops;
+
+ if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) {
+ pr_err("Too many proprietary commands: %zd\n",
+ ops->n_prop_ops);
+ ops->prop_ops = NULL;
+ ops->n_prop_ops = 0;
+ }
+
ndev->tx_headroom = tx_headroom;
ndev->tx_tailroom = tx_tailroom;
init_completion(&ndev->req_completion);
@@ -1115,7 +1218,7 @@ int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb)
}
EXPORT_SYMBOL(nci_recv_frame);
-static int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb)
+int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb)
{
pr_debug("len %d\n", skb->len);
@@ -1133,6 +1236,7 @@ static int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb)
return ndev->ops->send(ndev, skb);
}
+EXPORT_SYMBOL(nci_send_frame);
/* Send NCI command */
int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload)
@@ -1164,6 +1268,81 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload)
return 0;
}
+EXPORT_SYMBOL(nci_send_cmd);
+
+/* Proprietary commands API */
+static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops,
+ size_t n_ops,
+ __u16 opcode)
+{
+ size_t i;
+ struct nci_driver_ops *op;
+
+ if (!ops || !n_ops)
+ return NULL;
+
+ for (i = 0; i < n_ops; i++) {
+ op = &ops[i];
+ if (op->opcode == opcode)
+ return op;
+ }
+
+ return NULL;
+}
+
+static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode,
+ struct sk_buff *skb, struct nci_driver_ops *ops,
+ size_t n_ops)
+{
+ struct nci_driver_ops *op;
+
+ op = ops_cmd_lookup(ops, n_ops, rsp_opcode);
+ if (!op || !op->rsp)
+ return -ENOTSUPP;
+
+ return op->rsp(ndev, skb);
+}
+
+static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode,
+ struct sk_buff *skb, struct nci_driver_ops *ops,
+ size_t n_ops)
+{
+ struct nci_driver_ops *op;
+
+ op = ops_cmd_lookup(ops, n_ops, ntf_opcode);
+ if (!op || !op->ntf)
+ return -ENOTSUPP;
+
+ return op->ntf(ndev, skb);
+}
+
+int nci_prop_rsp_packet(struct nci_dev *ndev, __u16 opcode,
+ struct sk_buff *skb)
+{
+ return nci_op_rsp_packet(ndev, opcode, skb, ndev->ops->prop_ops,
+ ndev->ops->n_prop_ops);
+}
+
+int nci_prop_ntf_packet(struct nci_dev *ndev, __u16 opcode,
+ struct sk_buff *skb)
+{
+ return nci_op_ntf_packet(ndev, opcode, skb, ndev->ops->prop_ops,
+ ndev->ops->n_prop_ops);
+}
+
+int nci_core_rsp_packet(struct nci_dev *ndev, __u16 opcode,
+ struct sk_buff *skb)
+{
+ return nci_op_rsp_packet(ndev, opcode, skb, ndev->ops->core_ops,
+ ndev->ops->n_core_ops);
+}
+
+int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode,
+ struct sk_buff *skb)
+{
+ return nci_op_ntf_packet(ndev, opcode, skb, ndev->ops->core_ops,
+ ndev->ops->n_core_ops);
+}
/* ---- NCI TX Data worker thread ---- */
diff --git a/kernel/net/nfc/nci/data.c b/kernel/net/nfc/nci/data.c
index 566466d90..dbd242544 100644
--- a/kernel/net/nfc/nci/data.c
+++ b/kernel/net/nfc/nci/data.c
@@ -90,6 +90,18 @@ static inline void nci_push_data_hdr(struct nci_dev *ndev,
nci_pbf_set((__u8 *)hdr, pbf);
}
+int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id)
+{
+ struct nci_conn_info *conn_info;
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
+ if (!conn_info)
+ return -EPROTO;
+
+ return conn_info->max_pkt_payload_len;
+}
+EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size);
+
static int nci_queue_tx_data_frags(struct nci_dev *ndev,
__u8 conn_id,
struct sk_buff *skb) {
@@ -203,6 +215,7 @@ free_exit:
exit:
return rc;
}
+EXPORT_SYMBOL(nci_send_data);
/* ----------------- NCI RX Data ----------------- */
diff --git a/kernel/net/nfc/nci/hci.c b/kernel/net/nfc/nci/hci.c
index b33fed6d1..2aedac15c 100644
--- a/kernel/net/nfc/nci/hci.c
+++ b/kernel/net/nfc/nci/hci.c
@@ -70,6 +70,7 @@ struct nci_hcp_packet {
#define NCI_HCI_ANY_SET_PARAMETER 0x01
#define NCI_HCI_ANY_GET_PARAMETER 0x02
#define NCI_HCI_ANY_CLOSE_PIPE 0x04
+#define NCI_HCI_ADM_CLEAR_ALL_PIPE 0x14
#define NCI_HFP_NO_CHAINING 0x80
@@ -78,6 +79,8 @@ struct nci_hcp_packet {
#define NCI_EVT_HOT_PLUG 0x03
#define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01
+#define NCI_HCI_ADM_CREATE_PIPE 0x10
+#define NCI_HCI_ADM_DELETE_PIPE 0x11
/* HCP headers */
#define NCI_HCI_HCP_PACKET_HEADER_LEN 1
@@ -101,6 +104,20 @@ struct nci_hcp_packet {
#define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f)
#define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f)
+static int nci_hci_result_to_errno(u8 result)
+{
+ switch (result) {
+ case NCI_HCI_ANY_OK:
+ return 0;
+ case NCI_HCI_ANY_E_REG_PAR_UNKNOWN:
+ return -EOPNOTSUPP;
+ case NCI_HCI_ANY_E_TIMEOUT:
+ return -ETIME;
+ default:
+ return -1;
+ }
+}
+
/* HCI core */
static void nci_hci_reset_pipes(struct nci_hci_dev *hdev)
{
@@ -146,18 +163,18 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe,
if (!conn_info)
return -EPROTO;
- skb = nci_skb_alloc(ndev, 2 + conn_info->max_pkt_payload_len +
+ i = 0;
+ skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len +
NCI_DATA_HDR_SIZE, GFP_KERNEL);
if (!skb)
return -ENOMEM;
- skb_reserve(skb, 2 + NCI_DATA_HDR_SIZE);
+ skb_reserve(skb, NCI_DATA_HDR_SIZE + 2);
*skb_push(skb, 1) = data_type;
- i = 0;
- len = conn_info->max_pkt_payload_len;
-
do {
+ len = conn_info->max_pkt_payload_len;
+
/* If last packet add NCI_HFP_NO_CHAINING */
if (i + conn_info->max_pkt_payload_len -
(skb->len + 1) >= data_len) {
@@ -177,9 +194,15 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe,
return r;
i += len;
+
if (i < data_len) {
- skb_trim(skb, 0);
- skb_pull(skb, len);
+ skb = nci_skb_alloc(ndev,
+ conn_info->max_pkt_payload_len +
+ NCI_DATA_HDR_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reserve(skb, NCI_DATA_HDR_SIZE + 1);
}
} while (i < data_len);
@@ -212,7 +235,8 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd,
const u8 *param, size_t param_len,
struct sk_buff **skb)
{
- struct nci_conn_info *conn_info;
+ struct nci_hcp_message *message;
+ struct nci_conn_info *conn_info;
struct nci_data data;
int r;
u8 pipe = ndev->hci_dev->gate2pipe[gate];
@@ -232,14 +256,34 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd,
r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data,
msecs_to_jiffies(NCI_DATA_TIMEOUT));
-
- if (r == NCI_STATUS_OK && skb)
- *skb = conn_info->rx_skb;
+ if (r == NCI_STATUS_OK) {
+ message = (struct nci_hcp_message *)conn_info->rx_skb->data;
+ r = nci_hci_result_to_errno(
+ NCI_HCP_MSG_GET_CMD(message->header));
+ skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN);
+
+ if (!r && skb)
+ *skb = conn_info->rx_skb;
+ }
return r;
}
EXPORT_SYMBOL(nci_hci_send_cmd);
+int nci_hci_clear_all_pipes(struct nci_dev *ndev)
+{
+ int r;
+
+ r = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE,
+ NCI_HCI_ADM_CLEAR_ALL_PIPE, NULL, 0, NULL);
+ if (r < 0)
+ return r;
+
+ nci_hci_reset_pipes(ndev->hci_dev);
+ return r;
+}
+EXPORT_SYMBOL(nci_hci_clear_all_pipes);
+
static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe,
u8 event, struct sk_buff *skb)
{
@@ -328,9 +372,6 @@ static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe,
struct nci_conn_info *conn_info;
u8 status = result;
- if (result != NCI_HCI_ANY_OK)
- goto exit;
-
conn_info = ndev->hci_dev->conn_info;
if (!conn_info) {
status = NCI_STATUS_REJECTED;
@@ -340,7 +381,7 @@ static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe,
conn_info->rx_skb = skb;
exit:
- nci_req_complete(ndev, status);
+ nci_req_complete(ndev, NCI_STATUS_OK);
}
/* Receive hcp message for pipe, with type and cmd.
@@ -366,7 +407,7 @@ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe,
break;
}
- nci_req_complete(ndev, 0);
+ nci_req_complete(ndev, NCI_STATUS_OK);
}
static void nci_hci_msg_rx_work(struct work_struct *work)
@@ -378,7 +419,7 @@ static void nci_hci_msg_rx_work(struct work_struct *work)
u8 pipe, type, instruction;
while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) {
- pipe = skb->data[0];
+ pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]);
skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN);
message = (struct nci_hcp_message *)skb->data;
type = NCI_HCP_MSG_GET_TYPE(message->header);
@@ -395,7 +436,7 @@ void nci_hci_data_received_cb(void *context,
{
struct nci_dev *ndev = (struct nci_dev *)context;
struct nci_hcp_packet *packet;
- u8 pipe, type, instruction;
+ u8 pipe, type;
struct sk_buff *hcp_skb;
struct sk_buff *frag_skb;
int msg_len;
@@ -415,7 +456,7 @@ void nci_hci_data_received_cb(void *context,
/* it's the last fragment. Does it need re-aggregation? */
if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) {
- pipe = packet->header & NCI_HCI_FRAGMENT;
+ pipe = NCI_HCP_MSG_GET_PIPE(packet->header);
skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb);
msg_len = 0;
@@ -434,7 +475,7 @@ void nci_hci_data_received_cb(void *context,
*skb_put(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN) = pipe;
skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) {
- msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN;
+ msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN;
memcpy(skb_put(hcp_skb, msg_len), frag_skb->data +
NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len);
}
@@ -452,11 +493,10 @@ void nci_hci_data_received_cb(void *context,
packet = (struct nci_hcp_packet *)hcp_skb->data;
type = NCI_HCP_MSG_GET_TYPE(packet->message.header);
if (type == NCI_HCI_HCP_RESPONSE) {
- pipe = packet->header;
- instruction = NCI_HCP_MSG_GET_CMD(packet->message.header);
- skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN +
- NCI_HCI_HCP_MESSAGE_HEADER_LEN);
- nci_hci_hcp_message_rx(ndev, pipe, type, instruction, hcp_skb);
+ pipe = NCI_HCP_MSG_GET_PIPE(packet->header);
+ skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN);
+ nci_hci_hcp_message_rx(ndev, pipe, type,
+ NCI_STATUS_OK, hcp_skb);
} else {
skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb);
schedule_work(&ndev->hci_dev->msg_rx_work);
@@ -485,9 +525,47 @@ int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe)
}
EXPORT_SYMBOL(nci_hci_open_pipe);
+static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host,
+ u8 dest_gate, int *result)
+{
+ u8 pipe;
+ struct sk_buff *skb;
+ struct nci_hci_create_pipe_params params;
+ struct nci_hci_create_pipe_resp *resp;
+
+ pr_debug("gate=%d\n", dest_gate);
+
+ params.src_gate = NCI_HCI_ADMIN_GATE;
+ params.dest_host = dest_host;
+ params.dest_gate = dest_gate;
+
+ *result = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE,
+ NCI_HCI_ADM_CREATE_PIPE,
+ (u8 *)&params, sizeof(params), &skb);
+ if (*result < 0)
+ return NCI_HCI_INVALID_PIPE;
+
+ resp = (struct nci_hci_create_pipe_resp *)skb->data;
+ pipe = resp->pipe;
+ kfree_skb(skb);
+
+ pr_debug("pipe created=%d\n", pipe);
+
+ return pipe;
+}
+
+static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe)
+{
+ pr_debug("\n");
+
+ return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE,
+ NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL);
+}
+
int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx,
const u8 *param, size_t param_len)
{
+ struct nci_hcp_message *message;
struct nci_conn_info *conn_info;
struct nci_data data;
int r;
@@ -520,6 +598,12 @@ int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx,
r = nci_request(ndev, nci_hci_send_data_req,
(unsigned long)&data,
msecs_to_jiffies(NCI_DATA_TIMEOUT));
+ if (r == NCI_STATUS_OK) {
+ message = (struct nci_hcp_message *)conn_info->rx_skb->data;
+ r = nci_hci_result_to_errno(
+ NCI_HCP_MSG_GET_CMD(message->header));
+ skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN);
+ }
kfree(tmp);
return r;
@@ -529,6 +613,7 @@ EXPORT_SYMBOL(nci_hci_set_param);
int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx,
struct sk_buff **skb)
{
+ struct nci_hcp_message *message;
struct nci_conn_info *conn_info;
struct nci_data data;
int r;
@@ -553,8 +638,15 @@ int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx,
r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data,
msecs_to_jiffies(NCI_DATA_TIMEOUT));
- if (r == NCI_STATUS_OK)
- *skb = conn_info->rx_skb;
+ if (r == NCI_STATUS_OK) {
+ message = (struct nci_hcp_message *)conn_info->rx_skb->data;
+ r = nci_hci_result_to_errno(
+ NCI_HCP_MSG_GET_CMD(message->header));
+ skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN);
+
+ if (!r && skb)
+ *skb = conn_info->rx_skb;
+ }
return r;
}
@@ -563,6 +655,7 @@ EXPORT_SYMBOL(nci_hci_get_param);
int nci_hci_connect_gate(struct nci_dev *ndev,
u8 dest_host, u8 dest_gate, u8 pipe)
{
+ bool pipe_created = false;
int r;
if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE)
@@ -581,12 +674,26 @@ int nci_hci_connect_gate(struct nci_dev *ndev,
case NCI_HCI_ADMIN_GATE:
pipe = NCI_HCI_ADMIN_PIPE;
break;
+ default:
+ pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r);
+ if (pipe < 0)
+ return r;
+ pipe_created = true;
+ break;
}
open_pipe:
r = nci_hci_open_pipe(ndev, pipe);
- if (r < 0)
+ if (r < 0) {
+ if (pipe_created) {
+ if (nci_hci_delete_pipe(ndev, pipe) < 0) {
+ /* TODO: Cannot clean by deleting pipe...
+ * -> inconsistent state
+ */
+ }
+ }
return r;
+ }
ndev->hci_dev->pipes[pipe].gate = dest_gate;
ndev->hci_dev->pipes[pipe].host = dest_host;
@@ -639,23 +746,24 @@ int nci_hci_dev_session_init(struct nci_dev *ndev)
ndev->hci_dev->init_data.gates[0].gate,
ndev->hci_dev->init_data.gates[0].pipe);
if (r < 0)
- goto exit;
+ return r;
r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE,
NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb);
if (r < 0)
- goto exit;
+ return r;
if (skb->len &&
skb->len == strlen(ndev->hci_dev->init_data.session_id) &&
- memcmp(ndev->hci_dev->init_data.session_id,
- skb->data, skb->len) == 0 &&
+ !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) &&
ndev->ops->hci_load_session) {
/* Restore gate<->pipe table from some proprietary location. */
r = ndev->ops->hci_load_session(ndev);
+ } else {
+ r = nci_hci_clear_all_pipes(ndev);
if (r < 0)
goto exit;
- } else {
+
r = nci_hci_dev_connect_gates(ndev,
ndev->hci_dev->init_data.gate_count,
ndev->hci_dev->init_data.gates);
@@ -667,8 +775,6 @@ int nci_hci_dev_session_init(struct nci_dev *ndev)
ndev->hci_dev->init_data.session_id,
strlen(ndev->hci_dev->init_data.session_id));
}
- if (r == 0)
- goto exit;
exit:
kfree_skb(skb);
diff --git a/kernel/net/nfc/nci/ntf.c b/kernel/net/nfc/nci/ntf.c
index 321807107..2ada2b39e 100644
--- a/kernel/net/nfc/nci/ntf.c
+++ b/kernel/net/nfc/nci/ntf.c
@@ -758,6 +758,15 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb)
/* strip the nci control header */
skb_pull(skb, NCI_CTRL_HDR_SIZE);
+ if (nci_opcode_gid(ntf_opcode) == NCI_GID_PROPRIETARY) {
+ if (nci_prop_ntf_packet(ndev, ntf_opcode, skb) == -ENOTSUPP) {
+ pr_err("unsupported ntf opcode 0x%x\n",
+ ntf_opcode);
+ }
+
+ goto end;
+ }
+
switch (ntf_opcode) {
case NCI_OP_CORE_CONN_CREDITS_NTF:
nci_core_conn_credits_ntf_packet(ndev, skb);
@@ -796,5 +805,7 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb)
break;
}
+ nci_core_ntf_packet(ndev, ntf_opcode, skb);
+end:
kfree_skb(skb);
}
diff --git a/kernel/net/nfc/nci/rsp.c b/kernel/net/nfc/nci/rsp.c
index 02486bc2c..9b6eb913d 100644
--- a/kernel/net/nfc/nci/rsp.c
+++ b/kernel/net/nfc/nci/rsp.c
@@ -296,6 +296,15 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
/* strip the nci control header */
skb_pull(skb, NCI_CTRL_HDR_SIZE);
+ if (nci_opcode_gid(rsp_opcode) == NCI_GID_PROPRIETARY) {
+ if (nci_prop_rsp_packet(ndev, rsp_opcode, skb) == -ENOTSUPP) {
+ pr_err("unsupported rsp opcode 0x%x\n",
+ rsp_opcode);
+ }
+
+ goto end;
+ }
+
switch (rsp_opcode) {
case NCI_OP_CORE_RESET_RSP:
nci_core_reset_rsp_packet(ndev, skb);
@@ -346,6 +355,8 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
break;
}
+ nci_core_rsp_packet(ndev, rsp_opcode, skb);
+end:
kfree_skb(skb);
/* trigger the next cmd */
diff --git a/kernel/net/nfc/nci/spi.c b/kernel/net/nfc/nci/spi.c
index ec250e777..d904cd2f1 100644
--- a/kernel/net/nfc/nci/spi.c
+++ b/kernel/net/nfc/nci/spi.c
@@ -18,6 +18,8 @@
#define pr_fmt(fmt) "nci_spi: %s: " fmt, __func__
+#include <linux/module.h>
+
#include <linux/export.h>
#include <linux/spi/spi.h>
#include <linux/crc-ccitt.h>
@@ -56,6 +58,7 @@ static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb,
}
t.cs_change = cs_change;
t.delay_usecs = nspi->xfer_udelay;
+ t.speed_hz = nspi->xfer_speed_hz;
spi_message_init(&m);
spi_message_add_tail(&t, &m);
@@ -142,7 +145,8 @@ struct nci_spi *nci_spi_allocate_spi(struct spi_device *spi,
nspi->acknowledge_mode = acknowledge_mode;
nspi->xfer_udelay = delay;
-
+ /* Use controller max SPI speed by default */
+ nspi->xfer_speed_hz = 0;
nspi->spi = spi;
nspi->ndev = ndev;
init_completion(&nspi->req_completion);
@@ -195,12 +199,14 @@ static struct sk_buff *__nci_spi_read(struct nci_spi *nspi)
tx.tx_buf = req;
tx.len = 2;
tx.cs_change = 0;
+ tx.speed_hz = nspi->xfer_speed_hz;
spi_message_add_tail(&tx, &m);
memset(&rx, 0, sizeof(struct spi_transfer));
rx.rx_buf = resp_hdr;
rx.len = 2;
rx.cs_change = 1;
+ rx.speed_hz = nspi->xfer_speed_hz;
spi_message_add_tail(&rx, &m);
ret = spi_sync(nspi->spi, &m);
@@ -224,6 +230,7 @@ static struct sk_buff *__nci_spi_read(struct nci_spi *nspi)
rx.len = rx_len;
rx.cs_change = 0;
rx.delay_usecs = nspi->xfer_udelay;
+ rx.speed_hz = nspi->xfer_speed_hz;
spi_message_add_tail(&rx, &m);
ret = spi_sync(nspi->spi, &m);
@@ -320,3 +327,5 @@ done:
return skb;
}
EXPORT_SYMBOL_GPL(nci_spi_read);
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/nfc/nci/uart.c b/kernel/net/nfc/nci/uart.c
new file mode 100644
index 000000000..21d887567
--- /dev/null
+++ b/kernel/net/nfc/nci/uart.c
@@ -0,0 +1,494 @@
+/*
+ * Copyright (C) 2015, Marvell International Ltd.
+ *
+ * This software file (the "File") is distributed by Marvell International
+ * Ltd. under the terms of the GNU General Public License Version 2, June 1991
+ * (the "License"). You may use, redistribute and/or modify this File in
+ * accordance with the terms and conditions of the License, a copy of which
+ * is available on the worldwide web at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY DISCLAIMED. The License provides additional details about
+ * this warranty disclaimer.
+ */
+
+/* Inspired (hugely) by HCI LDISC implementation in Bluetooth.
+ *
+ * Copyright (C) 2000-2001 Qualcomm Incorporated
+ * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com>
+ * Copyright (C) 2004-2005 Marcel Holtmann <marcel@holtmann.org>
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/interrupt.h>
+#include <linux/ptrace.h>
+#include <linux/poll.h>
+
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/signal.h>
+#include <linux/ioctl.h>
+#include <linux/skbuff.h>
+
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+
+/* TX states */
+#define NCI_UART_SENDING 1
+#define NCI_UART_TX_WAKEUP 2
+
+static struct nci_uart *nci_uart_drivers[NCI_UART_DRIVER_MAX];
+
+static inline struct sk_buff *nci_uart_dequeue(struct nci_uart *nu)
+{
+ struct sk_buff *skb = nu->tx_skb;
+
+ if (!skb)
+ skb = skb_dequeue(&nu->tx_q);
+ else
+ nu->tx_skb = NULL;
+
+ return skb;
+}
+
+static inline int nci_uart_queue_empty(struct nci_uart *nu)
+{
+ if (nu->tx_skb)
+ return 0;
+
+ return skb_queue_empty(&nu->tx_q);
+}
+
+static int nci_uart_tx_wakeup(struct nci_uart *nu)
+{
+ if (test_and_set_bit(NCI_UART_SENDING, &nu->tx_state)) {
+ set_bit(NCI_UART_TX_WAKEUP, &nu->tx_state);
+ return 0;
+ }
+
+ schedule_work(&nu->write_work);
+
+ return 0;
+}
+
+static void nci_uart_write_work(struct work_struct *work)
+{
+ struct nci_uart *nu = container_of(work, struct nci_uart, write_work);
+ struct tty_struct *tty = nu->tty;
+ struct sk_buff *skb;
+
+restart:
+ clear_bit(NCI_UART_TX_WAKEUP, &nu->tx_state);
+
+ if (nu->ops.tx_start)
+ nu->ops.tx_start(nu);
+
+ while ((skb = nci_uart_dequeue(nu))) {
+ int len;
+
+ set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
+ len = tty->ops->write(tty, skb->data, skb->len);
+ skb_pull(skb, len);
+ if (skb->len) {
+ nu->tx_skb = skb;
+ break;
+ }
+ kfree_skb(skb);
+ }
+
+ if (test_bit(NCI_UART_TX_WAKEUP, &nu->tx_state))
+ goto restart;
+
+ if (nu->ops.tx_done && nci_uart_queue_empty(nu))
+ nu->ops.tx_done(nu);
+
+ clear_bit(NCI_UART_SENDING, &nu->tx_state);
+}
+
+static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver)
+{
+ struct nci_uart *nu = NULL;
+ int ret;
+
+ if (driver >= NCI_UART_DRIVER_MAX)
+ return -EINVAL;
+
+ if (!nci_uart_drivers[driver])
+ return -ENOENT;
+
+ nu = kzalloc(sizeof(*nu), GFP_KERNEL);
+ if (!nu)
+ return -ENOMEM;
+
+ memcpy(nu, nci_uart_drivers[driver], sizeof(struct nci_uart));
+ nu->tty = tty;
+ tty->disc_data = nu;
+ skb_queue_head_init(&nu->tx_q);
+ INIT_WORK(&nu->write_work, nci_uart_write_work);
+ spin_lock_init(&nu->rx_lock);
+
+ ret = nu->ops.open(nu);
+ if (ret) {
+ tty->disc_data = NULL;
+ kfree(nu);
+ } else if (!try_module_get(nu->owner)) {
+ nu->ops.close(nu);
+ tty->disc_data = NULL;
+ kfree(nu);
+ return -ENOENT;
+ }
+ return ret;
+}
+
+/* ------ LDISC part ------ */
+
+/* nci_uart_tty_open
+ *
+ * Called when line discipline changed to NCI_UART.
+ *
+ * Arguments:
+ * tty pointer to tty info structure
+ * Return Value:
+ * 0 if success, otherwise error code
+ */
+static int nci_uart_tty_open(struct tty_struct *tty)
+{
+ /* Error if the tty has no write op instead of leaving an exploitable
+ * hole
+ */
+ if (!tty->ops->write)
+ return -EOPNOTSUPP;
+
+ tty->disc_data = NULL;
+ tty->receive_room = 65536;
+
+ /* Flush any pending characters in the driver and line discipline. */
+
+ /* FIXME: why is this needed. Note don't use ldisc_ref here as the
+ * open path is before the ldisc is referencable.
+ */
+
+ if (tty->ldisc->ops->flush_buffer)
+ tty->ldisc->ops->flush_buffer(tty);
+ tty_driver_flush_buffer(tty);
+
+ return 0;
+}
+
+/* nci_uart_tty_close()
+ *
+ * Called when the line discipline is changed to something
+ * else, the tty is closed, or the tty detects a hangup.
+ */
+static void nci_uart_tty_close(struct tty_struct *tty)
+{
+ struct nci_uart *nu = (void *)tty->disc_data;
+
+ /* Detach from the tty */
+ tty->disc_data = NULL;
+
+ if (!nu)
+ return;
+
+ if (nu->tx_skb)
+ kfree_skb(nu->tx_skb);
+ if (nu->rx_skb)
+ kfree_skb(nu->rx_skb);
+
+ skb_queue_purge(&nu->tx_q);
+
+ nu->ops.close(nu);
+ nu->tty = NULL;
+ module_put(nu->owner);
+
+ cancel_work_sync(&nu->write_work);
+
+ kfree(nu);
+}
+
+/* nci_uart_tty_wakeup()
+ *
+ * Callback for transmit wakeup. Called when low level
+ * device driver can accept more send data.
+ *
+ * Arguments: tty pointer to associated tty instance data
+ * Return Value: None
+ */
+static void nci_uart_tty_wakeup(struct tty_struct *tty)
+{
+ struct nci_uart *nu = (void *)tty->disc_data;
+
+ if (!nu)
+ return;
+
+ clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
+
+ if (tty != nu->tty)
+ return;
+
+ nci_uart_tx_wakeup(nu);
+}
+
+/* nci_uart_tty_receive()
+ *
+ * Called by tty low level driver when receive data is
+ * available.
+ *
+ * Arguments: tty pointer to tty isntance data
+ * data pointer to received data
+ * flags pointer to flags for data
+ * count count of received data in bytes
+ *
+ * Return Value: None
+ */
+static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data,
+ char *flags, int count)
+{
+ struct nci_uart *nu = (void *)tty->disc_data;
+
+ if (!nu || tty != nu->tty)
+ return;
+
+ spin_lock(&nu->rx_lock);
+ nu->ops.recv_buf(nu, (void *)data, flags, count);
+ spin_unlock(&nu->rx_lock);
+
+ tty_unthrottle(tty);
+}
+
+/* nci_uart_tty_ioctl()
+ *
+ * Process IOCTL system call for the tty device.
+ *
+ * Arguments:
+ *
+ * tty pointer to tty instance data
+ * file pointer to open file object for device
+ * cmd IOCTL command code
+ * arg argument for IOCTL call (cmd dependent)
+ *
+ * Return Value: Command dependent
+ */
+static int nci_uart_tty_ioctl(struct tty_struct *tty, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct nci_uart *nu = (void *)tty->disc_data;
+ int err = 0;
+
+ switch (cmd) {
+ case NCIUARTSETDRIVER:
+ if (!nu)
+ return nci_uart_set_driver(tty, (unsigned int)arg);
+ else
+ return -EBUSY;
+ break;
+ default:
+ err = n_tty_ioctl_helper(tty, file, cmd, arg);
+ break;
+ }
+
+ return err;
+}
+
+/* We don't provide read/write/poll interface for user space. */
+static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file,
+ unsigned char __user *buf, size_t nr)
+{
+ return 0;
+}
+
+static ssize_t nci_uart_tty_write(struct tty_struct *tty, struct file *file,
+ const unsigned char *data, size_t count)
+{
+ return 0;
+}
+
+static unsigned int nci_uart_tty_poll(struct tty_struct *tty,
+ struct file *filp, poll_table *wait)
+{
+ return 0;
+}
+
+static int nci_uart_send(struct nci_uart *nu, struct sk_buff *skb)
+{
+ /* Queue TX packet */
+ skb_queue_tail(&nu->tx_q, skb);
+
+ /* Try to start TX (if possible) */
+ nci_uart_tx_wakeup(nu);
+
+ return 0;
+}
+
+/* -- Default recv_buf handler --
+ *
+ * This handler supposes that NCI frames are sent over UART link without any
+ * framing. It reads NCI header, retrieve the packet size and once all packet
+ * bytes are received it passes it to nci_uart driver for processing.
+ */
+static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data,
+ char *flags, int count)
+{
+ int chunk_len;
+
+ if (!nu->ndev) {
+ nfc_err(nu->tty->dev,
+ "receive data from tty but no NCI dev is attached yet, drop buffer\n");
+ return 0;
+ }
+
+ /* Decode all incoming data in packets
+ * and enqueue then for processing.
+ */
+ while (count > 0) {
+ /* If this is the first data of a packet, allocate a buffer */
+ if (!nu->rx_skb) {
+ nu->rx_packet_len = -1;
+ nu->rx_skb = nci_skb_alloc(nu->ndev,
+ NCI_MAX_PACKET_SIZE,
+ GFP_KERNEL);
+ if (!nu->rx_skb)
+ return -ENOMEM;
+ }
+
+ /* Eat byte after byte till full packet header is received */
+ if (nu->rx_skb->len < NCI_CTRL_HDR_SIZE) {
+ *skb_put(nu->rx_skb, 1) = *data++;
+ --count;
+ continue;
+ }
+
+ /* Header was received but packet len was not read */
+ if (nu->rx_packet_len < 0)
+ nu->rx_packet_len = NCI_CTRL_HDR_SIZE +
+ nci_plen(nu->rx_skb->data);
+
+ /* Compute how many bytes are missing and how many bytes can
+ * be consumed.
+ */
+ chunk_len = nu->rx_packet_len - nu->rx_skb->len;
+ if (count < chunk_len)
+ chunk_len = count;
+ memcpy(skb_put(nu->rx_skb, chunk_len), data, chunk_len);
+ data += chunk_len;
+ count -= chunk_len;
+
+ /* Chcek if packet is fully received */
+ if (nu->rx_packet_len == nu->rx_skb->len) {
+ /* Pass RX packet to driver */
+ if (nu->ops.recv(nu, nu->rx_skb) != 0)
+ nfc_err(nu->tty->dev, "corrupted RX packet\n");
+ /* Next packet will be a new one */
+ nu->rx_skb = NULL;
+ }
+ }
+
+ return 0;
+}
+
+/* -- Default recv handler -- */
+static int nci_uart_default_recv(struct nci_uart *nu, struct sk_buff *skb)
+{
+ return nci_recv_frame(nu->ndev, skb);
+}
+
+int nci_uart_register(struct nci_uart *nu)
+{
+ if (!nu || !nu->ops.open ||
+ !nu->ops.recv || !nu->ops.close)
+ return -EINVAL;
+
+ /* Set the send callback */
+ nu->ops.send = nci_uart_send;
+
+ /* Install default handlers if not overridden */
+ if (!nu->ops.recv_buf)
+ nu->ops.recv_buf = nci_uart_default_recv_buf;
+ if (!nu->ops.recv)
+ nu->ops.recv = nci_uart_default_recv;
+
+ /* Add this driver in the driver list */
+ if (nci_uart_drivers[nu->driver]) {
+ pr_err("driver %d is already registered\n", nu->driver);
+ return -EBUSY;
+ }
+ nci_uart_drivers[nu->driver] = nu;
+
+ pr_info("NCI uart driver '%s [%d]' registered\n", nu->name, nu->driver);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nci_uart_register);
+
+void nci_uart_unregister(struct nci_uart *nu)
+{
+ pr_info("NCI uart driver '%s [%d]' unregistered\n", nu->name,
+ nu->driver);
+
+ /* Remove this driver from the driver list */
+ nci_uart_drivers[nu->driver] = NULL;
+}
+EXPORT_SYMBOL_GPL(nci_uart_unregister);
+
+void nci_uart_set_config(struct nci_uart *nu, int baudrate, int flow_ctrl)
+{
+ struct ktermios new_termios;
+
+ if (!nu->tty)
+ return;
+
+ down_read(&nu->tty->termios_rwsem);
+ new_termios = nu->tty->termios;
+ up_read(&nu->tty->termios_rwsem);
+ tty_termios_encode_baud_rate(&new_termios, baudrate, baudrate);
+
+ if (flow_ctrl)
+ new_termios.c_cflag |= CRTSCTS;
+ else
+ new_termios.c_cflag &= ~CRTSCTS;
+
+ tty_set_termios(nu->tty, &new_termios);
+}
+EXPORT_SYMBOL_GPL(nci_uart_set_config);
+
+static struct tty_ldisc_ops nci_uart_ldisc = {
+ .magic = TTY_LDISC_MAGIC,
+ .owner = THIS_MODULE,
+ .name = "n_nci",
+ .open = nci_uart_tty_open,
+ .close = nci_uart_tty_close,
+ .read = nci_uart_tty_read,
+ .write = nci_uart_tty_write,
+ .poll = nci_uart_tty_poll,
+ .receive_buf = nci_uart_tty_receive,
+ .write_wakeup = nci_uart_tty_wakeup,
+ .ioctl = nci_uart_tty_ioctl,
+};
+
+static int __init nci_uart_init(void)
+{
+ memset(nci_uart_drivers, 0, sizeof(nci_uart_drivers));
+ return tty_register_ldisc(N_NCI, &nci_uart_ldisc);
+}
+
+static void __exit nci_uart_exit(void)
+{
+ tty_unregister_ldisc(N_NCI);
+}
+
+module_init(nci_uart_init);
+module_exit(nci_uart_exit);
+
+MODULE_AUTHOR("Marvell International Ltd.");
+MODULE_DESCRIPTION("NFC NCI UART driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_LDISC(N_NCI);
diff --git a/kernel/net/nfc/netlink.c b/kernel/net/nfc/netlink.c
index 376303671..f58c1fba1 100644
--- a/kernel/net/nfc/netlink.c
+++ b/kernel/net/nfc/netlink.c
@@ -5,6 +5,12 @@
* Lauro Ramos Venancio <lauro.venancio@openbossa.org>
* Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
*
+ * Vendor commands implementation based on net/wireless/nl80211.c
+ * which is:
+ *
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -57,6 +63,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
[NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING,
.len = NFC_FIRMWARE_NAME_MAXSIZE },
[NFC_ATTR_SE_APDU] = { .type = NLA_BINARY },
+ [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
+
};
static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = {
@@ -877,7 +885,7 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info)
target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);
protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]);
- nfc_deactivate_target(dev, target_idx);
+ nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP);
rc = nfc_activate_target(dev, target_idx, protocol);
nfc_put_device(dev);
@@ -1101,10 +1109,8 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info)
idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
dev = nfc_get_device(idx);
- if (!dev) {
- rc = -ENODEV;
- goto exit;
- }
+ if (!dev)
+ return -ENODEV;
device_lock(&dev->dev);
@@ -1489,6 +1495,131 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info)
return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx);
}
+static int nfc_genl_vendor_cmd(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ struct nfc_vendor_cmd *cmd;
+ u32 dev_idx, vid, subcmd;
+ u8 *data;
+ size_t data_len;
+ int i, err;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_VENDOR_ID] ||
+ !info->attrs[NFC_ATTR_VENDOR_SUBCMD])
+ return -EINVAL;
+
+ dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+ vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]);
+ subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]);
+
+ dev = nfc_get_device(dev_idx);
+ if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds)
+ return -ENODEV;
+
+ if (info->attrs[NFC_ATTR_VENDOR_DATA]) {
+ data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]);
+ data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]);
+ if (data_len == 0)
+ return -EINVAL;
+ } else {
+ data = NULL;
+ data_len = 0;
+ }
+
+ for (i = 0; i < dev->n_vendor_cmds; i++) {
+ cmd = &dev->vendor_cmds[i];
+
+ if (cmd->vendor_id != vid || cmd->subcmd != subcmd)
+ continue;
+
+ dev->cur_cmd_info = info;
+ err = cmd->doit(dev, data, data_len);
+ dev->cur_cmd_info = NULL;
+ return err;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+/* message building helper */
+static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, u8 cmd)
+{
+ /* since there is no private header just add the generic one */
+ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd);
+}
+
+static struct sk_buff *
+__nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen,
+ u32 portid, u32 seq,
+ enum nfc_attrs attr,
+ u32 oui, u32 subcmd, gfp_t gfp)
+{
+ struct sk_buff *skb;
+ void *hdr;
+
+ skb = nlmsg_new(approxlen + 100, gfp);
+ if (!skb)
+ return NULL;
+
+ hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR);
+ if (!hdr) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd))
+ goto nla_put_failure;
+
+ ((void **)skb->cb)[0] = dev;
+ ((void **)skb->cb)[1] = hdr;
+
+ return skb;
+
+nla_put_failure:
+ kfree_skb(skb);
+ return NULL;
+}
+
+struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev,
+ enum nfc_attrs attr,
+ u32 oui, u32 subcmd,
+ int approxlen)
+{
+ if (WARN_ON(!dev->cur_cmd_info))
+ return NULL;
+
+ return __nfc_alloc_vendor_cmd_skb(dev, approxlen,
+ dev->cur_cmd_info->snd_portid,
+ dev->cur_cmd_info->snd_seq, attr,
+ oui, subcmd, GFP_KERNEL);
+}
+EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb);
+
+int nfc_vendor_cmd_reply(struct sk_buff *skb)
+{
+ struct nfc_dev *dev = ((void **)skb->cb)[0];
+ void *hdr = ((void **)skb->cb)[1];
+
+ /* clear CB data for netlink core to own from now on */
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ if (WARN_ON(!dev->cur_cmd_info)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ genlmsg_end(skb, hdr);
+ return genlmsg_reply(skb, dev->cur_cmd_info);
+}
+EXPORT_SYMBOL(nfc_vendor_cmd_reply);
+
static const struct genl_ops nfc_genl_ops[] = {
{
.cmd = NFC_CMD_GET_DEVICE,
@@ -1579,6 +1710,11 @@ static const struct genl_ops nfc_genl_ops[] = {
.doit = nfc_genl_activate_target,
.policy = nfc_genl_policy,
},
+ {
+ .cmd = NFC_CMD_VENDOR,
+ .doit = nfc_genl_vendor_cmd,
+ .policy = nfc_genl_policy,
+ },
};
diff --git a/kernel/net/nfc/nfc.h b/kernel/net/nfc/nfc.h
index a8ce80b47..c20b784ad 100644
--- a/kernel/net/nfc/nfc.h
+++ b/kernel/net/nfc/nfc.h
@@ -25,12 +25,15 @@
#include <net/nfc/nfc.h>
#include <net/sock.h>
+#define NFC_TARGET_MODE_IDLE 0
+#define NFC_TARGET_MODE_SLEEP 1
+
struct nfc_protocol {
int id;
struct proto *proto;
struct module *owner;
int (*create)(struct net *net, struct socket *sock,
- const struct nfc_protocol *nfc_proto);
+ const struct nfc_protocol *nfc_proto, int kern);
};
struct nfc_rawsock {
@@ -147,7 +150,7 @@ int nfc_dep_link_down(struct nfc_dev *dev);
int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol);
-int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx);
+int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode);
int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb,
data_exchange_cb_t cb, void *cb_context);
diff --git a/kernel/net/nfc/rawsock.c b/kernel/net/nfc/rawsock.c
index 82b4e8024..e386e6c90 100644
--- a/kernel/net/nfc/rawsock.c
+++ b/kernel/net/nfc/rawsock.c
@@ -321,7 +321,8 @@ static void rawsock_destruct(struct sock *sk)
if (sk->sk_state == TCP_ESTABLISHED) {
nfc_deactivate_target(nfc_rawsock(sk)->dev,
- nfc_rawsock(sk)->target_idx);
+ nfc_rawsock(sk)->target_idx,
+ NFC_TARGET_MODE_IDLE);
nfc_put_device(nfc_rawsock(sk)->dev);
}
@@ -334,7 +335,7 @@ static void rawsock_destruct(struct sock *sk)
}
static int rawsock_create(struct net *net, struct socket *sock,
- const struct nfc_protocol *nfc_proto)
+ const struct nfc_protocol *nfc_proto, int kern)
{
struct sock *sk;
@@ -348,7 +349,7 @@ static int rawsock_create(struct net *net, struct socket *sock,
else
sock->ops = &rawsock_ops;
- sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto);
+ sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/openvswitch/Kconfig b/kernel/net/openvswitch/Kconfig
index ed6b0f8dd..d143aa9f6 100644
--- a/kernel/net/openvswitch/Kconfig
+++ b/kernel/net/openvswitch/Kconfig
@@ -5,6 +5,8 @@
config OPENVSWITCH
tristate "Open vSwitch"
depends on INET
+ depends on !NF_CONNTRACK || \
+ (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
select LIBCRC32C
select MPLS
select NET_MPLS_GSO
@@ -34,7 +36,7 @@ config OPENVSWITCH
config OPENVSWITCH_GRE
tristate "Open vSwitch GRE tunneling support"
depends on OPENVSWITCH
- depends on NET_IPGRE_DEMUX
+ depends on NET_IPGRE
default OPENVSWITCH
---help---
If you say Y here, then the Open vSwitch will be able create GRE
diff --git a/kernel/net/openvswitch/Makefile b/kernel/net/openvswitch/Makefile
index 91b947841..60f809085 100644
--- a/kernel/net/openvswitch/Makefile
+++ b/kernel/net/openvswitch/Makefile
@@ -15,6 +15,10 @@ openvswitch-y := \
vport-internal_dev.o \
vport-netdev.o
+ifneq ($(CONFIG_NF_CONNTRACK),)
+openvswitch-y += conntrack.o
+endif
+
+obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
-obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o
diff --git a/kernel/net/openvswitch/actions.c b/kernel/net/openvswitch/actions.c
index b491c1c29..c88d0f2d3 100644
--- a/kernel/net/openvswitch/actions.c
+++ b/kernel/net/openvswitch/actions.c
@@ -22,6 +22,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
+#include <linux/netfilter_ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -29,8 +30,10 @@
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
+#include <net/dst.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/ip6_fib.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/mpls.h>
@@ -38,6 +41,7 @@
#include "datapath.h"
#include "flow.h"
+#include "conntrack.h"
#include "vport.h"
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
@@ -52,6 +56,20 @@ struct deferred_action {
struct sw_flow_key pkt_key;
};
+#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN)
+struct ovs_frag_data {
+ unsigned long dst;
+ struct vport *vport;
+ struct ovs_skb_cb cb;
+ __be16 inner_protocol;
+ __u16 vlan_tci;
+ __be16 vlan_proto;
+ unsigned int l2_len;
+ u8 l2_data[MAX_L2_LEN];
+};
+
+static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
+
#define DEFERRED_ACTION_FIFO_SIZE 10
struct action_fifo {
int head;
@@ -185,10 +203,6 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
return 0;
}
-/* 'KEY' must not have any bits set outside of the 'MASK' */
-#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
-#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK))
-
static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
const __be32 *mpls_lse, const __be32 *mask)
{
@@ -201,7 +215,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
return err;
stack = (__be32 *)skb_mpls_header(skb);
- lse = MASKED(*stack, *mpls_lse, *mask);
+ lse = OVS_MASKED(*stack, *mpls_lse, *mask);
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__be32 diff[] = { ~(*stack), lse };
@@ -244,9 +258,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_)
const u16 *src = (const u16 *)src_;
const u16 *mask = (const u16 *)mask_;
- SET_MASKED(dst[0], src[0], mask[0]);
- SET_MASKED(dst[1], src[1], mask[1]);
- SET_MASKED(dst[2], src[2], mask[2]);
+ OVS_SET_MASKED(dst[0], src[0], mask[0]);
+ OVS_SET_MASKED(dst[1], src[1], mask[1]);
+ OVS_SET_MASKED(dst[2], src[2], mask[2]);
}
static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
@@ -273,28 +287,36 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
-static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
- __be32 *addr, __be32 new_addr)
+static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
+ __be32 addr, __be32 new_addr)
{
int transport_len = skb->len - skb_transport_offset(skb);
+ if (nh->frag_off & htons(IP_OFFSET))
+ return;
+
if (nh->protocol == IPPROTO_TCP) {
if (likely(transport_len >= sizeof(struct tcphdr)))
inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
- *addr, new_addr, 1);
+ addr, new_addr, true);
} else if (nh->protocol == IPPROTO_UDP) {
if (likely(transport_len >= sizeof(struct udphdr))) {
struct udphdr *uh = udp_hdr(skb);
if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&uh->check, skb,
- *addr, new_addr, 1);
+ addr, new_addr, true);
if (!uh->check)
uh->check = CSUM_MANGLED_0;
}
}
}
+}
+static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
+ __be32 *addr, __be32 new_addr)
+{
+ update_ip_l4_checksum(skb, nh, *addr, new_addr);
csum_replace4(&nh->check, *addr, new_addr);
skb_clear_hash(skb);
*addr = new_addr;
@@ -308,14 +330,14 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
if (l4_proto == NEXTHDR_TCP) {
if (likely(transport_len >= sizeof(struct tcphdr)))
inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
- addr, new_addr, 1);
+ addr, new_addr, true);
} else if (l4_proto == NEXTHDR_UDP) {
if (likely(transport_len >= sizeof(struct udphdr))) {
struct udphdr *uh = udp_hdr(skb);
if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace16(&uh->check, skb,
- addr, new_addr, 1);
+ addr, new_addr, true);
if (!uh->check)
uh->check = CSUM_MANGLED_0;
}
@@ -323,17 +345,17 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
} else if (l4_proto == NEXTHDR_ICMP) {
if (likely(transport_len >= sizeof(struct icmp6hdr)))
inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
- skb, addr, new_addr, 1);
+ skb, addr, new_addr, true);
}
}
static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4],
const __be32 mask[4], __be32 masked[4])
{
- masked[0] = MASKED(old[0], addr[0], mask[0]);
- masked[1] = MASKED(old[1], addr[1], mask[1]);
- masked[2] = MASKED(old[2], addr[2], mask[2]);
- masked[3] = MASKED(old[3], addr[3], mask[3]);
+ masked[0] = OVS_MASKED(old[0], addr[0], mask[0]);
+ masked[1] = OVS_MASKED(old[1], addr[1], mask[1]);
+ masked[2] = OVS_MASKED(old[2], addr[2], mask[2]);
+ masked[3] = OVS_MASKED(old[3], addr[3], mask[3]);
}
static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
@@ -350,15 +372,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
{
/* Bits 21-24 are always unmasked, so this retains their values. */
- SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
- SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
- SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
+ OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
+ OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
+ OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
}
static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
u8 mask)
{
- new_ttl = MASKED(nh->ttl, new_ttl, mask);
+ new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask);
csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
nh->ttl = new_ttl;
@@ -384,7 +406,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
* makes sense to check if the value actually changed.
*/
if (mask->ipv4_src) {
- new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
+ new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
if (unlikely(new_addr != nh->saddr)) {
set_ip_addr(skb, nh, &nh->saddr, new_addr);
@@ -392,7 +414,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
}
}
if (mask->ipv4_dst) {
- new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
+ new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
if (unlikely(new_addr != nh->daddr)) {
set_ip_addr(skb, nh, &nh->daddr, new_addr);
@@ -480,7 +502,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
*(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
}
if (mask->ipv6_hlimit) {
- SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit);
+ OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit,
+ mask->ipv6_hlimit);
flow_key->ip.ttl = nh->hop_limit;
}
return 0;
@@ -490,7 +513,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
static void set_tp_port(struct sk_buff *skb, __be16 *port,
__be16 new_port, __sum16 *check)
{
- inet_proto_csum_replace2(check, skb, *port, new_port, 0);
+ inet_proto_csum_replace2(check, skb, *port, new_port, false);
*port = new_port;
}
@@ -509,8 +532,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
uh = udp_hdr(skb);
/* Either of the masks is non-zero, so do not bother checking them. */
- src = MASKED(uh->source, key->udp_src, mask->udp_src);
- dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst);
+ src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src);
+ dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst);
if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
if (likely(src != uh->source)) {
@@ -550,12 +573,12 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key,
return err;
th = tcp_hdr(skb);
- src = MASKED(th->source, key->tcp_src, mask->tcp_src);
+ src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src);
if (likely(src != th->source)) {
set_tp_port(skb, &th->source, src, &th->check);
flow_key->tp.src = src;
}
- dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst);
+ dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst);
if (likely(dst != th->dest)) {
set_tp_port(skb, &th->dest, dst, &th->check);
flow_key->tp.dst = dst;
@@ -582,8 +605,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
old_csum = sh->checksum;
old_correct_csum = sctp_compute_cksum(skb, sctphoff);
- sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src);
- sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst);
+ sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src);
+ sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst);
new_csum = sctp_compute_cksum(skb, sctphoff);
@@ -597,28 +620,162 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
-static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
+ struct vport *vport = data->vport;
+
+ if (skb_cow_head(skb, data->l2_len) < 0) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ __skb_dst_copy(skb, data->dst);
+ *OVS_CB(skb) = data->cb;
+ skb->inner_protocol = data->inner_protocol;
+ skb->vlan_tci = data->vlan_tci;
+ skb->vlan_proto = data->vlan_proto;
+
+ /* Reconstruct the MAC header. */
+ skb_push(skb, data->l2_len);
+ memcpy(skb->data, &data->l2_data, data->l2_len);
+ ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len);
+ skb_reset_mac_header(skb);
+
+ ovs_vport_send(vport, skb);
+ return 0;
+}
+
+static unsigned int
+ovs_dst_get_mtu(const struct dst_entry *dst)
+{
+ return dst->dev->mtu;
+}
+
+static struct dst_ops ovs_dst_ops = {
+ .family = AF_UNSPEC,
+ .mtu = ovs_dst_get_mtu,
+};
+
+/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
+ * ovs_vport_output(), which is called once per fragmented packet.
+ */
+static void prepare_frag(struct vport *vport, struct sk_buff *skb)
+{
+ unsigned int hlen = skb_network_offset(skb);
+ struct ovs_frag_data *data;
+
+ data = this_cpu_ptr(&ovs_frag_data_storage);
+ data->dst = skb->_skb_refdst;
+ data->vport = vport;
+ data->cb = *OVS_CB(skb);
+ data->inner_protocol = skb->inner_protocol;
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ data->l2_len = hlen;
+ memcpy(&data->l2_data, skb->data, hlen);
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ skb_pull(skb, hlen);
+}
+
+static void ovs_fragment(struct net *net, struct vport *vport,
+ struct sk_buff *skb, u16 mru, __be16 ethertype)
+{
+ if (skb_network_offset(skb) > MAX_L2_LEN) {
+ OVS_NLERR(1, "L2 header too long to fragment");
+ goto err;
+ }
+
+ if (ethertype == htons(ETH_P_IP)) {
+ struct dst_entry ovs_dst;
+ unsigned long orig_dst;
+
+ prepare_frag(vport, skb);
+ dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
+ DST_OBSOLETE_NONE, DST_NOCOUNT);
+ ovs_dst.dev = vport->dev;
+
+ orig_dst = skb->_skb_refdst;
+ skb_dst_set_noref(skb, &ovs_dst);
+ IPCB(skb)->frag_max_size = mru;
+
+ ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
+ refdst_drop(orig_dst);
+ } else if (ethertype == htons(ETH_P_IPV6)) {
+ const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+ unsigned long orig_dst;
+ struct rt6_info ovs_rt;
+
+ if (!v6ops) {
+ goto err;
+ }
+
+ prepare_frag(vport, skb);
+ memset(&ovs_rt, 0, sizeof(ovs_rt));
+ dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
+ DST_OBSOLETE_NONE, DST_NOCOUNT);
+ ovs_rt.dst.dev = vport->dev;
+
+ orig_dst = skb->_skb_refdst;
+ skb_dst_set_noref(skb, &ovs_rt.dst);
+ IP6CB(skb)->frag_max_size = mru;
+
+ v6ops->fragment(net, skb->sk, skb, ovs_vport_output);
+ refdst_drop(orig_dst);
+ } else {
+ WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
+ ovs_vport_name(vport), ntohs(ethertype), mru,
+ vport->dev->mtu);
+ goto err;
+ }
+
+ return;
+err:
+ kfree_skb(skb);
+}
+
+static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
+ struct sw_flow_key *key)
{
struct vport *vport = ovs_vport_rcu(dp, out_port);
- if (likely(vport))
- ovs_vport_send(vport, skb);
- else
+ if (likely(vport)) {
+ u16 mru = OVS_CB(skb)->mru;
+
+ if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
+ ovs_vport_send(vport, skb);
+ } else if (mru <= vport->dev->mtu) {
+ struct net *net = read_pnet(&dp->net);
+ __be16 ethertype = key->eth.type;
+
+ if (!is_flow_key_valid(key)) {
+ if (eth_p_mpls(skb->protocol))
+ ethertype = skb->inner_protocol;
+ else
+ ethertype = vlan_get_protocol(skb);
+ }
+
+ ovs_fragment(net, vport, skb, mru, ethertype);
+ } else {
+ kfree_skb(skb);
+ }
+ } else {
kfree_skb(skb);
+ }
}
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key, const struct nlattr *attr)
+ struct sw_flow_key *key, const struct nlattr *attr,
+ const struct nlattr *actions, int actions_len)
{
- struct ovs_tunnel_info info;
struct dp_upcall_info upcall;
const struct nlattr *a;
int rem;
+ memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_ACTION;
- upcall.userdata = NULL;
- upcall.portid = 0;
- upcall.egress_tun_info = NULL;
+ upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
@@ -639,11 +796,18 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
if (vport) {
int err;
- err = ovs_vport_get_egress_tun_info(vport, skb,
- &info);
+ err = dev_fill_metadata_dst(vport->dev, skb);
if (!err)
- upcall.egress_tun_info = &info;
+ upcall.egress_tun_info = skb_tunnel_info(skb);
}
+
+ break;
+ }
+
+ case OVS_USERSPACE_ATTR_ACTIONS: {
+ /* Include actions. */
+ upcall.actions = actions;
+ upcall.actions_len = actions_len;
break;
}
@@ -654,7 +818,8 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
}
static int sample(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key, const struct nlattr *attr)
+ struct sw_flow_key *key, const struct nlattr *attr,
+ const struct nlattr *actions, int actions_len)
{
const struct nlattr *acts_list = NULL;
const struct nlattr *a;
@@ -662,9 +827,12 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
+ u32 probability;
+
switch (nla_type(a)) {
case OVS_SAMPLE_ATTR_PROBABILITY:
- if (prandom_u32() >= nla_get_u32(a))
+ probability = nla_get_u32(a);
+ if (!probability || prandom_u32() > probability)
return 0;
break;
@@ -688,7 +856,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
*/
if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
nla_is_last(a, rem)))
- return output_userspace(dp, skb, key, a);
+ return output_userspace(dp, skb, key, a, actions, actions_len);
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
@@ -726,7 +894,11 @@ static int execute_set_action(struct sk_buff *skb,
{
/* Only tunnel set execution is supported without a mask. */
if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
- OVS_CB(skb)->egress_tun_info = nla_data(a);
+ struct ovs_tunnel_info *tun = nla_data(a);
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *)tun->tun_dst);
+ skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);
return 0;
}
@@ -744,12 +916,13 @@ static int execute_masked_set_action(struct sk_buff *skb,
switch (nla_type(a)) {
case OVS_KEY_ATTR_PRIORITY:
- SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *));
+ OVS_SET_MASKED(skb->priority, nla_get_u32(a),
+ *get_mask(a, u32 *));
flow_key->phy.priority = skb->priority;
break;
case OVS_KEY_ATTR_SKB_MARK:
- SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *));
+ OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *));
flow_key->phy.skb_mark = skb->mark;
break;
@@ -792,6 +965,13 @@ static int execute_masked_set_action(struct sk_buff *skb,
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
__be32 *));
break;
+
+ case OVS_KEY_ATTR_CT_STATE:
+ case OVS_KEY_ATTR_CT_ZONE:
+ case OVS_KEY_ATTR_CT_MARK:
+ case OVS_KEY_ATTR_CT_LABELS:
+ err = -EINVAL;
+ break;
}
return err;
@@ -861,7 +1041,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
if (out_skb)
- do_output(dp, out_skb, prev_port);
+ do_output(dp, out_skb, prev_port, key);
prev_port = -1;
}
@@ -872,7 +1052,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
break;
case OVS_ACTION_ATTR_USERSPACE:
- output_userspace(dp, skb, key, a);
+ output_userspace(dp, skb, key, a, attr, len);
break;
case OVS_ACTION_ATTR_HASH:
@@ -916,7 +1096,22 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
break;
case OVS_ACTION_ATTR_SAMPLE:
- err = sample(dp, skb, key, a);
+ err = sample(dp, skb, key, a, attr, len);
+ break;
+
+ case OVS_ACTION_ATTR_CT:
+ if (!is_flow_key_valid(key)) {
+ err = ovs_flow_key_update(skb, key);
+ if (err)
+ return err;
+ }
+
+ err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
+ nla_data(a));
+
+ /* Hide stolen IP fragments from user space. */
+ if (err)
+ return err == -EINPROGRESS ? 0 : err;
break;
}
@@ -927,7 +1122,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
}
if (prev_port != -1)
- do_output(dp, skb, prev_port);
+ do_output(dp, skb, prev_port, key);
else
consume_skb(skb);
@@ -969,7 +1164,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
int err;
this_cpu_inc(exec_actions_level);
- OVS_CB(skb)->egress_tun_info = NULL;
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
diff --git a/kernel/net/openvswitch/conntrack.c b/kernel/net/openvswitch/conntrack.c
new file mode 100644
index 000000000..e004067ec
--- /dev/null
+++ b/kernel/net/openvswitch/conntrack.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/openvswitch.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+#include "datapath.h"
+#include "conntrack.h"
+#include "flow.h"
+#include "flow_netlink.h"
+
+struct ovs_ct_len_tbl {
+ size_t maxlen;
+ size_t minlen;
+};
+
+/* Metadata mark for masked write to conntrack mark */
+struct md_mark {
+ u32 value;
+ u32 mask;
+};
+
+/* Metadata label for masked write to conntrack label. */
+struct md_labels {
+ struct ovs_key_ct_labels value;
+ struct ovs_key_ct_labels mask;
+};
+
+/* Conntrack action context for execution. */
+struct ovs_conntrack_info {
+ struct nf_conntrack_helper *helper;
+ struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
+ u8 commit : 1;
+ u16 family;
+ struct md_mark mark;
+ struct md_labels labels;
+};
+
+static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
+
+static u16 key_to_nfproto(const struct sw_flow_key *key)
+{
+ switch (ntohs(key->eth.type)) {
+ case ETH_P_IP:
+ return NFPROTO_IPV4;
+ case ETH_P_IPV6:
+ return NFPROTO_IPV6;
+ default:
+ return NFPROTO_UNSPEC;
+ }
+}
+
+/* Map SKB connection state into the values used by flow definition. */
+static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
+{
+ u8 ct_state = OVS_CS_F_TRACKED;
+
+ switch (ctinfo) {
+ case IP_CT_ESTABLISHED_REPLY:
+ case IP_CT_RELATED_REPLY:
+ case IP_CT_NEW_REPLY:
+ ct_state |= OVS_CS_F_REPLY_DIR;
+ break;
+ default:
+ break;
+ }
+
+ switch (ctinfo) {
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ ct_state |= OVS_CS_F_ESTABLISHED;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ ct_state |= OVS_CS_F_RELATED;
+ break;
+ case IP_CT_NEW:
+ case IP_CT_NEW_REPLY:
+ ct_state |= OVS_CS_F_NEW;
+ break;
+ default:
+ break;
+ }
+
+ return ct_state;
+}
+
+static u32 ovs_ct_get_mark(const struct nf_conn *ct)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ return ct ? ct->mark : 0;
+#else
+ return 0;
+#endif
+}
+
+static void ovs_ct_get_labels(const struct nf_conn *ct,
+ struct ovs_key_ct_labels *labels)
+{
+ struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
+
+ if (cl) {
+ size_t len = cl->words * sizeof(long);
+
+ if (len > OVS_CT_LABELS_LEN)
+ len = OVS_CT_LABELS_LEN;
+ else if (len < OVS_CT_LABELS_LEN)
+ memset(labels, 0, OVS_CT_LABELS_LEN);
+ memcpy(labels, cl->bits, len);
+ } else {
+ memset(labels, 0, OVS_CT_LABELS_LEN);
+ }
+}
+
+static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
+ const struct nf_conntrack_zone *zone,
+ const struct nf_conn *ct)
+{
+ key->ct.state = state;
+ key->ct.zone = zone->id;
+ key->ct.mark = ovs_ct_get_mark(ct);
+ ovs_ct_get_labels(ct, &key->ct.labels);
+}
+
+/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.
+ */
+static void ovs_ct_update_key(const struct sk_buff *skb,
+ const struct ovs_conntrack_info *info,
+ struct sw_flow_key *key, bool post_ct)
+{
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u8 state = 0;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ state = ovs_ct_get_state(ctinfo);
+ if (!nf_ct_is_confirmed(ct))
+ state |= OVS_CS_F_NEW;
+ if (ct->master)
+ state |= OVS_CS_F_RELATED;
+ zone = nf_ct_zone(ct);
+ } else if (post_ct) {
+ state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
+ if (info)
+ zone = &info->zone;
+ }
+ __ovs_ct_update_key(key, state, zone, ct);
+}
+
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
+{
+ ovs_ct_update_key(skb, NULL, key, false);
+}
+
+int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+{
+ if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+ return -EMSGSIZE;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+ return -EMSGSIZE;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+ return -EMSGSIZE;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
+ &key->ct.labels))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+ u32 ct_mark, u32 mask)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u32 new_mark;
+
+
+ /* The connection could be invalid, in which case set_mark is no-op. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return 0;
+
+ new_mark = ct_mark | (ct->mark & ~(mask));
+ if (ct->mark != new_mark) {
+ ct->mark = new_mark;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ key->ct.mark = new_mark;
+ }
+
+ return 0;
+#else
+ return -ENOTSUPP;
+#endif
+}
+
+static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_labels *cl;
+ struct nf_conn *ct;
+ int err;
+
+ /* The connection could be invalid, in which case set_label is no-op.*/
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return 0;
+
+ cl = nf_ct_labels_find(ct);
+ if (!cl) {
+ nf_ct_labels_ext_add(ct);
+ cl = nf_ct_labels_find(ct);
+ }
+ if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN)
+ return -ENOSPC;
+
+ err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
+ OVS_CT_LABELS_LEN / sizeof(u32));
+ if (err)
+ return err;
+
+ ovs_ct_get_labels(ct, &key->ct.labels);
+ return 0;
+}
+
+/* 'skb' should already be pulled to nh_ofs. */
+static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ enum ip_conntrack_info ctinfo;
+ unsigned int protoff;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ return helper->help(skb, protoff, ct, ctinfo);
+}
+
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+static int handle_fragments(struct net *net, struct sw_flow_key *key,
+ u16 zone, struct sk_buff *skb)
+{
+ struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
+
+ if (key->eth.type == htons(ETH_P_IP)) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+ int err;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ err = ip_defrag(net, skb, user);
+ if (err)
+ return err;
+
+ ovs_cb.mru = IPCB(skb)->frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ } else if (key->eth.type == htons(ETH_P_IPV6)) {
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+ struct sk_buff *reasm;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ reasm = nf_ct_frag6_gather(net, skb, user);
+ if (!reasm)
+ return -EINPROGRESS;
+
+ if (skb == reasm) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* Don't free 'skb' even though it is one of the original
+ * fragments, as we're going to morph it into the head.
+ */
+ skb_get(skb);
+ nf_ct_frag6_consume_orig(reasm);
+
+ key->ip.proto = ipv6_hdr(reasm)->nexthdr;
+ skb_morph(skb, reasm);
+ skb->next = reasm->next;
+ consume_skb(reasm);
+ ovs_cb.mru = IP6CB(skb)->frag_max_size;
+#endif
+ } else {
+ kfree_skb(skb);
+ return -EPFNOSUPPORT;
+ }
+
+ key->ip.frag = OVS_FRAG_TYPE_NONE;
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+ *OVS_CB(skb) = ovs_cb;
+
+ return 0;
+}
+
+static struct nf_conntrack_expect *
+ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
+ u16 proto, const struct sk_buff *skb)
+{
+ struct nf_conntrack_tuple tuple;
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
+ return NULL;
+ return __nf_ct_expect_find(net, zone, &tuple);
+}
+
+/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
+ const struct ovs_conntrack_info *info)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+ if (!net_eq(net, read_pnet(&ct->ct_net)))
+ return false;
+ if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct)))
+ return false;
+ if (info->helper) {
+ struct nf_conn_help *help;
+
+ help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+ if (help && rcu_access_pointer(help->helper) != info->helper)
+ return false;
+ }
+
+ return true;
+}
+
+static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
+{
+ /* If we are recirculating packets to match on conntrack fields and
+ * committing with a separate conntrack action, then we don't need to
+ * actually run the packet through conntrack twice unless it's for a
+ * different zone.
+ */
+ if (!skb_nfct_cached(net, skb, info)) {
+ struct nf_conn *tmpl = info->ct;
+
+ /* Associate skb with specified zone. */
+ if (tmpl) {
+ if (skb->nfct)
+ nf_conntrack_put(skb->nfct);
+ nf_conntrack_get(&tmpl->ct_general);
+ skb->nfct = &tmpl->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ }
+
+ if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
+ skb) != NF_ACCEPT)
+ return -ENOENT;
+
+ if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+ WARN_ONCE(1, "helper rejected packet");
+ return -EINVAL;
+ }
+ }
+
+ ovs_ct_update_key(skb, info, key, true);
+
+ return 0;
+}
+
+/* Lookup connection and read fields into key. */
+static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
+{
+ struct nf_conntrack_expect *exp;
+
+ exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
+ if (exp) {
+ u8 state;
+
+ state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
+ __ovs_ct_update_key(key, state, &info->zone, exp->master);
+ } else {
+ int err;
+
+ err = __ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Lookup connection and confirm if unconfirmed. */
+static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
+{
+ u8 state;
+ int err;
+
+ state = key->ct.state;
+ if (key->ct.zone == info->zone.id &&
+ ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
+ /* Previous lookup has shown that this connection is already
+ * tracked and committed. Skip committing.
+ */
+ return 0;
+ }
+
+ err = __ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ return err;
+ if (nf_conntrack_confirm(skb) != NF_ACCEPT)
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(*labels); i++)
+ if (labels->ct_labels[i])
+ return true;
+
+ return false;
+}
+
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+int ovs_ct_execute(struct net *net, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info)
+{
+ int nh_ofs;
+ int err;
+
+ /* The conntrack module expects to be working at L3. */
+ nh_ofs = skb_network_offset(skb);
+ skb_pull(skb, nh_ofs);
+
+ if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
+ err = handle_fragments(net, key, info->zone.id, skb);
+ if (err)
+ return err;
+ }
+
+ if (info->commit)
+ err = ovs_ct_commit(net, key, info, skb);
+ else
+ err = ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ goto err;
+
+ if (info->mark.mask) {
+ err = ovs_ct_set_mark(skb, key, info->mark.value,
+ info->mark.mask);
+ if (err)
+ goto err;
+ }
+ if (labels_nonzero(&info->labels.mask))
+ err = ovs_ct_set_labels(skb, key, &info->labels.value,
+ &info->labels.mask);
+err:
+ skb_push(skb, nh_ofs);
+ if (err)
+ kfree_skb(skb);
+ return err;
+}
+
+static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
+ const struct sw_flow_key *key, bool log)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+
+ helper = nf_conntrack_helper_try_module_get(name, info->family,
+ key->ip.proto);
+ if (!helper) {
+ OVS_NLERR(log, "Unknown helper \"%s\"", name);
+ return -EINVAL;
+ }
+
+ help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+ if (!help) {
+ module_put(helper->me);
+ return -ENOMEM;
+ }
+
+ rcu_assign_pointer(help->helper, helper);
+ info->helper = helper;
+ return 0;
+}
+
+static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
+ [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
+ [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
+ .maxlen = sizeof(u16) },
+ [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
+ .maxlen = sizeof(struct md_mark) },
+ [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
+ .maxlen = sizeof(struct md_labels) },
+ [OVS_CT_ATTR_HELPER] = { .minlen = 1,
+ .maxlen = NF_CT_HELPER_NAME_LEN }
+};
+
+static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
+ const char **helper, bool log)
+{
+ struct nlattr *a;
+ int rem;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ int maxlen = ovs_ct_attr_lens[type].maxlen;
+ int minlen = ovs_ct_attr_lens[type].minlen;
+
+ if (type > OVS_CT_ATTR_MAX) {
+ OVS_NLERR(log,
+ "Unknown conntrack attr (type=%d, max=%d)",
+ type, OVS_CT_ATTR_MAX);
+ return -EINVAL;
+ }
+ if (nla_len(a) < minlen || nla_len(a) > maxlen) {
+ OVS_NLERR(log,
+ "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)",
+ type, nla_len(a), maxlen);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_CT_ATTR_COMMIT:
+ info->commit = true;
+ break;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ case OVS_CT_ATTR_ZONE:
+ info->zone.id = nla_get_u16(a);
+ break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case OVS_CT_ATTR_MARK: {
+ struct md_mark *mark = nla_data(a);
+
+ if (!mark->mask) {
+ OVS_NLERR(log, "ct_mark mask cannot be 0");
+ return -EINVAL;
+ }
+ info->mark = *mark;
+ break;
+ }
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case OVS_CT_ATTR_LABELS: {
+ struct md_labels *labels = nla_data(a);
+
+ if (!labels_nonzero(&labels->mask)) {
+ OVS_NLERR(log, "ct_labels mask cannot be 0");
+ return -EINVAL;
+ }
+ info->labels = *labels;
+ break;
+ }
+#endif
+ case OVS_CT_ATTR_HELPER:
+ *helper = nla_data(a);
+ if (!memchr(*helper, '\0', nla_len(a))) {
+ OVS_NLERR(log, "Invalid conntrack helper");
+ return -EINVAL;
+ }
+ break;
+ default:
+ OVS_NLERR(log, "Unknown conntrack attr (%d)",
+ type);
+ return -EINVAL;
+ }
+ }
+
+ if (rem > 0) {
+ OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr)
+{
+ if (attr == OVS_KEY_ATTR_CT_STATE)
+ return true;
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ attr == OVS_KEY_ATTR_CT_ZONE)
+ return true;
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ attr == OVS_KEY_ATTR_CT_MARK)
+ return true;
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ attr == OVS_KEY_ATTR_CT_LABELS) {
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ return ovs_net->xt_label;
+ }
+
+ return false;
+}
+
+int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa, bool log)
+{
+ struct ovs_conntrack_info ct_info;
+ const char *helper = NULL;
+ u16 family;
+ int err;
+
+ family = key_to_nfproto(key);
+ if (family == NFPROTO_UNSPEC) {
+ OVS_NLERR(log, "ct family unspecified");
+ return -EINVAL;
+ }
+
+ memset(&ct_info, 0, sizeof(ct_info));
+ ct_info.family = family;
+
+ nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID,
+ NF_CT_DEFAULT_ZONE_DIR, 0);
+
+ err = parse_ct(attr, &ct_info, &helper, log);
+ if (err)
+ return err;
+
+ /* Set up template for tracking connections in specific zones. */
+ ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL);
+ if (!ct_info.ct) {
+ OVS_NLERR(log, "Failed to allocate conntrack template");
+ return -ENOMEM;
+ }
+
+ __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
+ nf_conntrack_get(&ct_info.ct->ct_general);
+
+ if (helper) {
+ err = ovs_ct_add_helper(&ct_info, helper, key, log);
+ if (err)
+ goto err_free_ct;
+ }
+
+ err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
+ sizeof(ct_info), log);
+ if (err)
+ goto err_free_ct;
+
+ return 0;
+err_free_ct:
+ __ovs_ct_free_action(&ct_info);
+ return err;
+}
+
+int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
+ struct sk_buff *skb)
+{
+ struct nlattr *start;
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_CT);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+ return -EMSGSIZE;
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
+ return -EMSGSIZE;
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask &&
+ nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark),
+ &ct_info->mark))
+ return -EMSGSIZE;
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ labels_nonzero(&ct_info->labels.mask) &&
+ nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels),
+ &ct_info->labels))
+ return -EMSGSIZE;
+ if (ct_info->helper) {
+ if (nla_put_string(skb, OVS_CT_ATTR_HELPER,
+ ct_info->helper->name))
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, start);
+
+ return 0;
+}
+
+void ovs_ct_free_action(const struct nlattr *a)
+{
+ struct ovs_conntrack_info *ct_info = nla_data(a);
+
+ __ovs_ct_free_action(ct_info);
+}
+
+static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
+{
+ if (ct_info->helper)
+ module_put(ct_info->helper->me);
+ if (ct_info->ct)
+ nf_ct_put(ct_info->ct);
+}
+
+void ovs_ct_init(struct net *net)
+{
+ unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ if (nf_connlabels_get(net, n_bits)) {
+ ovs_net->xt_label = false;
+ OVS_NLERR(true, "Failed to set connlabel length");
+ } else {
+ ovs_net->xt_label = true;
+ }
+}
+
+void ovs_ct_exit(struct net *net)
+{
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ if (ovs_net->xt_label)
+ nf_connlabels_put(net);
+}
diff --git a/kernel/net/openvswitch/conntrack.h b/kernel/net/openvswitch/conntrack.h
new file mode 100644
index 000000000..a7544f405
--- /dev/null
+++ b/kernel/net/openvswitch/conntrack.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OVS_CONNTRACK_H
+#define OVS_CONNTRACK_H 1
+
+#include "flow.h"
+
+struct ovs_conntrack_info;
+enum ovs_key_attr;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+void ovs_ct_init(struct net *);
+void ovs_ct_exit(struct net *);
+bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
+int ovs_ct_copy_action(struct net *, const struct nlattr *,
+ const struct sw_flow_key *, struct sw_flow_actions **,
+ bool log);
+int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
+
+int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
+ const struct ovs_conntrack_info *);
+
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
+int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+void ovs_ct_free_action(const struct nlattr *a);
+
+#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
+ OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
+ OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+#else
+#include <linux/errno.h>
+
+static inline void ovs_ct_init(struct net *net) { }
+
+static inline void ovs_ct_exit(struct net *net) { }
+
+static inline bool ovs_ct_verify(struct net *net, int attr)
+{
+ return false;
+}
+
+static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **acts, bool log)
+{
+ return -ENOTSUPP;
+}
+
+static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
+{
+ return -ENOTSUPP;
+}
+
+static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info)
+{
+ kfree_skb(skb);
+ return -ENOTSUPP;
+}
+
+static inline void ovs_ct_fill_key(const struct sk_buff *skb,
+ struct sw_flow_key *key)
+{
+ key->ct.state = 0;
+ key->ct.zone = 0;
+ key->ct.mark = 0;
+ memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+}
+
+static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+ struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline void ovs_ct_free_action(const struct nlattr *a) { }
+
+#define CT_SUPPORTED_MASK 0
+#endif /* CONFIG_NF_CONNTRACK */
+#endif /* ovs_conntrack.h */
diff --git a/kernel/net/openvswitch/datapath.c b/kernel/net/openvswitch/datapath.c
index 27e14962b..deadfdab1 100644
--- a/kernel/net/openvswitch/datapath.c
+++ b/kernel/net/openvswitch/datapath.c
@@ -91,8 +91,7 @@ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
static void ovs_notify(struct genl_family *family,
struct sk_buff *skb, struct genl_info *info)
{
- genl_notify(family, skb, genl_info_net(info), info->snd_portid,
- 0, info->nlhdr, GFP_KERNEL);
+ genl_notify(family, skb, info, 0, GFP_KERNEL);
}
/**
@@ -176,7 +175,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
- return vport->ops->get_name(vport);
+ return ovs_vport_name(vport);
}
static int get_dpifindex(const struct datapath *dp)
@@ -188,7 +187,7 @@ static int get_dpifindex(const struct datapath *dp)
local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
- ifindex = netdev_vport_priv(local)->dev->ifindex;
+ ifindex = local->dev->ifindex;
else
ifindex = 0;
@@ -272,10 +271,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
struct dp_upcall_info upcall;
int error;
+ memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
- upcall.userdata = NULL;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
- upcall.egress_tun_info = NULL;
+ upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall);
if (unlikely(error))
kfree_skb(skb);
@@ -337,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
unsigned short gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
- struct ovs_skb_cb ovs_cb;
int err;
- ovs_cb = *OVS_CB(skb);
+ BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
- *OVS_CB(skb) = ovs_cb;
if (IS_ERR(segs))
return PTR_ERR(segs);
if (segs == NULL)
@@ -360,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
/* Queue all of the segments. */
skb = segs;
do {
- *OVS_CB(skb) = ovs_cb;
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;
@@ -397,9 +393,27 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
if (upcall_info->egress_tun_info)
size += nla_total_size(ovs_tun_key_attr_size());
+ /* OVS_PACKET_ATTR_ACTIONS */
+ if (upcall_info->actions_len)
+ size += nla_total_size(upcall_info->actions_len);
+
+ /* OVS_PACKET_ATTR_MRU */
+ if (upcall_info->mru)
+ size += nla_total_size(sizeof(upcall_info->mru));
+
return size;
}
+static void pad_packet(struct datapath *dp, struct sk_buff *skb)
+{
+ if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+ size_t plen = NLA_ALIGN(skb->len) - skb->len;
+
+ if (plen > 0)
+ memset(skb_put(skb, plen), 0, plen);
+ }
+}
+
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
@@ -472,12 +486,33 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
- err = ovs_nla_put_egress_tunnel_key(user_skb,
- upcall_info->egress_tun_info);
+ err = ovs_nla_put_tunnel_info(user_skb,
+ upcall_info->egress_tun_info);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
+ if (upcall_info->actions_len) {
+ nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
+ err = ovs_nla_put_actions(upcall_info->actions,
+ upcall_info->actions_len,
+ user_skb);
+ if (!err)
+ nla_nest_end(user_skb, nla);
+ else
+ nla_nest_cancel(user_skb, nla);
+ }
+
+ /* Add OVS_PACKET_ATTR_MRU */
+ if (upcall_info->mru) {
+ if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
+ upcall_info->mru)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ pad_packet(dp, user_skb);
+ }
+
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy() */
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
@@ -491,12 +526,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
goto out;
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
- if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
- size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
-
- if (plen > 0)
- memset(skb_put(user_skb, plen), 0, plen);
- }
+ pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
@@ -513,6 +543,7 @@ out:
static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
{
struct ovs_header *ovs_header = info->userhdr;
+ struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct sw_flow_actions *acts;
struct sk_buff *packet;
@@ -521,6 +552,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
struct ethhdr *eth;
struct vport *input_vport;
+ u16 mru = 0;
int len;
int err;
bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -545,34 +577,40 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have. */
- if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
+ if (eth_proto_is_802_3(eth->h_proto))
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
+ /* Set packet's mru */
+ if (a[OVS_PACKET_ATTR_MRU]) {
+ mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
+ packet->ignore_df = 1;
+ }
+ OVS_CB(packet)->mru = mru;
+
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
if (IS_ERR(flow))
goto err_kfree_skb;
- err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
- &flow->key, log);
+ err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
+ packet, &flow->key, log);
if (err)
goto err_flow_free;
- err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
+ err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
&flow->key, &acts, log);
if (err)
goto err_flow_free;
rcu_assign_pointer(flow->sf_acts, acts);
- OVS_CB(packet)->egress_tun_info = NULL;
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
rcu_read_lock();
- dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
+ dp = get_dp_rcu(net, ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto err_unlock;
@@ -584,6 +622,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (!input_vport)
goto err_unlock;
+ packet->dev = input_vport->dev;
OVS_CB(packet)->input_vport = input_vport;
sf_acts = rcu_dereference(flow->sf_acts);
@@ -610,6 +649,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
+ [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
};
static const struct genl_ops dp_packet_genl_ops[] = {
@@ -699,7 +739,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
/* OVS_FLOW_ATTR_ACTIONS */
if (should_fill_actions(ufid_flags))
- len += nla_total_size(acts->actions_len);
+ len += nla_total_size(acts->orig_len);
return len
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
@@ -866,6 +906,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
+ struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow *flow = NULL, *new_flow;
@@ -901,7 +942,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
/* Extract key. */
ovs_match_init(&match, &key, &mask);
- error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
+ error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
@@ -915,8 +956,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
goto err_kfree_flow;
/* Validate actions. */
- error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
- &acts, log);
+ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
+ &new_flow->key, &acts, log);
if (error) {
OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
goto err_kfree_flow;
@@ -930,7 +971,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
ovs_lock();
- dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ dp = get_dp(net, ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
@@ -1004,7 +1045,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
ovs_unlock();
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
@@ -1016,7 +1057,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
@@ -1024,7 +1065,8 @@ error:
}
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
-static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
+static struct sw_flow_actions *get_flow_actions(struct net *net,
+ const struct nlattr *a,
const struct sw_flow_key *key,
const struct sw_flow_mask *mask,
bool log)
@@ -1034,7 +1076,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
int error;
ovs_flow_mask_key(&masked_key, key, true, mask);
- error = ovs_nla_copy_actions(a, &masked_key, &acts, log);
+ error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);
if (error) {
OVS_NLERR(log,
"Actions may not be safe on all matching packets");
@@ -1046,6 +1088,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
+ struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
@@ -1070,15 +1113,15 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
ovs_match_init(&match, &key, &mask);
- error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
+ error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
- acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask,
- log);
+ acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
+ &mask, log);
if (IS_ERR(acts)) {
error = PTR_ERR(acts);
goto error;
@@ -1094,7 +1137,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
}
ovs_lock();
- dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ dp = get_dp(net, ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
@@ -1129,7 +1172,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
info, OVS_FLOW_CMD_NEW, false,
ufid_flags);
- if (unlikely(IS_ERR(reply))) {
+ if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_unlock_ovs;
}
@@ -1143,7 +1186,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
if (reply)
ovs_notify(&dp_flow_genl_family, reply, info);
if (old_acts)
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
return 0;
@@ -1151,7 +1194,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
error:
return error;
}
@@ -1160,6 +1203,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
+ struct net *net = sock_net(skb->sk);
struct sw_flow_key key;
struct sk_buff *reply;
struct sw_flow *flow;
@@ -1174,7 +1218,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
ovs_match_init(&match, &key, NULL);
- err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
+ err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
log);
} else if (!ufid_present) {
OVS_NLERR(log,
@@ -1218,6 +1262,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
+ struct net *net = sock_net(skb->sk);
struct sw_flow_key key;
struct sk_buff *reply;
struct sw_flow *flow = NULL;
@@ -1232,8 +1277,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
ovs_match_init(&match, &key, NULL);
- err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
- log);
+ err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
+ NULL, log);
if (unlikely(err))
return err;
}
@@ -1786,7 +1831,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
nla_put_string(skb, OVS_VPORT_ATTR_NAME,
- vport->ops->get_name(vport)))
+ ovs_vport_name(vport)))
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
@@ -2189,6 +2234,7 @@ static int __net_init ovs_init_net(struct net *net)
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
+ ovs_ct_init(net);
return 0;
}
@@ -2205,13 +2251,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
struct vport *vport;
hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (dev_net(netdev_vport->dev) == dnet)
+ if (dev_net(vport->dev) == dnet)
list_add(&vport->detach_list, head);
}
}
@@ -2226,6 +2269,7 @@ static void __net_exit ovs_exit_net(struct net *dnet)
struct net *net;
LIST_HEAD(head);
+ ovs_ct_exit(dnet);
ovs_lock();
list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
__dp_destroy(dp);
diff --git a/kernel/net/openvswitch/datapath.h b/kernel/net/openvswitch/datapath.h
index 4ec4a480b..67bdecd9f 100644
--- a/kernel/net/openvswitch/datapath.h
+++ b/kernel/net/openvswitch/datapath.h
@@ -25,10 +25,11 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/u64_stats_sync.h>
+#include <net/ip_tunnels.h>
+#include "conntrack.h"
#include "flow.h"
#include "flow_table.h"
-#include "vport.h"
#define DP_MAX_PORTS USHRT_MAX
#define DP_VPORT_HASH_BUCKETS 1024
@@ -92,14 +93,14 @@ struct datapath {
/**
* struct ovs_skb_cb - OVS data in skb CB
- * @egress_tun_key: Tunnel information about this packet on egress path.
- * NULL if the packet is not being tunneled.
* @input_vport: The original vport packet came in on. This value is cached
* when a packet is received by OVS.
+ * @mru: The maximum received fragement size; 0 if the packet is not
+ * fragmented.
*/
struct ovs_skb_cb {
- struct ovs_tunnel_info *egress_tun_info;
struct vport *input_vport;
+ u16 mru;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -112,12 +113,16 @@ struct ovs_skb_cb {
* then no packet is sent and the packet is accounted in the datapath's @n_lost
* counter.
* @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
+ * @mru: If not zero, Maximum received IP fragment size.
*/
struct dp_upcall_info {
- const struct ovs_tunnel_info *egress_tun_info;
+ struct ip_tunnel_info *egress_tun_info;
const struct nlattr *userdata;
+ const struct nlattr *actions;
+ int actions_len;
u32 portid;
u8 cmd;
+ u16 mru;
};
/**
@@ -128,7 +133,9 @@ struct dp_upcall_info {
struct ovs_net {
struct list_head dps;
struct work_struct dp_notify_work;
- struct vport_net vport_net;
+
+ /* Module reference for configuring conntrack. */
+ bool xt_label;
};
extern int ovs_net_id;
@@ -197,6 +204,10 @@ void ovs_dp_notify_wq(struct work_struct *work);
int action_fifos_init(void);
void action_fifos_exit(void);
+/* 'KEY' must not have any bits set outside of the 'MASK' */
+#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
+#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK))
+
#define OVS_NLERR(logging_allowed, fmt, ...) \
do { \
if (logging_allowed && net_ratelimit()) \
diff --git a/kernel/net/openvswitch/dp_notify.c b/kernel/net/openvswitch/dp_notify.c
index 2c631fe76..653d073ba 100644
--- a/kernel/net/openvswitch/dp_notify.c
+++ b/kernel/net/openvswitch/dp_notify.c
@@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work)
struct hlist_node *n;
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
- if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
+ if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
+ if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH))
dp_detach_port_notify(vport);
}
}
diff --git a/kernel/net/openvswitch/flow.c b/kernel/net/openvswitch/flow.c
index 2dacc7b5a..0ea128eee 100644
--- a/kernel/net/openvswitch/flow.c
+++ b/kernel/net/openvswitch/flow.c
@@ -46,9 +46,11 @@
#include <net/mpls.h>
#include <net/ndisc.h>
+#include "conntrack.h"
#include "datapath.h"
#include "flow.h"
#include "flow_netlink.h"
+#include "vport.h"
u64 ovs_flow_used_time(unsigned long flow_jiffies)
{
@@ -271,8 +273,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
key->ipv6.addr.dst = nh->daddr;
payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
- if (unlikely(payload_ofs < 0))
- return -EINVAL;
if (frag_off) {
if (frag_off & htons(~0x7))
@@ -283,6 +283,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
key->ip.frag = OVS_FRAG_TYPE_NONE;
}
+ /* Delayed handling of error in ipv6_skip_exthdr() as it
+ * always sets frag_off to a valid value which may be
+ * used to set key->ip.frag above.
+ */
+ if (unlikely(payload_ofs < 0))
+ return -EPROTO;
+
nh_len = payload_ofs - nh_ofs;
skb_set_transport_header(skb, nh_ofs + nh_len);
key->ip.proto = nexthdr;
@@ -332,7 +339,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
proto = *(__be16 *) skb->data;
__skb_pull(skb, sizeof(__be16));
- if (ntohs(proto) >= ETH_P_802_3_MIN)
+ if (eth_proto_is_802_3(proto))
return proto;
if (skb->len < sizeof(struct llc_snap_hdr))
@@ -349,7 +356,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
__skb_pull(skb, sizeof(struct llc_snap_hdr));
- if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
+ if (eth_proto_is_802_3(llc->ethertype))
return llc->ethertype;
return htons(ETH_P_802_2);
@@ -622,12 +629,16 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
nh_len = parse_ipv6hdr(skb, key);
if (unlikely(nh_len < 0)) {
- memset(&key->ip, 0, sizeof(key->ip));
- memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
- if (nh_len == -EINVAL) {
+ switch (nh_len) {
+ case -EINVAL:
+ memset(&key->ip, 0, sizeof(key->ip));
+ memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
+ /* fall-through */
+ case -EPROTO:
skb->transport_header = skb->network_header;
error = 0;
- } else {
+ break;
+ default:
error = nh_len;
}
return error;
@@ -682,24 +693,27 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
return key_extract(skb, key);
}
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
/* Extract metadata from packet. */
if (tun_info) {
- memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+ key->tun_proto = ip_tunnel_info_af(tun_info);
+ memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
- if (tun_info->options) {
+ if (tun_info->options_len) {
BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
8)) - 1
> sizeof(key->tun_opts));
- memcpy(TUN_METADATA_OPTS(key, tun_info->options_len),
- tun_info->options, tun_info->options_len);
+
+ ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len),
+ tun_info);
key->tun_opts_len = tun_info->options_len;
} else {
key->tun_opts_len = 0;
}
} else {
+ key->tun_proto = 0;
key->tun_opts_len = 0;
memset(&key->tun_key, 0, sizeof(key->tun_key));
}
@@ -707,13 +721,14 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
key->phy.priority = skb->priority;
key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
key->phy.skb_mark = skb->mark;
+ ovs_ct_fill_key(skb, key);
key->ovs_flow_hash = 0;
key->recirc_id = 0;
return key_extract(skb, key);
}
-int ovs_flow_key_extract_userspace(const struct nlattr *attr,
+int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
struct sk_buff *skb,
struct sw_flow_key *key, bool log)
{
@@ -722,7 +737,7 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr,
memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
/* Extract metadata from netlink attributes. */
- err = ovs_nla_get_flow_metadata(attr, key, log);
+ err = ovs_nla_get_flow_metadata(net, attr, key, log);
if (err)
return err;
diff --git a/kernel/net/openvswitch/flow.h b/kernel/net/openvswitch/flow.h
index a076e445c..1d055c559 100644
--- a/kernel/net/openvswitch/flow.h
+++ b/kernel/net/openvswitch/flow.h
@@ -32,31 +32,11 @@
#include <linux/time.h>
#include <linux/flex_array.h>
#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+#include <net/dst_metadata.h>
struct sk_buff;
-/* Used to memset ovs_key_ipv4_tunnel padding. */
-#define OVS_TUNNEL_KEY_SIZE \
- (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \
- FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
-
-struct ovs_key_ipv4_tunnel {
- __be64 tun_id;
- __be32 ipv4_src;
- __be32 ipv4_dst;
- __be16 tun_flags;
- u8 ipv4_tos;
- u8 ipv4_ttl;
- __be16 tp_src;
- __be16 tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
-
-struct ovs_tunnel_info {
- struct ovs_key_ipv4_tunnel tunnel;
- const void *options;
- u8 options_len;
-};
-
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -66,54 +46,9 @@ struct ovs_tunnel_info {
#define TUN_METADATA_OPTS(flow_key, opt_len) \
((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
-static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- __be32 saddr, __be32 daddr,
- u8 tos, u8 ttl,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- tun_info->tunnel.tun_id = tun_id;
- tun_info->tunnel.ipv4_src = saddr;
- tun_info->tunnel.ipv4_dst = daddr;
- tun_info->tunnel.ipv4_tos = tos;
- tun_info->tunnel.ipv4_ttl = ttl;
- tun_info->tunnel.tun_flags = tun_flags;
-
- /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
- * the upper tunnel are used.
- * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
- */
- tun_info->tunnel.tp_src = tp_src;
- tun_info->tunnel.tp_dst = tp_dst;
-
- /* Clear struct padding. */
- if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
- memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
- 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
-
- tun_info->options = opts;
- tun_info->options_len = opts_len;
-}
-
-static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- const struct iphdr *iph,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
- iph->tos, iph->ttl,
- tp_src, tp_dst,
- tun_id, tun_flags,
- opts, opts_len);
-}
+struct ovs_tunnel_info {
+ struct metadata_dst *tun_dst;
+};
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
@@ -122,12 +57,13 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
- struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
+ struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
} __packed phy; /* Safe when right after 'tun_key'. */
+ u8 tun_proto; /* Protocol of encapsulating tunnel. */
u32 ovs_flow_hash; /* Datapath computed hash value. */
u32 recirc_id; /* Recirculation ID. */
struct {
@@ -176,6 +112,14 @@ struct sw_flow_key {
} nd;
} ipv6;
};
+ struct {
+ /* Connection tracking fields. */
+ u16 zone;
+ u32 mark;
+ u8 state;
+ struct ovs_key_ct_labels labels;
+ } ct;
+
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
struct sw_flow_key_range {
@@ -209,6 +153,7 @@ struct sw_flow_id {
struct sw_flow_actions {
struct rcu_head rcu;
+ size_t orig_len; /* From flow_cmd_new netlink actions size */
u32 actions_len;
struct nlattr actions[];
};
@@ -273,11 +218,11 @@ void ovs_flow_stats_clear(struct sw_flow *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb,
struct sw_flow_key *key);
/* Extract key from packet coming from userspace. */
-int ovs_flow_key_extract_userspace(const struct nlattr *attr,
+int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
struct sk_buff *skb,
struct sw_flow_key *key, bool log);
diff --git a/kernel/net/openvswitch/flow_netlink.c b/kernel/net/openvswitch/flow_netlink.c
index c691b1a1e..d1bd4a45c 100644
--- a/kernel/net/openvswitch/flow_netlink.c
+++ b/kernel/net/openvswitch/flow_netlink.c
@@ -47,9 +47,9 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/mpls.h>
+#include <net/vxlan.h>
#include "flow_netlink.h"
-#include "vport-vxlan.h"
struct ovs_len_tbl {
int len;
@@ -57,6 +57,7 @@ struct ovs_len_tbl {
};
#define OVS_ATTR_NESTED -1
+#define OVS_ATTR_VARIABLE -2
static void update_range(struct sw_flow_match *match,
size_t offset, size_t size, bool is_mask)
@@ -261,8 +262,8 @@ size_t ovs_tun_key_attr_size(void)
* updating this function.
*/
return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */
- + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
- + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
+ + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */
+ + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
@@ -281,7 +282,7 @@ size_t ovs_key_attr_size(void)
/* Whenever adding new OVS_KEY_ FIELDS, we should consider
* updating this function.
*/
- BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22);
+ BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -290,6 +291,10 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
+ nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */
+ nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */
+ + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */
+ + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
+ + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
+ + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
@@ -300,6 +305,10 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
}
+static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = {
+ [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) },
+};
+
static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
[OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) },
[OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) },
@@ -311,8 +320,11 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
[OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) },
[OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) },
[OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 },
- [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED },
- [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED },
+ [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE },
+ [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED,
+ .next = ovs_vxlan_ext_key_lens },
+ [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) },
};
/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
@@ -339,8 +351,19 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED,
.next = ovs_tunnel_key_lens, },
[OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) },
+ [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
+ [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
};
+static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
+{
+ return expected_len == attr_len ||
+ expected_len == OVS_ATTR_NESTED ||
+ expected_len == OVS_ATTR_VARIABLE;
+}
+
static bool is_all_zero(const u8 *fp, size_t size)
{
int i;
@@ -380,7 +403,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
}
expected_len = ovs_key_lens[type].len;
- if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) {
+ if (!check_attr_len(nla_len(nla), expected_len)) {
OVS_NLERR(log, "Key %d has unexpected len %d expected %d",
type, nla_len(nla), expected_len);
return -EINVAL;
@@ -465,29 +488,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a,
return 0;
}
-static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = {
- [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 },
-};
-
-static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
+static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
struct sw_flow_match *match, bool is_mask,
bool log)
{
- struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
+ struct nlattr *a;
+ int rem;
unsigned long opt_key_offset;
- struct ovs_vxlan_opts opts;
- int err;
+ struct vxlan_metadata opts;
BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
- err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy);
- if (err < 0)
- return err;
-
memset(&opts, 0, sizeof(opts));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
- if (tb[OVS_VXLAN_EXT_GBP])
- opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]);
+ if (type > OVS_VXLAN_EXT_MAX) {
+ OVS_NLERR(log, "VXLAN extension %d out of range max %d",
+ type, OVS_VXLAN_EXT_MAX);
+ return -EINVAL;
+ }
+
+ if (!check_attr_len(nla_len(a),
+ ovs_vxlan_ext_key_lens[type].len)) {
+ OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d",
+ type, nla_len(a),
+ ovs_vxlan_ext_key_lens[type].len);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_VXLAN_EXT_GBP:
+ opts.gbp = nla_get_u32(a);
+ break;
+ default:
+ OVS_NLERR(log, "Unknown VXLAN extension attribute %d",
+ type);
+ return -EINVAL;
+ }
+ }
+ if (rem) {
+ OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.",
+ rem);
+ return -EINVAL;
+ }
if (!is_mask)
SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
@@ -500,15 +544,15 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
return 0;
}
-static int ipv4_tun_from_nlattr(const struct nlattr *attr,
- struct sw_flow_match *match, bool is_mask,
- bool log)
+static int ip_tun_from_nlattr(const struct nlattr *attr,
+ struct sw_flow_match *match, bool is_mask,
+ bool log)
{
- struct nlattr *a;
- int rem;
- bool ttl = false;
+ bool ttl = false, ipv4 = false, ipv6 = false;
__be16 tun_flags = 0;
int opts_type = 0;
+ struct nlattr *a;
+ int rem;
nla_for_each_nested(a, attr, rem) {
int type = nla_type(a);
@@ -520,8 +564,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
return -EINVAL;
}
- if (ovs_tunnel_key_lens[type].len != nla_len(a) &&
- ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) {
+ if (!check_attr_len(nla_len(a),
+ ovs_tunnel_key_lens[type].len)) {
OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d",
type, nla_len(a), ovs_tunnel_key_lens[type].len);
return -EINVAL;
@@ -534,19 +578,31 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
tun_flags |= TUNNEL_KEY;
break;
case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src,
nla_get_in_addr(a), is_mask);
+ ipv4 = true;
break;
case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst,
nla_get_in_addr(a), is_mask);
+ ipv4 = true;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst,
+ nla_get_in6_addr(a), is_mask);
+ ipv6 = true;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV6_DST:
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst,
+ nla_get_in6_addr(a), is_mask);
+ ipv6 = true;
break;
case OVS_TUNNEL_KEY_ATTR_TOS:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+ SW_FLOW_KEY_PUT(match, tun_key.tos,
nla_get_u8(a), is_mask);
break;
case OVS_TUNNEL_KEY_ATTR_TTL:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+ SW_FLOW_KEY_PUT(match, tun_key.ttl,
nla_get_u8(a), is_mask);
ttl = true;
break;
@@ -594,28 +650,46 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
opts_type = type;
break;
default:
- OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d",
+ OVS_NLERR(log, "Unknown IP tunnel attribute %d",
type);
return -EINVAL;
}
}
SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+ if (is_mask)
+ SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true);
+ else
+ SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET,
+ false);
if (rem > 0) {
- OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.",
+ OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.",
rem);
return -EINVAL;
}
+ if (ipv4 && ipv6) {
+ OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes");
+ return -EINVAL;
+ }
+
if (!is_mask) {
- if (!match->key->tun_key.ipv4_dst) {
+ if (!ipv4 && !ipv6) {
+ OVS_NLERR(log, "IP tunnel dst address not specified");
+ return -EINVAL;
+ }
+ if (ipv4 && !match->key->tun_key.u.ipv4.dst) {
OVS_NLERR(log, "IPv4 tunnel dst address is zero");
return -EINVAL;
}
+ if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) {
+ OVS_NLERR(log, "IPv6 tunnel dst address is zero");
+ return -EINVAL;
+ }
if (!ttl) {
- OVS_NLERR(log, "IPv4 tunnel TTL not specified.");
+ OVS_NLERR(log, "IP tunnel TTL not specified.");
return -EINVAL;
}
}
@@ -626,7 +700,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
static int vxlan_opt_to_nlattr(struct sk_buff *skb,
const void *tun_opts, int swkey_tun_opts_len)
{
- const struct ovs_vxlan_opts *opts = tun_opts;
+ const struct vxlan_metadata *opts = tun_opts;
struct nlattr *nla;
nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
@@ -640,25 +714,40 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb,
return 0;
}
-static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
- const void *tun_opts, int swkey_tun_opts_len)
+static int __ip_tun_to_nlattr(struct sk_buff *skb,
+ const struct ip_tunnel_key *output,
+ const void *tun_opts, int swkey_tun_opts_len,
+ unsigned short tun_proto)
{
if (output->tun_flags & TUNNEL_KEY &&
nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
return -EMSGSIZE;
- if (output->ipv4_src &&
- nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC,
- output->ipv4_src))
- return -EMSGSIZE;
- if (output->ipv4_dst &&
- nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
- output->ipv4_dst))
- return -EMSGSIZE;
- if (output->ipv4_tos &&
- nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+ switch (tun_proto) {
+ case AF_INET:
+ if (output->u.ipv4.src &&
+ nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC,
+ output->u.ipv4.src))
+ return -EMSGSIZE;
+ if (output->u.ipv4.dst &&
+ nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
+ output->u.ipv4.dst))
+ return -EMSGSIZE;
+ break;
+ case AF_INET6:
+ if (!ipv6_addr_any(&output->u.ipv6.src) &&
+ nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC,
+ &output->u.ipv6.src))
+ return -EMSGSIZE;
+ if (!ipv6_addr_any(&output->u.ipv6.dst) &&
+ nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST,
+ &output->u.ipv6.dst))
+ return -EMSGSIZE;
+ break;
+ }
+ if (output->tos &&
+ nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos))
return -EMSGSIZE;
- if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+ if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl))
return -EMSGSIZE;
if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
@@ -675,7 +764,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
if ((output->tun_flags & TUNNEL_OAM) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
return -EMSGSIZE;
- if (tun_opts) {
+ if (swkey_tun_opts_len) {
if (output->tun_flags & TUNNEL_GENEVE_OPT &&
nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
swkey_tun_opts_len, tun_opts))
@@ -688,9 +777,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
return 0;
}
-static int ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
- const void *tun_opts, int swkey_tun_opts_len)
+static int ip_tun_to_nlattr(struct sk_buff *skb,
+ const struct ip_tunnel_key *output,
+ const void *tun_opts, int swkey_tun_opts_len,
+ unsigned short tun_proto)
{
struct nlattr *nla;
int err;
@@ -699,7 +789,8 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
if (!nla)
return -EMSGSIZE;
- err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len);
+ err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len,
+ tun_proto);
if (err)
return err;
@@ -707,17 +798,18 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
return 0;
}
-int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
- const struct ovs_tunnel_info *egress_tun_info)
+int ovs_nla_put_tunnel_info(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
{
- return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
- egress_tun_info->options,
- egress_tun_info->options_len);
+ return __ip_tun_to_nlattr(skb, &tun_info->key,
+ ip_tunnel_info_opts(tun_info),
+ tun_info->options_len,
+ ip_tunnel_info_af(tun_info));
}
-static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
- const struct nlattr **a, bool is_mask,
- bool log)
+static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
+ u64 *attrs, const struct nlattr **a,
+ bool is_mask, bool log)
{
if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
@@ -763,21 +855,58 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
*attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
}
if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
- if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
- is_mask, log) < 0)
+ if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+ is_mask, log) < 0)
return -EINVAL;
*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
}
+
+ if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) &&
+ ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) {
+ u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]);
+
+ if (ct_state & ~CT_SUPPORTED_MASK) {
+ OVS_NLERR(log, "ct_state flags %08x unsupported",
+ ct_state);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
+ ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
+ u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
+
+ SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
+ ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) {
+ u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]);
+
+ SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) &&
+ ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) {
+ const struct ovs_key_ct_labels *cl;
+
+ cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]);
+ SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels,
+ sizeof(*cl), is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
+ }
return 0;
}
-static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
- const struct nlattr **a, bool is_mask,
- bool log)
+static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
+ u64 attrs, const struct nlattr **a,
+ bool is_mask, bool log)
{
int err;
- err = metadata_from_nlattrs(match, &attrs, a, is_mask, log);
+ err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log);
if (err)
return err;
@@ -816,7 +945,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
if (is_mask) {
/* Always exact match EtherType. */
eth_type = htons(0xffff);
- } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ } else if (!eth_proto_is_802_3(eth_type)) {
OVS_NLERR(log, "EtherType %x is less than min %x",
ntohs(eth_type), ETH_P_802_3_MIN);
return -EINVAL;
@@ -1012,10 +1141,16 @@ static void nlattr_set(struct nlattr *attr, u8 val,
/* The nlattr stream should already have been validated */
nla_for_each_nested(nla, attr, rem) {
- if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED)
- nlattr_set(nla, val, tbl[nla_type(nla)].next);
- else
+ if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) {
+ if (tbl[nla_type(nla)].next)
+ tbl = tbl[nla_type(nla)].next;
+ nlattr_set(nla, val, tbl);
+ } else {
memset(nla_data(nla), val, nla_len(nla));
+ }
+
+ if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE)
+ *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK;
}
}
@@ -1029,6 +1164,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val)
* mask. In case the 'mask' is NULL, the flow is treated as exact match
* flow. Otherwise, it is treated as a wildcarded flow, except the mask
* does not include any don't care bit.
+ * @net: Used to determine per-namespace field support.
* @match: receives the extracted flow match information.
* @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
* sequence. The fields should of the packet that triggered the creation
@@ -1039,7 +1175,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val)
* probing for feature compatibility this should be passed in as false to
* suppress unnecessary error logging.
*/
-int ovs_nla_get_match(struct sw_flow_match *match,
+int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
const struct nlattr *nla_key,
const struct nlattr *nla_mask,
bool log)
@@ -1089,7 +1225,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
}
}
- err = ovs_key_from_nlattrs(match, key_attrs, a, false, log);
+ err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
if (err)
return err;
@@ -1116,7 +1252,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
/* The userspace does not send tunnel attributes that
* are 0, but we should not wildcard them nonetheless.
*/
- if (match->key->tun_key.ipv4_dst)
+ if (match->key->tun_proto)
SW_FLOW_KEY_MEMSET_FIELD(match, tun_key,
0xff, true);
@@ -1169,7 +1305,8 @@ int ovs_nla_get_match(struct sw_flow_match *match,
}
}
- err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log);
+ err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
+ log);
if (err)
goto free_newmask;
}
@@ -1250,7 +1387,7 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
* extracted from the packet itself.
*/
-int ovs_nla_get_flow_metadata(const struct nlattr *attr,
+int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
struct sw_flow_key *key,
bool log)
{
@@ -1266,9 +1403,10 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr,
memset(&match, 0, sizeof(match));
match.key = key;
+ memset(&key->ct, 0, sizeof(key->ct));
key->phy.in_port = DP_MAX_PORTS;
- return metadata_from_nlattrs(&match, &attrs, a, false, log);
+ return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
}
static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
@@ -1287,14 +1425,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
goto nla_put_failure;
- if ((swkey->tun_key.ipv4_dst || is_mask)) {
+ if ((swkey->tun_proto || is_mask)) {
const void *opts = NULL;
if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
- if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
- swkey->tun_opts_len))
+ if (ip_tun_to_nlattr(skb, &output->tun_key, opts,
+ swkey->tun_opts_len, swkey->tun_proto))
goto nla_put_failure;
}
@@ -1314,6 +1452,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
goto nla_put_failure;
+ if (ovs_ct_put_key(output, skb))
+ goto nla_put_failure;
+
nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
if (!nla)
goto nla_put_failure;
@@ -1548,11 +1689,51 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log)
return sfa;
}
+static void ovs_nla_free_set_action(const struct nlattr *a)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ struct ovs_tunnel_info *ovs_tun;
+
+ switch (nla_type(ovs_key)) {
+ case OVS_KEY_ATTR_TUNNEL_INFO:
+ ovs_tun = nla_data(ovs_key);
+ dst_release((struct dst_entry *)ovs_tun->tun_dst);
+ break;
+ }
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ const struct nlattr *a;
+ int rem;
+
+ if (!sf_acts)
+ return;
+
+ nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_SET:
+ ovs_nla_free_set_action(a);
+ break;
+ case OVS_ACTION_ATTR_CT:
+ ovs_ct_free_action(a);
+ break;
+ }
+ }
+
+ kfree(sf_acts);
+}
+
+static void __ovs_nla_free_flow_actions(struct rcu_head *head)
+{
+ ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu));
+}
+
/* Schedules 'sf_acts' to be freed after the next RCU grace period.
* The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts)
{
- kfree_rcu(sf_acts, rcu);
+ call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions);
}
static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
@@ -1582,6 +1763,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
acts->actions_len = (*sfa)->actions_len;
+ acts->orig_len = (*sfa)->orig_len;
kfree(*sfa);
*sfa = acts;
@@ -1609,8 +1791,8 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa,
return a;
}
-static int add_action(struct sw_flow_actions **sfa, int attrtype,
- void *data, int len, bool log)
+int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data,
+ int len, bool log)
{
struct nlattr *a;
@@ -1625,7 +1807,7 @@ static inline int add_nested_action_start(struct sw_flow_actions **sfa,
int used = (*sfa)->actions_len;
int err;
- err = add_action(sfa, attrtype, NULL, 0, log);
+ err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log);
if (err)
return err;
@@ -1641,12 +1823,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
a->nla_len = sfa->actions_len - st_offset;
}
-static int __ovs_nla_copy_actions(const struct nlattr *attr,
+static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
int depth, struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log);
-static int validate_and_copy_sample(const struct nlattr *attr,
+static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key, int depth,
struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log)
@@ -1678,15 +1860,15 @@ static int validate_and_copy_sample(const struct nlattr *attr,
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log);
if (start < 0)
return start;
- err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
- nla_data(probability), sizeof(u32), log);
+ err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
+ nla_data(probability), sizeof(u32), log);
if (err)
return err;
st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log);
if (st_acts < 0)
return st_acts;
- err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa,
+ err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa,
eth_type, vlan_tci, log);
if (err)
return err;
@@ -1746,12 +1928,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
{
struct sw_flow_match match;
struct sw_flow_key key;
- struct ovs_tunnel_info *tun_info;
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *tun_info;
+ struct ovs_tunnel_info *ovs_tun;
struct nlattr *a;
int err = 0, start, opts_type;
ovs_match_init(&match, &key, NULL);
- opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
+ opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
if (opts_type < 0)
return opts_type;
@@ -1771,27 +1955,33 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (start < 0)
return start;
+ tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
+ if (!tun_dst)
+ return -ENOMEM;
+
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
- sizeof(*tun_info) + key.tun_opts_len, log);
- if (IS_ERR(a))
+ sizeof(*ovs_tun), log);
+ if (IS_ERR(a)) {
+ dst_release((struct dst_entry *)tun_dst);
return PTR_ERR(a);
+ }
- tun_info = nla_data(a);
- tun_info->tunnel = key.tun_key;
- tun_info->options_len = key.tun_opts_len;
+ ovs_tun = nla_data(a);
+ ovs_tun->tun_dst = tun_dst;
- if (tun_info->options_len) {
- /* We need to store the options in the action itself since
- * everything else will go away after flow setup. We can append
- * it to tun_info and then point there.
- */
- memcpy((tun_info + 1),
- TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len);
- tun_info->options = (tun_info + 1);
- } else {
- tun_info->options = NULL;
- }
+ tun_info = &tun_dst->u.tun_info;
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ if (key.tun_proto == AF_INET6)
+ tun_info->mode |= IP_TUNNEL_INFO_IPV6;
+ tun_info->key = key.tun_key;
+ /* We need to store the options in the action itself since
+ * everything else will go away after flow setup. We can append
+ * it to tun_info and then point there.
+ */
+ ip_tunnel_info_opts_set(tun_info,
+ TUN_METADATA_OPTS(&key, key.tun_opts_len),
+ key.tun_opts_len);
add_nested_action_end(*sfa, start);
return err;
@@ -1829,8 +2019,7 @@ static int validate_set(const struct nlattr *a,
key_len /= 2;
if (key_type > OVS_KEY_ATTR_MAX ||
- (ovs_key_lens[key_type].len != key_len &&
- ovs_key_lens[key_type].len != OVS_ATTR_NESTED))
+ !check_attr_len(key_len, ovs_key_lens[key_type].len))
return -EINVAL;
if (masked && !validate_masked(nla_data(ovs_key), key_len))
@@ -1843,6 +2032,8 @@ static int validate_set(const struct nlattr *a,
case OVS_KEY_ATTR_PRIORITY:
case OVS_KEY_ATTR_SKB_MARK:
+ case OVS_KEY_ATTR_CT_MARK:
+ case OVS_KEY_ATTR_CT_LABELS:
case OVS_KEY_ATTR_ETHERNET:
break;
@@ -2008,7 +2199,7 @@ static int copy_action(const struct nlattr *from,
return 0;
}
-static int __ovs_nla_copy_actions(const struct nlattr *attr,
+static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
int depth, struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log)
@@ -2032,7 +2223,8 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr,
[OVS_ACTION_ATTR_SET] = (u32)-1,
[OVS_ACTION_ATTR_SET_MASKED] = (u32)-1,
[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
- [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash)
+ [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
+ [OVS_ACTION_ATTR_CT] = (u32)-1,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2139,13 +2331,20 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr,
break;
case OVS_ACTION_ATTR_SAMPLE:
- err = validate_and_copy_sample(a, key, depth, sfa,
+ err = validate_and_copy_sample(net, a, key, depth, sfa,
eth_type, vlan_tci, log);
if (err)
return err;
skip_copy = true;
break;
+ case OVS_ACTION_ATTR_CT:
+ err = ovs_ct_copy_action(net, a, key, sfa, log);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
@@ -2164,7 +2363,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr,
}
/* 'key' must be the masked key. */
-int ovs_nla_copy_actions(const struct nlattr *attr,
+int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa, bool log)
{
@@ -2174,10 +2373,11 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
if (IS_ERR(*sfa))
return PTR_ERR(*sfa);
- err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
+ (*sfa)->orig_len = nla_len(attr);
+ err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
key->eth.tci, log);
if (err)
- kfree(*sfa);
+ ovs_nla_free_flow_actions(*sfa);
return err;
}
@@ -2227,16 +2427,17 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
switch (key_type) {
case OVS_KEY_ATTR_TUNNEL_INFO: {
- struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+ struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key);
+ struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info;
start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
if (!start)
return -EMSGSIZE;
- err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
- tun_info->options_len ?
- tun_info->options : NULL,
- tun_info->options_len);
+ err = ip_tun_to_nlattr(skb, &tun_info->key,
+ ip_tunnel_info_opts(tun_info),
+ tun_info->options_len,
+ ip_tunnel_info_af(tun_info));
if (err)
return err;
nla_nest_end(skb, start);
@@ -2298,6 +2499,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
if (err)
return err;
break;
+
+ case OVS_ACTION_ATTR_CT:
+ err = ovs_ct_action_to_attr(nla_data(a), skb);
+ if (err)
+ return err;
+ break;
+
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
diff --git a/kernel/net/openvswitch/flow_netlink.h b/kernel/net/openvswitch/flow_netlink.h
index 5c3d75bff..47dd142ec 100644
--- a/kernel/net/openvswitch/flow_netlink.h
+++ b/kernel/net/openvswitch/flow_netlink.h
@@ -45,29 +45,34 @@ void ovs_match_init(struct sw_flow_match *match,
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *,
- bool log);
+int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
+ struct sw_flow_key *, bool log);
int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
-int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
- const struct nlattr *mask, bool log);
-int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
- const struct ovs_tunnel_info *);
+int ovs_nla_get_match(struct net *, struct sw_flow_match *,
+ const struct nlattr *key, const struct nlattr *mask,
+ bool log);
+
+int ovs_nla_put_tunnel_info(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info);
bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
const struct sw_flow_key *key, bool log);
u32 ovs_nla_get_ufid_flags(const struct nlattr *attr);
-int ovs_nla_copy_actions(const struct nlattr *attr,
+int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa, bool log);
+int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype,
+ void *data, int len, bool log);
int ovs_nla_put_actions(const struct nlattr *attr,
int len, struct sk_buff *skb);
void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
#endif /* flow_netlink.h */
diff --git a/kernel/net/openvswitch/flow_table.c b/kernel/net/openvswitch/flow_table.c
index aa349514e..d073fff82 100644
--- a/kernel/net/openvswitch/flow_table.c
+++ b/kernel/net/openvswitch/flow_table.c
@@ -18,6 +18,7 @@
#include "flow.h"
#include "datapath.h"
+#include "flow_netlink.h"
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
@@ -92,7 +93,8 @@ struct sw_flow *ovs_flow_alloc(void)
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
- GFP_KERNEL | __GFP_ZERO, 0);
+ GFP_KERNEL | __GFP_ZERO,
+ node_online(0) ? 0 : NUMA_NO_NODE);
if (!stats)
goto err;
@@ -144,7 +146,8 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
- kfree((struct sw_flow_actions __force *)flow->sf_acts);
+ if (flow->sf_acts)
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
for_each_node(node)
if (flow->stats[node])
kmem_cache_free(flow_stats_cache,
@@ -425,7 +428,7 @@ static u32 flow_hash(const struct sw_flow_key *key,
static int flow_key_start(const struct sw_flow_key *key)
{
- if (key->tun_key.ipv4_dst)
+ if (key->tun_proto)
return 0;
else
return rounddown(offsetof(struct sw_flow_key, phy),
@@ -753,7 +756,7 @@ int ovs_flow_init(void)
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
- + (num_possible_nodes()
+ + (nr_node_ids
* sizeof(struct flow_stats *)),
0, 0, NULL);
if (flow_cache == NULL)
diff --git a/kernel/net/openvswitch/vport-geneve.c b/kernel/net/openvswitch/vport-geneve.c
index bf02fd580..e41cd12d9 100644
--- a/kernel/net/openvswitch/vport-geneve.c
+++ b/kernel/net/openvswitch/vport-geneve.c
@@ -26,113 +26,42 @@
#include "datapath.h"
#include "vport.h"
+#include "vport-netdev.h"
static struct vport_ops ovs_geneve_vport_ops;
-
/**
* struct geneve_port - Keeps track of open UDP ports
- * @gs: The socket created for this port number.
- * @name: vport name.
+ * @dst_port: destination port.
*/
struct geneve_port {
- struct geneve_sock *gs;
- char name[IFNAMSIZ];
+ u16 port_no;
};
-static LIST_HEAD(geneve_ports);
-
static inline struct geneve_port *geneve_vport(const struct vport *vport)
{
return vport_priv(vport);
}
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
- return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
-/* Convert 64 bit tunnel ID to 24 bit VNI. */
-static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
- vni[0] = (__force __u8)(tun_id >> 16);
- vni[1] = (__force __u8)(tun_id >> 8);
- vni[2] = (__force __u8)tun_id;
-#else
- vni[0] = (__force __u8)((__force u64)tun_id >> 40);
- vni[1] = (__force __u8)((__force u64)tun_id >> 48);
- vni[2] = (__force __u8)((__force u64)tun_id >> 56);
-#endif
-}
-
-/* Convert 24 bit VNI to 64 bit tunnel ID. */
-static __be64 vni_to_tunnel_id(const __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
- return (vni[0] << 16) | (vni[1] << 8) | vni[2];
-#else
- return (__force __be64)(((__force u64)vni[0] << 40) |
- ((__force u64)vni[1] << 48) |
- ((__force u64)vni[2] << 56));
-#endif
-}
-
-static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
-{
- struct vport *vport = gs->rcv_data;
- struct genevehdr *geneveh = geneve_hdr(skb);
- int opts_len;
- struct ovs_tunnel_info tun_info;
- __be64 key;
- __be16 flags;
-
- opts_len = geneveh->opt_len * 4;
-
- flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
- (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
- (geneveh->oam ? TUNNEL_OAM : 0) |
- (geneveh->critical ? TUNNEL_CRIT_OPT : 0);
-
- key = vni_to_tunnel_id(geneveh->vni);
-
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags,
- geneveh->options, opts_len);
-
- ovs_vport_receive(vport, skb, &tun_info);
-}
-
static int geneve_get_options(const struct vport *vport,
struct sk_buff *skb)
{
struct geneve_port *geneve_port = geneve_vport(vport);
- struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk);
- if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport)))
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no))
return -EMSGSIZE;
return 0;
}
-static void geneve_tnl_destroy(struct vport *vport)
-{
- struct geneve_port *geneve_port = geneve_vport(vport);
-
- geneve_sock_release(geneve_port->gs);
-
- ovs_vport_deferred_free(vport);
-}
-
static struct vport *geneve_tnl_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
struct geneve_port *geneve_port;
- struct geneve_sock *gs;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- int err;
u16 dst_port;
+ int err;
if (!options) {
err = -EINVAL;
@@ -154,106 +83,40 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
return vport;
geneve_port = geneve_vport(vport);
- strncpy(geneve_port->name, parms->name, IFNAMSIZ);
+ geneve_port->port_no = dst_port;
- gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0);
- if (IS_ERR(gs)) {
+ rtnl_lock();
+ dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
ovs_vport_free(vport);
- return (void *)gs;
+ return ERR_CAST(dev);
}
- geneve_port->gs = gs;
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
error:
return ERR_PTR(err);
}
-static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
+static struct vport *geneve_create(const struct vport_parms *parms)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct ovs_tunnel_info *tun_info;
- struct net *net = ovs_dp_get_net(vport->dp);
- struct geneve_port *geneve_port = geneve_vport(vport);
- __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
- __be16 sport;
- struct rtable *rt;
- struct flowi4 fl;
- u8 vni[3], opts_len, *opts;
- __be16 df;
- int err;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- if (unlikely(!tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &tun_info->tunnel;
- rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
- tunnel_id_to_vni(tun_key->tun_id, vni);
- skb->ignore_df = 1;
-
- if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) {
- opts = (u8 *)tun_info->options;
- opts_len = tun_info->options_len;
- } else {
- opts = NULL;
- opts_len = 0;
- }
-
- err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
- tun_key->ipv4_dst, tun_key->ipv4_tos,
- tun_key->ipv4_ttl, df, sport, dport,
- tun_key->tun_flags, vni, opts_len, opts,
- !!(tun_key->tun_flags & TUNNEL_CSUM), false);
- if (err < 0)
- ip_rt_put(rt);
- return err;
-
-error:
- kfree_skb(skb);
- return err;
-}
-
-static const char *geneve_get_name(const struct vport *vport)
-{
- struct geneve_port *geneve_port = geneve_vport(vport);
-
- return geneve_port->name;
-}
+ struct vport *vport;
-static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- struct geneve_port *geneve_port = geneve_vport(vport);
- struct net *net = ovs_dp_get_net(vport->dp);
- __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
- __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+ vport = geneve_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
- /* Get tp_src and tp_dst, refert to geneve_build_header().
- */
- return ovs_tunnel_get_egress_info(egress_tun_info,
- ovs_dp_get_net(vport->dp),
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_UDP, skb->mark, sport, dport);
+ return ovs_netdev_link(vport, parms->name);
}
static struct vport_ops ovs_geneve_vport_ops = {
.type = OVS_VPORT_TYPE_GENEVE,
- .create = geneve_tnl_create,
- .destroy = geneve_tnl_destroy,
- .get_name = geneve_get_name,
+ .create = geneve_create,
+ .destroy = ovs_netdev_tunnel_destroy,
.get_options = geneve_get_options,
- .send = geneve_tnl_send,
- .owner = THIS_MODULE,
- .get_egress_tun_info = geneve_get_egress_tun_info,
+ .send = dev_queue_xmit,
};
static int __init ovs_geneve_tnl_init(void)
diff --git a/kernel/net/openvswitch/vport-gre.c b/kernel/net/openvswitch/vport-gre.c
index f17ac9642..7f8897f33 100644
--- a/kernel/net/openvswitch/vport-gre.c
+++ b/kernel/net/openvswitch/vport-gre.c
@@ -45,254 +45,50 @@
#include "datapath.h"
#include "vport.h"
+#include "vport-netdev.h"
static struct vport_ops ovs_gre_vport_ops;
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 be64_get_low32(__be64 x)
+static struct vport *gre_tnl_create(const struct vport_parms *parms)
{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
-static __be16 filter_tnl_flags(__be16 flags)
-{
- return flags & (TUNNEL_CSUM | TUNNEL_KEY);
-}
-
-static struct sk_buff *__build_header(struct sk_buff *skb,
- int tunnel_hlen)
-{
- struct tnl_ptk_info tpi;
- const struct ovs_key_ipv4_tunnel *tun_key;
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
-
- skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
- if (IS_ERR(skb))
- return skb;
-
- tpi.flags = filter_tnl_flags(tun_key->tun_flags);
- tpi.proto = htons(ETH_P_TEB);
- tpi.key = be64_get_low32(tun_key->tun_id);
- tpi.seq = 0;
- gre_build_header(skb, &tpi, tunnel_hlen);
-
- return skb;
-}
-
-static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
-#endif
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_rcv(struct sk_buff *skb,
- const struct tnl_ptk_info *tpi)
-{
- struct ovs_tunnel_info tun_info;
- struct ovs_net *ovs_net;
- struct vport *vport;
- __be64 key;
-
- ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
- vport = rcu_dereference(ovs_net->vport_net.gre_vport);
- if (unlikely(!vport))
- return PACKET_REJECT;
-
- key = key_to_tunnel_id(tpi->key, tpi->seq);
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
- filter_tnl_flags(tpi->flags), NULL, 0);
-
- ovs_vport_receive(vport, skb, &tun_info);
- return PACKET_RCVD;
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
-{
- struct ovs_net *ovs_net;
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct net_device *dev;
struct vport *vport;
- ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
- vport = rcu_dereference(ovs_net->vport_net.gre_vport);
-
- if (unlikely(!vport))
- return PACKET_REJECT;
- else
- return PACKET_RCVD;
-}
-
-static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct flowi4 fl;
- struct rtable *rt;
- int min_headroom;
- int tunnel_hlen;
- __be16 df;
- int err;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto err_free_skb;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
- rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto err_free_skb;
- }
-
- tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + tunnel_hlen + sizeof(struct iphdr)
- + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (unlikely(!skb)) {
- err = -ENOMEM;
- goto err_free_rt;
- }
-
- /* Push Tunnel header. */
- skb = __build_header(skb, tunnel_hlen);
- if (IS_ERR(skb)) {
- err = PTR_ERR(skb);
- skb = NULL;
- goto err_free_rt;
+ vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ rtnl_lock();
+ dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_CAST(dev);
}
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
- htons(IP_DF) : 0;
-
- skb->ignore_df = 1;
-
- return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
- tun_key->ipv4_dst, IPPROTO_GRE,
- tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false);
-err_free_rt:
- ip_rt_put(rt);
-err_free_skb:
- kfree_skb(skb);
- return err;
-}
-
-static struct gre_cisco_protocol gre_protocol = {
- .handler = gre_rcv,
- .err_handler = gre_err,
- .priority = 1,
-};
-
-static int gre_ports;
-static int gre_init(void)
-{
- int err;
-
- gre_ports++;
- if (gre_ports > 1)
- return 0;
-
- err = gre_cisco_register(&gre_protocol);
- if (err)
- pr_warn("cannot register gre protocol handler\n");
-
- return err;
-}
-
-static void gre_exit(void)
-{
- gre_ports--;
- if (gre_ports > 0)
- return;
-
- gre_cisco_unregister(&gre_protocol);
-}
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
-static const char *gre_get_name(const struct vport *vport)
-{
- return vport_priv(vport);
+ return vport;
}
static struct vport *gre_create(const struct vport_parms *parms)
{
- struct net *net = ovs_dp_get_net(parms->dp);
- struct ovs_net *ovs_net;
struct vport *vport;
- int err;
- err = gre_init();
- if (err)
- return ERR_PTR(err);
-
- ovs_net = net_generic(net, ovs_net_id);
- if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
- vport = ERR_PTR(-EEXIST);
- goto error;
- }
-
- vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+ vport = gre_tnl_create(parms);
if (IS_ERR(vport))
- goto error;
-
- strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
- rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
- return vport;
+ return vport;
-error:
- gre_exit();
- return vport;
-}
-
-static void gre_tnl_destroy(struct vport *vport)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct ovs_net *ovs_net;
-
- ovs_net = net_generic(net, ovs_net_id);
-
- RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL);
- ovs_vport_deferred_free(vport);
- gre_exit();
-}
-
-static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- return ovs_tunnel_get_egress_info(egress_tun_info,
- ovs_dp_get_net(vport->dp),
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_GRE, skb->mark, 0, 0);
+ return ovs_netdev_link(vport, parms->name);
}
static struct vport_ops ovs_gre_vport_ops = {
.type = OVS_VPORT_TYPE_GRE,
.create = gre_create,
- .destroy = gre_tnl_destroy,
- .get_name = gre_get_name,
- .send = gre_tnl_send,
- .get_egress_tun_info = gre_get_egress_tun_info,
- .owner = THIS_MODULE,
+ .send = dev_queue_xmit,
+ .destroy = ovs_netdev_tunnel_destroy,
};
static int __init ovs_gre_tnl_init(void)
diff --git a/kernel/net/openvswitch/vport-internal_dev.c b/kernel/net/openvswitch/vport-internal_dev.c
index 6a55f7105..ec76398a7 100644
--- a/kernel/net/openvswitch/vport-internal_dev.c
+++ b/kernel/net/openvswitch/vport-internal_dev.c
@@ -43,35 +43,26 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev)
return netdev_priv(netdev);
}
-/* This function is only called by the kernel network layer.*/
-static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev,
- struct rtnl_link_stats64 *stats)
-{
- struct vport *vport = ovs_internal_dev_get_vport(netdev);
- struct ovs_vport_stats vport_stats;
-
- ovs_vport_get_stats(vport, &vport_stats);
-
- /* The tx and rx stats need to be swapped because the
- * switch and host OS have opposite perspectives. */
- stats->rx_packets = vport_stats.tx_packets;
- stats->tx_packets = vport_stats.rx_packets;
- stats->rx_bytes = vport_stats.tx_bytes;
- stats->tx_bytes = vport_stats.rx_bytes;
- stats->rx_errors = vport_stats.tx_errors;
- stats->tx_errors = vport_stats.rx_errors;
- stats->rx_dropped = vport_stats.tx_dropped;
- stats->tx_dropped = vport_stats.rx_dropped;
-
- return stats;
-}
-
/* Called with rcu_read_lock_bh. */
static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
+ int len, err;
+
+ len = skb->len;
rcu_read_lock();
- ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
+ err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
rcu_read_unlock();
+
+ if (likely(!err)) {
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += len;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ netdev->stats.tx_errors++;
+ }
return 0;
}
@@ -115,13 +106,45 @@ static void internal_dev_destructor(struct net_device *dev)
free_netdev(dev);
}
+static struct rtnl_link_stats64 *
+internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+ stats->rx_errors = dev->stats.rx_errors;
+ stats->tx_errors = dev->stats.tx_errors;
+ stats->tx_dropped = dev->stats.tx_dropped;
+ stats->rx_dropped = dev->stats.rx_dropped;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *percpu_stats;
+ struct pcpu_sw_netstats local_stats;
+ unsigned int start;
+
+ percpu_stats = per_cpu_ptr(dev->tstats, i);
+
+ do {
+ start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
+ local_stats = *percpu_stats;
+ } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
+
+ stats->rx_bytes += local_stats.rx_bytes;
+ stats->rx_packets += local_stats.rx_packets;
+ stats->tx_bytes += local_stats.tx_bytes;
+ stats->tx_packets += local_stats.tx_packets;
+ }
+
+ return stats;
+}
+
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
- .ndo_get_stats64 = internal_dev_get_stats,
+ .ndo_get_stats64 = internal_get_stats,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -135,7 +158,7 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
- netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
netdev->destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -156,49 +179,51 @@ static void do_setup(struct net_device *netdev)
static struct vport *internal_dev_create(const struct vport_parms *parms)
{
struct vport *vport;
- struct netdev_vport *netdev_vport;
struct internal_dev *internal_dev;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_internal_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
- parms->name, NET_NAME_UNKNOWN,
- do_setup);
- if (!netdev_vport->dev) {
+ vport->dev = alloc_netdev(sizeof(struct internal_dev),
+ parms->name, NET_NAME_UNKNOWN, do_setup);
+ if (!vport->dev) {
err = -ENOMEM;
goto error_free_vport;
}
+ vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!vport->dev->tstats) {
+ err = -ENOMEM;
+ goto error_free_netdev;
+ }
- dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp));
- internal_dev = internal_dev_priv(netdev_vport->dev);
+ dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
+ internal_dev = internal_dev_priv(vport->dev);
internal_dev->vport = vport;
/* Restrict bridge port to current netns. */
if (vport->port_no == OVSP_LOCAL)
- netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL;
+ vport->dev->features |= NETIF_F_NETNS_LOCAL;
rtnl_lock();
- err = register_netdevice(netdev_vport->dev);
+ err = register_netdevice(vport->dev);
if (err)
- goto error_free_netdev;
+ goto error_unlock;
- dev_set_promiscuity(netdev_vport->dev, 1);
+ dev_set_promiscuity(vport->dev, 1);
rtnl_unlock();
- netif_start_queue(netdev_vport->dev);
+ netif_start_queue(vport->dev);
return vport;
-error_free_netdev:
+error_unlock:
rtnl_unlock();
- free_netdev(netdev_vport->dev);
+ free_percpu(vport->dev->tstats);
+error_free_netdev:
+ free_netdev(vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
@@ -207,49 +232,49 @@ error:
static void internal_dev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
- netif_stop_queue(netdev_vport->dev);
+ netif_stop_queue(vport->dev);
rtnl_lock();
- dev_set_promiscuity(netdev_vport->dev, -1);
+ dev_set_promiscuity(vport->dev, -1);
/* unregister_netdevice() waits for an RCU grace period. */
- unregister_netdevice(netdev_vport->dev);
-
+ unregister_netdevice(vport->dev);
+ free_percpu(vport->dev->tstats);
rtnl_unlock();
}
-static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
+static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
{
- struct net_device *netdev = netdev_vport_priv(vport)->dev;
- int len;
+ struct net_device *netdev = skb->dev;
+ struct pcpu_sw_netstats *stats;
if (unlikely(!(netdev->flags & IFF_UP))) {
kfree_skb(skb);
- return 0;
+ netdev->stats.rx_dropped++;
+ return NETDEV_TX_OK;
}
- len = skb->len;
-
skb_dst_drop(skb);
nf_reset(skb);
secpath_reset(skb);
- skb->dev = netdev;
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, netdev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- netif_rx(skb);
+ stats = this_cpu_ptr(netdev->tstats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
- return len;
+ netif_rx(skb);
+ return NETDEV_TX_OK;
}
static struct vport_ops ovs_internal_vport_ops = {
.type = OVS_VPORT_TYPE_INTERNAL,
.create = internal_dev_create,
.destroy = internal_dev_destroy,
- .get_name = ovs_netdev_get_name,
.send = internal_dev_recv,
};
diff --git a/kernel/net/openvswitch/vport-netdev.c b/kernel/net/openvswitch/vport-netdev.c
index 33e6d6e29..6b0190b98 100644
--- a/kernel/net/openvswitch/vport-netdev.c
+++ b/kernel/net/openvswitch/vport-netdev.c
@@ -26,18 +26,24 @@
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/openvswitch.h>
+#include <linux/export.h>
-#include <net/llc.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
#include "datapath.h"
+#include "vport.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
static struct vport_ops ovs_netdev_vport_ops;
/* Must be called with rcu_read_lock. */
-static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
+static void netdev_port_receive(struct sk_buff *skb)
{
+ struct vport *vport;
+
+ vport = ovs_netdev_get_vport(skb->dev);
if (unlikely(!vport))
goto error;
@@ -53,10 +59,8 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
skb_push(skb, ETH_HLEN);
ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
-
- ovs_vport_receive(vport, skb, NULL);
+ ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
return;
-
error:
kfree_skb(skb);
}
@@ -65,15 +69,11 @@ error:
static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
- struct vport *vport;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
- vport = ovs_netdev_get_vport(skb->dev);
-
- netdev_port_receive(vport, skb);
-
+ netdev_port_receive(skb);
return RX_HANDLER_CONSUMED;
}
@@ -83,139 +83,116 @@ static struct net_device *get_dpdev(const struct datapath *dp)
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
BUG_ON(!local);
- return netdev_vport_priv(local)->dev;
+ return local->dev;
}
-static struct vport *netdev_create(const struct vport_parms *parms)
+struct vport *ovs_netdev_link(struct vport *vport, const char *name)
{
- struct vport *vport;
- struct netdev_vport *netdev_vport;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_netdev_vport_ops, parms);
- if (IS_ERR(vport)) {
- err = PTR_ERR(vport);
- goto error;
- }
-
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
- if (!netdev_vport->dev) {
+ vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
+ if (!vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
- if (netdev_vport->dev->flags & IFF_LOOPBACK ||
- netdev_vport->dev->type != ARPHRD_ETHER ||
- ovs_is_internal_dev(netdev_vport->dev)) {
+ if (vport->dev->flags & IFF_LOOPBACK ||
+ vport->dev->type != ARPHRD_ETHER ||
+ ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
}
rtnl_lock();
- err = netdev_master_upper_dev_link(netdev_vport->dev,
+ err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp));
if (err)
goto error_unlock;
- err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
+ err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_master_upper_dev_unlink;
- dev_disable_lro(netdev_vport->dev);
- dev_set_promiscuity(netdev_vport->dev, 1);
- netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+ dev_disable_lro(vport->dev);
+ dev_set_promiscuity(vport->dev, 1);
+ vport->dev->priv_flags |= IFF_OVS_DATAPATH;
rtnl_unlock();
return vport;
error_master_upper_dev_unlink:
- netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+ netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
error_unlock:
rtnl_unlock();
error_put:
- dev_put(netdev_vport->dev);
+ dev_put(vport->dev);
error_free_vport:
ovs_vport_free(vport);
-error:
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_link);
-static void free_port_rcu(struct rcu_head *rcu)
+static struct vport *netdev_create(const struct vport_parms *parms)
{
- struct netdev_vport *netdev_vport = container_of(rcu,
- struct netdev_vport, rcu);
+ struct vport *vport;
+
+ vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
- dev_put(netdev_vport->dev);
- ovs_vport_free(vport_from_priv(netdev_vport));
+ return ovs_netdev_link(vport, parms->name);
}
-void ovs_netdev_detach_dev(struct vport *vport)
+static void vport_netdev_free(struct rcu_head *rcu)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ struct vport *vport = container_of(rcu, struct vport, rcu);
+ if (vport->dev)
+ dev_put(vport->dev);
+ ovs_vport_free(vport);
+}
+
+void ovs_netdev_detach_dev(struct vport *vport)
+{
ASSERT_RTNL();
- netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
- netdev_rx_handler_unregister(netdev_vport->dev);
- netdev_upper_dev_unlink(netdev_vport->dev,
- netdev_master_upper_dev_get(netdev_vport->dev));
- dev_set_promiscuity(netdev_vport->dev, -1);
+ vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
+ netdev_rx_handler_unregister(vport->dev);
+ netdev_upper_dev_unlink(vport->dev,
+ netdev_master_upper_dev_get(vport->dev));
+ dev_set_promiscuity(vport->dev, -1);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
static void netdev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
rtnl_lock();
- if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
rtnl_unlock();
- call_rcu(&netdev_vport->rcu, free_port_rcu);
-}
-
-const char *ovs_netdev_get_name(const struct vport *vport)
-{
- const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- return netdev_vport->dev->name;
-}
-
-static unsigned int packet_length(const struct sk_buff *skb)
-{
- unsigned int length = skb->len - ETH_HLEN;
-
- if (skb->protocol == htons(ETH_P_8021Q))
- length -= VLAN_HLEN;
-
- return length;
+ call_rcu(&vport->rcu, vport_netdev_free);
}
-static int netdev_send(struct vport *vport, struct sk_buff *skb)
+void ovs_netdev_tunnel_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- int mtu = netdev_vport->dev->mtu;
- int len;
-
- if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
- net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
- netdev_vport->dev->name,
- packet_length(skb), mtu);
- goto drop;
- }
-
- skb->dev = netdev_vport->dev;
- len = skb->len;
- dev_queue_xmit(skb);
+ rtnl_lock();
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
- return len;
+ /* We can be invoked by both explicit vport deletion and
+ * underlying netdev deregistration; delete the link only
+ * if it's not already shutting down.
+ */
+ if (vport->dev->reg_state == NETREG_REGISTERED)
+ rtnl_delete_link(vport->dev);
+ dev_put(vport->dev);
+ vport->dev = NULL;
+ rtnl_unlock();
-drop:
- kfree_skb(skb);
- return 0;
+ call_rcu(&vport->rcu, vport_netdev_free);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
/* Returns null if this device is not attached to a datapath. */
struct vport *ovs_netdev_get_vport(struct net_device *dev)
@@ -231,8 +208,7 @@ static struct vport_ops ovs_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_NETDEV,
.create = netdev_create,
.destroy = netdev_destroy,
- .get_name = ovs_netdev_get_name,
- .send = netdev_send,
+ .send = dev_queue_xmit,
};
int __init ovs_netdev_init(void)
diff --git a/kernel/net/openvswitch/vport-netdev.h b/kernel/net/openvswitch/vport-netdev.h
index 6f7038e79..19e29c12a 100644
--- a/kernel/net/openvswitch/vport-netdev.h
+++ b/kernel/net/openvswitch/vport-netdev.h
@@ -26,22 +26,11 @@
struct vport *ovs_netdev_get_vport(struct net_device *dev);
-struct netdev_vport {
- struct rcu_head rcu;
-
- struct net_device *dev;
-};
-
-static inline struct netdev_vport *
-netdev_vport_priv(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-const char *ovs_netdev_get_name(const struct vport *);
+struct vport *ovs_netdev_link(struct vport *vport, const char *name);
void ovs_netdev_detach_dev(struct vport *);
int __init ovs_netdev_init(void);
void ovs_netdev_exit(void);
+void ovs_netdev_tunnel_destroy(struct vport *vport);
#endif /* vport_netdev.h */
diff --git a/kernel/net/openvswitch/vport-vxlan.c b/kernel/net/openvswitch/vport-vxlan.c
index 6d39766e7..d933cb89e 100644
--- a/kernel/net/openvswitch/vport-vxlan.c
+++ b/kernel/net/openvswitch/vport-vxlan.c
@@ -17,94 +17,37 @@
* 02110-1301, USA
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/net.h>
-#include <linux/rculist.h>
-#include <linux/udp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/openvswitch.h>
#include <linux/module.h>
-
-#include <net/icmp.h>
-#include <net/ip.h>
#include <net/udp.h>
#include <net/ip_tunnels.h>
#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
#include <net/vxlan.h>
#include "datapath.h"
#include "vport.h"
-#include "vport-vxlan.h"
-
-/**
- * struct vxlan_port - Keeps track of open UDP ports
- * @vs: vxlan_sock created for the port.
- * @name: vport name.
- */
-struct vxlan_port {
- struct vxlan_sock *vs;
- char name[IFNAMSIZ];
- u32 exts; /* VXLAN_F_* in <net/vxlan.h> */
-};
-
-static struct vport_ops ovs_vxlan_vport_ops;
-
-static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
-{
- return vport_priv(vport);
-}
+#include "vport-netdev.h"
-/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
- struct vxlan_metadata *md)
-{
- struct ovs_tunnel_info tun_info;
- struct vxlan_port *vxlan_port;
- struct vport *vport = vs->data;
- struct iphdr *iph;
- struct ovs_vxlan_opts opts = {
- .gbp = md->gbp,
- };
- __be64 key;
- __be16 flags;
-
- flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
- vxlan_port = vxlan_vport(vport);
- if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
- flags |= TUNNEL_VXLAN_OPT;
-
- /* Save outer tunnel values */
- iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(md->vni) >> 8);
- ovs_flow_tun_info_init(&tun_info, iph,
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags, &opts, sizeof(opts));
-
- ovs_vport_receive(vport, skb, &tun_info);
-}
+static struct vport_ops ovs_vxlan_netdev_vport_ops;
static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+ __be16 dst_port = vxlan->cfg.dst_port;
if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
return -EMSGSIZE;
- if (vxlan_port->exts) {
+ if (vxlan->flags & VXLAN_F_GBP) {
struct nlattr *exts;
exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
if (!exts)
return -EMSGSIZE;
- if (vxlan_port->exts & VXLAN_F_GBP &&
+ if (vxlan->flags & VXLAN_F_GBP &&
nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
return -EMSGSIZE;
@@ -114,23 +57,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
return 0;
}
-static void vxlan_tnl_destroy(struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
-
- vxlan_sock_release(vxlan_port->vs);
-
- ovs_vport_deferred_free(vport);
-}
-
-static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = {
[OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
};
-static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr,
+ struct vxlan_config *conf)
{
- struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
- struct vxlan_port *vxlan_port;
+ struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1];
int err;
if (nla_len(attr) < sizeof(struct nlattr))
@@ -140,10 +74,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
if (err < 0)
return err;
- vxlan_port = vxlan_vport(vport);
-
if (exts[OVS_VXLAN_EXT_GBP])
- vxlan_port->exts |= VXLAN_F_GBP;
+ conf->flags |= VXLAN_F_GBP;
return 0;
}
@@ -152,166 +84,84 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
- struct vxlan_port *vxlan_port;
- struct vxlan_sock *vs;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- u16 dst_port;
int err;
+ struct vxlan_config conf = {
+ .no_share = true,
+ .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX,
+ };
if (!options) {
err = -EINVAL;
goto error;
}
+
a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
if (a && nla_len(a) == sizeof(u16)) {
- dst_port = nla_get_u16(a);
+ conf.dst_port = htons(nla_get_u16(a));
} else {
/* Require destination port from userspace. */
err = -EINVAL;
goto error;
}
- vport = ovs_vport_alloc(sizeof(struct vxlan_port),
- &ovs_vxlan_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
- vxlan_port = vxlan_vport(vport);
- strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
-
a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
if (a) {
- err = vxlan_configure_exts(vport, a);
+ err = vxlan_configure_exts(vport, a, &conf);
if (err) {
ovs_vport_free(vport);
goto error;
}
}
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true,
- vxlan_port->exts);
- if (IS_ERR(vs)) {
+ rtnl_lock();
+ dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
ovs_vport_free(vport);
- return (void *)vs;
+ return ERR_CAST(dev);
}
- vxlan_port->vs = vs;
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
-
error:
return ERR_PTR(err);
}
-static int vxlan_ext_gbp(struct sk_buff *skb)
+static struct vport *vxlan_create(const struct vport_parms *parms)
{
- const struct ovs_tunnel_info *tun_info;
- const struct ovs_vxlan_opts *opts;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- opts = tun_info->options;
-
- if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
- tun_info->options_len >= sizeof(*opts))
- return opts->gbp;
- else
- return 0;
-}
-
-static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- struct sock *sk = vxlan_port->vs->sock->sk;
- __be16 dst_port = inet_sk(sk)->inet_sport;
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct vxlan_metadata md = {0};
- struct rtable *rt;
- struct flowi4 fl;
- __be16 src_port;
- __be16 df;
- int err;
- u32 vxflags;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
- rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
- htons(IP_DF) : 0;
-
- skb->ignore_df = 1;
-
- src_port = udp_flow_src_port(net, skb, 0, 0, true);
- md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
- md.gbp = vxlan_ext_gbp(skb);
- vxflags = vxlan_port->exts |
- (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
-
- err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
- src_port, dst_port,
- &md, false, vxflags);
- if (err < 0)
- ip_rt_put(rt);
- return err;
-error:
- kfree_skb(skb);
- return err;
-}
-
-static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
- __be16 src_port;
- int port_min;
- int port_max;
-
- inet_get_local_port_range(net, &port_min, &port_max);
- src_port = udp_flow_src_port(net, skb, 0, 0, true);
+ struct vport *vport;
- return ovs_tunnel_get_egress_info(egress_tun_info, net,
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_UDP, skb->mark,
- src_port, dst_port);
-}
+ vport = vxlan_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
-static const char *vxlan_get_name(const struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- return vxlan_port->name;
+ return ovs_netdev_link(vport, parms->name);
}
-static struct vport_ops ovs_vxlan_vport_ops = {
- .type = OVS_VPORT_TYPE_VXLAN,
- .create = vxlan_tnl_create,
- .destroy = vxlan_tnl_destroy,
- .get_name = vxlan_get_name,
- .get_options = vxlan_get_options,
- .send = vxlan_tnl_send,
- .get_egress_tun_info = vxlan_get_egress_tun_info,
- .owner = THIS_MODULE,
+static struct vport_ops ovs_vxlan_netdev_vport_ops = {
+ .type = OVS_VPORT_TYPE_VXLAN,
+ .create = vxlan_create,
+ .destroy = ovs_netdev_tunnel_destroy,
+ .get_options = vxlan_get_options,
+ .send = dev_queue_xmit,
};
static int __init ovs_vxlan_tnl_init(void)
{
- return ovs_vport_ops_register(&ovs_vxlan_vport_ops);
+ return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops);
}
static void __exit ovs_vxlan_tnl_exit(void)
{
- ovs_vport_ops_unregister(&ovs_vxlan_vport_ops);
+ ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops);
}
module_init(ovs_vxlan_tnl_init);
diff --git a/kernel/net/openvswitch/vport-vxlan.h b/kernel/net/openvswitch/vport-vxlan.h
deleted file mode 100644
index 4b08233e7..000000000
--- a/kernel/net/openvswitch/vport-vxlan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef VPORT_VXLAN_H
-#define VPORT_VXLAN_H 1
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-struct ovs_vxlan_opts {
- __u32 gbp;
-};
-
-#endif
diff --git a/kernel/net/openvswitch/vport.c b/kernel/net/openvswitch/vport.c
index 067a3fff1..31cbc8c5c 100644
--- a/kernel/net/openvswitch/vport.c
+++ b/kernel/net/openvswitch/vport.c
@@ -34,9 +34,6 @@
#include "vport.h"
#include "vport-internal_dev.h"
-static void ovs_vport_record_error(struct vport *,
- enum vport_err_type err_type);
-
static LIST_HEAD(vport_ops_list);
/* Protected by RCU read lock for reading, ovs_mutex for writing. */
@@ -74,7 +71,7 @@ static struct hlist_head *hash_bucket(const struct net *net, const char *name)
return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
}
-int ovs_vport_ops_register(struct vport_ops *ops)
+int __ovs_vport_ops_register(struct vport_ops *ops)
{
int err = -EEXIST;
struct vport_ops *o;
@@ -90,7 +87,7 @@ errout:
ovs_unlock();
return err;
}
-EXPORT_SYMBOL_GPL(ovs_vport_ops_register);
+EXPORT_SYMBOL_GPL(__ovs_vport_ops_register);
void ovs_vport_ops_unregister(struct vport_ops *ops)
{
@@ -113,7 +110,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node)
- if (!strcmp(name, vport->ops->get_name(vport)) &&
+ if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -157,12 +154,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
return ERR_PTR(-EINVAL);
}
- vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!vport->percpu_stats) {
- kfree(vport);
- return ERR_PTR(-ENOMEM);
- }
-
return vport;
}
EXPORT_SYMBOL_GPL(ovs_vport_alloc);
@@ -183,7 +174,6 @@ void ovs_vport_free(struct vport *vport)
* it is safe to use raw dereference.
*/
kfree(rcu_dereference_raw(vport->upcall_portids));
- free_percpu(vport->percpu_stats);
kfree(vport);
}
EXPORT_SYMBOL_GPL(ovs_vport_free);
@@ -226,7 +216,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms)
}
bucket = hash_bucket(ovs_dp_get_net(vport->dp),
- vport->ops->get_name(vport));
+ ovs_vport_name(vport));
hlist_add_head_rcu(&vport->hash_node, bucket);
return vport;
}
@@ -266,8 +256,8 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
*
* @vport: vport to delete.
*
- * Detaches @vport from its datapath and destroys it. It is possible to fail
- * for reasons such as lack of memory. ovs_mutex must be held.
+ * Detaches @vport from its datapath and destroys it. ovs_mutex must
+ * be held.
*/
void ovs_vport_del(struct vport *vport)
{
@@ -290,41 +280,19 @@ void ovs_vport_del(struct vport *vport)
*/
void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
{
- int i;
-
- memset(stats, 0, sizeof(*stats));
-
- /* We potentially have 2 sources of stats that need to be combined:
- * those we have collected (split into err_stats and percpu_stats) from
- * set_stats() and device error stats from netdev->get_stats() (for
- * errors that happen downstream and therefore aren't reported through
- * our vport_record_error() function).
- * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS).
- * netdev-stats can be directly read over netlink-ioctl.
- */
-
- stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors);
- stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors);
- stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped);
- stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped);
-
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *percpu_stats;
- struct pcpu_sw_netstats local_stats;
- unsigned int start;
-
- percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
- local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
-
- stats->rx_bytes += local_stats.rx_bytes;
- stats->rx_packets += local_stats.rx_packets;
- stats->tx_bytes += local_stats.tx_bytes;
- stats->tx_packets += local_stats.tx_packets;
- }
+ const struct rtnl_link_stats64 *dev_stats;
+ struct rtnl_link_stats64 temp;
+
+ dev_stats = dev_get_stats(vport->dev, &temp);
+ stats->rx_errors = dev_stats->rx_errors;
+ stats->tx_errors = dev_stats->tx_errors;
+ stats->tx_dropped = dev_stats->tx_dropped;
+ stats->rx_dropped = dev_stats->rx_dropped;
+
+ stats->rx_bytes = dev_stats->rx_bytes;
+ stats->rx_packets = dev_stats->rx_packets;
+ stats->tx_bytes = dev_stats->tx_bytes;
+ stats->tx_packets = dev_stats->tx_packets;
}
/**
@@ -468,94 +436,34 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
* Must be called with rcu_read_lock. The packet cannot be shared and
* skb->data should point to the Ethernet header.
*/
-void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
- const struct ovs_tunnel_info *tun_info)
+int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
+ const struct ip_tunnel_info *tun_info)
{
- struct pcpu_sw_netstats *stats;
struct sw_flow_key key;
int error;
- stats = this_cpu_ptr(vport->percpu_stats);
- u64_stats_update_begin(&stats->syncp);
- stats->rx_packets++;
- stats->rx_bytes += skb->len +
- (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- u64_stats_update_end(&stats->syncp);
-
OVS_CB(skb)->input_vport = vport;
- OVS_CB(skb)->egress_tun_info = NULL;
+ OVS_CB(skb)->mru = 0;
+ if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
+ u32 mark;
+
+ mark = skb->mark;
+ skb_scrub_packet(skb, true);
+ skb->mark = mark;
+ tun_info = NULL;
+ }
+
/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_key_extract(tun_info, skb, &key);
if (unlikely(error)) {
kfree_skb(skb);
- return;
+ return error;
}
ovs_dp_process_packet(skb, &key);
+ return 0;
}
EXPORT_SYMBOL_GPL(ovs_vport_receive);
-/**
- * ovs_vport_send - send a packet on a device
- *
- * @vport: vport on which to send the packet
- * @skb: skb to send
- *
- * Sends the given packet and returns the length of data sent. Either ovs
- * lock or rcu_read_lock must be held.
- */
-int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
-{
- int sent = vport->ops->send(vport, skb);
-
- if (likely(sent > 0)) {
- struct pcpu_sw_netstats *stats;
-
- stats = this_cpu_ptr(vport->percpu_stats);
-
- u64_stats_update_begin(&stats->syncp);
- stats->tx_packets++;
- stats->tx_bytes += sent;
- u64_stats_update_end(&stats->syncp);
- } else if (sent < 0) {
- ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
- } else {
- ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
- }
- return sent;
-}
-
-/**
- * ovs_vport_record_error - indicate device error to generic stats layer
- *
- * @vport: vport that encountered the error
- * @err_type: one of enum vport_err_type types to indicate the error type
- *
- * If using the vport generic stats layer indicate that an error of the given
- * type has occurred.
- */
-static void ovs_vport_record_error(struct vport *vport,
- enum vport_err_type err_type)
-{
- switch (err_type) {
- case VPORT_E_RX_DROPPED:
- atomic_long_inc(&vport->err_stats.rx_dropped);
- break;
-
- case VPORT_E_RX_ERROR:
- atomic_long_inc(&vport->err_stats.rx_errors);
- break;
-
- case VPORT_E_TX_DROPPED:
- atomic_long_inc(&vport->err_stats.tx_dropped);
- break;
-
- case VPORT_E_TX_ERROR:
- atomic_long_inc(&vport->err_stats.tx_errors);
- break;
- }
-
-}
-
static void free_vport_rcu(struct rcu_head *rcu)
{
struct vport *vport = container_of(rcu, struct vport, rcu);
@@ -572,56 +480,32 @@ void ovs_vport_deferred_free(struct vport *vport)
}
EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
- struct net *net,
- const struct ovs_tunnel_info *tun_info,
- u8 ipproto,
- u32 skb_mark,
- __be16 tp_src,
- __be16 tp_dst)
+static unsigned int packet_length(const struct sk_buff *skb)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct rtable *rt;
- struct flowi4 fl;
-
- if (unlikely(!tun_info))
- return -EINVAL;
-
- tun_key = &tun_info->tunnel;
-
- /* Route lookup to get srouce IP address.
- * The process may need to be changed if the corresponding process
- * in vports ops changed.
- */
- rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
+ unsigned int length = skb->len - ETH_HLEN;
- ip_rt_put(rt);
+ if (skb->protocol == htons(ETH_P_8021Q))
+ length -= VLAN_HLEN;
- /* Generate egress_tun_info based on tun_info,
- * saddr, tp_src and tp_dst
- */
- __ovs_flow_tun_info_init(egress_tun_info,
- fl.saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos,
- tun_key->ipv4_ttl,
- tp_src, tp_dst,
- tun_key->tun_id,
- tun_key->tun_flags,
- tun_info->options,
- tun_info->options_len);
-
- return 0;
+ return length;
}
-EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
-int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info)
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
- /* get_egress_tun_info() is only implemented on tunnel ports. */
- if (unlikely(!vport->ops->get_egress_tun_info))
- return -EINVAL;
+ int mtu = vport->dev->mtu;
+
+ if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
+ vport->dev->name,
+ packet_length(skb), mtu);
+ vport->dev->stats.tx_errors++;
+ goto drop;
+ }
+
+ skb->dev = vport->dev;
+ vport->ops->send(skb);
+ return;
- return vport->ops->get_egress_tun_info(vport, skb, info);
+drop:
+ kfree_skb(skb);
}
diff --git a/kernel/net/openvswitch/vport.h b/kernel/net/openvswitch/vport.h
index bc85331a6..8ea3a9698 100644
--- a/kernel/net/openvswitch/vport.h
+++ b/kernel/net/openvswitch/vport.h
@@ -35,10 +35,6 @@ struct vport_parms;
/* The following definitions are for users of the vport subsytem: */
-struct vport_net {
- struct vport __rcu *gre_vport;
-};
-
int ovs_vport_init(void);
void ovs_vport_exit(void);
@@ -56,26 +52,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);
int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);
u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
-int ovs_vport_send(struct vport *, struct sk_buff *);
-
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
- struct net *net,
- const struct ovs_tunnel_info *tun_info,
- u8 ipproto,
- u32 skb_mark,
- __be16 tp_src,
- __be16 tp_dst);
-int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info);
-
-/* The following definitions are for implementers of vport devices: */
-
-struct vport_err_stats {
- atomic_long_t rx_dropped;
- atomic_long_t rx_errors;
- atomic_long_t tx_dropped;
- atomic_long_t tx_errors;
-};
/**
* struct vport_portids - array of netlink portids of a vport.
* must be protected by rcu.
@@ -101,12 +77,10 @@ struct vport_portids {
* @hash_node: Element in @dev_table hash table in vport.c.
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
- * @percpu_stats: Points to per-CPU statistics used and maintained by vport
- * @err_stats: Points to error statistics used and maintained by vport
* @detach_list: list used for detaching vport in net-exit call.
*/
struct vport {
- struct rcu_head rcu;
+ struct net_device *dev;
struct datapath *dp;
struct vport_portids __rcu *upcall_portids;
u16 port_no;
@@ -115,10 +89,8 @@ struct vport {
struct hlist_node dp_hash_node;
const struct vport_ops *ops;
- struct pcpu_sw_netstats __percpu *percpu_stats;
-
- struct vport_err_stats err_stats;
struct list_head detach_list;
+ struct rcu_head rcu;
};
/**
@@ -155,11 +127,8 @@ struct vport_parms {
* @get_options: Appends vport-specific attributes for the configuration of an
* existing vport to a &struct sk_buff. May be %NULL for a vport that does not
* have any configuration.
- * @get_name: Get the device's name.
- * @send: Send a packet on the device. Returns the length of the packet sent,
+ * @send: Send a packet on the device.
* zero for dropped packets or negative for error.
- * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for
- * a packet.
*/
struct vport_ops {
enum ovs_vport_type type;
@@ -171,24 +140,11 @@ struct vport_ops {
int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);
- /* Called with rcu_read_lock or ovs_mutex. */
- const char *(*get_name)(const struct vport *);
-
- int (*send)(struct vport *, struct sk_buff *);
- int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
- struct ovs_tunnel_info *);
-
+ netdev_tx_t (*send) (struct sk_buff *skb);
struct module *owner;
struct list_head list;
};
-enum vport_err_type {
- VPORT_E_RX_DROPPED,
- VPORT_E_RX_ERROR,
- VPORT_E_TX_DROPPED,
- VPORT_E_TX_ERROR,
-};
-
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
@@ -225,8 +181,8 @@ static inline struct vport *vport_from_priv(void *priv)
return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
}
-void ovs_vport_receive(struct vport *, struct sk_buff *,
- const struct ovs_tunnel_info *);
+int ovs_vport_receive(struct vport *, struct sk_buff *,
+ const struct ip_tunnel_info *);
static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
@@ -235,11 +191,22 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
}
-int ovs_vport_ops_register(struct vport_ops *ops);
+static inline const char *ovs_vport_name(struct vport *vport)
+{
+ return vport->dev->name;
+}
+
+int __ovs_vport_ops_register(struct vport_ops *ops);
+#define ovs_vport_ops_register(ops) \
+ ({ \
+ (ops)->owner = THIS_MODULE; \
+ __ovs_vport_ops_register(ops); \
+ })
+
void ovs_vport_ops_unregister(struct vport_ops *ops);
static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
- const struct ovs_key_ipv4_tunnel *key,
+ const struct ip_tunnel_key *key,
u32 mark,
struct flowi4 *fl,
u8 protocol)
@@ -247,13 +214,16 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
struct rtable *rt;
memset(fl, 0, sizeof(*fl));
- fl->daddr = key->ipv4_dst;
- fl->saddr = key->ipv4_src;
- fl->flowi4_tos = RT_TOS(key->ipv4_tos);
+ fl->daddr = key->u.ipv4.dst;
+ fl->saddr = key->u.ipv4.src;
+ fl->flowi4_tos = RT_TOS(key->tos);
fl->flowi4_mark = mark;
fl->flowi4_proto = protocol;
rt = ip_route_output_key(net, fl);
return rt;
}
+
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb);
+
#endif /* vport.h */
diff --git a/kernel/net/packet/af_packet.c b/kernel/net/packet/af_packet.c
index f9f259247..2c23bae0e 100644
--- a/kernel/net/packet/af_packet.c
+++ b/kernel/net/packet/af_packet.c
@@ -93,6 +93,7 @@
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
+#include <linux/bpf.h>
#include "internal.h"
@@ -230,6 +231,8 @@ struct packet_skb_cb {
} sa;
};
+#define vio_le() virtio_legacy_is_little_endian()
+
#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
@@ -519,13 +522,11 @@ static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
}
static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
- int tx_ring,
struct sk_buff_head *rb_queue)
{
struct tpacket_kbdq_core *pkc;
- pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
- GET_PBDQC_FROM_RB(&po->rx_ring);
+ pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
spin_lock_bh(&rb_queue->lock);
pkc->delete_blk_timer = 1;
@@ -544,15 +545,11 @@ static void prb_init_blk_timer(struct packet_sock *po,
pkc->retire_blk_timer.expires = jiffies;
}
-static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
+static void prb_setup_retire_blk_timer(struct packet_sock *po)
{
struct tpacket_kbdq_core *pkc;
- if (tx_ring)
- BUG();
-
- pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
- GET_PBDQC_FROM_RB(&po->rx_ring);
+ pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
}
@@ -608,7 +605,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_buffer *rb,
struct pgv *pg_vec,
- union tpacket_req_u *req_u, int tx_ring)
+ union tpacket_req_u *req_u)
{
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;
@@ -635,7 +632,7 @@ static void init_prb_bdqc(struct packet_sock *po,
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
- prb_setup_retire_blk_timer(po, tx_ring);
+ prb_setup_retire_blk_timer(po);
prb_open_block(p1, pbd);
}
@@ -1235,27 +1232,81 @@ static void packet_free_pending(struct packet_sock *po)
free_percpu(po->tx_ring.pending_refcnt);
}
-static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+#define ROOM_POW_OFF 2
+#define ROOM_NONE 0x0
+#define ROOM_LOW 0x1
+#define ROOM_NORMAL 0x2
+
+static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+{
+ int idx, len;
+
+ len = po->rx_ring.frame_max + 1;
+ idx = po->rx_ring.head;
+ if (pow_off)
+ idx += len >> pow_off;
+ if (idx >= len)
+ idx -= len;
+ return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+{
+ int idx, len;
+
+ len = po->rx_ring.prb_bdqc.knum_blocks;
+ idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ if (pow_off)
+ idx += len >> pow_off;
+ if (idx >= len)
+ idx -= len;
+ return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
struct sock *sk = &po->sk;
- bool has_room;
+ int ret = ROOM_NONE;
+
+ if (po->prot_hook.func != tpacket_rcv) {
+ int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - (skb ? skb->truesize : 0);
+ if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ return ROOM_NORMAL;
+ else if (avail > 0)
+ return ROOM_LOW;
+ else
+ return ROOM_NONE;
+ }
- if (po->prot_hook.func != tpacket_rcv)
- return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
- <= sk->sk_rcvbuf;
+ if (po->tp_version == TPACKET_V3) {
+ if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
+ ret = ROOM_NORMAL;
+ else if (__tpacket_v3_has_room(po, 0))
+ ret = ROOM_LOW;
+ } else {
+ if (__tpacket_has_room(po, ROOM_POW_OFF))
+ ret = ROOM_NORMAL;
+ else if (__tpacket_has_room(po, 0))
+ ret = ROOM_LOW;
+ }
- spin_lock(&sk->sk_receive_queue.lock);
- if (po->tp_version == TPACKET_V3)
- has_room = prb_lookup_block(po, &po->rx_ring,
- po->rx_ring.prb_bdqc.kactive_blk_num,
- TP_STATUS_KERNEL);
- else
- has_room = packet_lookup_frame(po, &po->rx_ring,
- po->rx_ring.head,
- TP_STATUS_KERNEL);
- spin_unlock(&sk->sk_receive_queue.lock);
+ return ret;
+}
- return has_room;
+static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+ int ret;
+ bool has_room;
+
+ spin_lock_bh(&po->sk.sk_receive_queue.lock);
+ ret = __packet_rcv_has_room(po, skb);
+ has_room = ret == ROOM_NORMAL;
+ if (po->pressure == has_room)
+ po->pressure = !has_room;
+ spin_unlock_bh(&po->sk.sk_receive_queue.lock);
+
+ return ret;
}
static void packet_sock_destruct(struct sock *sk)
@@ -1273,6 +1324,20 @@ static void packet_sock_destruct(struct sock *sk)
sk_refcnt_debug_dec(sk);
}
+static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
+{
+ u32 rxhash;
+ int i, count = 0;
+
+ rxhash = skb_get_hash(skb);
+ for (i = 0; i < ROLLOVER_HLEN; i++)
+ if (po->rollover->history[i] == rxhash)
+ count++;
+
+ po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
+ return count > (ROLLOVER_HLEN >> 1);
+}
+
static unsigned int fanout_demux_hash(struct packet_fanout *f,
struct sk_buff *skb,
unsigned int num)
@@ -1305,22 +1370,40 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f,
static unsigned int fanout_demux_rollover(struct packet_fanout *f,
struct sk_buff *skb,
- unsigned int idx, unsigned int skip,
+ unsigned int idx, bool try_self,
unsigned int num)
{
- unsigned int i, j;
+ struct packet_sock *po, *po_next, *po_skip = NULL;
+ unsigned int i, j, room = ROOM_NONE;
- i = j = min_t(int, f->next[idx], num - 1);
+ po = pkt_sk(f->arr[idx]);
+
+ if (try_self) {
+ room = packet_rcv_has_room(po, skb);
+ if (room == ROOM_NORMAL ||
+ (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
+ return idx;
+ po_skip = po;
+ }
+
+ i = j = min_t(int, po->rollover->sock, num - 1);
do {
- if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+ po_next = pkt_sk(f->arr[i]);
+ if (po_next != po_skip && !po_next->pressure &&
+ packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
- f->next[idx] = i;
+ po->rollover->sock = i;
+ atomic_long_inc(&po->rollover->num);
+ if (room == ROOM_LOW)
+ atomic_long_inc(&po->rollover->num_huge);
return i;
}
+
if (++i == num)
i = 0;
} while (i != j);
+ atomic_long_inc(&po->rollover->num_failed);
return idx;
}
@@ -1331,6 +1414,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f,
return skb_get_queue_mapping(skb) % num;
}
+static unsigned int fanout_demux_bpf(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ struct bpf_prog *prog;
+ unsigned int ret = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(f->bpf_prog);
+ if (prog)
+ ret = bpf_prog_run_clear_cb(prog, skb) % num;
+ rcu_read_unlock();
+
+ return ret;
+}
+
static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
{
return f->flags & (flag >> 8);
@@ -1341,17 +1440,17 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
{
struct packet_fanout *f = pt->af_packet_priv;
unsigned int num = READ_ONCE(f->num_members);
+ struct net *net = read_pnet(&f->net);
struct packet_sock *po;
unsigned int idx;
- if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
- !num) {
+ if (!net_eq(dev_net(dev), net) || !num) {
kfree_skb(skb);
return 0;
}
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
- skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
+ skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
if (!skb)
return 0;
}
@@ -1373,17 +1472,18 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
idx = fanout_demux_qm(f, skb, num);
break;
case PACKET_FANOUT_ROLLOVER:
- idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
+ idx = fanout_demux_rollover(f, skb, 0, false, num);
+ break;
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
+ idx = fanout_demux_bpf(f, skb, num);
break;
}
- po = pkt_sk(f->arr[idx]);
- if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
- unlikely(!packet_rcv_has_room(po, skb))) {
- idx = fanout_demux_rollover(f, skb, idx, idx, num);
- po = pkt_sk(f->arr[idx]);
- }
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
+ idx = fanout_demux_rollover(f, skb, idx, true, num);
+ po = pkt_sk(f->arr[idx]);
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}
@@ -1420,10 +1520,107 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
{
- if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
- return true;
+ if (sk->sk_family != PF_PACKET)
+ return false;
- return false;
+ return ptype->af_packet_priv == pkt_sk(sk)->fanout;
+}
+
+static void fanout_init_data(struct packet_fanout *f)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_LB:
+ atomic_set(&f->rr_cur, 0);
+ break;
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
+ RCU_INIT_POINTER(f->bpf_prog, NULL);
+ break;
+ }
+}
+
+static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
+{
+ struct bpf_prog *old;
+
+ spin_lock(&f->lock);
+ old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
+ rcu_assign_pointer(f->bpf_prog, new);
+ spin_unlock(&f->lock);
+
+ if (old) {
+ synchronize_net();
+ bpf_prog_destroy(old);
+ }
+}
+
+static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
+ unsigned int len)
+{
+ struct bpf_prog *new;
+ struct sock_fprog fprog;
+ int ret;
+
+ if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+ if (len != sizeof(fprog))
+ return -EINVAL;
+ if (copy_from_user(&fprog, data, len))
+ return -EFAULT;
+
+ ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
+ if (ret)
+ return ret;
+
+ __fanout_set_data_bpf(po->fanout, new);
+ return 0;
+}
+
+static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
+ unsigned int len)
+{
+ struct bpf_prog *new;
+ u32 fd;
+
+ if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+ if (len != sizeof(fd))
+ return -EINVAL;
+ if (copy_from_user(&fd, data, len))
+ return -EFAULT;
+
+ new = bpf_prog_get(fd);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(new);
+ return -EINVAL;
+ }
+
+ __fanout_set_data_bpf(po->fanout, new);
+ return 0;
+}
+
+static int fanout_set_data(struct packet_sock *po, char __user *data,
+ unsigned int len)
+{
+ switch (po->fanout->type) {
+ case PACKET_FANOUT_CBPF:
+ return fanout_set_data_cbpf(po, data, len);
+ case PACKET_FANOUT_EBPF:
+ return fanout_set_data_ebpf(po, data, len);
+ default:
+ return -EINVAL;
+ };
+}
+
+static void fanout_release_data(struct packet_fanout *f)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
+ __fanout_set_data_bpf(f, NULL);
+ };
}
static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
@@ -1443,6 +1640,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
case PACKET_FANOUT_CPU:
case PACKET_FANOUT_RND:
case PACKET_FANOUT_QM:
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
break;
default:
return -EINVAL;
@@ -1454,6 +1653,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
if (po->fanout)
return -EALREADY;
+ if (type == PACKET_FANOUT_ROLLOVER ||
+ (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
+ po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
+ if (!po->rollover)
+ return -ENOMEM;
+ atomic_long_set(&po->rollover->num, 0);
+ atomic_long_set(&po->rollover->num_huge, 0);
+ atomic_long_set(&po->rollover->num_failed, 0);
+ }
+
mutex_lock(&fanout_mutex);
match = NULL;
list_for_each_entry(f, &fanout_list, list) {
@@ -1475,10 +1684,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->id = id;
match->type = type;
match->flags = flags;
- atomic_set(&match->rr_cur, 0);
INIT_LIST_HEAD(&match->list);
spin_lock_init(&match->lock);
atomic_set(&match->sk_ref, 0);
+ fanout_init_data(match);
match->prot_hook.type = po->prot_hook.type;
match->prot_hook.dev = po->prot_hook.dev;
match->prot_hook.func = packet_rcv_fanout;
@@ -1502,6 +1711,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
out:
mutex_unlock(&fanout_mutex);
+ if (err) {
+ kfree(po->rollover);
+ po->rollover = NULL;
+ }
return err;
}
@@ -1520,9 +1733,27 @@ static void fanout_release(struct sock *sk)
if (atomic_dec_and_test(&f->sk_ref)) {
list_del(&f->list);
dev_remove_pack(&f->prot_hook);
+ fanout_release_data(f);
kfree(f);
}
mutex_unlock(&fanout_mutex);
+
+ if (po->rollover)
+ kfree_rcu(po->rollover, rcu);
+}
+
+static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
+ struct sk_buff *skb)
+{
+ /* Earlier code assumed this would be a VLAN pkt, double-check
+ * this now that we have the actual packet in hand. We can only
+ * do this check on Ethernet devices.
+ */
+ if (unlikely(dev->type != ARPHRD_ETHER))
+ return false;
+
+ skb_reset_mac_header(skb);
+ return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
}
static const struct proto_ops packet_ops;
@@ -1686,18 +1917,10 @@ retry:
goto retry;
}
- if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
- /* Earlier code assumed this would be a VLAN pkt,
- * double-check this now that we have the actual
- * packet in hand.
- */
- struct ethhdr *ehdr;
- skb_reset_mac_header(skb);
- ehdr = eth_hdr(skb);
- if (ehdr->h_proto != htons(ETH_P_8021Q)) {
- err = -EMSGSIZE;
- goto out_unlock;
- }
+ if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
+ !packet_extra_vlan_len_allowed(dev, skb)) {
+ err = -EMSGSIZE;
+ goto out_unlock;
}
skb->protocol = proto;
@@ -1723,16 +1946,16 @@ out_free:
return err;
}
-static unsigned int run_filter(const struct sk_buff *skb,
- const struct sock *sk,
- unsigned int res)
+static unsigned int run_filter(struct sk_buff *skb,
+ const struct sock *sk,
+ unsigned int res)
{
struct sk_filter *filter;
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
- res = SK_RUN_FILTER(filter, skb);
+ res = bpf_prog_run_clear_cb(filter->prog, skb);
rcu_read_unlock();
return res;
@@ -2107,8 +2330,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
static bool ll_header_truncated(const struct net_device *dev, int len)
{
/* net device doesn't like empty head */
- if (unlikely(len <= dev->hard_header_len)) {
- net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
+ if (unlikely(len < dev->hard_header_len)) {
+ net_warn_ratelimited("%s: packet size is too short (%d < %d)\n",
current->comm, len, dev->hard_header_len);
return true;
}
@@ -2116,6 +2339,15 @@ static bool ll_header_truncated(const struct net_device *dev, int len)
return false;
}
+static void tpacket_set_protocol(const struct net_device *dev,
+ struct sk_buff *skb)
+{
+ if (dev->type == ARPHRD_ETHER) {
+ skb_reset_mac_header(skb);
+ skb->protocol = eth_hdr(skb)->h_proto;
+ }
+}
+
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
void *frame, struct net_device *dev, int size_max,
__be16 proto, unsigned char *addr, int hlen)
@@ -2152,8 +2384,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
- if (!packet_use_direct_xmit(po))
- skb_probe_transport_header(skb, 0);
if (unlikely(po->tp_tx_has_off)) {
int off_min, off_max, off;
off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
@@ -2199,6 +2429,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
dev->hard_header_len);
if (unlikely(err))
return err;
+ if (!skb->protocol)
+ tpacket_set_protocol(dev, skb);
data += dev->hard_header_len;
to_write -= dev->hard_header_len;
@@ -2233,6 +2465,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
len = ((to_write > len_max) ? len_max : to_write);
}
+ skb_probe_transport_header(skb, 0);
+
return tp_len;
}
@@ -2277,12 +2511,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
- reserve = dev->hard_header_len + VLAN_HLEN;
+ if (po->sk.sk_socket->type == SOCK_RAW)
+ reserve = dev->hard_header_len;
size_max = po->tx_ring.frame_size
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
- if (size_max > dev->mtu + reserve)
- size_max = dev->mtu + reserve;
+ if (size_max > dev->mtu + reserve + VLAN_HLEN)
+ size_max = dev->mtu + reserve + VLAN_HLEN;
do {
ph = packet_current_frame(po, &po->tx_ring,
@@ -2309,18 +2544,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
addr, hlen);
if (likely(tp_len >= 0) &&
- tp_len > dev->mtu + dev->hard_header_len) {
- struct ethhdr *ehdr;
- /* Earlier code assumed this would be a VLAN pkt,
- * double-check this now that we have the actual
- * packet in hand.
- */
+ tp_len > dev->mtu + reserve &&
+ !packet_extra_vlan_len_allowed(dev, skb))
+ tp_len = -EMSGSIZE;
- skb_reset_mac_header(skb);
- ehdr = eth_hdr(skb);
- if (ehdr->h_proto != htons(ETH_P_8021Q))
- tp_len = -EMSGSIZE;
- }
if (unlikely(tp_len < 0)) {
if (po->tp_loss) {
__packet_set_status(po, ph,
@@ -2414,6 +2641,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
__be16 proto;
unsigned char *addr;
int err, reserve = 0;
+ struct sockcm_cookie sockc;
struct virtio_net_hdr vnet_hdr = { 0 };
int offset = 0;
int vnet_hdr_len;
@@ -2449,6 +2677,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_unlock;
+ sockc.mark = sk->sk_mark;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err))
+ goto out_unlock;
+ }
+
if (sock->type == SOCK_RAW)
reserve = dev->hard_header_len;
if (po->has_vnet_hdr) {
@@ -2466,15 +2701,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
goto out_unlock;
if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
- (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
- __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
- __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
- vnet_hdr.hdr_len = __cpu_to_virtio16(false,
- __virtio16_to_cpu(false, vnet_hdr.csum_start) +
- __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
+ (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
+ __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 >
+ __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len)))
+ vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(),
+ __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
+ __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2);
err = -EINVAL;
- if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
+ if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len)
goto out_unlock;
if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
@@ -2517,7 +2752,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
- __virtio16_to_cpu(false, vnet_hdr.hdr_len),
+ __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
msg->msg_flags & MSG_DONTWAIT, &err);
if (skb == NULL)
goto out_unlock;
@@ -2541,31 +2776,23 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
- if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
- /* Earlier code assumed this would be a VLAN pkt,
- * double-check this now that we have the actual
- * packet in hand.
- */
- struct ethhdr *ehdr;
- skb_reset_mac_header(skb);
- ehdr = eth_hdr(skb);
- if (ehdr->h_proto != htons(ETH_P_8021Q)) {
- err = -EMSGSIZE;
- goto out_free;
- }
+ if (!gso_type && (len > dev->mtu + reserve + extra_len) &&
+ !packet_extra_vlan_len_allowed(dev, skb)) {
+ err = -EMSGSIZE;
+ goto out_free;
}
skb->protocol = proto;
skb->dev = dev;
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = sockc.mark;
packet_pick_tx_queue(dev, skb);
if (po->has_vnet_hdr) {
if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
- u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
+ u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start);
+ u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset);
if (!skb_partial_csum_set(skb, s, o)) {
err = -EINVAL;
goto out_free;
@@ -2573,7 +2800,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
}
skb_shinfo(skb)->gso_size =
- __virtio16_to_cpu(false, vnet_hdr.gso_size);
+ __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size);
skb_shinfo(skb)->gso_type = gso_type;
/* Header must be checked, and gso_segs computed. */
@@ -2583,8 +2810,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
len += vnet_hdr_len;
}
- if (!packet_use_direct_xmit(po))
- skb_probe_transport_header(skb, reserve);
+ skb_probe_transport_header(skb, reserve);
+
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
@@ -2687,22 +2914,40 @@ static int packet_release(struct socket *sock)
* Attach a packet hook.
*/
-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
+static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
+ __be16 proto)
{
struct packet_sock *po = pkt_sk(sk);
struct net_device *dev_curr;
__be16 proto_curr;
bool need_rehook;
+ struct net_device *dev = NULL;
+ int ret = 0;
+ bool unlisted = false;
- if (po->fanout) {
- if (dev)
- dev_put(dev);
-
+ if (po->fanout)
return -EINVAL;
- }
lock_sock(sk);
spin_lock(&po->bind_lock);
+ rcu_read_lock();
+
+ if (name) {
+ dev = dev_get_by_name_rcu(sock_net(sk), name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+ } else if (ifindex) {
+ dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ if (dev)
+ dev_hold(dev);
proto_curr = po->prot_hook.type;
dev_curr = po->prot_hook.dev;
@@ -2710,14 +2955,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
need_rehook = proto_curr != proto || dev_curr != dev;
if (need_rehook) {
- unregister_prot_hook(sk, true);
+ if (po->running) {
+ rcu_read_unlock();
+ __unregister_prot_hook(sk, true);
+ rcu_read_lock();
+ dev_curr = po->prot_hook.dev;
+ if (dev)
+ unlisted = !dev_get_by_index_rcu(sock_net(sk),
+ dev->ifindex);
+ }
po->num = proto;
po->prot_hook.type = proto;
- po->prot_hook.dev = dev;
- po->ifindex = dev ? dev->ifindex : 0;
- packet_cached_dev_assign(po, dev);
+ if (unlikely(unlisted)) {
+ dev_put(dev);
+ po->prot_hook.dev = NULL;
+ po->ifindex = -1;
+ packet_cached_dev_reset(po);
+ } else {
+ po->prot_hook.dev = dev;
+ po->ifindex = dev ? dev->ifindex : 0;
+ packet_cached_dev_assign(po, dev);
+ }
}
if (dev_curr)
dev_put(dev_curr);
@@ -2725,7 +2985,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
if (proto == 0 || !need_rehook)
goto out_unlock;
- if (!dev || (dev->flags & IFF_UP)) {
+ if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
register_prot_hook(sk);
} else {
sk->sk_err = ENETDOWN;
@@ -2734,9 +2994,10 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
}
out_unlock:
+ rcu_read_unlock();
spin_unlock(&po->bind_lock);
release_sock(sk);
- return 0;
+ return ret;
}
/*
@@ -2748,8 +3009,6 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
{
struct sock *sk = sock->sk;
char name[15];
- struct net_device *dev;
- int err = -ENODEV;
/*
* Check legality
@@ -2759,19 +3018,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
return -EINVAL;
strlcpy(name, uaddr->sa_data, sizeof(name));
- dev = dev_get_by_name(sock_net(sk), name);
- if (dev)
- err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
- return err;
+ return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
}
static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
struct sock *sk = sock->sk;
- struct net_device *dev = NULL;
- int err;
-
/*
* Check legality
@@ -2782,16 +3035,8 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len
if (sll->sll_family != AF_PACKET)
return -EINVAL;
- if (sll->sll_ifindex) {
- err = -ENODEV;
- dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
- if (dev == NULL)
- goto out;
- }
- err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
-
-out:
- return err;
+ return packet_do_bind(sk, NULL, sll->sll_ifindex,
+ sll->sll_protocol ? : pkt_sk(sk)->num);
}
static struct proto packet_proto = {
@@ -2821,7 +3066,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
sock->state = SS_UNCONNECTED;
err = -ENOBUFS;
- sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+ sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
if (sk == NULL)
goto out;
@@ -2851,6 +3096,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
spin_lock_init(&po->bind_lock);
mutex_init(&po->pg_vec_lock);
+ po->rollover = NULL;
po->prot_hook.func = packet_rcv;
if (sock->type == SOCK_PACKET)
@@ -2928,6 +3174,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL)
goto out;
+ if (pkt_sk(sk)->pressure)
+ packet_rcv_has_room(pkt_sk(sk), NULL);
+
if (pkt_sk(sk)->has_vnet_hdr) {
struct virtio_net_hdr vnet_hdr = { 0 };
@@ -2943,9 +3192,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
/* This is a hint as to how much should be linear. */
vnet_hdr.hdr_len =
- __cpu_to_virtio16(false, skb_headlen(skb));
+ __cpu_to_virtio16(vio_le(), skb_headlen(skb));
vnet_hdr.gso_size =
- __cpu_to_virtio16(false, sinfo->gso_size);
+ __cpu_to_virtio16(vio_le(), sinfo->gso_size);
if (sinfo->gso_type & SKB_GSO_TCPV4)
vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
else if (sinfo->gso_type & SKB_GSO_TCPV6)
@@ -2963,9 +3212,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb->ip_summed == CHECKSUM_PARTIAL) {
vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
- vnet_hdr.csum_start = __cpu_to_virtio16(false,
+ vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(),
skb_checksum_start_offset(skb));
- vnet_hdr.csum_offset = __cpu_to_virtio16(false,
+ vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(),
skb->csum_offset);
} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
@@ -3432,6 +3681,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return fanout_add(sk, val & 0xffff, val >> 16);
}
+ case PACKET_FANOUT_DATA:
+ {
+ if (!po->fanout)
+ return -EINVAL;
+
+ return fanout_set_data(po, optval, optlen);
+ }
case PACKET_TX_HAS_OFF:
{
unsigned int val;
@@ -3471,6 +3727,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
struct packet_sock *po = pkt_sk(sk);
void *data = &val;
union tpacket_stats_u st;
+ struct tpacket_rollover_stats rstats;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3546,6 +3803,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
((u32)po->fanout->flags << 24)) :
0);
break;
+ case PACKET_ROLLOVER_STATS:
+ if (!po->rollover)
+ return -EINVAL;
+ rstats.tp_all = atomic_long_read(&po->rollover->num);
+ rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
+ rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
+ data = &rstats;
+ lv = sizeof(rstats);
+ break;
case PACKET_TX_HAS_OFF:
val = po->tp_tx_has_off;
break;
@@ -3683,6 +3949,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= POLLIN | POLLRDNORM;
}
+ if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+ po->pressure = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
@@ -3842,7 +4110,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = -EINVAL;
if (unlikely((int)req->tp_block_size <= 0))
goto out;
- if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+ if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
goto out;
if (po->tp_version >= TPACKET_V3 &&
(int)(req->tp_block_size -
@@ -3854,8 +4122,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
goto out;
- rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
- if (unlikely(rb->frames_per_block <= 0))
+ rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
+ if (unlikely(rb->frames_per_block == 0))
goto out;
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
req->tp_frame_nr))
@@ -3872,7 +4140,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
* it above but just being paranoid
*/
if (!tx_ring)
- init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
+ init_prb_bdqc(po, rb, pg_vec, req_u);
break;
default:
break;
@@ -3932,7 +4200,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (closing && (po->tp_version > TPACKET_V2)) {
/* Because we don't support block-based V3 on tx-ring */
if (!tx_ring)
- prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
+ prb_shutdown_retire_blk_timer(po, rb_queue);
}
release_sock(sk);
diff --git a/kernel/net/packet/internal.h b/kernel/net/packet/internal.h
index fe6e20cae..9ee46314b 100644
--- a/kernel/net/packet/internal.h
+++ b/kernel/net/packet/internal.h
@@ -79,15 +79,27 @@ struct packet_fanout {
u16 id;
u8 type;
u8 flags;
- atomic_t rr_cur;
+ union {
+ atomic_t rr_cur;
+ struct bpf_prog __rcu *bpf_prog;
+ };
struct list_head list;
struct sock *arr[PACKET_FANOUT_MAX];
- int next[PACKET_FANOUT_MAX];
spinlock_t lock;
atomic_t sk_ref;
struct packet_type prot_hook ____cacheline_aligned_in_smp;
};
+struct packet_rollover {
+ int sock;
+ struct rcu_head rcu;
+ atomic_long_t num;
+ atomic_long_t num_huge;
+ atomic_long_t num_failed;
+#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32))
+ u32 history[ROLLOVER_HLEN] ____cacheline_aligned;
+} ____cacheline_aligned_in_smp;
+
struct packet_sock {
/* struct sock has to be the first member of packet_sock */
struct sock sk;
@@ -102,8 +114,10 @@ struct packet_sock {
auxdata:1,
origdev:1,
has_vnet_hdr:1;
+ int pressure;
int ifindex; /* bound device */
__be16 num;
+ struct packet_rollover *rollover;
struct packet_mclist *mclist;
atomic_t mapped;
enum tpacket_versions tp_version;
diff --git a/kernel/net/phonet/af_phonet.c b/kernel/net/phonet/af_phonet.c
index 32ab87d34..f92575366 100644
--- a/kernel/net/phonet/af_phonet.c
+++ b/kernel/net/phonet/af_phonet.c
@@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
goto out;
}
- sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+ sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern);
if (sk == NULL) {
err = -ENOMEM;
goto out;
@@ -377,6 +377,10 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
struct sockaddr_pn sa;
u16 len;
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
/* check we have at least a full Phonet header */
if (!pskb_pull(skb, sizeof(struct phonethdr)))
goto out;
diff --git a/kernel/net/phonet/pep.c b/kernel/net/phonet/pep.c
index 6de2aeb98..850a86cde 100644
--- a/kernel/net/phonet/pep.c
+++ b/kernel/net/phonet/pep.c
@@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
}
/* Create a new to-be-accepted sock */
- newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0);
if (!newsk) {
pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
err = -ENOBUFS;
diff --git a/kernel/net/rds/af_rds.c b/kernel/net/rds/af_rds.c
index 10443377f..b5476aebd 100644
--- a/kernel/net/rds/af_rds.c
+++ b/kernel/net/rds/af_rds.c
@@ -40,15 +40,6 @@
#include "rds.h"
-char *rds_str_array(char **array, size_t elements, size_t index)
-{
- if ((index < elements) && array[index])
- return array[index];
- else
- return "unknown";
-}
-EXPORT_SYMBOL(rds_str_array);
-
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
static unsigned long rds_sock_count;
@@ -81,13 +72,7 @@ static int rds_release(struct socket *sock)
rds_clear_recv_queue(rs);
rds_cong_remove_socket(rs);
- /*
- * the binding lookup hash uses rcu, we need to
- * make sure we synchronize_rcu before we free our
- * entry
- */
rds_remove_bound(rs);
- synchronize_rcu();
rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs);
@@ -270,6 +255,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
return ret;
}
+static int rds_set_transport(struct rds_sock *rs, char __user *optval,
+ int optlen)
+{
+ int t_type;
+
+ if (rs->rs_transport)
+ return -EOPNOTSUPP; /* previously attached to transport */
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
+ return -EFAULT;
+
+ if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
+ return -EINVAL;
+
+ rs->rs_transport = rds_trans_get(t_type);
+
+ return rs->rs_transport ? 0 : -ENOPROTOOPT;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -300,6 +307,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
case RDS_CONG_MONITOR:
ret = rds_cong_monitor(rs, optval, optlen);
break;
+ case SO_RDS_TRANSPORT:
+ lock_sock(sock->sk);
+ ret = rds_set_transport(rs, optval, optlen);
+ release_sock(sock->sk);
+ break;
default:
ret = -ENOPROTOOPT;
}
@@ -312,6 +324,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret = -ENOPROTOOPT, len;
+ int trans;
if (level != SOL_RDS)
goto out;
@@ -337,6 +350,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
else
ret = 0;
break;
+ case SO_RDS_TRANSPORT:
+ if (len < sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ trans = (rs->rs_transport ? rs->rs_transport->t_type :
+ RDS_TRANS_NONE); /* unbound */
+ if (put_user(trans, (int __user *)optval) ||
+ put_user(sizeof(int), optlen))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ break;
default:
break;
}
@@ -406,6 +432,14 @@ static const struct proto_ops rds_proto_ops = {
.sendpage = sock_no_sendpage,
};
+static void rds_sock_destruct(struct sock *sk)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+
+ WARN_ON((&rs->rs_item != rs->rs_item.next ||
+ &rs->rs_item != rs->rs_item.prev));
+}
+
static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
struct rds_sock *rs;
@@ -413,6 +447,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
sock_init_data(sock, sk);
sock->ops = &rds_proto_ops;
sk->sk_protocol = protocol;
+ sk->sk_destruct = rds_sock_destruct;
rs = rds_sk_to_rs(sk);
spin_lock_init(&rs->rs_lock);
@@ -440,7 +475,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_SEQPACKET || protocol)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+ sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
if (!sk)
return -ENOMEM;
@@ -538,6 +573,7 @@ static void rds_exit(void)
rds_threads_exit();
rds_stats_exit();
rds_page_exit();
+ rds_bind_lock_destroy();
rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
}
@@ -547,9 +583,14 @@ static int rds_init(void)
{
int ret;
- ret = rds_conn_init();
+ ret = rds_bind_lock_init();
if (ret)
goto out;
+
+ ret = rds_conn_init();
+ if (ret)
+ goto out_bind;
+
ret = rds_threads_init();
if (ret)
goto out_conn;
@@ -583,6 +624,8 @@ out_conn:
rds_conn_exit();
rds_cong_exit();
rds_page_exit();
+out_bind:
+ rds_bind_lock_destroy();
out:
return ret;
}
diff --git a/kernel/net/rds/bind.c b/kernel/net/rds/bind.c
index a2e6562da..b22ea9565 100644
--- a/kernel/net/rds/bind.c
+++ b/kernel/net/rds/bind.c
@@ -38,51 +38,16 @@
#include <linux/ratelimit.h>
#include "rds.h"
-#define BIND_HASH_SIZE 1024
-static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
-static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rhashtable bind_hash_table;
-static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
-{
- return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
- (BIND_HASH_SIZE - 1));
-}
-
-static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
- struct rds_sock *insert)
-{
- struct rds_sock *rs;
- struct hlist_head *head = hash_to_bucket(addr, port);
- u64 cmp;
- u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
- cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
- be16_to_cpu(rs->rs_bound_port);
-
- if (cmp == needle) {
- rcu_read_unlock();
- return rs;
- }
- }
- rcu_read_unlock();
-
- if (insert) {
- /*
- * make sure our addr and port are set before
- * we are added to the list, other people
- * in rcu will find us as soon as the
- * hlist_add_head_rcu is done
- */
- insert->rs_bound_addr = addr;
- insert->rs_bound_port = port;
- rds_sock_addref(insert);
-
- hlist_add_head_rcu(&insert->rs_bound_node, head);
- }
- return NULL;
-}
+static struct rhashtable_params ht_parms = {
+ .nelem_hint = 768,
+ .key_len = sizeof(u64),
+ .key_offset = offsetof(struct rds_sock, rs_bound_key),
+ .head_offset = offsetof(struct rds_sock, rs_bound_node),
+ .max_size = 16384,
+ .min_size = 1024,
+};
/*
* Return the rds_sock bound at the given local address.
@@ -92,10 +57,10 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
*/
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{
+ u64 key = ((u64)addr << 32) | port;
struct rds_sock *rs;
- rs = rds_bind_lookup(addr, port, NULL);
-
+ rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
@@ -103,15 +68,16 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
ntohs(port));
+
return rs;
}
/* returns -ve errno or +ve port */
static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
{
- unsigned long flags;
int ret = -EADDRINUSE;
u16 rover, last;
+ u64 key;
if (*port != 0) {
rover = be16_to_cpu(*port);
@@ -121,42 +87,49 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
last = rover - 1;
}
- spin_lock_irqsave(&rds_bind_lock, flags);
-
do {
if (rover == 0)
rover++;
- if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+
+ key = ((u64)addr << 32) | cpu_to_be16(rover);
+ if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+ continue;
+
+ rs->rs_bound_key = key;
+ rs->rs_bound_addr = addr;
+ rs->rs_bound_port = cpu_to_be16(rover);
+ rs->rs_bound_node.next = NULL;
+ rds_sock_addref(rs);
+ if (!rhashtable_insert_fast(&bind_hash_table,
+ &rs->rs_bound_node, ht_parms)) {
*port = rs->rs_bound_port;
ret = 0;
rdsdebug("rs %p binding to %pI4:%d\n",
rs, &addr, (int)ntohs(*port));
break;
+ } else {
+ rds_sock_put(rs);
+ ret = -ENOMEM;
+ break;
}
} while (rover++ != last);
- spin_unlock_irqrestore(&rds_bind_lock, flags);
-
return ret;
}
void rds_remove_bound(struct rds_sock *rs)
{
- unsigned long flags;
-
- spin_lock_irqsave(&rds_bind_lock, flags);
- if (rs->rs_bound_addr) {
- rdsdebug("rs %p unbinding from %pI4:%d\n",
- rs, &rs->rs_bound_addr,
- ntohs(rs->rs_bound_port));
+ if (!rs->rs_bound_addr)
+ return;
- hlist_del_init_rcu(&rs->rs_bound_node);
- rds_sock_put(rs);
- rs->rs_bound_addr = 0;
- }
+ rdsdebug("rs %p unbinding from %pI4:%d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port));
- spin_unlock_irqrestore(&rds_bind_lock, flags);
+ rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
+ rds_sock_put(rs);
+ rs->rs_bound_addr = 0;
}
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
@@ -181,7 +154,19 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (ret)
goto out;
- trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+ if (rs->rs_transport) { /* previously bound */
+ trans = rs->rs_transport;
+ if (trans->laddr_check(sock_net(sock->sk),
+ sin->sin_addr.s_addr) != 0) {
+ ret = -ENOPROTOOPT;
+ rds_remove_bound(rs);
+ } else {
+ ret = 0;
+ }
+ goto out;
+ }
+ trans = rds_trans_get_preferred(sock_net(sock->sk),
+ sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
@@ -195,9 +180,15 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
out:
release_sock(sk);
-
- /* we might have called rds_remove_bound on error */
- if (ret)
- synchronize_rcu();
return ret;
}
+
+void rds_bind_lock_destroy(void)
+{
+ rhashtable_destroy(&bind_hash_table);
+}
+
+int rds_bind_lock_init(void)
+{
+ return rhashtable_init(&bind_hash_table, &ht_parms);
+}
diff --git a/kernel/net/rds/connection.c b/kernel/net/rds/connection.c
index da6da57e5..e3b118cae 100644
--- a/kernel/net/rds/connection.c
+++ b/kernel/net/rds/connection.c
@@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
} while (0)
/* rcu read lock must be held or the connection spinlock */
-static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+static struct rds_connection *rds_conn_lookup(struct net *net,
+ struct hlist_head *head,
__be32 laddr, __be32 faddr,
struct rds_transport *trans)
{
@@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
- conn->c_trans == trans) {
+ conn->c_trans == trans && net == rds_conn_net(conn)) {
ret = conn;
break;
}
@@ -117,7 +118,8 @@ static void rds_conn_reset(struct rds_connection *conn)
* For now they are not garbage collected once they're created. They
* are torn down as the module is removed, if ever.
*/
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
@@ -126,12 +128,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
- struct rds_transport *otrans = trans;
- if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
- goto new_conn;
rcu_read_lock();
- conn = rds_conn_lookup(head, laddr, faddr, trans);
+ conn = rds_conn_lookup(net, head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
laddr == faddr && !is_outgoing) {
/* This is a looped back IB connection, and we're
@@ -145,7 +144,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
if (conn)
goto out;
-new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
@@ -157,6 +155,7 @@ new_conn:
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
+ rds_conn_net_set(conn, net);
init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue);
@@ -174,7 +173,7 @@ new_conn:
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(faddr);
+ loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -198,6 +197,7 @@ new_conn:
atomic_set(&conn->c_state, RDS_CONN_DOWN);
conn->c_send_gen = 0;
+ conn->c_outgoing = (is_outgoing ? 1 : 0);
conn->c_reconnect_jiffies = 0;
INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
@@ -234,22 +234,13 @@ new_conn:
/* Creating normal conn */
struct rds_connection *found;
- if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
- found = NULL;
- else
- found = rds_conn_lookup(head, laddr, faddr, trans);
+ found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
- if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
- (otrans->t_type != RDS_TRANS_TCP)) {
- /* Only the active side should be added to
- * reconnect list for TCP.
- */
- hlist_add_head_rcu(&conn->c_hash_node, head);
- }
+ hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
}
@@ -260,17 +251,19 @@ out:
return conn;
}
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
- return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
- return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
@@ -297,6 +290,8 @@ void rds_conn_shutdown(struct rds_connection *conn)
wait_event(conn->c_waitq,
!test_bit(RDS_IN_XMIT, &conn->c_flags));
+ wait_event(conn->c_waitq,
+ !test_bit(RDS_RECV_REFILL, &conn->c_flags));
conn->c_trans->conn_shutdown(conn);
rds_conn_reset(conn);
@@ -324,7 +319,9 @@ void rds_conn_shutdown(struct rds_connection *conn)
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
- rds_queue_reconnect(conn);
+ if (conn->c_trans->t_type != RDS_TRANS_TCP ||
+ conn->c_outgoing == 1)
+ rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
diff --git a/kernel/net/rds/ib.c b/kernel/net/rds/ib.c
index ba2dffeff..f222885ac 100644
--- a/kernel/net/rds/ib.c
+++ b/kernel/net/rds/ib.c
@@ -43,14 +43,14 @@
#include "rds.h"
#include "ib.h"
-static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
-unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
+unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
-module_param(fmr_pool_size, int, 0444);
-MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
-module_param(fmr_message_size, int, 0444);
-MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_fmr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
+module_param(rds_ib_fmr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
@@ -97,10 +97,10 @@ static void rds_ib_dev_free(struct work_struct *work)
struct rds_ib_device *rds_ibdev = container_of(work,
struct rds_ib_device, free_work);
- if (rds_ibdev->mr_pool)
- rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
- if (rds_ibdev->mr)
- ib_dereg_mr(rds_ibdev->mr);
+ if (rds_ibdev->mr_8k_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
+ if (rds_ibdev->mr_1m_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
@@ -150,9 +150,13 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
- rds_ibdev->max_fmrs = dev_attr->max_fmr ?
- min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
- fmr_pool_size;
+ rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
+ min_t(unsigned int, (dev_attr->max_mr / 2),
+ rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
+
+ rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
+ min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
+ rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
@@ -164,18 +168,25 @@ static void rds_ib_add_one(struct ib_device *device)
goto put_dev;
}
- rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(rds_ibdev->mr)) {
- rds_ibdev->mr = NULL;
+ rds_ibdev->mr_1m_pool =
+ rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
+ if (IS_ERR(rds_ibdev->mr_1m_pool)) {
+ rds_ibdev->mr_1m_pool = NULL;
goto put_dev;
}
- rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
- if (IS_ERR(rds_ibdev->mr_pool)) {
- rds_ibdev->mr_pool = NULL;
+ rds_ibdev->mr_8k_pool =
+ rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
+ if (IS_ERR(rds_ibdev->mr_8k_pool)) {
+ rds_ibdev->mr_8k_pool = NULL;
goto put_dev;
}
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
+ dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
+ rds_ibdev->max_8k_fmrs);
+
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
@@ -230,11 +241,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
*
* This can be called at any time and can be racing with any other RDS path.
*/
-static void rds_ib_remove_one(struct ib_device *device)
+static void rds_ib_remove_one(struct ib_device *device, void *client_data)
{
- struct rds_ib_device *rds_ibdev;
+ struct rds_ib_device *rds_ibdev = client_data;
- rds_ibdev = ib_get_client_data(device, &rds_ib_client);
if (!rds_ibdev)
return;
@@ -317,7 +327,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
@@ -326,7 +336,7 @@ static int rds_ib_laddr_check(__be32 addr)
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
@@ -366,6 +376,7 @@ void rds_ib_exit(void)
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
+ rds_ib_fmr_exit();
}
struct rds_transport rds_ib_transport = {
@@ -401,10 +412,14 @@ int rds_ib_init(void)
INIT_LIST_HEAD(&rds_ib_devices);
- ret = ib_register_client(&rds_ib_client);
+ ret = rds_ib_fmr_init();
if (ret)
goto out;
+ ret = ib_register_client(&rds_ib_client);
+ if (ret)
+ goto out_fmr_exit;
+
ret = rds_ib_sysctl_init();
if (ret)
goto out_ibreg;
@@ -427,6 +442,8 @@ out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
rds_ib_unregister_client();
+out_fmr_exit:
+ rds_ib_fmr_exit();
out:
return ret;
}
diff --git a/kernel/net/rds/ib.h b/kernel/net/rds/ib.h
index c36d71322..b3fdebb57 100644
--- a/kernel/net/rds/ib.h
+++ b/kernel/net/rds/ib.h
@@ -9,8 +9,11 @@
#include "rds.h"
#include "rdma_transport.h"
-#define RDS_FMR_SIZE 256
-#define RDS_FMR_POOL_SIZE 8192
+#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
+#define RDS_FMR_1M_MSG_SIZE 256
+#define RDS_FMR_8K_MSG_SIZE 2
+#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
+#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
@@ -24,6 +27,9 @@
#define RDS_IB_RECYCLE_BATCH_COUNT 32
+#define RDS_IB_WC_MAX 32
+#define RDS_IB_SEND_OP BIT_ULL(63)
+
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
@@ -69,7 +75,11 @@ struct rds_ib_connect_private {
struct rds_ib_send_work {
void *s_op;
- struct ib_send_wr s_wr;
+ union {
+ struct ib_send_wr s_wr;
+ struct ib_rdma_wr s_rdma_wr;
+ struct ib_atomic_wr s_atomic_wr;
+ };
struct ib_sge s_sge[RDS_IB_MAX_SGE];
unsigned long s_queued;
};
@@ -89,6 +99,20 @@ struct rds_ib_work_ring {
atomic_t w_free_ctr;
};
+/* Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+ u64 ack_next;
+ u64 ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+
struct rds_ib_device;
struct rds_ib_connection {
@@ -100,9 +124,14 @@ struct rds_ib_connection {
/* alphabet soup, IBTA style */
struct rdma_cm_id *i_cm_id;
struct ib_pd *i_pd;
- struct ib_mr *i_mr;
struct ib_cq *i_send_cq;
struct ib_cq *i_recv_cq;
+ struct ib_wc i_send_wc[RDS_IB_WC_MAX];
+ struct ib_wc i_recv_wc[RDS_IB_WC_MAX];
+
+ /* interrupt handling */
+ struct tasklet_struct i_send_tasklet;
+ struct tasklet_struct i_recv_tasklet;
/* tx */
struct rds_ib_work_ring i_send_ring;
@@ -113,7 +142,6 @@ struct rds_ib_connection {
atomic_t i_signaled_sends;
/* rx */
- struct tasklet_struct i_recv_tasklet;
struct mutex i_recv_mutex;
struct rds_ib_work_ring i_recv_ring;
struct rds_ib_incoming *i_ibinc;
@@ -165,6 +193,12 @@ struct rds_ib_connection {
struct rds_ib_ipaddr {
struct list_head list;
__be32 ipaddr;
+ struct rcu_head rcu;
+};
+
+enum {
+ RDS_IB_MR_8K_POOL,
+ RDS_IB_MR_1M_POOL,
};
struct rds_ib_device {
@@ -173,10 +207,12 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- struct ib_mr *mr;
- struct rds_ib_mr_pool *mr_pool;
- unsigned int fmr_max_remaps;
unsigned int max_fmrs;
+ struct rds_ib_mr_pool *mr_1m_pool;
+ struct rds_ib_mr_pool *mr_8k_pool;
+ unsigned int fmr_max_remaps;
+ unsigned int max_8k_fmrs;
+ unsigned int max_1m_fmrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
@@ -199,14 +235,14 @@ struct rds_ib_device {
struct rds_ib_statistics {
uint64_t s_ib_connect_raced;
uint64_t s_ib_listen_closed_stale;
- uint64_t s_ib_tx_cq_call;
+ uint64_t s_ib_evt_handler_call;
+ uint64_t s_ib_tasklet_call;
uint64_t s_ib_tx_cq_event;
uint64_t s_ib_tx_ring_full;
uint64_t s_ib_tx_throttle;
uint64_t s_ib_tx_sg_mapping_failure;
uint64_t s_ib_tx_stalled;
uint64_t s_ib_tx_credit_updates;
- uint64_t s_ib_rx_cq_call;
uint64_t s_ib_rx_cq_event;
uint64_t s_ib_rx_ring_empty;
uint64_t s_ib_rx_refill_from_cq;
@@ -218,12 +254,18 @@ struct rds_ib_statistics {
uint64_t s_ib_ack_send_delayed;
uint64_t s_ib_ack_send_piggybacked;
uint64_t s_ib_ack_received;
- uint64_t s_ib_rdma_mr_alloc;
- uint64_t s_ib_rdma_mr_free;
- uint64_t s_ib_rdma_mr_used;
- uint64_t s_ib_rdma_mr_pool_flush;
- uint64_t s_ib_rdma_mr_pool_wait;
- uint64_t s_ib_rdma_mr_pool_depleted;
+ uint64_t s_ib_rdma_mr_8k_alloc;
+ uint64_t s_ib_rdma_mr_8k_free;
+ uint64_t s_ib_rdma_mr_8k_used;
+ uint64_t s_ib_rdma_mr_8k_pool_flush;
+ uint64_t s_ib_rdma_mr_8k_pool_wait;
+ uint64_t s_ib_rdma_mr_8k_pool_depleted;
+ uint64_t s_ib_rdma_mr_1m_alloc;
+ uint64_t s_ib_rdma_mr_1m_free;
+ uint64_t s_ib_rdma_mr_1m_used;
+ uint64_t s_ib_rdma_mr_1m_pool_flush;
+ uint64_t s_ib_rdma_mr_1m_pool_wait;
+ uint64_t s_ib_rdma_mr_1m_pool_depleted;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
@@ -235,28 +277,34 @@ extern struct workqueue_struct *rds_ib_wq;
* doesn't define it.
*/
static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
- struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+ struct scatterlist *sglist,
+ unsigned int sg_dma_len,
+ int direction)
{
+ struct scatterlist *sg;
unsigned int i;
- for (i = 0; i < sg_dma_len; ++i) {
+ for_each_sg(sglist, sg, sg_dma_len, i) {
ib_dma_sync_single_for_cpu(dev,
- ib_sg_dma_address(dev, &sg[i]),
- ib_sg_dma_len(dev, &sg[i]),
+ ib_sg_dma_address(dev, sg),
+ ib_sg_dma_len(dev, sg),
direction);
}
}
#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
- struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+ struct scatterlist *sglist,
+ unsigned int sg_dma_len,
+ int direction)
{
+ struct scatterlist *sg;
unsigned int i;
- for (i = 0; i < sg_dma_len; ++i) {
+ for_each_sg(sglist, sg, sg_dma_len, i) {
ib_dma_sync_single_for_device(dev,
- ib_sg_dma_address(dev, &sg[i]),
- ib_sg_dma_len(dev, &sg[i]),
+ ib_sg_dma_address(dev, sg),
+ ib_sg_dma_len(dev, sg),
direction);
}
}
@@ -269,7 +317,8 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
-extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_fmr_1m_pool_size;
+extern unsigned int rds_ib_fmr_8k_pool_size;
extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
@@ -299,7 +348,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
+ int npages);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -307,6 +357,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
+int rds_ib_fmr_init(void);
+void rds_ib_fmr_exit(void);
/* ib_recv.c */
int rds_ib_recv_init(void);
@@ -314,10 +366,11 @@ void rds_ib_recv_exit(void);
int rds_ib_recv(struct rds_connection *conn);
int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
void rds_ib_inc_free(struct rds_incoming *inc);
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc,
+ struct rds_ib_ack_state *state);
void rds_ib_recv_tasklet_fn(unsigned long data);
void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
@@ -325,6 +378,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
void rds_ib_attempt_ack(struct rds_ib_connection *ic);
void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required);
/* ib_ring.c */
void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
@@ -339,11 +393,10 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */
-char *rds_ib_wc_status_str(enum ib_wc_status status);
void rds_ib_xmit_complete(struct rds_connection *conn);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
void rds_ib_send_init_ring(struct rds_ib_connection *ic);
void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
diff --git a/kernel/net/rds/ib_cm.c b/kernel/net/rds/ib_cm.c
index 8a09ee7db..da5a7fb98 100644
--- a/kernel/net/rds/ib_cm.c
+++ b/kernel/net/rds/ib_cm.c
@@ -39,36 +39,6 @@
#include "rds.h"
#include "ib.h"
-static char *rds_ib_event_type_strings[] = {
-#define RDS_IB_EVENT_STRING(foo) \
- [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
- RDS_IB_EVENT_STRING(CQ_ERR),
- RDS_IB_EVENT_STRING(QP_FATAL),
- RDS_IB_EVENT_STRING(QP_REQ_ERR),
- RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
- RDS_IB_EVENT_STRING(COMM_EST),
- RDS_IB_EVENT_STRING(SQ_DRAINED),
- RDS_IB_EVENT_STRING(PATH_MIG),
- RDS_IB_EVENT_STRING(PATH_MIG_ERR),
- RDS_IB_EVENT_STRING(DEVICE_FATAL),
- RDS_IB_EVENT_STRING(PORT_ACTIVE),
- RDS_IB_EVENT_STRING(PORT_ERR),
- RDS_IB_EVENT_STRING(LID_CHANGE),
- RDS_IB_EVENT_STRING(PKEY_CHANGE),
- RDS_IB_EVENT_STRING(SM_CHANGE),
- RDS_IB_EVENT_STRING(SRQ_ERR),
- RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
- RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
- RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
-#undef RDS_IB_EVENT_STRING
-};
-
-static char *rds_ib_event_str(enum ib_event_type type)
-{
- return rds_str_array(rds_ib_event_type_strings,
- ARRAY_SIZE(rds_ib_event_type_strings), type);
-};
-
/*
* Set the selected protocol version
*/
@@ -165,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_recv_init_ring(ic);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
- rds_ib_recv_refill(conn, 1);
+ rds_ib_recv_refill(conn, 1, GFP_KERNEL);
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
@@ -243,7 +213,97 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{
rdsdebug("event %u (%s) data %p\n",
- event->event, rds_ib_event_str(event->event), data);
+ event->event, ib_event_msg(event->event), data);
+}
+
+/* Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_ib_stats_inc(s_ib_evt_handler_call);
+
+ tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs,
+ struct rds_ib_ack_state *ack_state)
+{
+ int nr;
+ int i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+ if (wc->wr_id & RDS_IB_SEND_OP)
+ rds_ib_send_cqe_handler(ic, wc);
+ else
+ rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ }
+ }
+}
+
+static void rds_ib_tasklet_fn_send(unsigned long data)
+{
+ struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
+ struct rds_connection *conn = ic->conn;
+ struct rds_ib_ack_state state;
+
+ rds_ib_stats_inc(s_ib_tasklet_call);
+
+ memset(&state, 0, sizeof(state));
+ poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+ poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+
+ if (rds_conn_up(conn) &&
+ (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued)))
+ rds_send_xmit(ic->conn);
+}
+
+static void rds_ib_tasklet_fn_recv(unsigned long data)
+{
+ struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
+ struct rds_connection *conn = ic->conn;
+ struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+ struct rds_ib_ack_state state;
+
+ if (!rds_ibdev)
+ rds_conn_drop(conn);
+
+ rds_ib_stats_inc(s_ib_tasklet_call);
+
+ memset(&state, 0, sizeof(state));
+ poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+
+ if (state.ack_next_valid)
+ rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rds_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+
+ if (rds_conn_up(conn))
+ rds_ib_attempt_ack(ic);
}
static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -252,7 +312,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
struct rds_ib_connection *ic = conn->c_transport_data;
rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
- rds_ib_event_str(event->event));
+ ib_event_msg(event->event));
switch (event->event) {
case IB_EVENT_COMM_EST:
@@ -261,13 +321,25 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
default:
rdsdebug("Fatal QP Event %u (%s) "
"- connection %pI4->%pI4, reconnecting\n",
- event->event, rds_ib_event_str(event->event),
+ event->event, ib_event_msg(event->event),
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
break;
}
}
+static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_ib_stats_inc(s_ib_evt_handler_call);
+
+ tasklet_schedule(&ic->i_send_tasklet);
+}
+
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -277,6 +349,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_device *dev = ic->i_cm_id->device;
struct ib_qp_init_attr attr;
+ struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
int ret;
@@ -298,11 +371,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
- ic->i_mr = rds_ibdev->mr;
- ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+ cq_attr.cqe = ic->i_send_ring.w_nr + 1;
+
+ ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
- ic->i_send_ring.w_nr + 1, 0);
+ &cq_attr);
if (IS_ERR(ic->i_send_cq)) {
ret = PTR_ERR(ic->i_send_cq);
ic->i_send_cq = NULL;
@@ -310,9 +384,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
goto out;
}
- ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+ cq_attr.cqe = ic->i_recv_ring.w_nr;
+ ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
rds_ib_cq_event_handler, conn,
- ic->i_recv_ring.w_nr, 0);
+ &cq_attr);
if (IS_ERR(ic->i_recv_cq)) {
ret = PTR_ERR(ic->i_recv_cq);
ic->i_recv_cq = NULL;
@@ -402,7 +477,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rds_ib_recv_init_ack(ic);
- rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+ rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
ic->i_send_cq, ic->i_recv_cq);
out:
@@ -475,8 +550,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
- conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
- GFP_KERNEL);
+ /* RDS/IB is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+ &rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
@@ -592,7 +668,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
@@ -664,8 +740,18 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0));
+ tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
+ /* first destroy the ib state that generates callbacks */
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq)
+ ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ ib_destroy_cq(ic->i_recv_cq);
+
+ /* then free the resources that ib callbacks use */
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
ic->i_send_ring.w_nr *
@@ -689,12 +775,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
if (ic->i_recvs)
rds_ib_recv_clear_ring(ic);
- if (ic->i_cm_id->qp)
- rdma_destroy_qp(ic->i_cm_id);
- if (ic->i_send_cq)
- ib_destroy_cq(ic->i_send_cq);
- if (ic->i_recv_cq)
- ib_destroy_cq(ic->i_recv_cq);
rdma_destroy_id(ic->i_cm_id);
/*
@@ -705,7 +785,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
ic->i_cm_id = NULL;
ic->i_pd = NULL;
- ic->i_mr = NULL;
ic->i_send_cq = NULL;
ic->i_recv_cq = NULL;
ic->i_send_hdrs = NULL;
@@ -768,8 +847,10 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
}
INIT_LIST_HEAD(&ic->ib_node);
- tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
- (unsigned long) ic);
+ tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
+ (unsigned long)ic);
+ tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
+ (unsigned long)ic);
mutex_init(&ic->i_recv_mutex);
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
diff --git a/kernel/net/rds/ib_rdma.c b/kernel/net/rds/ib_rdma.c
index c8faaf364..19123a97b 100644
--- a/kernel/net/rds/ib_rdma.c
+++ b/kernel/net/rds/ib_rdma.c
@@ -66,6 +66,7 @@ struct rds_ib_mr {
* Our own little FMR pool
*/
struct rds_ib_mr_pool {
+ unsigned int pool_type;
struct mutex flush_lock; /* serialize fmr invalidate */
struct delayed_work flush_worker; /* flush worker */
@@ -84,6 +85,25 @@ struct rds_ib_mr_pool {
struct ib_fmr_attr fmr_attr;
};
+static struct workqueue_struct *rds_ib_fmr_wq;
+
+int rds_ib_fmr_init(void)
+{
+ rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
+ if (!rds_ib_fmr_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+/* By the time this is called all the IB devices should have been torn down and
+ * had their pools freed. As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_fmr_exit(void)
+{
+ destroy_workqueue(rds_ib_fmr_wq);
+}
+
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
@@ -141,10 +161,8 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
}
spin_unlock_irq(&rds_ibdev->spinlock);
- if (to_free) {
- synchronize_rcu();
- kfree(to_free);
- }
+ if (to_free)
+ kfree_rcu(to_free, rcu);
}
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -152,12 +170,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
struct rds_ib_device *rds_ibdev_old;
rds_ibdev_old = rds_ib_get_device(ipaddr);
- if (rds_ibdev_old) {
+ if (!rds_ibdev_old)
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+
+ if (rds_ibdev_old != rds_ibdev) {
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
rds_ib_dev_put(rds_ibdev_old);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
}
+ rds_ib_dev_put(rds_ibdev_old);
- return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+ return 0;
}
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -213,7 +236,8 @@ void rds_ib_destroy_nodev_conns(void)
rds_conn_destroy(ic->conn);
}
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
+ int pool_type)
{
struct rds_ib_mr_pool *pool;
@@ -221,6 +245,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
if (!pool)
return ERR_PTR(-ENOMEM);
+ pool->pool_type = pool_type;
init_llist_head(&pool->free_list);
init_llist_head(&pool->drop_list);
init_llist_head(&pool->clean_list);
@@ -228,28 +253,30 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
init_waitqueue_head(&pool->flush_wait);
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
- pool->fmr_attr.max_pages = fmr_message_size;
+ if (pool_type == RDS_IB_MR_1M_POOL) {
+ /* +1 allows for unaligned MRs */
+ pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
+ pool->max_items = RDS_FMR_1M_POOL_SIZE;
+ } else {
+ /* pool_type == RDS_IB_MR_8K_POOL */
+ pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
+ pool->max_items = RDS_FMR_8K_POOL_SIZE;
+ }
+
+ pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
pool->fmr_attr.page_shift = PAGE_SHIFT;
- pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
-
- /* We never allow more than max_items MRs to be allocated.
- * When we exceed more than max_items_soft, we start freeing
- * items more aggressively.
- * Make sure that max_items > max_items_soft > max_items / 2
- */
pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
- pool->max_items = rds_ibdev->max_fmrs;
return pool;
}
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
{
- struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
- iinfo->rdma_mr_max = pool->max_items;
- iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+ iinfo->rdma_mr_max = pool_1m->max_items;
+ iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
@@ -291,14 +318,28 @@ static inline void wait_clean_list_grace(void)
}
}
-static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
+ int npages)
{
- struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ struct rds_ib_mr_pool *pool;
struct rds_ib_mr *ibmr = NULL;
int err = 0, iter = 0;
+ if (npages <= RDS_FMR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- schedule_delayed_work(&pool->flush_worker, 10);
+ queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+
+ /* Switch pools if one of the pool is reaching upper limit */
+ if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ pool = rds_ibdev->mr_1m_pool;
+ else
+ pool = rds_ibdev->mr_8k_pool;
+ }
while (1) {
ibmr = rds_ib_reuse_fmr(pool);
@@ -320,12 +361,18 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
atomic_dec(&pool->item_count);
if (++iter > 2) {
- rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
return ERR_PTR(-EAGAIN);
}
/* We do have some empty MRs. Flush them out. */
- rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
rds_ib_flush_mr_pool(pool, 0, &ibmr);
if (ibmr)
return ibmr;
@@ -337,8 +384,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
goto out_no_cigar;
}
- memset(ibmr, 0, sizeof(*ibmr));
-
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
@@ -352,7 +397,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
goto out_no_cigar;
}
- rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
return ibmr;
out_no_cigar:
@@ -408,7 +458,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
}
page_cnt += len >> PAGE_SHIFT;
- if (page_cnt > fmr_message_size)
+ if (page_cnt > ibmr->pool->fmr_attr.max_pages)
return -EINVAL;
dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
@@ -440,7 +490,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
ibmr->sg_dma_len = sg_dma_len;
ibmr->remap_count++;
- rds_ib_stats_inc(s_ib_rdma_mr_used);
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
ret = 0;
out:
@@ -486,7 +539,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
/* FIXME we need a way to tell a r/w MR
* from a r/o MR */
- BUG_ON(irqs_disabled());
+ WARN_ON(!page->mapping && irqs_disabled());
set_page_dirty(page);
put_page(page);
}
@@ -503,8 +556,7 @@ static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
__rds_ib_teardown_mr(ibmr);
if (pinned) {
- struct rds_ib_device *rds_ibdev = ibmr->device;
- struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ struct rds_ib_mr_pool *pool = ibmr->pool;
atomic_sub(pinned, &pool->free_pinned);
}
@@ -524,11 +576,13 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
/*
* given an llist of mrs, put them all into the list_head for more processing
*/
-static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
+static unsigned int llist_append_to_list(struct llist_head *llist,
+ struct list_head *list)
{
struct rds_ib_mr *ibmr;
struct llist_node *node;
struct llist_node *next;
+ unsigned int count = 0;
node = llist_del_all(llist);
while (node) {
@@ -536,7 +590,9 @@ static void llist_append_to_list(struct llist_head *llist, struct list_head *lis
ibmr = llist_entry(node, struct rds_ib_mr, llnode);
list_add_tail(&ibmr->unmap_list, list);
node = next;
+ count++;
}
+ return count;
}
/*
@@ -569,7 +625,7 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
* to free as many MRs as needed to get back to this limit.
*/
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
- int free_all, struct rds_ib_mr **ibmr_ret)
+ int free_all, struct rds_ib_mr **ibmr_ret)
{
struct rds_ib_mr *ibmr, *next;
struct llist_node *clean_nodes;
@@ -577,14 +633,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
LIST_HEAD(unmap_list);
LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
- unsigned int nfreed = 0, ncleaned = 0, free_goal;
+ unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
int ret = 0;
- rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
if (ibmr_ret) {
DEFINE_WAIT(wait);
- while(!mutex_trylock(&pool->flush_lock)) {
+ while (!mutex_trylock(&pool->flush_lock)) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
@@ -619,8 +678,8 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
/* Get the list of all MRs to be dropped. Ordering matters -
* we want to put drop_list ahead of free_list.
*/
- llist_append_to_list(&pool->drop_list, &unmap_list);
- llist_append_to_list(&pool->free_list, &unmap_list);
+ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
+ dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
if (free_all)
llist_append_to_list(&pool->clean_list, &unmap_list);
@@ -641,14 +700,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
- if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
- rds_ib_stats_inc(s_ib_rdma_mr_free);
+ if (nfreed < free_goal ||
+ ibmr->remap_count >= pool->fmr_attr.max_maps) {
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
list_del(&ibmr->unmap_list);
ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr);
nfreed++;
}
- ncleaned++;
}
if (!list_empty(&unmap_list)) {
@@ -674,7 +736,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
}
atomic_sub(unpinned, &pool->free_pinned);
- atomic_sub(ncleaned, &pool->dirty_count);
+ atomic_sub(dirty_to_clean, &pool->dirty_count);
atomic_sub(nfreed, &pool->item_count);
out:
@@ -695,8 +757,8 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
void rds_ib_free_mr(void *trans_private, int invalidate)
{
struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_mr_pool *pool = ibmr->pool;
struct rds_ib_device *rds_ibdev = ibmr->device;
- struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
@@ -711,16 +773,18 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
- atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- schedule_delayed_work(&pool->flush_worker, 10);
+ atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
if (invalidate) {
if (likely(!in_interrupt())) {
rds_ib_flush_mr_pool(pool, 0, NULL);
} else {
/* We get here if the user created a MR marked
- * as use_once and invalidate at the same time. */
- schedule_delayed_work(&pool->flush_worker, 10);
+ * as use_once and invalidate at the same time.
+ */
+ queue_delayed_work(rds_ib_fmr_wq,
+ &pool->flush_worker, 10);
}
}
@@ -733,10 +797,11 @@ void rds_ib_flush_mrs(void)
down_read(&rds_ib_devices_lock);
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
- struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ if (rds_ibdev->mr_8k_pool)
+ rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
- if (pool)
- rds_ib_flush_mr_pool(pool, 0, NULL);
+ if (rds_ibdev->mr_1m_pool)
+ rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
}
up_read(&rds_ib_devices_lock);
}
@@ -754,12 +819,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}
- if (!rds_ibdev->mr_pool) {
+ if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
ret = -ENODEV;
goto out;
}
- ibmr = rds_ib_alloc_fmr(rds_ibdev);
+ ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
if (IS_ERR(ibmr)) {
rds_ib_dev_put(rds_ibdev);
return ibmr;
diff --git a/kernel/net/rds/ib_recv.c b/kernel/net/rds/ib_recv.c
index 1b981a4e4..977fb8606 100644
--- a/kernel/net/rds/ib_recv.c
+++ b/kernel/net/rds/ib_recv.c
@@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
sge = &recv->r_sge[0];
sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
- sge->lkey = ic->i_mr->lkey;
+ sge->lkey = ic->i_pd->local_dma_lkey;
sge = &recv->r_sge[1];
sge->addr = 0;
sge->length = RDS_FRAG_SIZE;
- sge->lkey = ic->i_mr->lkey;
+ sge->lkey = ic->i_pd->local_dma_lkey;
}
}
@@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
}
static int rds_ib_recv_refill_one(struct rds_connection *conn,
- struct rds_ib_recv_work *recv, int prefill)
+ struct rds_ib_recv_work *recv, gfp_t gfp)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_sge *sge;
@@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
gfp_t slab_mask = GFP_NOWAIT;
gfp_t page_mask = GFP_NOWAIT;
- if (prefill) {
+ if (gfp & __GFP_DIRECT_RECLAIM) {
slab_mask = GFP_KERNEL;
page_mask = GFP_HIGHUSER;
}
@@ -347,6 +347,24 @@ out:
return ret;
}
+static int acquire_refill(struct rds_connection *conn)
+{
+ return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
+}
+
+static void release_refill(struct rds_connection *conn)
+{
+ clear_bit(RDS_RECV_REFILL, &conn->c_flags);
+
+ /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+ * hot path and finding waiters is very rare. We don't want to walk
+ * the system-wide hashed waitqueue buckets in the fast path only to
+ * almost never find waiters.
+ */
+ if (waitqueue_active(&conn->c_waitq))
+ wake_up_all(&conn->c_waitq);
+}
+
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
@@ -354,15 +372,23 @@ out:
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_recv_work *recv;
struct ib_recv_wr *failed_wr;
unsigned int posted = 0;
int ret = 0;
+ bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
u32 pos;
+ /* the goal here is to just make sure that someone, somewhere
+ * is posting buffers. If we can't get the refill lock,
+ * let them do their thing
+ */
+ if (!acquire_refill(conn))
+ return;
+
while ((prefill || rds_conn_up(conn)) &&
rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
if (pos >= ic->i_recv_ring.w_nr) {
@@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
}
recv = &ic->i_recvs[pos];
- ret = rds_ib_recv_refill_one(conn, recv, prefill);
+ ret = rds_ib_recv_refill_one(conn, recv, gfp);
if (ret) {
break;
}
@@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
if (ret)
rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+
+ release_refill(conn);
+
+ /* if we're called from the softirq handler, we'll be GFP_NOWAIT.
+ * in this case the ring being low is going to lead to more interrupts
+ * and we can safely let the softirq code take care of it unless the
+ * ring is completely empty.
+ *
+ * if we're called from krdsd, we'll be GFP_KERNEL. In this case
+ * we might have raced with the softirq code while we had the refill
+ * lock held. Use rds_ib_ring_low() instead of ring_empty to decide
+ * if we should requeue.
+ */
+ if (rds_conn_up(conn) &&
+ ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
+ rds_ib_ring_empty(&ic->i_recv_ring))) {
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+ }
}
/*
@@ -520,7 +564,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
sge->addr = ic->i_ack_dma;
sge->length = sizeof(struct rds_header);
- sge->lkey = ic->i_mr->lkey;
+ sge->lkey = ic->i_pd->local_dma_lkey;
wr->sg_list = sge;
wr->num_sge = 1;
@@ -552,8 +596,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
* wr_id and avoids working with the ring in that case.
*/
#ifndef KERNEL_HAS_ATOMIC64
-static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
- int ack_required)
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
{
unsigned long flags;
@@ -578,8 +621,7 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
return seq;
}
#else
-static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
- int ack_required)
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
{
atomic64_set(&ic->i_ack_next, seq);
if (ack_required) {
@@ -786,20 +828,6 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
rds_cong_map_updated(map, uncongested);
}
-/*
- * Rings are posted with all the allocations they'll need to queue the
- * incoming message to the receiving socket so this can't fail.
- * All fragments start with a header, so we can make sure we're not receiving
- * garbage, and we can tell a small 8 byte fragment from an ACK frame.
- */
-struct rds_ib_ack_state {
- u64 ack_next;
- u64 ack_recv;
- unsigned int ack_required:1;
- unsigned int ack_next_valid:1;
- unsigned int ack_recv_valid:1;
-};
-
static void rds_ib_process_recv(struct rds_connection *conn,
struct rds_ib_recv_work *recv, u32 data_len,
struct rds_ib_ack_state *state)
@@ -925,89 +953,50 @@ static void rds_ib_process_recv(struct rds_connection *conn,
}
}
-/*
- * Plucking the oldest entry from the ring can be done concurrently with
- * the thread refilling the ring. Each ring operation is protected by
- * spinlocks and the transient state of refilling doesn't change the
- * recording of which entry is oldest.
- *
- * This relies on IB only calling one cq comp_handler for each cq so that
- * there will only be one caller of rds_recv_incoming() per RDS connection.
- */
-void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
-{
- struct rds_connection *conn = context;
- struct rds_ib_connection *ic = conn->c_transport_data;
-
- rdsdebug("conn %p cq %p\n", conn, cq);
-
- rds_ib_stats_inc(s_ib_rx_cq_call);
-
- tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_ib_connection *ic,
- struct rds_ib_ack_state *state)
+void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
+ struct ib_wc *wc,
+ struct rds_ib_ack_state *state)
{
struct rds_connection *conn = ic->conn;
- struct ib_wc wc;
struct rds_ib_recv_work *recv;
- while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status,
- rds_ib_wc_status_str(wc.status), wc.byte_len,
- be32_to_cpu(wc.ex.imm_data));
- rds_ib_stats_inc(s_ib_rx_cq_event);
-
- recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ ib_wc_status_msg(wc->status), wc->byte_len,
+ be32_to_cpu(wc->ex.imm_data));
- ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
-
- /*
- * Also process recvs in connecting state because it is possible
- * to get a recv completion _before_ the rdmacm ESTABLISHED
- * event is processed.
- */
- if (wc.status == IB_WC_SUCCESS) {
- rds_ib_process_recv(conn, recv, wc.byte_len, state);
- } else {
- /* We expect errors as the qp is drained during shutdown */
- if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on %pI4 had "
- "status %u (%s), disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- wc.status,
- rds_ib_wc_status_str(wc.status));
- }
+ rds_ib_stats_inc(s_ib_rx_cq_event);
+ recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
+ DMA_FROM_DEVICE);
- /*
- * It's very important that we only free this ring entry if we've truly
- * freed the resources allocated to the entry. The refilling path can
- * leak if we don't.
- */
- rds_ib_ring_free(&ic->i_recv_ring, 1);
+ /* Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (wc->status == IB_WC_SUCCESS) {
+ rds_ib_process_recv(conn, recv, wc->byte_len, state);
+ } else {
+ /* We expect errors as the qp is drained during shutdown */
+ if (rds_conn_up(conn) || rds_conn_connecting(conn))
+ rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
+ &conn->c_faddr,
+ wc->status,
+ ib_wc_status_msg(wc->status));
}
-}
-
-void rds_ib_recv_tasklet_fn(unsigned long data)
-{
- struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
- struct rds_connection *conn = ic->conn;
- struct rds_ib_ack_state state = { 0, };
-
- rds_poll_cq(ic, &state);
- ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- rds_poll_cq(ic, &state);
- if (state.ack_next_valid)
- rds_ib_set_ack(ic, state.ack_next, state.ack_required);
- if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
- rds_send_drop_acked(conn, state.ack_recv, NULL);
- ic->i_ack_recv = state.ack_recv;
+ /* rds_ib_process_recv() doesn't always consume the frag, and
+ * we might not have called it at all if the wc didn't indicate
+ * success. We already unmapped the frag's pages, though, and
+ * the following rds_ib_ring_free() call tells the refill path
+ * that it will not find an allocated frag here. Make sure we
+ * keep that promise by freeing a frag that's still on the ring.
+ */
+ if (recv->r_frag) {
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
}
- if (rds_conn_up(conn))
- rds_ib_attempt_ack(ic);
+ rds_ib_ring_free(&ic->i_recv_ring, 1);
/* If we ever end up with a really empty receive ring, we're
* in deep trouble, as the sender will definitely see RNR
@@ -1016,7 +1005,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring))
- rds_ib_recv_refill(conn, 0);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
}
int rds_ib_recv(struct rds_connection *conn)
@@ -1025,8 +1014,10 @@ int rds_ib_recv(struct rds_connection *conn)
int ret = 0;
rdsdebug("conn %p\n", conn);
- if (rds_conn_up(conn))
+ if (rds_conn_up(conn)) {
rds_ib_attempt_ack(ic);
+ rds_ib_recv_refill(conn, 0, GFP_KERNEL);
+ }
return ret;
}
@@ -1049,9 +1040,10 @@ int rds_ib_recv_init(void)
rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
sizeof(struct rds_page_frag),
0, SLAB_HWCACHE_ALIGN, NULL);
- if (!rds_ib_frag_slab)
+ if (!rds_ib_frag_slab) {
kmem_cache_destroy(rds_ib_incoming_slab);
- else
+ rds_ib_incoming_slab = NULL;
+ } else
ret = 0;
out:
return ret;
diff --git a/kernel/net/rds/ib_send.c b/kernel/net/rds/ib_send.c
index bd3825d38..eac30bf48 100644
--- a/kernel/net/rds/ib_send.c
+++ b/kernel/net/rds/ib_send.c
@@ -39,40 +39,6 @@
#include "rds.h"
#include "ib.h"
-static char *rds_ib_wc_status_strings[] = {
-#define RDS_IB_WC_STATUS_STR(foo) \
- [IB_WC_##foo] = __stringify(IB_WC_##foo)
- RDS_IB_WC_STATUS_STR(SUCCESS),
- RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
- RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
- RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
- RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
- RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
- RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
- RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
- RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
- RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
- RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
- RDS_IB_WC_STATUS_STR(REM_OP_ERR),
- RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
- RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
- RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
- RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
- RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
- RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
- RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
- RDS_IB_WC_STATUS_STR(FATAL_ERR),
- RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
- RDS_IB_WC_STATUS_STR(GENERAL_ERR),
-#undef RDS_IB_WC_STATUS_STR
-};
-
-char *rds_ib_wc_status_str(enum ib_wc_status status)
-{
- return rds_str_array(rds_ib_wc_status_strings,
- ARRAY_SIZE(rds_ib_wc_status_strings), status);
-}
-
/*
* Convert IB-specific error message to RDS error message and call core
* completion handler.
@@ -229,16 +195,16 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
send->s_op = NULL;
- send->s_wr.wr_id = i;
+ send->s_wr.wr_id = i | RDS_IB_SEND_OP;
send->s_wr.sg_list = send->s_sge;
send->s_wr.ex.imm_data = 0;
sge = &send->s_sge[0];
sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
- sge->lkey = ic->i_mr->lkey;
+ sge->lkey = ic->i_pd->local_dma_lkey;
- send->s_sge[1].lkey = ic->i_mr->lkey;
+ send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
}
}
@@ -271,81 +237,73 @@ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
* unallocs the next free entry in the ring it doesn't alter which is
* the next to be freed, which is what this is concerned with.
*/
-void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
{
- struct rds_connection *conn = context;
- struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_message *rm = NULL;
- struct ib_wc wc;
+ struct rds_connection *conn = ic->conn;
struct rds_ib_send_work *send;
u32 completed;
u32 oldest;
u32 i = 0;
- int ret;
int nr_sig = 0;
- rdsdebug("cq %p conn %p\n", cq, conn);
- rds_ib_stats_inc(s_ib_tx_cq_call);
- ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (ret)
- rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status,
- rds_ib_wc_status_str(wc.status), wc.byte_len,
- be32_to_cpu(wc.ex.imm_data));
- rds_ib_stats_inc(s_ib_tx_cq_event);
-
- if (wc.wr_id == RDS_IB_ACK_WR_ID) {
- if (time_after(jiffies, ic->i_ack_queued + HZ/2))
- rds_ib_stats_inc(s_ib_tx_stalled);
- rds_ib_ack_send_complete(ic);
- continue;
- }
- oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ ib_wc_status_msg(wc->status), wc->byte_len,
+ be32_to_cpu(wc->ex.imm_data));
+ rds_ib_stats_inc(s_ib_tx_cq_event);
- completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+ if (wc->wr_id == RDS_IB_ACK_WR_ID) {
+ if (time_after(jiffies, ic->i_ack_queued + HZ / 2))
+ rds_ib_stats_inc(s_ib_tx_stalled);
+ rds_ib_ack_send_complete(ic);
+ return;
+ }
- for (i = 0; i < completed; i++) {
- send = &ic->i_sends[oldest];
- if (send->s_wr.send_flags & IB_SEND_SIGNALED)
- nr_sig++;
+ oldest = rds_ib_ring_oldest(&ic->i_send_ring);
- rm = rds_ib_send_unmap_op(ic, send, wc.status);
+ completed = rds_ib_ring_completed(&ic->i_send_ring,
+ (wc->wr_id & ~RDS_IB_SEND_OP),
+ oldest);
- if (time_after(jiffies, send->s_queued + HZ/2))
- rds_ib_stats_inc(s_ib_tx_stalled);
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
- if (send->s_op) {
- if (send->s_op == rm->m_final_op) {
- /* If anyone waited for this message to get flushed out, wake
- * them up now */
- rds_message_unmapped(rm);
- }
- rds_message_put(rm);
- send->s_op = NULL;
- }
+ rm = rds_ib_send_unmap_op(ic, send, wc->status);
- oldest = (oldest + 1) % ic->i_send_ring.w_nr;
- }
+ if (time_after(jiffies, send->s_queued + HZ / 2))
+ rds_ib_stats_inc(s_ib_tx_stalled);
- rds_ib_ring_free(&ic->i_send_ring, completed);
- rds_ib_sub_signaled(ic, nr_sig);
- nr_sig = 0;
-
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
- test_bit(0, &conn->c_map_queued))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
- /* We expect errors as the qp is drained during shutdown */
- if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on %pI4 had status "
- "%u (%s), disconnecting and reconnecting\n",
- &conn->c_faddr, wc.status,
- rds_ib_wc_status_str(wc.status));
+ if (send->s_op) {
+ if (send->s_op == rm->m_final_op) {
+ /* If anyone waited for this message to get
+ * flushed out, wake them up now
+ */
+ rds_message_unmapped(rm);
+ }
+ rds_message_put(rm);
+ send->s_op = NULL;
}
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ rds_ib_ring_free(&ic->i_send_ring, completed);
+ rds_ib_sub_signaled(ic, nr_sig);
+ nr_sig = 0;
+
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+ rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
+ &conn->c_faddr, wc->status,
+ ib_wc_status_msg(wc->status));
}
}
@@ -605,6 +563,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
}
rds_message_addref(rm);
+ rm->data.op_dmasg = 0;
+ rm->data.op_dmaoff = 0;
ic->i_data_op = &rm->data;
/* Finalize the header */
@@ -658,7 +618,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send = &ic->i_sends[pos];
first = send;
prev = NULL;
- scat = &ic->i_data_op->op_sg[sg];
+ scat = &ic->i_data_op->op_sg[rm->data.op_dmasg];
i = 0;
do {
unsigned int len = 0;
@@ -680,17 +640,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* Set up the data, if present */
if (i < work_alloc
&& scat != &rm->data.op_sg[rm->data.op_count]) {
- len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ len = min(RDS_FRAG_SIZE,
+ ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
send->s_wr.num_sge = 2;
- send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+ send->s_sge[1].addr = ib_sg_dma_address(dev, scat);
+ send->s_sge[1].addr += rm->data.op_dmaoff;
send->s_sge[1].length = len;
bytes_sent += len;
- off += len;
- if (off == ib_sg_dma_len(dev, scat)) {
+ rm->data.op_dmaoff += len;
+ if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
scat++;
- off = 0;
+ rm->data.op_dmasg++;
+ rm->data.op_dmaoff = 0;
}
}
@@ -738,6 +701,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_op = ic->i_data_op;
prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+ if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ prev->s_wr.send_flags |= IB_SEND_SIGNALED;
+ nr_sig++;
+ }
ic->i_data_op = NULL;
}
@@ -809,23 +777,23 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
send->s_queued = jiffies;
if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
- send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
- send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
- send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
- send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
- send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+ send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+ send->s_atomic_wr.compare_add = op->op_m_cswp.compare;
+ send->s_atomic_wr.swap = op->op_m_cswp.swap;
+ send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask;
+ send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask;
} else { /* FADD */
- send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
- send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
- send->s_wr.wr.atomic.swap = 0;
- send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
- send->s_wr.wr.atomic.swap_mask = 0;
+ send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+ send->s_atomic_wr.compare_add = op->op_m_fadd.add;
+ send->s_atomic_wr.swap = 0;
+ send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
+ send->s_atomic_wr.swap_mask = 0;
}
nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
- send->s_wr.num_sge = 1;
- send->s_wr.next = NULL;
- send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
- send->s_wr.wr.atomic.rkey = op->op_rkey;
+ send->s_atomic_wr.wr.num_sge = 1;
+ send->s_atomic_wr.wr.next = NULL;
+ send->s_atomic_wr.remote_addr = op->op_remote_addr;
+ send->s_atomic_wr.rkey = op->op_rkey;
send->s_op = op;
rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
@@ -842,7 +810,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
/* Convert our struct scatterlist to struct ib_sge */
send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
- send->s_sge[0].lkey = ic->i_mr->lkey;
+ send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
send->s_sge[0].addr, send->s_sge[0].length);
@@ -850,11 +818,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
- failed_wr = &send->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+ failed_wr = &send->s_atomic_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr);
rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
- send, &send->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &send->s_wr);
+ send, &send->s_atomic_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &send->s_atomic_wr.wr);
if (ret) {
printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
@@ -863,9 +831,9 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
goto out;
}
- if (unlikely(failed_wr != &send->s_wr)) {
+ if (unlikely(failed_wr != &send->s_atomic_wr.wr)) {
printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
- BUG_ON(failed_wr != &send->s_wr);
+ BUG_ON(failed_wr != &send->s_atomic_wr.wr);
}
out:
@@ -936,27 +904,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
- send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->op_rkey;
+ send->s_rdma_wr.remote_addr = remote_addr;
+ send->s_rdma_wr.rkey = op->op_rkey;
if (num_sge > max_sge) {
- send->s_wr.num_sge = max_sge;
+ send->s_rdma_wr.wr.num_sge = max_sge;
num_sge -= max_sge;
} else {
- send->s_wr.num_sge = num_sge;
+ send->s_rdma_wr.wr.num_sge = num_sge;
}
- send->s_wr.next = NULL;
+ send->s_rdma_wr.wr.next = NULL;
if (prev)
- prev->s_wr.next = &send->s_wr;
+ prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
+ scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
send->s_sge[j].addr =
ib_sg_dma_address(ic->i_cm_id->device, scat);
send->s_sge[j].length = len;
- send->s_sge[j].lkey = ic->i_mr->lkey;
+ send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
sent += len;
rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
@@ -966,7 +935,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
}
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+ &send->s_rdma_wr.wr,
+ send->s_rdma_wr.wr.num_sge,
+ send->s_rdma_wr.wr.next);
prev = send;
if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
@@ -987,11 +958,11 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
- failed_wr = &first->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ failed_wr = &first->s_rdma_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_wr);
+ first, &first->s_rdma_wr.wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
if (ret) {
printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
@@ -1000,9 +971,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
goto out;
}
- if (unlikely(failed_wr != &first->s_wr)) {
+ if (unlikely(failed_wr != &first->s_rdma_wr.wr)) {
printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
- BUG_ON(failed_wr != &first->s_wr);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
}
diff --git a/kernel/net/rds/ib_stats.c b/kernel/net/rds/ib_stats.c
index 2d5965d6e..d77e04473 100644
--- a/kernel/net/rds/ib_stats.c
+++ b/kernel/net/rds/ib_stats.c
@@ -42,14 +42,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
static const char *const rds_ib_stat_names[] = {
"ib_connect_raced",
"ib_listen_closed_stale",
- "ib_tx_cq_call",
+ "s_ib_evt_handler_call",
+ "ib_tasklet_call",
"ib_tx_cq_event",
"ib_tx_ring_full",
"ib_tx_throttle",
"ib_tx_sg_mapping_failure",
"ib_tx_stalled",
"ib_tx_credit_updates",
- "ib_rx_cq_call",
"ib_rx_cq_event",
"ib_rx_ring_empty",
"ib_rx_refill_from_cq",
@@ -61,12 +61,18 @@ static const char *const rds_ib_stat_names[] = {
"ib_ack_send_delayed",
"ib_ack_send_piggybacked",
"ib_ack_received",
- "ib_rdma_mr_alloc",
- "ib_rdma_mr_free",
- "ib_rdma_mr_used",
- "ib_rdma_mr_pool_flush",
- "ib_rdma_mr_pool_wait",
- "ib_rdma_mr_pool_depleted",
+ "ib_rdma_mr_8k_alloc",
+ "ib_rdma_mr_8k_free",
+ "ib_rdma_mr_8k_used",
+ "ib_rdma_mr_8k_pool_flush",
+ "ib_rdma_mr_8k_pool_wait",
+ "ib_rdma_mr_8k_pool_depleted",
+ "ib_rdma_mr_1m_alloc",
+ "ib_rdma_mr_1m_free",
+ "ib_rdma_mr_1m_used",
+ "ib_rdma_mr_1m_pool_flush",
+ "ib_rdma_mr_1m_pool_wait",
+ "ib_rdma_mr_1m_pool_depleted",
"ib_atomic_cswp",
"ib_atomic_fadd",
};
diff --git a/kernel/net/rds/iw.c b/kernel/net/rds/iw.c
index 589935661..576f1825f 100644
--- a/kernel/net/rds/iw.c
+++ b/kernel/net/rds/iw.c
@@ -125,12 +125,11 @@ free_attr:
kfree(dev_attr);
}
-static void rds_iw_remove_one(struct ib_device *device)
+static void rds_iw_remove_one(struct ib_device *device, void *client_data)
{
- struct rds_iw_device *rds_iwdev;
+ struct rds_iw_device *rds_iwdev = client_data;
struct rds_iw_cm_id *i_cm_id, *next;
- rds_iwdev = ib_get_client_data(device, &rds_iw_client);
if (!rds_iwdev)
return;
@@ -149,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device)
if (rds_iwdev->mr)
ib_dereg_mr(rds_iwdev->mr);
- while (ib_dealloc_pd(rds_iwdev->pd)) {
- rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
- msleep(1);
- }
+ ib_dealloc_pd(rds_iwdev->pd);
list_del(&rds_iwdev->list);
kfree(rds_iwdev);
@@ -218,7 +214,7 @@ static void rds_iw_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_iw_laddr_check(__be32 addr)
+static int rds_iw_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
@@ -227,7 +223,7 @@ static int rds_iw_laddr_check(__be32 addr)
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
diff --git a/kernel/net/rds/iw.h b/kernel/net/rds/iw.h
index cbe6674e3..5af01d175 100644
--- a/kernel/net/rds/iw.h
+++ b/kernel/net/rds/iw.h
@@ -74,10 +74,13 @@ struct rds_iw_send_work {
struct rm_rdma_op *s_op;
struct rds_iw_mapping *s_mapping;
struct ib_mr *s_mr;
- struct ib_fast_reg_page_list *s_page_list;
unsigned char s_remap_count;
- struct ib_send_wr s_wr;
+ union {
+ struct ib_send_wr s_send_wr;
+ struct ib_rdma_wr s_rdma_wr;
+ struct ib_reg_wr s_reg_wr;
+ };
struct ib_sge s_sge[RDS_IW_MAX_SGE];
unsigned long s_queued;
};
@@ -195,7 +198,7 @@ struct rds_iw_device {
/* Magic WR_ID for ACKs */
#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
-#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
+#define RDS_IW_REG_WR_ID ((u64)0xefefefefefefefefULL)
#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
struct rds_iw_statistics {
diff --git a/kernel/net/rds/iw_cm.c b/kernel/net/rds/iw_cm.c
index a6c2bea9f..aea4c911b 100644
--- a/kernel/net/rds/iw_cm.c
+++ b/kernel/net/rds/iw_cm.c
@@ -179,6 +179,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
void *context)
{
struct ib_device *dev = rds_iwdev->dev;
+ struct ib_cq_init_attr cq_attr = {};
unsigned int send_size, recv_size;
int ret;
@@ -198,9 +199,10 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
attr->sq_sig_type = IB_SIGNAL_REQ_WR;
attr->qp_type = IB_QPT_RC;
+ cq_attr.cqe = send_size;
attr->send_cq = ib_create_cq(dev, send_cq_handler,
rds_iw_cq_event_handler,
- context, send_size, 0);
+ context, &cq_attr);
if (IS_ERR(attr->send_cq)) {
ret = PTR_ERR(attr->send_cq);
attr->send_cq = NULL;
@@ -208,9 +210,10 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
goto out;
}
+ cq_attr.cqe = recv_size;
attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
rds_iw_cq_event_handler,
- context, recv_size, 0);
+ context, &cq_attr);
if (IS_ERR(attr->recv_cq)) {
ret = PTR_ERR(attr->recv_cq);
attr->recv_cq = NULL;
@@ -395,8 +398,9 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
&dp->dp_saddr, &dp->dp_daddr,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
- conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
- GFP_KERNEL);
+ /* RDS/IW is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+ &rds_iw_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
@@ -520,7 +524,7 @@ int rds_iw_conn_connect(struct rds_connection *conn)
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
diff --git a/kernel/net/rds/iw_rdma.c b/kernel/net/rds/iw_rdma.c
index dba8d0864..b09a40c1a 100644
--- a/kernel/net/rds/iw_rdma.c
+++ b/kernel/net/rds/iw_rdma.c
@@ -47,7 +47,6 @@ struct rds_iw_mr {
struct rdma_cm_id *cm_id;
struct ib_mr *mr;
- struct ib_fast_reg_page_list *page_list;
struct rds_iw_mapping mapping;
unsigned char remap_count;
@@ -75,10 +74,10 @@ struct rds_iw_mr_pool {
int max_pages;
};
-static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
-static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_reg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr,
struct scatterlist *sg, unsigned int nents);
static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
@@ -258,19 +257,18 @@ static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
sg->bytes = 0;
}
-static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
- struct rds_iw_scatterlist *sg)
+static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+ struct rds_iw_scatterlist *sg)
{
struct ib_device *dev = rds_iwdev->dev;
- u64 *dma_pages = NULL;
- int i, j, ret;
+ int i, ret;
WARN_ON(sg->dma_len);
sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
if (unlikely(!sg->dma_len)) {
printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
- return ERR_PTR(-EBUSY);
+ return -EBUSY;
}
sg->bytes = 0;
@@ -303,31 +301,14 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
if (sg->dma_npages > fastreg_message_size)
goto out_unmap;
- dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
- if (!dma_pages) {
- ret = -ENOMEM;
- goto out_unmap;
- }
-
- for (i = j = 0; i < sg->dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
- u64 end_addr;
- end_addr = dma_addr + dma_len;
- dma_addr &= ~PAGE_MASK;
- for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
- dma_pages[j++] = dma_addr;
- BUG_ON(j > sg->dma_npages);
- }
- return dma_pages;
+ return 0;
out_unmap:
ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
sg->dma_len = 0;
- kfree(dma_pages);
- return ERR_PTR(ret);
+ return ret;
}
@@ -440,7 +421,7 @@ static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
INIT_LIST_HEAD(&ibmr->mapping.m_list);
ibmr->mapping.m_mr = ibmr;
- err = rds_iw_init_fastreg(pool, ibmr);
+ err = rds_iw_init_reg(pool, ibmr);
if (err)
goto out_no_cigar;
@@ -479,14 +460,13 @@ void rds_iw_sync_mr(void *trans_private, int direction)
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
-static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
{
struct rds_iw_mr *ibmr, *next;
LIST_HEAD(unmap_list);
LIST_HEAD(kill_list);
unsigned long flags;
unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
- int ret = 0;
rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
@@ -538,7 +518,6 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
atomic_sub(nfreed, &pool->item_count);
mutex_unlock(&pool->flush_lock);
- return ret;
}
static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
@@ -622,7 +601,7 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
ibmr->cm_id = cm_id;
ibmr->device = rds_iwdev;
- ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+ ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents);
if (ret == 0)
*key_ret = ibmr->mr->rkey;
else
@@ -638,7 +617,7 @@ out:
}
/*
- * iWARP fastreg handling
+ * iWARP reg handling
*
* The life cycle of a fastreg registration is a bit different from
* FMRs.
@@ -650,7 +629,7 @@ out:
* This creates a bit of a problem for us, as we do not have the destination
* IP in GET_MR, so the connection must be setup prior to the GET_MR call for
* RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
- * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * will try to queue a LOCAL_INV (if needed) and a REG_MR work request
* before queuing the SEND. When completions for these arrive, they are
* dispatched to the MR has a bit set showing that RDMa can be performed.
*
@@ -659,71 +638,60 @@ out:
* The expectation there is that this invalidation step includes ALL
* PREVIOUSLY FREED MRs.
*/
-static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
+static int rds_iw_init_reg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
{
struct rds_iw_device *rds_iwdev = pool->device;
- struct ib_fast_reg_page_list *page_list = NULL;
struct ib_mr *mr;
int err;
- mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+ mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
+ pool->max_message_size);
if (IS_ERR(mr)) {
err = PTR_ERR(mr);
- printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
- return err;
- }
-
- /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
- * is not filled in.
- */
- page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
- if (IS_ERR(page_list)) {
- err = PTR_ERR(page_list);
-
- printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
- ib_dereg_mr(mr);
+ printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
return err;
}
- ibmr->page_list = page_list;
ibmr->mr = mr;
return 0;
}
-static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping)
{
struct rds_iw_mr *ibmr = mapping->m_mr;
- struct ib_send_wr f_wr, *failed_wr;
- int ret;
+ struct rds_iw_scatterlist *m_sg = &mapping->m_sg;
+ struct ib_reg_wr reg_wr;
+ struct ib_send_wr *failed_wr;
+ int ret, n;
+
+ n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE);
+ if (unlikely(n != m_sg->len))
+ return n < 0 ? n : -EINVAL;
+
+ reg_wr.wr.next = NULL;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.wr_id = RDS_IW_REG_WR_ID;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = ibmr->mr;
+ reg_wr.key = mapping->m_rkey;
+ reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
/*
- * Perform a WR for the fast_reg_mr. Each individual page
+ * Perform a WR for the reg_mr. Each individual page
* in the sg list is added to the fast reg page list and placed
- * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * inside the reg_mr WR. The key used is a rolling 8bit
* counter, which should guarantee uniqueness.
*/
ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
mapping->m_rkey = ibmr->mr->rkey;
- memset(&f_wr, 0, sizeof(f_wr));
- f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
- f_wr.opcode = IB_WR_FAST_REG_MR;
- f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
- f_wr.wr.fast_reg.rkey = mapping->m_rkey;
- f_wr.wr.fast_reg.page_list = ibmr->page_list;
- f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
- f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE;
- f_wr.wr.fast_reg.iova_start = 0;
- f_wr.send_flags = IB_SEND_SIGNALED;
-
- failed_wr = &f_wr;
- ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
- BUG_ON(failed_wr != &f_wr);
+ failed_wr = &reg_wr.wr;
+ ret = ib_post_send(ibmr->cm_id->qp, &reg_wr.wr, &failed_wr);
+ BUG_ON(failed_wr != &reg_wr.wr);
if (ret)
printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
__func__, __LINE__, ret);
@@ -755,21 +723,20 @@ out:
return ret;
}
-static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr,
- struct scatterlist *sg,
- unsigned int sg_len)
+static int rds_iw_map_reg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr,
+ struct scatterlist *sg,
+ unsigned int sg_len)
{
struct rds_iw_device *rds_iwdev = pool->device;
struct rds_iw_mapping *mapping = &ibmr->mapping;
u64 *dma_pages;
- int i, ret = 0;
+ int ret = 0;
rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
- dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
- if (IS_ERR(dma_pages)) {
- ret = PTR_ERR(dma_pages);
+ ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+ if (ret) {
dma_pages = NULL;
goto out;
}
@@ -779,10 +746,7 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
goto out;
}
- for (i = 0; i < mapping->m_sg.dma_npages; ++i)
- ibmr->page_list->page_list[i] = dma_pages[i];
-
- ret = rds_iw_rdma_build_fastreg(mapping);
+ ret = rds_iw_rdma_reg_mr(mapping);
if (ret)
goto out;
@@ -868,8 +832,6 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr)
{
- if (ibmr->page_list)
- ib_free_fast_reg_page_list(ibmr->page_list);
if (ibmr->mr)
ib_dereg_mr(ibmr->mr);
}
diff --git a/kernel/net/rds/iw_send.c b/kernel/net/rds/iw_send.c
index 13834780a..e20bd503f 100644
--- a/kernel/net/rds/iw_send.c
+++ b/kernel/net/rds/iw_send.c
@@ -137,13 +137,13 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic)
send->s_op = NULL;
send->s_mapping = NULL;
- send->s_wr.next = NULL;
- send->s_wr.wr_id = i;
- send->s_wr.sg_list = send->s_sge;
- send->s_wr.num_sge = 1;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.send_flags = 0;
- send->s_wr.ex.imm_data = 0;
+ send->s_send_wr.next = NULL;
+ send->s_send_wr.wr_id = i;
+ send->s_send_wr.sg_list = send->s_sge;
+ send->s_send_wr.num_sge = 1;
+ send->s_send_wr.opcode = IB_WR_SEND;
+ send->s_send_wr.send_flags = 0;
+ send->s_send_wr.ex.imm_data = 0;
sge = rds_iw_data_sge(ic, send->s_sge);
sge->lkey = 0;
@@ -153,16 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic)
sge->length = sizeof(struct rds_header);
sge->lkey = 0;
- send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+ send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
+ fastreg_message_size);
if (IS_ERR(send->s_mr)) {
- printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
- break;
- }
-
- send->s_page_list = ib_alloc_fast_reg_page_list(
- ic->i_cm_id->device, fastreg_message_size);
- if (IS_ERR(send->s_page_list)) {
- printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+ printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
break;
}
}
@@ -176,9 +170,7 @@ void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
BUG_ON(!send->s_mr);
ib_dereg_mr(send->s_mr);
- BUG_ON(!send->s_page_list);
- ib_free_fast_reg_page_list(send->s_page_list);
- if (send->s_wr.opcode == 0xdead)
+ if (send->s_send_wr.opcode == 0xdead)
continue;
if (send->s_rm)
rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
@@ -226,7 +218,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
continue;
}
- if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+ if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) {
ic->i_fastreg_posted = 1;
continue;
}
@@ -246,12 +238,12 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
send = &ic->i_sends[oldest];
/* In the error case, wc.opcode sometimes contains garbage */
- switch (send->s_wr.opcode) {
+ switch (send->s_send_wr.opcode) {
case IB_WR_SEND:
if (send->s_rm)
rds_iw_send_unmap_rm(ic, send, wc.status);
break;
- case IB_WR_FAST_REG_MR:
+ case IB_WR_REG_MR:
case IB_WR_RDMA_WRITE:
case IB_WR_RDMA_READ:
case IB_WR_RDMA_READ_WITH_INV:
@@ -261,12 +253,12 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
default:
printk_ratelimited(KERN_NOTICE
"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
- __func__, send->s_wr.opcode);
+ __func__, send->s_send_wr.opcode);
break;
}
- send->s_wr.opcode = 0xdead;
- send->s_wr.num_sge = 1;
+ send->s_send_wr.opcode = 0xdead;
+ send->s_send_wr.num_sge = 1;
if (time_after(jiffies, send->s_queued + HZ/2))
rds_iw_stats_inc(s_iw_tx_stalled);
@@ -454,10 +446,10 @@ rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
WARN_ON(pos != send - ic->i_sends);
- send->s_wr.send_flags = send_flags;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.num_sge = 2;
- send->s_wr.next = NULL;
+ send->s_send_wr.send_flags = send_flags;
+ send->s_send_wr.opcode = IB_WR_SEND;
+ send->s_send_wr.num_sge = 2;
+ send->s_send_wr.next = NULL;
send->s_queued = jiffies;
send->s_op = NULL;
@@ -471,7 +463,7 @@ rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
} else {
/* We're sending a packet with no payload. There is only
* one SGE */
- send->s_wr.num_sge = 1;
+ send->s_send_wr.num_sge = 1;
sge = &send->s_sge[0];
}
@@ -581,6 +573,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
rds_message_addref(rm);
+ rm->data.op_dmasg = 0;
+ rm->data.op_dmaoff = 0;
ic->i_rm = rm;
/* Finalize the header */
@@ -622,7 +616,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
send = &ic->i_sends[pos];
first = send;
prev = NULL;
- scat = &rm->data.op_sg[sg];
+ scat = &rm->data.op_sg[rm->data.op_dmasg];
sent = 0;
i = 0;
@@ -656,10 +650,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
send = &ic->i_sends[pos];
- len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ len = min(RDS_FRAG_SIZE,
+ ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
rds_iw_xmit_populate_wr(ic, send, pos,
- ib_sg_dma_address(dev, scat) + off, len,
- send_flags);
+ ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len,
+ send_flags);
/*
* We want to delay signaling completions just enough to get
@@ -668,29 +663,30 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
*/
if (ic->i_unsignaled_wrs-- == 0) {
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
ic->i_unsignaled_bytes -= len;
if (ic->i_unsignaled_bytes <= 0) {
ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
/*
* Always signal the last one if we're stopping due to flow control.
*/
if (flow_controlled && i == (work_alloc-1))
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+ &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next);
sent += len;
- off += len;
- if (off == ib_sg_dma_len(dev, scat)) {
+ rm->data.op_dmaoff += len;
+ if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
scat++;
- off = 0;
+ rm->data.op_dmaoff = 0;
+ rm->data.op_dmasg++;
}
add_header:
@@ -717,7 +713,7 @@ add_header:
}
if (prev)
- prev->s_wr.next = &send->s_wr;
+ prev->s_send_wr.next = &send->s_send_wr;
prev = send;
pos = (pos + 1) % ic->i_send_ring.w_nr;
@@ -731,7 +727,7 @@ add_header:
/* if we finished the message then send completion owns it */
if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_rm = ic->i_rm;
- prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
ic->i_rm = NULL;
}
@@ -743,11 +739,11 @@ add_header:
rds_iw_send_add_credits(conn, credit_alloc - i);
/* XXX need to worry about failed_wr and partial sends. */
- failed_wr = &first->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ failed_wr = &first->s_send_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_wr);
+ first, &first->s_send_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_send_wr);
if (ret) {
printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
@@ -765,24 +761,26 @@ out:
return ret;
}
-static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+static int rds_iw_build_send_reg(struct rds_iw_send_work *send,
+ struct scatterlist *sg,
+ int sg_nents)
{
- BUG_ON(nent > send->s_page_list->max_page_list_len);
- /*
- * Perform a WR for the fast_reg_mr. Each individual page
- * in the sg list is added to the fast reg page list and placed
- * inside the fast_reg_mr WR.
- */
- send->s_wr.opcode = IB_WR_FAST_REG_MR;
- send->s_wr.wr.fast_reg.length = len;
- send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
- send->s_wr.wr.fast_reg.page_list = send->s_page_list;
- send->s_wr.wr.fast_reg.page_list_len = nent;
- send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
- send->s_wr.wr.fast_reg.iova_start = sg_addr;
+ int n;
+
+ n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE);
+ if (unlikely(n != sg_nents))
+ return n < 0 ? n : -EINVAL;
+
+ send->s_reg_wr.wr.opcode = IB_WR_REG_MR;
+ send->s_reg_wr.wr.wr_id = 0;
+ send->s_reg_wr.wr.num_sge = 0;
+ send->s_reg_wr.mr = send->s_mr;
+ send->s_reg_wr.key = send->s_mr->rkey;
+ send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE;
ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+
+ return 0;
}
int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
@@ -803,6 +801,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
int sent;
int ret;
int num_sge;
+ int sg_nents;
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
@@ -856,9 +855,10 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
scat = &op->op_sg[0];
sent = 0;
num_sge = op->op_count;
+ sg_nents = 0;
for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
- send->s_wr.send_flags = 0;
+ send->s_rdma_wr.wr.send_flags = 0;
send->s_queued = jiffies;
/*
@@ -867,7 +867,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
*/
if (ic->i_unsignaled_wrs-- == 0) {
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- send->s_wr.send_flags = IB_SEND_SIGNALED;
+ send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED;
}
/* To avoid the need to have the plumbing to invalidate the fastreg_mr used
@@ -875,30 +875,31 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
* IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
*/
if (op->op_write)
- send->s_wr.opcode = IB_WR_RDMA_WRITE;
+ send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
else
- send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
- send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->op_rkey;
+ send->s_rdma_wr.remote_addr = remote_addr;
+ send->s_rdma_wr.rkey = op->op_rkey;
send->s_op = op;
if (num_sge > rds_iwdev->max_sge) {
- send->s_wr.num_sge = rds_iwdev->max_sge;
+ send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge;
num_sge -= rds_iwdev->max_sge;
} else
- send->s_wr.num_sge = num_sge;
+ send->s_rdma_wr.wr.num_sge = num_sge;
- send->s_wr.next = NULL;
+ send->s_rdma_wr.wr.next = NULL;
if (prev)
- prev->s_wr.next = &send->s_wr;
+ prev->s_send_wr.next = &send->s_rdma_wr.wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
+ scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
- if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
- send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+ if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+ sg_nents++;
else {
send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
send->s_sge[j].length = len;
@@ -912,15 +913,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
scat++;
}
- if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
- send->s_wr.num_sge = 1;
+ if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+ send->s_rdma_wr.wr.num_sge = 1;
send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
}
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+ &send->s_rdma_wr,
+ send->s_rdma_wr.wr.num_sge,
+ send->s_rdma_wr.wr.next);
prev = send;
if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
@@ -929,7 +932,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
/* if we finished the message then send completion owns it */
if (scat == &op->op_sg[op->op_count])
- first->s_wr.send_flags = IB_SEND_SIGNALED;
+ first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED;
if (i < work_alloc) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
@@ -943,16 +946,20 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
* fastreg_mr (or possibly a dma_mr)
*/
if (!op->op_write) {
- rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
- op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+ ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos],
+ &op->op_sg[0], sg_nents);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IW: failed to reg send mem\n");
+ goto out;
+ }
work_alloc++;
}
- failed_wr = &first->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ failed_wr = &first->s_rdma_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_wr);
+ first, &first->s_rdma_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
if (ret) {
printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
diff --git a/kernel/net/rds/rdma.c b/kernel/net/rds/rdma.c
index 40084d843..4c93badea 100644
--- a/kernel/net/rds/rdma.c
+++ b/kernel/net/rds/rdma.c
@@ -435,9 +435,10 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
/* If the MR was marked as invalidate, this will
* trigger an async flush. */
- if (zot_me)
+ if (zot_me) {
rds_destroy_mr(mr);
- rds_mr_put(mr);
+ rds_mr_put(mr);
+ }
}
void rds_rdma_free_op(struct rm_rdma_op *ro)
@@ -451,7 +452,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
* is the case for a RDMA_READ which copies from remote
* to local memory */
if (!ro->op_write) {
- BUG_ON(irqs_disabled());
+ WARN_ON(!page->mapping && irqs_disabled());
set_page_dirty(page);
}
put_page(page);
@@ -658,6 +659,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
if (ret < 0)
goto out;
+ else
+ ret = 0;
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr);
diff --git a/kernel/net/rds/rdma_transport.c b/kernel/net/rds/rdma_transport.c
index 6cd9d1dea..9c1fed81b 100644
--- a/kernel/net/rds/rdma_transport.c
+++ b/kernel/net/rds/rdma_transport.c
@@ -34,37 +34,10 @@
#include <rdma/rdma_cm.h>
#include "rdma_transport.h"
+#include "ib.h"
static struct rdma_cm_id *rds_rdma_listen_id;
-static char *rds_cm_event_strings[] = {
-#define RDS_CM_EVENT_STRING(foo) \
- [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
- RDS_CM_EVENT_STRING(ADDR_RESOLVED),
- RDS_CM_EVENT_STRING(ADDR_ERROR),
- RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
- RDS_CM_EVENT_STRING(ROUTE_ERROR),
- RDS_CM_EVENT_STRING(CONNECT_REQUEST),
- RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
- RDS_CM_EVENT_STRING(CONNECT_ERROR),
- RDS_CM_EVENT_STRING(UNREACHABLE),
- RDS_CM_EVENT_STRING(REJECTED),
- RDS_CM_EVENT_STRING(ESTABLISHED),
- RDS_CM_EVENT_STRING(DISCONNECTED),
- RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
- RDS_CM_EVENT_STRING(MULTICAST_JOIN),
- RDS_CM_EVENT_STRING(MULTICAST_ERROR),
- RDS_CM_EVENT_STRING(ADDR_CHANGE),
- RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
-#undef RDS_CM_EVENT_STRING
-};
-
-static char *rds_cm_event_str(enum rdma_cm_event_type type)
-{
- return rds_str_array(rds_cm_event_strings,
- ARRAY_SIZE(rds_cm_event_strings), type);
-};
-
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -74,7 +47,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
int ret = 0;
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
- event->event, rds_cm_event_str(event->event));
+ event->event, rdma_event_msg(event->event));
if (cm_id->device->node_type == RDMA_NODE_RNIC)
trans = &rds_iw_transport;
@@ -110,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- /* XXX worry about racing with listen acceptance */
- ret = trans->cm_initiate_connect(cm_id);
+ /* Connection could have been dropped so make sure the
+ * cm_id is valid before proceeding
+ */
+ if (conn) {
+ struct rds_ib_connection *ibic;
+
+ ibic = conn->c_transport_data;
+ if (ibic && ibic->i_cm_id == cm_id)
+ ret = trans->cm_initiate_connect(cm_id);
+ else
+ rds_conn_drop(conn);
+ }
break;
case RDMA_CM_EVENT_ESTABLISHED:
@@ -139,7 +122,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
default:
/* things like device disconnect? */
printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
- event->event, rds_cm_event_str(event->event));
+ event->event, rdma_event_msg(event->event));
break;
}
@@ -148,7 +131,7 @@ out:
mutex_unlock(&conn->c_cm_lock);
rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
- rds_cm_event_str(event->event), ret);
+ rdma_event_msg(event->event), ret);
return ret;
}
@@ -159,8 +142,8 @@ static int rds_rdma_listen_init(void)
struct rdma_cm_id *cm_id;
int ret;
- cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
- IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
diff --git a/kernel/net/rds/rds.h b/kernel/net/rds/rds.h
index 0d41155a2..0e2797bdc 100644
--- a/kernel/net/rds/rds.h
+++ b/kernel/net/rds/rds.h
@@ -7,6 +7,7 @@
#include <rdma/rdma_cm.h>
#include <linux/mutex.h>
#include <linux/rds.h>
+#include <linux/rhashtable.h>
#include "info.h"
@@ -80,12 +81,15 @@ enum {
#define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1
#define RDS_IN_XMIT 2
+#define RDS_RECV_REFILL 3
struct rds_connection {
struct hlist_node c_hash_node;
__be32 c_laddr;
__be32 c_faddr;
- unsigned int c_loopback:1;
+ unsigned int c_loopback:1,
+ c_outgoing:1,
+ c_pad_to_32:30;
struct rds_connection *c_passive;
struct rds_cong_map *c_lcong;
@@ -128,8 +132,21 @@ struct rds_connection {
/* Protocol version */
unsigned int c_version;
+ possible_net_t c_net;
};
+static inline
+struct net *rds_conn_net(struct rds_connection *conn)
+{
+ return read_pnet(&conn->c_net);
+}
+
+static inline
+void rds_conn_net_set(struct rds_connection *conn, struct net *net)
+{
+ write_pnet(&conn->c_net, net);
+}
+
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
@@ -363,6 +380,8 @@ struct rds_message {
unsigned int op_active:1;
unsigned int op_nents;
unsigned int op_count;
+ unsigned int op_dmasg;
+ unsigned int op_dmaoff;
struct scatterlist *op_sg;
} data;
};
@@ -408,11 +427,6 @@ struct rds_notifier {
* should try hard not to block.
*/
-#define RDS_TRANS_IB 0
-#define RDS_TRANS_IWARP 1
-#define RDS_TRANS_TCP 2
-#define RDS_TRANS_COUNT 3
-
struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
@@ -420,7 +434,7 @@ struct rds_transport {
unsigned int t_prefer_loopback:1;
unsigned int t_type;
- int (*laddr_check)(__be32 addr);
+ int (*laddr_check)(struct net *net, __be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_connect)(struct rds_connection *conn);
@@ -461,7 +475,8 @@ struct rds_sock {
* bound_addr used for both incoming and outgoing, no INADDR_ANY
* support.
*/
- struct hlist_node rs_bound_node;
+ struct rhash_head rs_bound_node;
+ u64 rs_bound_key;
__be32 rs_bound_addr;
__be32 rs_conn_addr;
__be16 rs_bound_port;
@@ -575,7 +590,6 @@ struct rds_statistics {
};
/* af_rds.c */
-char *rds_str_array(char **array, size_t elements, size_t index);
void rds_sock_addref(struct rds_sock *rs);
void rds_sock_put(struct rds_sock *rs);
void rds_wake_sk_sleep(struct rds_sock *rs);
@@ -593,6 +607,8 @@ extern wait_queue_head_t rds_poll_waitq;
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
void rds_remove_bound(struct rds_sock *rs);
struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+int rds_bind_lock_init(void);
+void rds_bind_lock_destroy(void);
/* cong.c */
int rds_cong_get_maps(struct rds_connection *conn);
@@ -612,9 +628,11 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
int rds_conn_init(void);
void rds_conn_exit(void);
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
@@ -799,10 +817,11 @@ void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(__be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
+struct rds_transport *rds_trans_get(int t_type);
int rds_trans_init(void);
void rds_trans_exit(void);
diff --git a/kernel/net/rds/send.c b/kernel/net/rds/send.c
index e9430f537..c9cdb358e 100644
--- a/kernel/net/rds/send.c
+++ b/kernel/net/rds/send.c
@@ -38,6 +38,7 @@
#include <linux/list.h>
#include <linux/ratelimit.h>
#include <linux/export.h>
+#include <linux/sizes.h>
#include "rds.h"
@@ -51,7 +52,7 @@
* it to 0 will restore the old behavior (where we looped until we had
* drained the queue).
*/
-static int send_batch_count = 64;
+static int send_batch_count = SZ_1K;
module_param(send_batch_count, int, 0444);
MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
@@ -223,7 +224,7 @@ restart:
* through a lot of messages, lets back off and see
* if anyone else jumps in
*/
- if (batch_count >= 1024)
+ if (batch_count >= send_batch_count)
goto over_batch;
spin_lock_irqsave(&conn->c_lock, flags);
@@ -282,26 +283,34 @@ restart:
/* The transport either sends the whole rdma or none of it */
if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
rm->m_final_op = &rm->rdma;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
- if (ret)
+ if (ret) {
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
break;
+ }
conn->c_xmit_rdma_sent = 1;
- /* The transport owns the mapped memory for now.
- * You can't unmap it while it's on the send queue */
- set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
rm->m_final_op = &rm->atomic;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
- if (ret)
+ if (ret) {
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
break;
+ }
conn->c_xmit_atomic_sent = 1;
- /* The transport owns the mapped memory for now.
- * You can't unmap it while it's on the send queue */
- set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
/*
@@ -411,15 +420,19 @@ over_batch:
*/
if (ret == 0) {
smp_mb();
- if (!list_empty(&conn->c_send_queue) &&
+ if ((test_bit(0, &conn->c_map_queued) ||
+ !list_empty(&conn->c_send_queue)) &&
send_gen == conn->c_send_gen) {
rds_stats_inc(s_send_lock_queue_raced);
- goto restart;
+ if (batch_count < send_batch_count)
+ goto restart;
+ queue_delayed_work(rds_wq, &conn->c_send_w, 1);
}
}
out:
return ret;
}
+EXPORT_SYMBOL_GPL(rds_send_xmit);
static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
{
@@ -769,8 +782,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
while (!list_empty(&list)) {
rm = list_entry(list.next, struct rds_message, m_sock_item);
list_del_init(&rm->m_sock_item);
-
rds_message_wait(rm);
+
+ /* just in case the code above skipped this message
+ * because RDS_MSG_ON_CONN wasn't set, run it again here
+ * taking m_rs_lock is the only thing that keeps us
+ * from racing with ack processing.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ spin_lock(&rs->rs_lock);
+ __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ spin_unlock(&rs->rs_lock);
+
+ rm->m_rs = NULL;
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
rds_message_put(rm);
}
}
@@ -986,11 +1013,18 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
release_sock(sk);
}
- /* racing with another thread binding seems ok here */
+ lock_sock(sk);
if (daddr == 0 || rs->rs_bound_addr == 0) {
+ release_sock(sk);
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
+ release_sock(sk);
+
+ if (payload_len > rds_sk_sndbuf(rs)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
/* size of rm including all sgs */
ret = rds_rm_size(msg, payload_len);
@@ -1023,7 +1057,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
conn = rs->rs_conn;
else {
- conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+ conn = rds_conn_create_outgoing(sock_net(sock->sk),
+ rs->rs_bound_addr, daddr,
rs->rs_transport,
sock->sk->sk_allocation);
if (IS_ERR(conn)) {
@@ -1063,11 +1098,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
dport, &queued)) {
rds_stats_inc(s_send_queue_full);
- /* XXX make sure this is reasonable */
- if (payload_len > rds_sk_sndbuf(rs)) {
- ret = -EMSGSIZE;
- goto out;
- }
+
if (nonblock) {
ret = -EAGAIN;
goto out;
@@ -1095,8 +1126,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
*/
rds_stats_inc(s_send_queued);
- if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- rds_send_xmit(conn);
+ ret = rds_send_xmit(conn);
+ if (ret == -ENOMEM || ret == -EAGAIN)
+ queue_delayed_work(rds_wq, &conn->c_send_w, 1);
rds_message_put(rm);
return payload_len;
@@ -1152,8 +1184,8 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
- if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ /* schedule the send work on rds_wq */
+ queue_delayed_work(rds_wq, &conn->c_send_w, 1);
rds_message_put(rm);
return 0;
diff --git a/kernel/net/rds/tcp.c b/kernel/net/rds/tcp.c
index edac9ef2b..9d6ddbacd 100644
--- a/kernel/net/rds/tcp.c
+++ b/kernel/net/rds/tcp.c
@@ -35,6 +35,9 @@
#include <linux/in.h>
#include <linux/module.h>
#include <net/tcp.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
@@ -64,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock)
set_fs(oldfs);
}
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation. In general these
+ * customizations should be tunable via module_param()
+ */
void rds_tcp_tune(struct socket *sock)
{
- struct sock *sk = sock->sk;
-
rds_tcp_nonagle(sock);
-
- /*
- * We're trying to saturate gigabit with the default,
- * see svc_sock_setbufsize().
- */
- lock_sock(sk);
- sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
- sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
- release_sock(sk);
}
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
@@ -189,9 +184,9 @@ out:
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}
-static int rds_tcp_laddr_check(__be32 addr)
+static int rds_tcp_laddr_check(struct net *net, __be32 addr)
{
- if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+ if (inet_addr_type(net, addr) == RTN_LOCAL)
return 0;
return -EADDRNOTAVAIL;
}
@@ -250,16 +245,7 @@ static void rds_tcp_destroy_conns(void)
}
}
-static void rds_tcp_exit(void)
-{
- rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
- rds_tcp_listen_stop();
- rds_tcp_destroy_conns();
- rds_trans_unregister(&rds_tcp_transport);
- rds_tcp_recv_exit();
- kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
+static void rds_tcp_exit(void);
struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
@@ -281,6 +267,136 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback = 1,
};
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+ struct socket *rds_tcp_listen_sock;
+ struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+ struct rds_tcp_net *rtn = container_of(work,
+ struct rds_tcp_net,
+ rds_tcp_accept_w);
+
+ while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+ cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ queue_work(rds_wq, &rtn->rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+ if (!rtn->rds_tcp_listen_sock) {
+ pr_warn("could not set up listen sock\n");
+ return -EAFNOSUPPORT;
+ }
+ INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
+ return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ /* If rds_tcp_exit_net() is called as a result of netns deletion,
+ * the rds_tcp_kill_sock() device notifier would already have cleaned
+ * up the listen socket, thus there is no work to do in this function.
+ *
+ * If rds_tcp_exit_net() is called as a result of module unload,
+ * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
+ * we do need to clean up the listen socket here.
+ */
+ if (rtn->rds_tcp_listen_sock) {
+ rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+ rtn->rds_tcp_listen_sock = NULL;
+ flush_work(&rtn->rds_tcp_accept_w);
+ }
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+ .init = rds_tcp_init_net,
+ .exit = rds_tcp_exit_net,
+ .id = &rds_tcp_netid,
+ .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+ struct rds_tcp_connection *tc, *_tc;
+ struct sock *sk;
+ LIST_HEAD(tmp_list);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+ rtn->rds_tcp_listen_sock = NULL;
+ flush_work(&rtn->rds_tcp_accept_w);
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ struct net *c_net = read_pnet(&tc->conn->c_net);
+
+ if (net != c_net || !tc->t_sock)
+ continue;
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
+ sk = tc->t_sock->sk;
+ sk->sk_prot->disconnect(sk, 0);
+ tcp_done(sk);
+ if (tc->conn->c_passive)
+ rds_conn_destroy(tc->conn->c_passive);
+ rds_conn_destroy(tc->conn);
+ }
+}
+
+static int rds_tcp_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ /* rds-tcp registers as a pernet subys, so the ->exit will only
+ * get invoked after network acitivity has quiesced. We need to
+ * clean up all sockets to quiesce network activity, and use
+ * the unregistration of the per-net loopback device as a trigger
+ * to start that cleanup.
+ */
+ if (event == NETDEV_UNREGISTER_FINAL &&
+ dev->ifindex == LOOPBACK_IFINDEX)
+ rds_tcp_kill_sock(dev_net(dev));
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rds_tcp_dev_notifier = {
+ .notifier_call = rds_tcp_dev_event,
+ .priority = -10, /* must be called after other network notifiers */
+};
+
+static void rds_tcp_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+ unregister_pernet_subsys(&rds_tcp_net_ops);
+ if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
+ pr_warn("could not unregister rds_tcp_dev_notifier\n");
+ rds_tcp_destroy_conns();
+ rds_trans_unregister(&rds_tcp_transport);
+ rds_tcp_recv_exit();
+ kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
static int rds_tcp_init(void)
{
int ret;
@@ -293,6 +409,16 @@ static int rds_tcp_init(void)
goto out;
}
+ ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
+ if (ret) {
+ pr_warn("could not register rds_tcp_dev_notifier\n");
+ goto out;
+ }
+
+ ret = register_pernet_subsys(&rds_tcp_net_ops);
+ if (ret)
+ goto out_slab;
+
ret = rds_tcp_recv_init();
if (ret)
goto out_slab;
@@ -301,19 +427,14 @@ static int rds_tcp_init(void)
if (ret)
goto out_recv;
- ret = rds_tcp_listen_init();
- if (ret)
- goto out_register;
-
rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
goto out;
-out_register:
- rds_trans_unregister(&rds_tcp_transport);
out_recv:
rds_tcp_recv_exit();
out_slab:
+ unregister_pernet_subsys(&rds_tcp_net_ops);
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
diff --git a/kernel/net/rds/tcp.h b/kernel/net/rds/tcp.h
index 0dbdd3716..64f873c0c 100644
--- a/kernel/net/rds/tcp.h
+++ b/kernel/net/rds/tcp.h
@@ -52,6 +52,7 @@ u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
extern struct rds_transport rds_tcp_transport;
+void rds_tcp_accept_work(struct sock *sk);
/* tcp_connect.c */
int rds_tcp_conn_connect(struct rds_connection *conn);
@@ -59,9 +60,11 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
-int rds_tcp_listen_init(void);
-void rds_tcp_listen_stop(void);
+struct socket *rds_tcp_listen_init(struct net *);
+void rds_tcp_listen_stop(struct socket *);
void rds_tcp_listen_data_ready(struct sock *sk);
+int rds_tcp_accept_one(struct socket *sock);
+int rds_tcp_keepalive(struct socket *sock);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
diff --git a/kernel/net/rds/tcp_connect.c b/kernel/net/rds/tcp_connect.c
index 973109c7b..5cb16875c 100644
--- a/kernel/net/rds/tcp_connect.c
+++ b/kernel/net/rds/tcp_connect.c
@@ -79,7 +79,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
struct sockaddr_in src, dest;
int ret;
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
@@ -111,10 +112,12 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
- if (ret == 0)
+ if (ret == 0) {
+ rds_tcp_keepalive(sock);
sock = NULL;
- else
+ } else {
rds_tcp_restore_callbacks(sock, conn->c_transport_data);
+ }
out:
if (sock)
diff --git a/kernel/net/rds/tcp_listen.c b/kernel/net/rds/tcp_listen.c
index 0da49e344..0936a4a32 100644
--- a/kernel/net/rds/tcp_listen.c
+++ b/kernel/net/rds/tcp_listen.c
@@ -38,14 +38,7 @@
#include "rds.h"
#include "tcp.h"
-/*
- * cheesy, but simple..
- */
-static void rds_tcp_accept_worker(struct work_struct *work);
-static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
-static struct socket *rds_tcp_listen_sock;
-
-static int rds_tcp_keepalive(struct socket *sock)
+int rds_tcp_keepalive(struct socket *sock)
{
/* values below based on xs_udp_default_timeout */
int keepidle = 5; /* send a probe 'keepidle' secs after last data */
@@ -77,7 +70,7 @@ bail:
return ret;
}
-static int rds_tcp_accept_one(struct socket *sock)
+int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
@@ -85,8 +78,9 @@ static int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp;
- ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
- sock->sk->sk_protocol, &new_sock);
+ ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
+ sock->sk->sk_type, sock->sk->sk_protocol,
+ &new_sock);
if (ret)
goto out;
@@ -108,35 +102,35 @@ static int rds_tcp_accept_one(struct socket *sock)
&inet->inet_saddr, ntohs(inet->inet_sport),
&inet->inet_daddr, ntohs(inet->inet_dport));
- conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+ conn = rds_conn_create(sock_net(sock->sk),
+ inet->inet_saddr, inet->inet_daddr,
&rds_tcp_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
goto out;
}
/* An incoming SYN request came in, and TCP just accepted it.
- * We always create a new conn for listen side of TCP, and do not
- * add it to the c_hash_list.
*
* If the client reboots, this conn will need to be cleaned up.
* rds_tcp_state_change() will do that cleanup
*/
rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
- WARN_ON(!rs_tcp || rs_tcp->t_sock);
-
- /*
- * see the comment above rds_queue_delayed_reconnect()
- */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- if (rds_conn_state(conn) == RDS_CONN_UP)
- rds_tcp_stats_inc(s_tcp_listen_closed_stale);
- else
- rds_tcp_stats_inc(s_tcp_connect_raced);
- rds_conn_drop(conn);
+ if (rs_tcp->t_sock &&
+ ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+ struct sock *nsk = new_sock->sk;
+
+ nsk->sk_user_data = NULL;
+ nsk->sk_prot->disconnect(nsk, 0);
+ tcp_done(nsk);
+ new_sock = NULL;
ret = 0;
goto out;
+ } else if (rs_tcp->t_sock) {
+ rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+ conn->c_outgoing = 0;
}
+ rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
rds_tcp_set_callbacks(new_sock, conn);
rds_connect_complete(conn);
new_sock = NULL;
@@ -148,12 +142,6 @@ out:
return ret;
}
-static void rds_tcp_accept_worker(struct work_struct *work)
-{
- while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
- cond_resched();
-}
-
void rds_tcp_listen_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
@@ -174,20 +162,20 @@ void rds_tcp_listen_data_ready(struct sock *sk)
* socket
*/
if (sk->sk_state == TCP_LISTEN)
- queue_work(rds_wq, &rds_tcp_listen_work);
+ rds_tcp_accept_work(sk);
out:
read_unlock(&sk->sk_callback_lock);
ready(sk);
}
-int rds_tcp_listen_init(void)
+struct socket *rds_tcp_listen_init(struct net *net)
{
struct sockaddr_in sin;
struct socket *sock = NULL;
int ret;
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
@@ -211,17 +199,15 @@ int rds_tcp_listen_init(void)
if (ret < 0)
goto out;
- rds_tcp_listen_sock = sock;
- sock = NULL;
+ return sock;
out:
if (sock)
sock_release(sock);
- return ret;
+ return NULL;
}
-void rds_tcp_listen_stop(void)
+void rds_tcp_listen_stop(struct socket *sock)
{
- struct socket *sock = rds_tcp_listen_sock;
struct sock *sk;
if (!sock)
@@ -242,5 +228,4 @@ void rds_tcp_listen_stop(void)
/* wait for accepts to stop and close the socket */
flush_workqueue(rds_wq);
sock_release(sock);
- rds_tcp_listen_sock = NULL;
}
diff --git a/kernel/net/rds/tcp_recv.c b/kernel/net/rds/tcp_recv.c
index fbc5ef88b..27a992154 100644
--- a/kernel/net/rds/tcp_recv.c
+++ b/kernel/net/rds/tcp_recv.c
@@ -214,8 +214,15 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
to_copy = min(tc->t_tinc_data_rem, left);
- pskb_pull(clone, offset);
- pskb_trim(clone, to_copy);
+ if (!pskb_pull(clone, offset) ||
+ pskb_trim(clone, to_copy)) {
+ pr_warn("rds_tcp_data_recv: pull/trim failed "
+ "left %zu data_rem %zu skb_len %d\n",
+ left, tc->t_tinc_data_rem, skb->len);
+ kfree_skb(clone);
+ desc->error = -ENOMEM;
+ goto out;
+ }
skb_queue_tail(&tinc->ti_skb_list, clone);
rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
diff --git a/kernel/net/rds/tcp_send.c b/kernel/net/rds/tcp_send.c
index 53b17ca0d..2894e6095 100644
--- a/kernel/net/rds/tcp_send.c
+++ b/kernel/net/rds/tcp_send.c
@@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
int ret = 0;
+ int more;
if (hdr_off == 0) {
/*
@@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
goto out;
}
+ more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0;
while (sg < rm->data.op_nents) {
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+
ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(&rm->data.op_sg[sg]),
rm->data.op_sg[sg].offset + off,
rm->data.op_sg[sg].length - off,
- MSG_DONTWAIT|MSG_NOSIGNAL);
+ flags);
rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
ret);
@@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
off = 0;
sg++;
}
+ if (sg == rm->data.op_nents - 1)
+ more = 0;
}
out:
diff --git a/kernel/net/rds/threads.c b/kernel/net/rds/threads.c
index dc2402e87..454aa6d23 100644
--- a/kernel/net/rds/threads.c
+++ b/kernel/net/rds/threads.c
@@ -162,7 +162,9 @@ void rds_send_worker(struct work_struct *work)
int ret;
if (rds_conn_state(conn) == RDS_CONN_UP) {
+ clear_bit(RDS_LL_SEND_FULL, &conn->c_flags);
ret = rds_send_xmit(conn);
+ cond_resched();
rdsdebug("conn %p ret %d\n", conn, ret);
switch (ret) {
case -EAGAIN:
diff --git a/kernel/net/rds/transport.c b/kernel/net/rds/transport.c
index 7f2ac4fec..f3afd1d60 100644
--- a/kernel/net/rds/transport.c
+++ b/kernel/net/rds/transport.c
@@ -73,11 +73,11 @@ EXPORT_SYMBOL_GPL(rds_trans_unregister);
void rds_trans_put(struct rds_transport *trans)
{
- if (trans && trans->t_owner)
+ if (trans)
module_put(trans->t_owner);
}
-struct rds_transport *rds_trans_get_preferred(__be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
@@ -90,7 +90,28 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr)
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
- if (trans && (trans->laddr_check(addr) == 0) &&
+ if (trans && (trans->laddr_check(net, addr) == 0) &&
+ (!trans->t_owner || try_module_get(trans->t_owner))) {
+ ret = trans;
+ break;
+ }
+ }
+ up_read(&rds_trans_sem);
+
+ return ret;
+}
+
+struct rds_transport *rds_trans_get(int t_type)
+{
+ struct rds_transport *ret = NULL;
+ struct rds_transport *trans;
+ unsigned int i;
+
+ down_read(&rds_trans_sem);
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
+ trans = transports[i];
+
+ if (trans && trans->t_type == t_type &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;
diff --git a/kernel/net/rfkill/Kconfig b/kernel/net/rfkill/Kconfig
index 4c10e7e6c..598d374f6 100644
--- a/kernel/net/rfkill/Kconfig
+++ b/kernel/net/rfkill/Kconfig
@@ -36,7 +36,8 @@ config RFKILL_REGULATOR
config RFKILL_GPIO
tristate "GPIO RFKILL driver"
- depends on RFKILL && GPIOLIB
+ depends on RFKILL
+ depends on GPIOLIB || COMPILE_TEST
default n
help
If you say yes here you get support of a generic gpio RFKILL
diff --git a/kernel/net/rfkill/core.c b/kernel/net/rfkill/core.c
index fa7cd7927..cf5b69ab1 100644
--- a/kernel/net/rfkill/core.c
+++ b/kernel/net/rfkill/core.c
@@ -49,7 +49,6 @@
struct rfkill {
spinlock_t lock;
- const char *name;
enum rfkill_type type;
unsigned long state;
@@ -73,6 +72,7 @@ struct rfkill {
struct delayed_work poll_work;
struct work_struct uevent_work;
struct work_struct sync_work;
+ char name[];
};
#define to_rfkill(d) container_of(d, struct rfkill, dev)
@@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
{
struct rfkill *rfkill;
- rfkill_global_states[type].cur = blocked;
+ if (type == RFKILL_TYPE_ALL) {
+ int i;
+
+ for (i = 0; i < NUM_RFKILL_TYPES; i++)
+ rfkill_global_states[i].cur = blocked;
+ } else {
+ rfkill_global_states[type].cur = blocked;
+ }
+
list_for_each_entry(rfkill, &rfkill_list, node) {
if (rfkill->type != type && type != RFKILL_TYPE_ALL)
continue;
@@ -794,7 +802,8 @@ void rfkill_resume_polling(struct rfkill *rfkill)
}
EXPORT_SYMBOL(rfkill_resume_polling);
-static int rfkill_suspend(struct device *dev, pm_message_t state)
+#ifdef CONFIG_PM_SLEEP
+static int rfkill_suspend(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
@@ -818,13 +827,18 @@ static int rfkill_resume(struct device *dev)
return 0;
}
+static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume);
+#define RFKILL_PM_OPS (&rfkill_pm_ops)
+#else
+#define RFKILL_PM_OPS NULL
+#endif
+
static struct class rfkill_class = {
.name = "rfkill",
.dev_release = rfkill_release,
.dev_groups = rfkill_dev_groups,
.dev_uevent = rfkill_dev_uevent,
- .suspend = rfkill_suspend,
- .resume = rfkill_resume,
+ .pm = RFKILL_PM_OPS,
};
bool rfkill_blocked(struct rfkill *rfkill)
@@ -862,14 +876,14 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
return NULL;
- rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+ rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL);
if (!rfkill)
return NULL;
spin_lock_init(&rfkill->lock);
INIT_LIST_HEAD(&rfkill->node);
rfkill->type = type;
- rfkill->name = name;
+ strcpy(rfkill->name, name);
rfkill->ops = ops;
rfkill->data = ops_data;
@@ -1081,17 +1095,6 @@ static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait)
return res;
}
-static bool rfkill_readable(struct rfkill_data *data)
-{
- bool r;
-
- mutex_lock(&data->mtx);
- r = !list_empty(&data->events);
- mutex_unlock(&data->mtx);
-
- return r;
-}
-
static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
size_t count, loff_t *pos)
{
@@ -1108,8 +1111,11 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
goto out;
}
mutex_unlock(&data->mtx);
+ /* since we re-check and it just compares pointers,
+ * using !list_empty() without locking isn't a problem
+ */
ret = wait_event_interruptible(data->read_wait,
- rfkill_readable(data));
+ !list_empty(&data->events));
mutex_lock(&data->mtx);
if (ret)
diff --git a/kernel/net/rfkill/rfkill-gpio.c b/kernel/net/rfkill/rfkill-gpio.c
index d978f2f46..93127220c 100644
--- a/kernel/net/rfkill/rfkill-gpio.c
+++ b/kernel/net/rfkill/rfkill-gpio.c
@@ -112,21 +112,17 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
rfkill->clk = devm_clk_get(&pdev->dev, NULL);
- gpio = devm_gpiod_get(&pdev->dev, "reset");
- if (!IS_ERR(gpio)) {
- ret = gpiod_direction_output(gpio, 0);
- if (ret)
- return ret;
- rfkill->reset_gpio = gpio;
- }
+ gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW);
+ if (IS_ERR(gpio))
+ return PTR_ERR(gpio);
- gpio = devm_gpiod_get(&pdev->dev, "shutdown");
- if (!IS_ERR(gpio)) {
- ret = gpiod_direction_output(gpio, 0);
- if (ret)
- return ret;
- rfkill->shutdown_gpio = gpio;
- }
+ rfkill->reset_gpio = gpio;
+
+ gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_OUT_LOW);
+ if (IS_ERR(gpio))
+ return PTR_ERR(gpio);
+
+ rfkill->shutdown_gpio = gpio;
/* Make sure at-least one of the GPIO is defined and that
* a name is specified for this instance
@@ -168,7 +164,6 @@ static int rfkill_gpio_remove(struct platform_device *pdev)
#ifdef CONFIG_ACPI
static const struct acpi_device_id rfkill_acpi_match[] = {
{ "BCM2E1A", RFKILL_TYPE_BLUETOOTH },
- { "BCM2E39", RFKILL_TYPE_BLUETOOTH },
{ "BCM2E3D", RFKILL_TYPE_BLUETOOTH },
{ "BCM2E40", RFKILL_TYPE_BLUETOOTH },
{ "BCM2E64", RFKILL_TYPE_BLUETOOTH },
diff --git a/kernel/net/rose/af_rose.c b/kernel/net/rose/af_rose.c
index 8ae603069..129d357d2 100644
--- a/kernel/net/rose/af_rose.c
+++ b/kernel/net/rose/af_rose.c
@@ -192,7 +192,8 @@ static void rose_kill_by_device(struct net_device *dev)
if (rose->device == dev) {
rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
- rose->neighbour->use--;
+ if (rose->neighbour)
+ rose->neighbour->use--;
rose->device = NULL;
}
}
@@ -520,7 +521,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_SEQPACKET || protocol != 0)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto);
+ sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern);
if (sk == NULL)
return -ENOMEM;
@@ -559,7 +560,7 @@ static struct sock *rose_make_new(struct sock *osk)
if (osk->sk_type != SOCK_SEQPACKET)
return NULL;
- sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto);
+ sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0);
if (sk == NULL)
return NULL;
diff --git a/kernel/net/rose/rose_link.c b/kernel/net/rose/rose_link.c
index e873d7d9f..c76638cc2 100644
--- a/kernel/net/rose/rose_link.c
+++ b/kernel/net/rose/rose_link.c
@@ -25,7 +25,6 @@
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
-#include <linux/netfilter.h>
#include <net/rose.h>
static void rose_ftimer_expiry(unsigned long);
diff --git a/kernel/net/rose/rose_route.c b/kernel/net/rose/rose_route.c
index 40148932c..0fc76d845 100644
--- a/kernel/net/rose/rose_route.c
+++ b/kernel/net/rose/rose_route.c
@@ -31,7 +31,6 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
-#include <linux/netfilter.h>
#include <linux/init.h>
#include <net/rose.h>
#include <linux/seq_file.h>
diff --git a/kernel/net/rxrpc/af_rxrpc.c b/kernel/net/rxrpc/af_rxrpc.c
index 0095b9a0b..1f8a144a5 100644
--- a/kernel/net/rxrpc/af_rxrpc.c
+++ b/kernel/net/rxrpc/af_rxrpc.c
@@ -305,7 +305,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
if (!key)
key = rx->key;
- if (key && !key->payload.data)
+ if (key && !key->payload.data[0])
key = NULL; /* a no-security key */
bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp);
@@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
sock->ops = &rxrpc_rpc_ops;
sock->state = SS_UNCONNECTED;
- sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto);
+ sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/kernel/net/rxrpc/ar-ack.c b/kernel/net/rxrpc/ar-ack.c
index e0547f521..adc555e03 100644
--- a/kernel/net/rxrpc/ar-ack.c
+++ b/kernel/net/rxrpc/ar-ack.c
@@ -723,8 +723,10 @@ process_further:
if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
- hard > tx)
+ hard > tx) {
+ call->acks_hard = tx;
goto all_acked;
+ }
smp_rmb();
rxrpc_rotate_tx_window(call, hard - 1);
diff --git a/kernel/net/rxrpc/ar-connection.c b/kernel/net/rxrpc/ar-connection.c
index 6631f4f1e..6c71ed1ca 100644
--- a/kernel/net/rxrpc/ar-connection.c
+++ b/kernel/net/rxrpc/ar-connection.c
@@ -500,7 +500,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
if (bundle->num_conns >= 20) {
_debug("too many conns");
- if (!(gfp & __GFP_WAIT)) {
+ if (!gfpflags_allow_blocking(gfp)) {
_leave(" = -EAGAIN");
return -EAGAIN;
}
@@ -808,7 +808,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn)
ASSERTCMP(atomic_read(&conn->usage), >, 0);
- conn->put_time = get_seconds();
+ conn->put_time = ktime_get_seconds();
if (atomic_dec_and_test(&conn->usage)) {
_debug("zombie");
rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
@@ -852,7 +852,7 @@ static void rxrpc_connection_reaper(struct work_struct *work)
_enter("");
- now = get_seconds();
+ now = ktime_get_seconds();
earliest = ULONG_MAX;
write_lock_bh(&rxrpc_connection_lock);
diff --git a/kernel/net/rxrpc/ar-internal.h b/kernel/net/rxrpc/ar-internal.h
index aef1bd294..2934a73a5 100644
--- a/kernel/net/rxrpc/ar-internal.h
+++ b/kernel/net/rxrpc/ar-internal.h
@@ -208,7 +208,7 @@ struct rxrpc_transport {
struct rb_root server_conns; /* server connections on this transport */
struct list_head link; /* link in master session list */
struct sk_buff_head error_queue; /* error packets awaiting processing */
- time_t put_time; /* time at which to reap */
+ unsigned long put_time; /* time at which to reap */
spinlock_t client_lock; /* client connection allocation lock */
rwlock_t conn_lock; /* lock for active/dead connections */
atomic_t usage;
@@ -256,7 +256,7 @@ struct rxrpc_connection {
struct rxrpc_crypt csum_iv; /* packet checksum base */
unsigned long events;
#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */
- time_t put_time; /* time at which to reap */
+ unsigned long put_time; /* time at which to reap */
rwlock_t lock; /* access lock */
spinlock_t state_lock; /* state-change lock */
atomic_t usage;
diff --git a/kernel/net/rxrpc/ar-key.c b/kernel/net/rxrpc/ar-key.c
index db0f39f5e..da3cc09f6 100644
--- a/kernel/net/rxrpc/ar-key.c
+++ b/kernel/net/rxrpc/ar-key.c
@@ -148,10 +148,10 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
token->kad->ticket[6], token->kad->ticket[7]);
/* count the number of tokens attached */
- prep->type_data[0] = (void *)((unsigned long)prep->type_data[0] + 1);
+ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);
/* attach the data */
- for (pptoken = (struct rxrpc_key_token **)&prep->payload[0];
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
*pptoken;
pptoken = &(*pptoken)->next)
continue;
@@ -522,7 +522,7 @@ static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep,
goto inval;
/* attach the payload */
- for (pptoken = (struct rxrpc_key_token **)&prep->payload[0];
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
*pptoken;
pptoken = &(*pptoken)->next)
continue;
@@ -764,10 +764,10 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep)
memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length);
/* count the number of tokens attached */
- prep->type_data[0] = (void *)((unsigned long)prep->type_data[0] + 1);
+ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);
/* attach the data */
- pp = (struct rxrpc_key_token **)&prep->payload[0];
+ pp = (struct rxrpc_key_token **)&prep->payload.data[0];
while (*pp)
pp = &(*pp)->next;
*pp = token;
@@ -814,7 +814,7 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token)
*/
static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
{
- rxrpc_free_token_list(prep->payload[0]);
+ rxrpc_free_token_list(prep->payload.data[0]);
}
/*
@@ -831,7 +831,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
if (prep->datalen != 8)
return -EINVAL;
- memcpy(&prep->type_data, prep->data, 8);
+ memcpy(&prep->payload.data[2], prep->data, 8);
ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(ci)) {
@@ -842,7 +842,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0)
BUG();
- prep->payload[0] = ci;
+ prep->payload.data[0] = ci;
_leave(" = 0");
return 0;
}
@@ -852,8 +852,8 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
*/
static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
{
- if (prep->payload[0])
- crypto_free_blkcipher(prep->payload[0]);
+ if (prep->payload.data[0])
+ crypto_free_blkcipher(prep->payload.data[0]);
}
/*
@@ -861,7 +861,7 @@ static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
*/
static void rxrpc_destroy(struct key *key)
{
- rxrpc_free_token_list(key->payload.data);
+ rxrpc_free_token_list(key->payload.data[0]);
}
/*
@@ -869,9 +869,9 @@ static void rxrpc_destroy(struct key *key)
*/
static void rxrpc_destroy_s(struct key *key)
{
- if (key->payload.data) {
- crypto_free_blkcipher(key->payload.data);
- key->payload.data = NULL;
+ if (key->payload.data[0]) {
+ crypto_free_blkcipher(key->payload.data[0]);
+ key->payload.data[0] = NULL;
}
}
@@ -1070,7 +1070,7 @@ static long rxrpc_read(const struct key *key,
size += 1 * 4; /* token count */
ntoks = 0;
- for (token = key->payload.data; token; token = token->next) {
+ for (token = key->payload.data[0]; token; token = token->next) {
toksize = 4; /* sec index */
switch (token->security_index) {
@@ -1163,7 +1163,7 @@ static long rxrpc_read(const struct key *key,
ENCODE(ntoks);
tok = 0;
- for (token = key->payload.data; token; token = token->next) {
+ for (token = key->payload.data[0]; token; token = token->next) {
toksize = toksizes[tok++];
ENCODE(toksize);
oldxdr = xdr;
diff --git a/kernel/net/rxrpc/ar-local.c b/kernel/net/rxrpc/ar-local.c
index ca904ed54..78483b460 100644
--- a/kernel/net/rxrpc/ar-local.c
+++ b/kernel/net/rxrpc/ar-local.c
@@ -73,8 +73,8 @@ static int rxrpc_create_local(struct rxrpc_local *local)
_enter("%p{%d}", local, local->srx.transport_type);
/* create a socket to represent the local endpoint */
- ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP,
- &local->socket);
+ ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
+ IPPROTO_UDP, &local->socket);
if (ret < 0) {
_leave(" = %d [socket]", ret);
return ret;
diff --git a/kernel/net/rxrpc/ar-output.c b/kernel/net/rxrpc/ar-output.c
index c0042807b..14c4e12c4 100644
--- a/kernel/net/rxrpc/ar-output.c
+++ b/kernel/net/rxrpc/ar-output.c
@@ -158,7 +158,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
service_id = htons(srx->srx_service);
}
key = rx->key;
- if (key && !rx->key->payload.data)
+ if (key && !rx->key->payload.data[0])
key = NULL;
bundle = rxrpc_get_bundle(rx, trans, key, service_id,
GFP_KERNEL);
@@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
/* this should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
return -EPIPE;
diff --git a/kernel/net/rxrpc/ar-security.c b/kernel/net/rxrpc/ar-security.c
index 49b3cc31e..8334474eb 100644
--- a/kernel/net/rxrpc/ar-security.c
+++ b/kernel/net/rxrpc/ar-security.c
@@ -137,9 +137,9 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
if (ret < 0)
return ret;
- if (!key->payload.data)
+ token = key->payload.data[0];
+ if (!token)
return -EKEYREJECTED;
- token = key->payload.data;
sec = rxrpc_security_lookup(token->security_index);
if (!sec)
diff --git a/kernel/net/rxrpc/ar-transport.c b/kernel/net/rxrpc/ar-transport.c
index 1976dec84..9946467f1 100644
--- a/kernel/net/rxrpc/ar-transport.c
+++ b/kernel/net/rxrpc/ar-transport.c
@@ -189,7 +189,7 @@ void rxrpc_put_transport(struct rxrpc_transport *trans)
ASSERTCMP(atomic_read(&trans->usage), >, 0);
- trans->put_time = get_seconds();
+ trans->put_time = ktime_get_seconds();
if (unlikely(atomic_dec_and_test(&trans->usage))) {
_debug("zombie");
/* let the reaper determine the timeout to avoid a race with
@@ -226,7 +226,7 @@ static void rxrpc_transport_reaper(struct work_struct *work)
_enter("");
- now = get_seconds();
+ now = ktime_get_seconds();
earliest = ULONG_MAX;
/* extract all the transports that have been dead too long */
diff --git a/kernel/net/rxrpc/rxkad.c b/kernel/net/rxrpc/rxkad.c
index f226709eb..d7a9ab5a9 100644
--- a/kernel/net/rxrpc/rxkad.c
+++ b/kernel/net/rxrpc/rxkad.c
@@ -67,7 +67,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
_enter("{%d},{%x}", conn->debug_id, key_serial(conn->key));
- token = conn->key->payload.data;
+ token = conn->key->payload.data[0];
conn->security_ix = token->security_index;
ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
@@ -125,7 +125,7 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
if (!conn->key)
return;
- token = conn->key->payload.data;
+ token = conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
desc.tfm = conn->cipher;
@@ -221,7 +221,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr.checksum = 0;
/* encrypt from the session key */
- token = call->conn->key->payload.data;
+ token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
desc.tfm = call->conn->cipher;
desc.info = iv.x;
@@ -433,7 +433,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
skb_to_sgvec(skb, sg, 0, skb->len);
/* decrypt from the session key */
- token = call->conn->key->payload.data;
+ token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
desc.tfm = call->conn->cipher;
desc.info = iv.x;
@@ -780,7 +780,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
if (conn->security_level < min_level)
goto protocol_error;
- token = conn->key->payload.data;
+ token = conn->key->payload.data[0];
/* build the response packet */
memset(&resp, 0, sizeof(resp));
@@ -848,12 +848,12 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
}
}
- ASSERT(conn->server_key->payload.data != NULL);
+ ASSERT(conn->server_key->payload.data[0] != NULL);
ASSERTCMP((unsigned long) ticket & 7UL, ==, 0);
- memcpy(&iv, &conn->server_key->type_data, sizeof(iv));
+ memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv));
- desc.tfm = conn->server_key->payload.data;
+ desc.tfm = conn->server_key->payload.data[0];
desc.info = iv.x;
desc.flags = 0;
diff --git a/kernel/net/sched/Kconfig b/kernel/net/sched/Kconfig
index 2274e723a..daa33432b 100644
--- a/kernel/net/sched/Kconfig
+++ b/kernel/net/sched/Kconfig
@@ -312,6 +312,7 @@ config NET_SCH_PIE
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
+ select NET_INGRESS
---help---
Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.
@@ -477,6 +478,16 @@ config NET_CLS_BPF
To compile this code as a module, choose M here: the module will
be called cls_bpf.
+config NET_CLS_FLOWER
+ tristate "Flower classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ a configurable combination of packet keys and masks.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_flower.
+
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
diff --git a/kernel/net/sched/Makefile b/kernel/net/sched/Makefile
index 7ca7f4c1b..690c1689e 100644
--- a/kernel/net/sched/Makefile
+++ b/kernel/net/sched/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
+obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
diff --git a/kernel/net/sched/act_api.c b/kernel/net/sched/act_api.c
index f8d9c2a2c..06e7c4a37 100644
--- a/kernel/net/sched/act_api.c
+++ b/kernel/net/sched/act_api.c
@@ -27,7 +27,16 @@
#include <net/act_api.h>
#include <net/netlink.h>
-void tcf_hash_destroy(struct tc_action *a)
+static void free_tcf(struct rcu_head *head)
+{
+ struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu);
+
+ free_percpu(p->cpu_bstats);
+ free_percpu(p->cpu_qstats);
+ kfree(p);
+}
+
+static void tcf_hash_destroy(struct tc_action *a)
{
struct tcf_common *p = a->priv;
struct tcf_hashinfo *hinfo = a->ops->hinfo;
@@ -41,9 +50,8 @@ void tcf_hash_destroy(struct tc_action *a)
* gen_estimator est_timer() might access p->tcfc_lock
* or bstats, wait a RCU grace period before freeing p
*/
- kfree_rcu(p, tcfc_rcu);
+ call_rcu(&p->tcfc_rcu, free_tcf);
}
-EXPORT_SYMBOL(tcf_hash_destroy);
int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
{
@@ -231,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
if (est)
gen_kill_estimator(&pc->tcfc_bstats,
&pc->tcfc_rate_est);
- kfree_rcu(pc, tcfc_rcu);
+ call_rcu(&pc->tcfc_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
- int size, int bind)
+ int size, int bind, bool cpustats)
{
struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+ int err = -ENOMEM;
if (unlikely(!p))
return -ENOMEM;
@@ -247,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
if (bind)
p->tcfc_bindcnt = 1;
+ if (cpustats) {
+ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!p->cpu_bstats) {
+err1:
+ kfree(p);
+ return err;
+ }
+ p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!p->cpu_qstats) {
+err2:
+ free_percpu(p->cpu_bstats);
+ goto err1;
+ }
+ }
spin_lock_init(&p->tcfc_lock);
INIT_HLIST_NODE(&p->tcfc_head);
p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
p->tcfc_tm.install = jiffies;
p->tcfc_tm.lastuse = jiffies;
if (est) {
- int err = gen_new_estimator(&p->tcfc_bstats, NULL,
- &p->tcfc_rate_est,
- &p->tcfc_lock, est);
+ err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
+ &p->tcfc_rate_est,
+ &p->tcfc_lock, est);
if (err) {
- kfree(p);
- return err;
+ free_percpu(p->cpu_qstats);
+ goto err2;
}
}
@@ -393,11 +416,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
list_for_each_entry(a, actions, list) {
repeat:
ret = a->ops->act(skb, a, res);
- if (TC_MUNGED & skb->tc_verd) {
- /* copied already, allow trampling */
- skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
- skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
- }
if (ret == TC_ACT_REPEAT)
goto repeat; /* we need a ttl - JHS */
if (ret != TC_ACT_PIPE)
@@ -621,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (err < 0)
goto errout;
- if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 ||
+ if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
&p->tcfc_rate_est) < 0 ||
- gnet_stats_copy_queue(&d, NULL,
+ gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfc_qstats,
p->tcfc_qstats.qlen) < 0)
goto errout;
diff --git a/kernel/net/sched/act_bpf.c b/kernel/net/sched/act_bpf.c
index 521ffca91..0bc6f912f 100644
--- a/kernel/net/sched/act_bpf.c
+++ b/kernel/net/sched/act_bpf.c
@@ -37,19 +37,25 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
struct tcf_result *res)
{
struct tcf_bpf *prog = act->priv;
+ struct bpf_prog *filter;
int action, filter_res;
+ bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
if (unlikely(!skb_mac_header_was_set(skb)))
return TC_ACT_UNSPEC;
- spin_lock(&prog->tcf_lock);
-
- prog->tcf_tm.lastuse = jiffies;
- bstats_update(&prog->tcf_bstats, skb);
+ tcf_lastuse_update(&prog->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
- /* Needed here for accessing maps. */
rcu_read_lock();
- filter_res = BPF_PROG_RUN(prog->filter, skb);
+ filter = rcu_dereference(prog->filter);
+ if (at_ingress) {
+ __skb_push(skb, skb->mac_len);
+ filter_res = BPF_PROG_RUN(filter, skb);
+ __skb_pull(skb, skb->mac_len);
+ } else {
+ filter_res = BPF_PROG_RUN(filter, skb);
+ }
rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
@@ -66,11 +72,12 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
case TC_ACT_PIPE:
case TC_ACT_RECLASSIFY:
case TC_ACT_OK:
+ case TC_ACT_REDIRECT:
action = filter_res;
break;
case TC_ACT_SHOT:
action = filter_res;
- prog->tcf_qstats.drops++;
+ qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats));
break;
case TC_ACT_UNSPEC:
action = prog->tcf_action;
@@ -80,7 +87,6 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
break;
}
- spin_unlock(&prog->tcf_lock);
return action;
}
@@ -256,7 +262,10 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
struct tcf_bpf_cfg *cfg)
{
cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
- cfg->filter = prog->filter;
+ /* updates to prog->filter are prevented, since it's called either
+ * with rtnl lock or during final cleanup in rcu callback
+ */
+ cfg->filter = rcu_dereference_protected(prog->filter, 1);
cfg->bpf_ops = prog->bpf_ops;
cfg->bpf_name = prog->bpf_name;
@@ -271,7 +280,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct tc_act_bpf *parm;
struct tcf_bpf *prog;
bool is_bpf, is_ebpf;
- int ret;
+ int ret, res = 0;
if (!nla)
return -EINVAL;
@@ -280,45 +289,47 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (ret < 0)
return ret;
- is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
- is_ebpf = tb[TCA_ACT_BPF_FD];
-
- if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
- !tb[TCA_ACT_BPF_PARMS])
+ if (!tb[TCA_ACT_BPF_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
- memset(&cfg, 0, sizeof(cfg));
-
- ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
- tcf_bpf_init_from_efd(tb, &cfg);
- if (ret < 0)
- return ret;
-
if (!tcf_hash_check(parm->index, act, bind)) {
ret = tcf_hash_create(parm->index, est, act,
- sizeof(*prog), bind);
+ sizeof(*prog), bind, true);
if (ret < 0)
- goto destroy_fp;
+ return ret;
- ret = ACT_P_CREATED;
+ res = ACT_P_CREATED;
} else {
/* Don't override defaults. */
if (bind)
- goto destroy_fp;
+ return 0;
tcf_hash_release(act, bind);
- if (!replace) {
- ret = -EEXIST;
- goto destroy_fp;
- }
+ if (!replace)
+ return -EEXIST;
}
+ is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
+ is_ebpf = tb[TCA_ACT_BPF_FD];
+
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ memset(&cfg, 0, sizeof(cfg));
+
+ ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
+ tcf_bpf_init_from_efd(tb, &cfg);
+ if (ret < 0)
+ goto out;
+
prog = to_bpf(act);
- spin_lock_bh(&prog->tcf_lock);
+ ASSERT_RTNL();
- if (ret != ACT_P_CREATED)
+ if (res != ACT_P_CREATED)
tcf_bpf_prog_fill_cfg(prog, &old);
prog->bpf_ops = cfg.bpf_ops;
@@ -330,19 +341,21 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
prog->bpf_fd = cfg.bpf_fd;
prog->tcf_action = parm->action;
- prog->filter = cfg.filter;
-
- spin_unlock_bh(&prog->tcf_lock);
+ rcu_assign_pointer(prog->filter, cfg.filter);
- if (ret == ACT_P_CREATED)
+ if (res == ACT_P_CREATED) {
tcf_hash_insert(act);
- else
+ } else {
+ /* make sure the program being replaced is no longer executing */
+ synchronize_rcu();
tcf_bpf_cfg_cleanup(&old);
+ }
- return ret;
+ return res;
+out:
+ if (res == ACT_P_CREATED)
+ tcf_hash_cleanup(act, est);
-destroy_fp:
- tcf_bpf_cfg_cleanup(&cfg);
return ret;
}
diff --git a/kernel/net/sched/act_connmark.c b/kernel/net/sched/act_connmark.c
index 295d14bd6..bb41699c6 100644
--- a/kernel/net/sched/act_connmark.c
+++ b/kernel/net/sched/act_connmark.c
@@ -37,6 +37,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
struct nf_conntrack_tuple tuple;
enum ip_conntrack_info ctinfo;
struct tcf_connmark_info *ca = a->priv;
+ struct nf_conntrack_zone zone;
struct nf_conn *c;
int proto;
@@ -67,10 +68,13 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
}
if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- proto, &tuple))
+ proto, ca->net, &tuple))
goto out;
- thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple);
+ zone.id = ca->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(ca->net, &zone, &tuple);
if (!thash)
goto out;
@@ -108,12 +112,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
+ bind, false);
if (ret)
return ret;
ci = to_connmark(a);
ci->tcf_action = parm->action;
+ ci->net = net;
ci->zone = parm->zone;
tcf_hash_insert(a);
diff --git a/kernel/net/sched/act_csum.c b/kernel/net/sched/act_csum.c
index 4cd5cf1ae..b07c535ba 100644
--- a/kernel/net/sched/act_csum.c
+++ b/kernel/net/sched/act_csum.c
@@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_CSUM_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/kernel/net/sched/act_gact.c b/kernel/net/sched/act_gact.c
index 7fffc2272..5c1b05170 100644
--- a/kernel/net/sched/act_gact.c
+++ b/kernel/net/sched/act_gact.c
@@ -28,14 +28,18 @@
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (prandom_u32() % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
static int gact_determ(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
+ u32 pack = atomic_inc_return(&gact->packets);
+
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (pack % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
@@ -85,7 +89,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
#endif
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
+ bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -99,16 +104,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
gact = to_gact(a);
- spin_lock_bh(&gact->tcf_lock);
+ ASSERT_RTNL();
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_paction = p_parm->paction;
- gact->tcfg_pval = p_parm->pval;
+ gact->tcfg_pval = max_t(u16, 1, p_parm->pval);
+ /* Make sure tcfg_pval is written before tcfg_ptype
+ * coupled with smp_rmb() in gact_net_rand() & gact_determ()
+ */
+ smp_wmb();
gact->tcfg_ptype = p_parm->ptype;
}
#endif
- spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
@@ -118,23 +126,21 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_gact *gact = a->priv;
- int action = TC_ACT_SHOT;
+ int action = READ_ONCE(gact->tcf_action);
- spin_lock(&gact->tcf_lock);
#ifdef CONFIG_GACT_PROB
- if (gact->tcfg_ptype)
- action = gact_rand[gact->tcfg_ptype](gact);
- else
- action = gact->tcf_action;
-#else
- action = gact->tcf_action;
+ {
+ u32 ptype = READ_ONCE(gact->tcfg_ptype);
+
+ if (ptype)
+ action = gact_rand[ptype](gact);
+ }
#endif
- gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
- gact->tcf_bstats.packets++;
+ bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
if (action == TC_ACT_SHOT)
- gact->tcf_qstats.drops++;
- gact->tcf_tm.lastuse = jiffies;
- spin_unlock(&gact->tcf_lock);
+ qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+
+ tcf_lastuse_update(&gact->tcf_tm);
return action;
}
diff --git a/kernel/net/sched/act_ipt.c b/kernel/net/sched/act_ipt.c
index cbc8dd7dd..d05869646 100644
--- a/kernel/net/sched/act_ipt.c
+++ b/kernel/net/sched/act_ipt.c
@@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
index = nla_get_u32(tb[TCA_IPT_INDEX]);
if (!tcf_hash_check(index, a, bind) ) {
- ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+ ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -189,6 +189,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
* worry later - danger - this API seems to have changed
* from earlier kernels
*/
+ par.net = dev_net(skb->dev);
par.in = skb->dev;
par.out = NULL;
par.hooknum = ipt->tcfi_hook;
diff --git a/kernel/net/sched/act_mirred.c b/kernel/net/sched/act_mirred.c
index 3f63ceac8..32fcdecdb 100644
--- a/kernel/net/sched/act_mirred.c
+++ b/kernel/net/sched/act_mirred.c
@@ -31,13 +31,19 @@
#define MIRRED_TAB_MASK 7
static LIST_HEAD(mirred_list);
+static DEFINE_SPINLOCK(mirred_list_lock);
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1);
+
+ /* We could be called either in a RCU callback or with RTNL lock held. */
+ spin_lock_bh(&mirred_list_lock);
list_del(&m->tcfm_list);
- if (m->tcfm_dev)
- dev_put(m->tcfm_dev);
+ spin_unlock_bh(&mirred_list_lock);
+ if (dev)
+ dev_put(dev);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -93,32 +99,37 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, a, bind)) {
if (dev == NULL)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
+ bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
- if (!ovr) {
- tcf_hash_release(a, bind);
+ if (bind)
+ return 0;
+
+ tcf_hash_release(a, bind);
+ if (!ovr)
return -EEXIST;
- }
}
m = to_mirred(a);
- spin_lock_bh(&m->tcf_lock);
+ ASSERT_RTNL();
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (dev != NULL) {
m->tcfm_ifindex = parm->ifindex;
if (ret != ACT_P_CREATED)
- dev_put(m->tcfm_dev);
+ dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
dev_hold(dev);
- m->tcfm_dev = dev;
+ rcu_assign_pointer(m->tcfm_dev, dev);
m->tcfm_ok_push = ok_push;
}
- spin_unlock_bh(&m->tcf_lock);
+
if (ret == ACT_P_CREATED) {
+ spin_lock_bh(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
+ spin_unlock_bh(&mirred_list_lock);
tcf_hash_insert(a);
}
@@ -131,28 +142,30 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_mirred *m = a->priv;
struct net_device *dev;
struct sk_buff *skb2;
+ int retval, err;
u32 at;
- int retval, err = 1;
- spin_lock(&m->tcf_lock);
- m->tcf_tm.lastuse = jiffies;
- bstats_update(&m->tcf_bstats, skb);
+ tcf_lastuse_update(&m->tcf_tm);
+
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
- dev = m->tcfm_dev;
- if (!dev) {
- printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+ rcu_read_lock();
+ retval = READ_ONCE(m->tcf_action);
+ dev = rcu_dereference(m->tcfm_dev);
+ if (unlikely(!dev)) {
+ pr_notice_once("tc mirred: target device is gone\n");
goto out;
}
- if (!(dev->flags & IFF_UP)) {
+ if (unlikely(!(dev->flags & IFF_UP))) {
net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
dev->name);
goto out;
}
at = G_TC_AT(skb->tc_verd);
- skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
- if (skb2 == NULL)
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
goto out;
if (!(at & AT_EGRESS)) {
@@ -166,18 +179,16 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
+ skb_sender_cpu_clear(skb2);
err = dev_queue_xmit(skb2);
-out:
if (err) {
- m->tcf_qstats.overlimits++;
+out:
+ qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
retval = TC_ACT_SHOT;
- else
- retval = m->tcf_action;
- } else
- retval = m->tcf_action;
- spin_unlock(&m->tcf_lock);
+ }
+ rcu_read_unlock();
return retval;
}
@@ -216,15 +227,20 @@ static int mirred_device_event(struct notifier_block *unused,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
- if (event == NETDEV_UNREGISTER)
+ ASSERT_RTNL();
+ if (event == NETDEV_UNREGISTER) {
+ spin_lock_bh(&mirred_list_lock);
list_for_each_entry(m, &mirred_list, tcfm_list) {
- spin_lock_bh(&m->tcf_lock);
- if (m->tcfm_dev == dev) {
+ if (rcu_access_pointer(m->tcfm_dev) == dev) {
dev_put(dev);
- m->tcfm_dev = NULL;
+ /* Note : no rcu grace period necessary, as
+ * net_device are already rcu protected.
+ */
+ RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
- spin_unlock_bh(&m->tcf_lock);
}
+ spin_unlock_bh(&mirred_list_lock);
+ }
return NOTIFY_DONE;
}
diff --git a/kernel/net/sched/act_nat.c b/kernel/net/sched/act_nat.c
index 270a030d5..b7c4ead8b 100644
--- a/kernel/net/sched/act_nat.c
+++ b/kernel/net/sched/act_nat.c
@@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_NAT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -161,7 +162,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
goto drop;
tcph = (void *)(skb_network_header(skb) + ihl);
- inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr,
+ true);
break;
}
case IPPROTO_UDP:
@@ -177,7 +179,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
udph = (void *)(skb_network_header(skb) + ihl);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
- new_addr, 1);
+ new_addr, true);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
@@ -230,7 +232,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
iph->saddr = new_addr;
inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
- 0);
+ false);
break;
}
default:
diff --git a/kernel/net/sched/act_pedit.c b/kernel/net/sched/act_pedit.c
index 59649d588..e38a7701f 100644
--- a/kernel/net/sched/act_pedit.c
+++ b/kernel/net/sched/act_pedit.c
@@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
p = to_pedit(a);
@@ -68,13 +69,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
ret = ACT_P_CREATED;
} else {
- p = to_pedit(a);
- tcf_hash_release(a, bind);
if (bind)
return 0;
+ tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
-
+ p = to_pedit(a);
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL)
@@ -108,7 +108,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_pedit *p = a->priv;
- int i, munged = 0;
+ int i;
unsigned int off;
if (skb_unclone(skb, GFP_ATOMIC))
@@ -156,11 +156,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
*ptr = ((*ptr & tkey->mask) ^ tkey->val);
if (ptr == &_data)
skb_store_bits(skb, off + offset, ptr, 4);
- munged++;
}
- if (munged)
- skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
goto done;
} else
WARN(1, "pedit BUG: index %d\n", p->tcf_index);
diff --git a/kernel/net/sched/act_simple.c b/kernel/net/sched/act_simple.c
index 6a8d94886..d6b708d6a 100644
--- a/kernel/net/sched/act_simple.c
+++ b/kernel/net/sched/act_simple.c
@@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
defdata = nla_data(tb[TCA_DEF_DATA]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+ bind, false);
if (ret)
return ret;
diff --git a/kernel/net/sched/act_skbedit.c b/kernel/net/sched/act_skbedit.c
index fcfeeaf83..6751b5f8c 100644
--- a/kernel/net/sched/act_skbedit.c
+++ b/kernel/net/sched/act_skbedit.c
@@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+ bind, false);
if (ret)
return ret;
diff --git a/kernel/net/sched/act_vlan.c b/kernel/net/sched/act_vlan.c
index d735ecf0b..796785e0b 100644
--- a/kernel/net/sched/act_vlan.c
+++ b/kernel/net/sched/act_vlan.c
@@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
action = parm->v_action;
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
+ bind, false);
if (ret)
return ret;
diff --git a/kernel/net/sched/cls_bpf.c b/kernel/net/sched/cls_bpf.c
index c0b86f2bf..5faaa5425 100644
--- a/kernel/net/sched/cls_bpf.c
+++ b/kernel/net/sched/cls_bpf.c
@@ -38,6 +38,7 @@ struct cls_bpf_prog {
struct bpf_prog *filter;
struct list_head link;
struct tcf_result res;
+ bool exts_integrated;
struct tcf_exts exts;
u32 handle;
union {
@@ -52,6 +53,7 @@ struct cls_bpf_prog {
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
[TCA_BPF_CLASSID] = { .type = NLA_U32 },
+ [TCA_BPF_FLAGS] = { .type = NLA_U32 },
[TCA_BPF_FD] = { .type = NLA_U32 },
[TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
@@ -59,11 +61,30 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
};
+static int cls_bpf_exec_opcode(int code)
+{
+ switch (code) {
+ case TC_ACT_OK:
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ case TC_ACT_REDIRECT:
+ case TC_ACT_UNSPEC:
+ return code;
+ default:
+ return TC_ACT_UNSPEC;
+ }
+}
+
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
struct cls_bpf_prog *prog;
+#ifdef CONFIG_NET_CLS_ACT
+ bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
+#else
+ bool at_ingress = false;
+#endif
int ret = -1;
if (unlikely(!skb_mac_header_was_set(skb)))
@@ -72,7 +93,28 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
/* Needed here for accessing maps. */
rcu_read_lock();
list_for_each_entry_rcu(prog, &head->plist, link) {
- int filter_res = BPF_PROG_RUN(prog->filter, skb);
+ int filter_res;
+
+ qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
+
+ if (at_ingress) {
+ /* It is safe to push/pull even if skb_shared() */
+ __skb_push(skb, skb->mac_len);
+ filter_res = BPF_PROG_RUN(prog->filter, skb);
+ __skb_pull(skb, skb->mac_len);
+ } else {
+ filter_res = BPF_PROG_RUN(prog->filter, skb);
+ }
+
+ if (prog->exts_integrated) {
+ res->class = prog->res.class;
+ res->classid = qdisc_skb_cb(skb)->tc_classid;
+
+ ret = cls_bpf_exec_opcode(filter_res);
+ if (ret == TC_ACT_UNSPEC)
+ continue;
+ break;
+ }
if (filter_res == 0)
continue;
@@ -181,8 +223,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
return ret;
}
-static int cls_bpf_prog_from_ops(struct nlattr **tb,
- struct cls_bpf_prog *prog, u32 classid)
+static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
{
struct sock_filter *bpf_ops;
struct sock_fprog_kern fprog_tmp;
@@ -216,15 +257,13 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb,
prog->bpf_ops = bpf_ops;
prog->bpf_num_ops = bpf_num_ops;
prog->bpf_name = NULL;
-
prog->filter = fp;
- prog->res.classid = classid;
return 0;
}
-static int cls_bpf_prog_from_efd(struct nlattr **tb,
- struct cls_bpf_prog *prog, u32 classid)
+static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
+ const struct tcf_proto *tp)
{
struct bpf_prog *fp;
char *name = NULL;
@@ -254,9 +293,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb,
prog->bpf_ops = NULL;
prog->bpf_fd = bpf_fd;
prog->bpf_name = name;
-
prog->filter = fp;
- prog->res.classid = classid;
+
+ if (fp->dst_needed)
+ netif_keep_dst(qdisc_dev(tp->q));
return 0;
}
@@ -266,16 +306,13 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
+ bool is_bpf, is_ebpf, have_exts = false;
struct tcf_exts exts;
- bool is_bpf, is_ebpf;
- u32 classid;
int ret;
is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
is_ebpf = tb[TCA_BPF_FD];
-
- if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
- !tb[TCA_BPF_CLASSID])
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
return -EINVAL;
tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
@@ -283,18 +320,32 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
if (ret < 0)
return ret;
- classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+ if (tb[TCA_BPF_FLAGS]) {
+ u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
+
+ if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
+ tcf_exts_destroy(&exts);
+ return -EINVAL;
+ }
+
+ have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
+ }
+
+ prog->exts_integrated = have_exts;
- ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) :
- cls_bpf_prog_from_efd(tb, prog, classid);
+ ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
+ cls_bpf_prog_from_efd(tb, prog, tp);
if (ret < 0) {
tcf_exts_destroy(&exts);
return ret;
}
- tcf_bind_filter(tp, &prog->res, base);
- tcf_exts_change(tp, &prog->exts, &exts);
+ if (tb[TCA_BPF_CLASSID]) {
+ prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+ tcf_bind_filter(tp, &prog->res, base);
+ }
+ tcf_exts_change(tp, &prog->exts, &exts);
return 0;
}
@@ -415,6 +466,7 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
{
struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
struct nlattr *nest;
+ u32 bpf_flags = 0;
int ret;
if (prog == NULL)
@@ -426,7 +478,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+ if (prog->res.classid &&
+ nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
goto nla_put_failure;
if (cls_bpf_is_ebpf(prog))
@@ -439,6 +492,11 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
if (tcf_exts_dump(skb, &prog->exts) < 0)
goto nla_put_failure;
+ if (prog->exts_integrated)
+ bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
+ if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
+ goto nla_put_failure;
+
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
diff --git a/kernel/net/sched/cls_cgroup.c b/kernel/net/sched/cls_cgroup.c
index ea611b216..4c85bd3a7 100644
--- a/kernel/net/sched/cls_cgroup.c
+++ b/kernel/net/sched/cls_cgroup.c
@@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
- u32 classid;
-
- classid = task_cls_state(current)->classid;
-
- /*
- * Due to the nature of the classifier it is required to ignore all
- * packets originating from softirq context as accessing `current'
- * would lead to false results.
- *
- * This test assumes that all callers of dev_queue_xmit() explicitely
- * disable bh. Knowing this, it is possible to detect softirq based
- * calls by looking at the number of nested bh disable calls because
- * softirqs always disables bh.
- */
- if (in_serving_softirq()) {
- /* If there is an sk_classid we'll use that. */
- if (!skb->sk)
- return -1;
- classid = skb->sk->sk_classid;
- }
+ u32 classid = task_get_classid(skb);
if (!classid)
return -1;
-
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
return -1;
res->classid = classid;
res->class = 0;
+
return tcf_exts_exec(skb, &head->exts, res);
}
diff --git a/kernel/net/sched/cls_flow.c b/kernel/net/sched/cls_flow.c
index 75df923f5..fbfec6a18 100644
--- a/kernel/net/sched/cls_flow.c
+++ b/kernel/net/sched/cls_flow.c
@@ -22,11 +22,12 @@
#include <linux/if_vlan.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <net/inet_sock.h>
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/route.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netfilter/nf_conntrack.h>
@@ -68,35 +69,41 @@ static inline u32 addr_fold(void *addr)
static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
- if (flow->src)
- return ntohl(flow->src);
+ __be32 src = flow_get_u32_src(flow);
+
+ if (src)
+ return ntohl(src);
+
return addr_fold(skb->sk);
}
static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
- if (flow->dst)
- return ntohl(flow->dst);
+ __be32 dst = flow_get_u32_dst(flow);
+
+ if (dst)
+ return ntohl(dst);
+
return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
}
static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
{
- return flow->ip_proto;
+ return flow->basic.ip_proto;
}
static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
- if (flow->ports)
- return ntohs(flow->port16[0]);
+ if (flow->ports.ports)
+ return ntohs(flow->ports.src);
return addr_fold(skb->sk);
}
static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
- if (flow->ports)
- return ntohs(flow->port16[1]);
+ if (flow->ports.ports)
+ return ntohs(flow->ports.dst);
return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
}
@@ -191,8 +198,11 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb)
static u32 flow_get_skuid(const struct sk_buff *skb)
{
- if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
- kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (sk && sk->sk_socket && sk->sk_socket->file) {
+ kuid_t skuid = sk->sk_socket->file->f_cred->fsuid;
+
return from_kuid(&init_user_ns, skuid);
}
return 0;
@@ -200,8 +210,11 @@ static u32 flow_get_skuid(const struct sk_buff *skb)
static u32 flow_get_skgid(const struct sk_buff *skb)
{
- if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
- kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (sk && sk->sk_socket && sk->sk_socket->file) {
+ kgid_t skgid = sk->sk_socket->file->f_cred->fsgid;
+
return from_kgid(&init_user_ns, skgid);
}
return 0;
@@ -295,7 +308,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
keymask = f->keymask;
if (keymask & FLOW_KEYS_NEEDED)
- skb_flow_dissect(skb, &flow_keys);
+ skb_flow_dissect_flow_keys(skb, &flow_keys, 0);
for (n = 0; n < f->nkeys; n++) {
key = ffs(keymask) - 1;
diff --git a/kernel/net/sched/cls_flower.c b/kernel/net/sched/cls_flower.c
new file mode 100644
index 000000000..95b021243
--- /dev/null
+++ b/kernel/net/sched/cls_flower.c
@@ -0,0 +1,697 @@
+/*
+ * net/sched/cls_flower.c Flower classifier
+ *
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rhashtable.h>
+
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/flow_dissector.h>
+
+struct fl_flow_key {
+ int indev_ifindex;
+ struct flow_dissector_key_control control;
+ struct flow_dissector_key_basic basic;
+ struct flow_dissector_key_eth_addrs eth;
+ struct flow_dissector_key_addrs ipaddrs;
+ union {
+ struct flow_dissector_key_ipv4_addrs ipv4;
+ struct flow_dissector_key_ipv6_addrs ipv6;
+ };
+ struct flow_dissector_key_ports tp;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct fl_flow_mask_range {
+ unsigned short int start;
+ unsigned short int end;
+};
+
+struct fl_flow_mask {
+ struct fl_flow_key key;
+ struct fl_flow_mask_range range;
+ struct rcu_head rcu;
+};
+
+struct cls_fl_head {
+ struct rhashtable ht;
+ struct fl_flow_mask mask;
+ struct flow_dissector dissector;
+ u32 hgen;
+ bool mask_assigned;
+ struct list_head filters;
+ struct rhashtable_params ht_params;
+ struct rcu_head rcu;
+};
+
+struct cls_fl_filter {
+ struct rhash_head ht_node;
+ struct fl_flow_key mkey;
+ struct tcf_exts exts;
+ struct tcf_result res;
+ struct fl_flow_key key;
+ struct list_head list;
+ u32 handle;
+ struct rcu_head rcu;
+};
+
+static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
+{
+ return mask->range.end - mask->range.start;
+}
+
+static void fl_mask_update_range(struct fl_flow_mask *mask)
+{
+ const u8 *bytes = (const u8 *) &mask->key;
+ size_t size = sizeof(mask->key);
+ size_t i, first = 0, last = size - 1;
+
+ for (i = 0; i < sizeof(mask->key); i++) {
+ if (bytes[i]) {
+ if (!first && i)
+ first = i;
+ last = i;
+ }
+ }
+ mask->range.start = rounddown(first, sizeof(long));
+ mask->range.end = roundup(last + 1, sizeof(long));
+}
+
+static void *fl_key_get_start(struct fl_flow_key *key,
+ const struct fl_flow_mask *mask)
+{
+ return (u8 *) key + mask->range.start;
+}
+
+static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key,
+ struct fl_flow_mask *mask)
+{
+ const long *lkey = fl_key_get_start(key, mask);
+ const long *lmask = fl_key_get_start(&mask->key, mask);
+ long *lmkey = fl_key_get_start(mkey, mask);
+ int i;
+
+ for (i = 0; i < fl_mask_range(mask); i += sizeof(long))
+ *lmkey++ = *lkey++ & *lmask++;
+}
+
+static void fl_clear_masked_range(struct fl_flow_key *key,
+ struct fl_flow_mask *mask)
+{
+ memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
+}
+
+static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_fl_head *head = rcu_dereference_bh(tp->root);
+ struct cls_fl_filter *f;
+ struct fl_flow_key skb_key;
+ struct fl_flow_key skb_mkey;
+
+ fl_clear_masked_range(&skb_key, &head->mask);
+ skb_key.indev_ifindex = skb->skb_iif;
+ /* skb_flow_dissect() does not set n_proto in case an unknown protocol,
+ * so do it rather here.
+ */
+ skb_key.basic.n_proto = skb->protocol;
+ skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
+
+ fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
+
+ f = rhashtable_lookup_fast(&head->ht,
+ fl_key_get_start(&skb_mkey, &head->mask),
+ head->ht_params);
+ if (f) {
+ *res = f->res;
+ return tcf_exts_exec(skb, &f->exts, res);
+ }
+ return -1;
+}
+
+static int fl_init(struct tcf_proto *tp)
+{
+ struct cls_fl_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD_RCU(&head->filters);
+ rcu_assign_pointer(tp->root, head);
+
+ return 0;
+}
+
+static void fl_destroy_filter(struct rcu_head *head)
+{
+ struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
+
+ tcf_exts_destroy(&f->exts);
+ kfree(f);
+}
+
+static bool fl_destroy(struct tcf_proto *tp, bool force)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f, *next;
+
+ if (!force && !list_empty(&head->filters))
+ return false;
+
+ list_for_each_entry_safe(f, next, &head->filters, list) {
+ list_del_rcu(&f->list);
+ call_rcu(&f->rcu, fl_destroy_filter);
+ }
+ RCU_INIT_POINTER(tp->root, NULL);
+ if (head->mask_assigned)
+ rhashtable_destroy(&head->ht);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static unsigned long fl_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f;
+
+ list_for_each_entry(f, &head->filters, list)
+ if (f->handle == handle)
+ return (unsigned long) f;
+ return 0;
+}
+
+static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
+ [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC },
+ [TCA_FLOWER_CLASSID] = { .type = NLA_U32 },
+ [TCA_FLOWER_INDEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ },
+ [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 },
+};
+
+static void fl_set_key_val(struct nlattr **tb,
+ void *val, int val_type,
+ void *mask, int mask_type, int len)
+{
+ if (!tb[val_type])
+ return;
+ memcpy(val, nla_data(tb[val_type]), len);
+ if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
+ memset(mask, 0xff, len);
+ else
+ memcpy(mask, nla_data(tb[mask_type]), len);
+}
+
+static int fl_set_key(struct net *net, struct nlattr **tb,
+ struct fl_flow_key *key, struct fl_flow_key *mask)
+{
+#ifdef CONFIG_NET_CLS_IND
+ if (tb[TCA_FLOWER_INDEV]) {
+ int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
+ if (err < 0)
+ return err;
+ key->indev_ifindex = err;
+ mask->indev_ifindex = 0xffffffff;
+ }
+#endif
+
+ fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
+ mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
+ sizeof(key->eth.dst));
+ fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
+ mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
+ sizeof(key->eth.src));
+
+ fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
+ &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto));
+
+ if (key->basic.n_proto == htons(ETH_P_IP) ||
+ key->basic.n_proto == htons(ETH_P_IPV6)) {
+ fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
+ &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.ip_proto));
+ }
+
+ if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
+ &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+ sizeof(key->ipv4.src));
+ fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
+ &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
+ sizeof(key->ipv4.dst));
+ } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) {
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
+ &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+ sizeof(key->ipv6.src));
+ fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST,
+ &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK,
+ sizeof(key->ipv6.dst));
+ }
+
+ if (key->basic.ip_proto == IPPROTO_TCP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
+ &mask->tp.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
+ &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.dst));
+ } else if (key->basic.ip_proto == IPPROTO_UDP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
+ &mask->tp.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
+ &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.dst));
+ }
+
+ return 0;
+}
+
+static bool fl_mask_eq(struct fl_flow_mask *mask1,
+ struct fl_flow_mask *mask2)
+{
+ const long *lmask1 = fl_key_get_start(&mask1->key, mask1);
+ const long *lmask2 = fl_key_get_start(&mask2->key, mask2);
+
+ return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) &&
+ !memcmp(lmask1, lmask2, fl_mask_range(mask1));
+}
+
+static const struct rhashtable_params fl_ht_params = {
+ .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */
+ .head_offset = offsetof(struct cls_fl_filter, ht_node),
+ .automatic_shrinking = true,
+};
+
+static int fl_init_hashtable(struct cls_fl_head *head,
+ struct fl_flow_mask *mask)
+{
+ head->ht_params = fl_ht_params;
+ head->ht_params.key_len = fl_mask_range(mask);
+ head->ht_params.key_offset += mask->range.start;
+
+ return rhashtable_init(&head->ht, &head->ht_params);
+}
+
+#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
+#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member))
+#define FL_KEY_MEMBER_END_OFFSET(member) \
+ (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member))
+
+#define FL_KEY_IN_RANGE(mask, member) \
+ (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \
+ FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start)
+
+#define FL_KEY_SET(keys, cnt, id, member) \
+ do { \
+ keys[cnt].key_id = id; \
+ keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \
+ cnt++; \
+ } while(0);
+
+#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \
+ do { \
+ if (FL_KEY_IN_RANGE(mask, member)) \
+ FL_KEY_SET(keys, cnt, id, member); \
+ } while(0);
+
+static void fl_init_dissector(struct cls_fl_head *head,
+ struct fl_flow_mask *mask)
+{
+ struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
+ size_t cnt = 0;
+
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
+ FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
+ FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
+ FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
+ FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS, tp);
+
+ skb_flow_dissector_init(&head->dissector, keys, cnt);
+}
+
+static int fl_check_assign_mask(struct cls_fl_head *head,
+ struct fl_flow_mask *mask)
+{
+ int err;
+
+ if (head->mask_assigned) {
+ if (!fl_mask_eq(&head->mask, mask))
+ return -EINVAL;
+ else
+ return 0;
+ }
+
+ /* Mask is not assigned yet. So assign it and init hashtable
+ * according to that.
+ */
+ err = fl_init_hashtable(head, mask);
+ if (err)
+ return err;
+ memcpy(&head->mask, mask, sizeof(head->mask));
+ head->mask_assigned = true;
+
+ fl_init_dissector(head, mask);
+
+ return 0;
+}
+
+static int fl_set_parms(struct net *net, struct tcf_proto *tp,
+ struct cls_fl_filter *f, struct fl_flow_mask *mask,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ struct tcf_exts e;
+ int err;
+
+ tcf_exts_init(&e, TCA_FLOWER_ACT, 0);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_FLOWER_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ err = fl_set_key(net, tb, &f->key, &mask->key);
+ if (err)
+ goto errout;
+
+ fl_mask_update_range(mask);
+ fl_set_masked_key(&f->mkey, &f->key, mask);
+
+ tcf_exts_change(tp, &f->exts, &e);
+
+ return 0;
+errout:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static u32 fl_grab_new_handle(struct tcf_proto *tp,
+ struct cls_fl_head *head)
+{
+ unsigned int i = 0x80000000;
+ u32 handle;
+
+ do {
+ if (++head->hgen == 0x7FFFFFFF)
+ head->hgen = 1;
+ } while (--i > 0 && fl_get(tp, head->hgen));
+
+ if (unlikely(i == 0)) {
+ pr_err("Insufficient number of handles\n");
+ handle = 0;
+ } else {
+ handle = head->hgen;
+ }
+
+ return handle;
+}
+
+static int fl_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
+ struct cls_fl_filter *fnew;
+ struct nlattr *tb[TCA_FLOWER_MAX + 1];
+ struct fl_flow_mask mask = {};
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy);
+ if (err < 0)
+ return err;
+
+ if (fold && handle && fold->handle != handle)
+ return -EINVAL;
+
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
+
+ if (!handle) {
+ handle = fl_grab_new_handle(tp, head);
+ if (!handle) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+ fnew->handle = handle;
+
+ err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
+ if (err)
+ goto errout;
+
+ err = fl_check_assign_mask(head, &mask);
+ if (err)
+ goto errout;
+
+ err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
+ head->ht_params);
+ if (err)
+ goto errout;
+ if (fold)
+ rhashtable_remove_fast(&head->ht, &fold->ht_node,
+ head->ht_params);
+
+ *arg = (unsigned long) fnew;
+
+ if (fold) {
+ list_replace_rcu(&fold->list, &fnew->list);
+ tcf_unbind_filter(tp, &fold->res);
+ call_rcu(&fold->rcu, fl_destroy_filter);
+ } else {
+ list_add_tail_rcu(&fnew->list, &head->filters);
+ }
+
+ return 0;
+
+errout:
+ kfree(fnew);
+ return err;
+}
+
+static int fl_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
+
+ rhashtable_remove_fast(&head->ht, &f->ht_node,
+ head->ht_params);
+ list_del_rcu(&f->list);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, fl_destroy_filter);
+ return 0;
+}
+
+static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f;
+
+ list_for_each_entry_rcu(f, &head->filters, list) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static int fl_dump_key_val(struct sk_buff *skb,
+ void *val, int val_type,
+ void *mask, int mask_type, int len)
+{
+ int err;
+
+ if (!memchr_inv(mask, 0, len))
+ return 0;
+ err = nla_put(skb, val_type, len, val);
+ if (err)
+ return err;
+ if (mask_type != TCA_FLOWER_UNSPEC) {
+ err = nla_put(skb, mask_type, len, mask);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f = (struct cls_fl_filter *) fh;
+ struct nlattr *nest;
+ struct fl_flow_key *key, *mask;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ key = &f->key;
+ mask = &head->mask.key;
+
+ if (mask->indev_ifindex) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_index(net, key->indev_ifindex);
+ if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
+ goto nla_put_failure;
+ }
+
+ if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
+ mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
+ sizeof(key->eth.dst)) ||
+ fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
+ mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
+ sizeof(key->eth.src)) ||
+ fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
+ &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto)))
+ goto nla_put_failure;
+ if ((key->basic.n_proto == htons(ETH_P_IP) ||
+ key->basic.n_proto == htons(ETH_P_IPV6)) &&
+ fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
+ &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.ip_proto)))
+ goto nla_put_failure;
+
+ if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+ (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
+ &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+ sizeof(key->ipv4.src)) ||
+ fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
+ &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
+ sizeof(key->ipv4.dst))))
+ goto nla_put_failure;
+ else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+ (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
+ &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+ sizeof(key->ipv6.src)) ||
+ fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST,
+ &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK,
+ sizeof(key->ipv6.dst))))
+ goto nla_put_failure;
+
+ if (key->basic.ip_proto == IPPROTO_TCP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
+ &mask->tp.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
+ &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.dst))))
+ goto nla_put_failure;
+ else if (key->basic.ip_proto == IPPROTO_UDP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
+ &mask->tp.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
+ &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp.dst))))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_fl_ops __read_mostly = {
+ .kind = "flower",
+ .classify = fl_classify,
+ .init = fl_init,
+ .destroy = fl_destroy,
+ .get = fl_get,
+ .change = fl_change,
+ .delete = fl_delete,
+ .walk = fl_walk,
+ .dump = fl_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init cls_fl_init(void)
+{
+ return register_tcf_proto_ops(&cls_fl_ops);
+}
+
+static void __exit cls_fl_exit(void)
+{
+ unregister_tcf_proto_ops(&cls_fl_ops);
+}
+
+module_init(cls_fl_init);
+module_exit(cls_fl_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("Flower classifier");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/net/sched/cls_rsvp.h b/kernel/net/sched/cls_rsvp.h
index 02fa82792..f9c9fc075 100644
--- a/kernel/net/sched/cls_rsvp.h
+++ b/kernel/net/sched/cls_rsvp.h
@@ -283,12 +283,22 @@ static int rsvp_init(struct tcf_proto *tp)
return -ENOBUFS;
}
-static void
-rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
+static void rsvp_delete_filter_rcu(struct rcu_head *head)
{
- tcf_unbind_filter(tp, &f->res);
+ struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
+
tcf_exts_destroy(&f->exts);
- kfree_rcu(f, rcu);
+ kfree(f);
+}
+
+static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
+{
+ tcf_unbind_filter(tp, &f->res);
+ /* all classifiers are required to call tcf_exts_destroy() after rcu
+ * grace period, since converted-to-rcu actions are relying on that
+ * in cleanup() callback
+ */
+ call_rcu(&f->rcu, rsvp_delete_filter_rcu);
}
static bool rsvp_destroy(struct tcf_proto *tp, bool force)
diff --git a/kernel/net/sched/cls_tcindex.c b/kernel/net/sched/cls_tcindex.c
index a557dbaf5..944c8ff45 100644
--- a/kernel/net/sched/cls_tcindex.c
+++ b/kernel/net/sched/cls_tcindex.c
@@ -27,6 +27,7 @@
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
+ struct rcu_head rcu;
};
struct tcindex_filter {
@@ -133,8 +134,23 @@ static int tcindex_init(struct tcf_proto *tp)
return 0;
}
-static int
-tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+static void tcindex_destroy_rexts(struct rcu_head *head)
+{
+ struct tcindex_filter_result *r;
+
+ r = container_of(head, struct tcindex_filter_result, rcu);
+ tcf_exts_destroy(&r->exts);
+}
+
+static void tcindex_destroy_fexts(struct rcu_head *head)
+{
+ struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu);
+
+ tcf_exts_destroy(&f->result.exts);
+ kfree(f);
+}
+
+static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
@@ -162,9 +178,14 @@ found:
rcu_assign_pointer(*walk, rtnl_dereference(f->next));
}
tcf_unbind_filter(tp, &r->res);
- tcf_exts_destroy(&r->exts);
+ /* all classifiers are required to call tcf_exts_destroy() after rcu
+ * grace period, since converted-to-rcu actions are relying on that
+ * in cleanup() callback
+ */
if (f)
- kfree_rcu(f, rcu);
+ call_rcu(&f->rcu, tcindex_destroy_fexts);
+ else
+ call_rcu(&r->rcu, tcindex_destroy_rexts);
return 0;
}
diff --git a/kernel/net/sched/em_ipset.c b/kernel/net/sched/em_ipset.c
index a3d79c8bf..c66ca9400 100644
--- a/kernel/net/sched/em_ipset.c
+++ b/kernel/net/sched/em_ipset.c
@@ -92,9 +92,10 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
rcu_read_lock();
- if (dev && skb->skb_iif)
- indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
+ if (skb->skb_iif)
+ indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
+ acpar.net = em->net;
acpar.in = indev ? indev : dev;
acpar.out = dev;
diff --git a/kernel/net/sched/em_meta.c b/kernel/net/sched/em_meta.c
index b5294ce20..f2aabc008 100644
--- a/kernel/net/sched/em_meta.c
+++ b/kernel/net/sched/em_meta.c
@@ -343,119 +343,145 @@ META_COLLECTOR(int_sk_refcnt)
META_COLLECTOR(int_sk_rcvbuf)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_rcvbuf;
+ dst->value = sk->sk_rcvbuf;
}
META_COLLECTOR(int_sk_shutdown)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_shutdown;
+ dst->value = sk->sk_shutdown;
}
META_COLLECTOR(int_sk_proto)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_protocol;
+ dst->value = sk->sk_protocol;
}
META_COLLECTOR(int_sk_type)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_type;
+ dst->value = sk->sk_type;
}
META_COLLECTOR(int_sk_rmem_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = sk_rmem_alloc_get(skb->sk);
+ dst->value = sk_rmem_alloc_get(sk);
}
META_COLLECTOR(int_sk_wmem_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = sk_wmem_alloc_get(skb->sk);
+ dst->value = sk_wmem_alloc_get(sk);
}
META_COLLECTOR(int_sk_omem_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = atomic_read(&skb->sk->sk_omem_alloc);
+ dst->value = atomic_read(&sk->sk_omem_alloc);
}
META_COLLECTOR(int_sk_rcv_qlen)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_receive_queue.qlen;
+ dst->value = sk->sk_receive_queue.qlen;
}
META_COLLECTOR(int_sk_snd_qlen)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_write_queue.qlen;
+ dst->value = sk->sk_write_queue.qlen;
}
META_COLLECTOR(int_sk_wmem_queued)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_wmem_queued;
+ dst->value = sk->sk_wmem_queued;
}
META_COLLECTOR(int_sk_fwd_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_forward_alloc;
+ dst->value = sk->sk_forward_alloc;
}
META_COLLECTOR(int_sk_sndbuf)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_sndbuf;
+ dst->value = sk->sk_sndbuf;
}
META_COLLECTOR(int_sk_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = (__force int) skb->sk->sk_allocation;
+ dst->value = (__force int) sk->sk_allocation;
}
META_COLLECTOR(int_sk_hash)
@@ -469,92 +495,112 @@ META_COLLECTOR(int_sk_hash)
META_COLLECTOR(int_sk_lingertime)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_lingertime / HZ;
+ dst->value = sk->sk_lingertime / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_error_queue.qlen;
+ dst->value = sk->sk_error_queue.qlen;
}
META_COLLECTOR(int_sk_ack_bl)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_ack_backlog;
+ dst->value = sk->sk_ack_backlog;
}
META_COLLECTOR(int_sk_max_ack_bl)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_max_ack_backlog;
+ dst->value = sk->sk_max_ack_backlog;
}
META_COLLECTOR(int_sk_prio)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_priority;
+ dst->value = sk->sk_priority;
}
META_COLLECTOR(int_sk_rcvlowat)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_rcvlowat;
+ dst->value = sk->sk_rcvlowat;
}
META_COLLECTOR(int_sk_rcvtimeo)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_rcvtimeo / HZ;
+ dst->value = sk->sk_rcvtimeo / HZ;
}
META_COLLECTOR(int_sk_sndtimeo)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_sndtimeo / HZ;
+ dst->value = sk->sk_sndtimeo / HZ;
}
META_COLLECTOR(int_sk_sendmsg_off)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_frag.offset;
+ dst->value = sk->sk_frag.offset;
}
META_COLLECTOR(int_sk_write_pend)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_write_pending;
+ dst->value = sk->sk_write_pending;
}
/**************************************************************************
diff --git a/kernel/net/sched/sch_api.c b/kernel/net/sched/sch_api.c
index 1e1c89e51..af1acf009 100644
--- a/kernel/net/sched/sch_api.c
+++ b/kernel/net/sched/sch_api.c
@@ -253,7 +253,8 @@ int qdisc_set_default(const char *name)
}
/* We know handle. Find qdisc among all qdisc's attached to device
- (root qdisc, all its children, children of children etc.)
+ * (root qdisc, all its children, children of children etc.)
+ * Note: caller either uses rtnl or rcu_read_lock()
*/
static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
@@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
root->handle == handle)
return root;
- list_for_each_entry(q, &root->list, list) {
+ list_for_each_entry_rcu(q, &root->list, list) {
if (q->handle == handle)
return q;
}
@@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q)
struct Qdisc *root = qdisc_dev(q)->qdisc;
WARN_ON_ONCE(root == &noop_qdisc);
- list_add_tail(&q->list, &root->list);
+ ASSERT_RTNL();
+ list_add_tail_rcu(&q->list, &root->list);
}
}
EXPORT_SYMBOL(qdisc_list_add);
void qdisc_list_del(struct Qdisc *q)
{
- if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
- list_del(&q->list);
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+ ASSERT_RTNL();
+ list_del_rcu(&q->list);
+ }
}
EXPORT_SYMBOL(qdisc_list_del);
@@ -750,14 +754,18 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
if (n == 0)
return;
drops = max_t(int, n, 0);
+ rcu_read_lock();
while ((parentid = sch->parent)) {
if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
- return;
+ break;
+ if (sch->flags & TCQ_F_NOPARENT)
+ break;
+ /* TODO: perform the search on a per txq basis */
sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
if (sch == NULL) {
- WARN_ON(parentid != TC_H_ROOT);
- return;
+ WARN_ON_ONCE(parentid != TC_H_ROOT);
+ break;
}
cops = sch->ops->cl_ops;
if (cops->qlen_notify) {
@@ -768,6 +776,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
sch->q.qlen -= n;
__qdisc_qstats_drop(sch, drops);
}
+ rcu_read_unlock();
}
EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
@@ -1806,57 +1815,46 @@ done:
* to this qdisc, (optionally) tests for protocol and asks
* specific classifiers.
*/
-int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
{
__be16 protocol = tc_skb_protocol(skb);
- int err;
+#ifdef CONFIG_NET_CLS_ACT
+ const struct tcf_proto *old_tp = tp;
+ int limit = 0;
+reclassify:
+#endif
for (; tp; tp = rcu_dereference_bh(tp->next)) {
+ int err;
+
if (tp->protocol != protocol &&
tp->protocol != htons(ETH_P_ALL))
continue;
- err = tp->classify(skb, tp, res);
- if (err >= 0) {
+ err = tp->classify(skb, tp, res);
#ifdef CONFIG_NET_CLS_ACT
- if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
- skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
+ if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
+ goto reset;
#endif
+ if (err >= 0)
return err;
- }
}
- return -1;
-}
-EXPORT_SYMBOL(tc_classify_compat);
-
-int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
-{
- int err = 0;
-#ifdef CONFIG_NET_CLS_ACT
- const struct tcf_proto *otp = tp;
-reclassify:
-#endif
- err = tc_classify_compat(skb, tp, res);
+ return -1;
#ifdef CONFIG_NET_CLS_ACT
- if (err == TC_ACT_RECLASSIFY) {
- u32 verd = G_TC_VERD(skb->tc_verd);
- tp = otp;
-
- if (verd++ >= MAX_REC_LOOP) {
- net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
- tp->q->ops->id,
- tp->prio & 0xffff,
- ntohs(tp->protocol));
- return TC_ACT_SHOT;
- }
- skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
- goto reclassify;
+reset:
+ if (unlikely(limit++ >= MAX_REC_LOOP)) {
+ net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
+ tp->q->ops->id, tp->prio & 0xffff,
+ ntohs(tp->protocol));
+ return TC_ACT_SHOT;
}
+
+ tp = old_tp;
+ protocol = tc_skb_protocol(skb);
+ goto reclassify;
#endif
- return err;
}
EXPORT_SYMBOL(tc_classify);
@@ -1885,13 +1883,10 @@ EXPORT_SYMBOL(tcf_destroy_chain);
#ifdef CONFIG_PROC_FS
static int psched_show(struct seq_file *seq, void *v)
{
- struct timespec ts;
-
- hrtimer_get_res(CLOCK_MONOTONIC, &ts);
seq_printf(seq, "%08x %08x %08x %08x\n",
(u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1000000,
- (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
+ (u32)NSEC_PER_SEC / hrtimer_resolution);
return 0;
}
@@ -1956,6 +1951,7 @@ static int __init pktsched_init(void)
register_qdisc(&bfifo_qdisc_ops);
register_qdisc(&pfifo_head_drop_qdisc_ops);
register_qdisc(&mq_qdisc_ops);
+ register_qdisc(&noqueue_qdisc_ops);
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
diff --git a/kernel/net/sched/sch_atm.c b/kernel/net/sched/sch_atm.c
index e3e2cc5fd..1911af3ca 100644
--- a/kernel/net/sched/sch_atm.c
+++ b/kernel/net/sched/sch_atm.c
@@ -375,7 +375,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
list_for_each_entry(flow, &p->flows, list) {
fl = rcu_dereference_bh(flow->filter_list);
if (fl) {
- result = tc_classify_compat(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, true);
if (result < 0)
continue;
flow = (struct atm_flow_data *)res.class;
diff --git a/kernel/net/sched/sch_blackhole.c b/kernel/net/sched/sch_blackhole.c
index 094a874b4..3fee70d98 100644
--- a/kernel/net/sched/sch_blackhole.c
+++ b/kernel/net/sched/sch_blackhole.c
@@ -11,7 +11,7 @@
* Note: Quantum tunneling is not supported.
*/
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
@@ -37,17 +37,8 @@ static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
.owner = THIS_MODULE,
};
-static int __init blackhole_module_init(void)
+static int __init blackhole_init(void)
{
return register_qdisc(&blackhole_qdisc_ops);
}
-
-static void __exit blackhole_module_exit(void)
-{
- unregister_qdisc(&blackhole_qdisc_ops);
-}
-
-module_init(blackhole_module_init)
-module_exit(blackhole_module_exit)
-
-MODULE_LICENSE("GPL");
+device_initcall(blackhole_init)
diff --git a/kernel/net/sched/sch_cbq.c b/kernel/net/sched/sch_cbq.c
index beeb75f80..c538d9e4a 100644
--- a/kernel/net/sched/sch_cbq.c
+++ b/kernel/net/sched/sch_cbq.c
@@ -240,7 +240,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
/*
* Step 2+n. Apply classifier.
*/
- result = tc_classify_compat(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, true);
if (!fl || result < 0)
goto fallback;
diff --git a/kernel/net/sched/sch_choke.c b/kernel/net/sched/sch_choke.c
index c009eb904..5ffb8b833 100644
--- a/kernel/net/sched/sch_choke.c
+++ b/kernel/net/sched/sch_choke.c
@@ -18,7 +18,7 @@
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
/*
CHOKe stateless AQM for fair bandwidth allocation
@@ -133,16 +133,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
--sch->q.qlen;
}
-/* private part of skb->cb[] that a qdisc is allowed to use
- * is limited to QDISC_CB_PRIV_LEN bytes.
- * As a flow key might be too large, we store a part of it only.
- */
-#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
-
struct choke_skb_cb {
u16 classid;
u8 keys_valid;
- u8 keys[QDISC_CB_PRIV_LEN - 3];
+ struct flow_keys_digest keys;
};
static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
@@ -176,19 +170,19 @@ static bool choke_match_flow(struct sk_buff *skb1,
if (!choke_skb_cb(skb1)->keys_valid) {
choke_skb_cb(skb1)->keys_valid = 1;
- skb_flow_dissect(skb1, &temp);
- memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
+ skb_flow_dissect_flow_keys(skb1, &temp, 0);
+ make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
}
if (!choke_skb_cb(skb2)->keys_valid) {
choke_skb_cb(skb2)->keys_valid = 1;
- skb_flow_dissect(skb2, &temp);
- memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
+ skb_flow_dissect_flow_keys(skb2, &temp, 0);
+ make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
}
return !memcmp(&choke_skb_cb(skb1)->keys,
&choke_skb_cb(skb2)->keys,
- CHOKE_K_LEN);
+ sizeof(choke_skb_cb(skb1)->keys));
}
/*
@@ -207,7 +201,7 @@ static bool choke_classify(struct sk_buff *skb,
int result;
fl = rcu_dereference_bh(q->filter_list);
- result = tc_classify(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -391,6 +385,19 @@ static void choke_reset(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ qdisc_qstats_backlog_dec(sch, skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+
+ memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
+ q->head = q->tail = 0;
red_restart(&q->vars);
}
@@ -546,65 +553,6 @@ static void choke_destroy(struct Qdisc *sch)
choke_free(q->tab);
}
-static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
-{
- return NULL;
-}
-
-static unsigned long choke_get(struct Qdisc *sch, u32 classid)
-{
- return 0;
-}
-
-static void choke_put(struct Qdisc *q, unsigned long cl)
-{
-}
-
-static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
- u32 classid)
-{
- return 0;
-}
-
-static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
- unsigned long cl)
-{
- struct choke_sched_data *q = qdisc_priv(sch);
-
- if (cl)
- return NULL;
- return &q->filter_list;
-}
-
-static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- tcm->tcm_handle |= TC_H_MIN(cl);
- return 0;
-}
-
-static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
-{
- if (!arg->stop) {
- if (arg->fn(sch, 1, arg) < 0) {
- arg->stop = 1;
- return;
- }
- arg->count++;
- }
-}
-
-static const struct Qdisc_class_ops choke_class_ops = {
- .leaf = choke_leaf,
- .get = choke_get,
- .put = choke_put,
- .tcf_chain = choke_find_tcf,
- .bind_tcf = choke_bind,
- .unbind_tcf = choke_put,
- .dump = choke_dump_class,
- .walk = choke_walk,
-};
-
static struct sk_buff *choke_peek_head(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
diff --git a/kernel/net/sched/sch_codel.c b/kernel/net/sched/sch_codel.c
index 7a0bdb16a..535007d5f 100644
--- a/kernel/net/sched/sch_codel.c
+++ b/kernel/net/sched/sch_codel.c
@@ -6,7 +6,7 @@
*
* Implemented on linux by :
* Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
- * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
[TCA_CODEL_LIMIT] = { .type = NLA_U32 },
[TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
[TCA_CODEL_ECN] = { .type = NLA_U32 },
+ [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 },
};
static int codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
}
+ if (tb[TCA_CODEL_CE_THRESHOLD]) {
+ u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]);
+
+ q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
if (tb[TCA_CODEL_INTERVAL]) {
u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
@@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_CODEL_ECN,
q->params.ecn))
goto nla_put_failure;
-
+ if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+ nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD,
+ codel_time_to_us(q->params.ce_threshold)))
+ goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
@@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.ldelay = codel_time_to_us(q->vars.ldelay),
.dropping = q->vars.dropping,
.ecn_mark = q->stats.ecn_mark,
+ .ce_mark = q->stats.ce_mark,
};
if (q->vars.dropping) {
diff --git a/kernel/net/sched/sch_drr.c b/kernel/net/sched/sch_drr.c
index 338706092..f26bdea87 100644
--- a/kernel/net/sched/sch_drr.c
+++ b/kernel/net/sched/sch_drr.c
@@ -331,7 +331,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
fl = rcu_dereference_bh(q->filter_list);
- result = tc_classify(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
diff --git a/kernel/net/sched/sch_dsmark.c b/kernel/net/sched/sch_dsmark.c
index 66700a611..f357f34d0 100644
--- a/kernel/net/sched/sch_dsmark.c
+++ b/kernel/net/sched/sch_dsmark.c
@@ -35,14 +35,20 @@
#define NO_DEFAULT_INDEX (1 << 16)
+struct mask_value {
+ u8 mask;
+ u8 value;
+};
+
struct dsmark_qdisc_data {
struct Qdisc *q;
struct tcf_proto __rcu *filter_list;
- u8 *mask; /* "owns" the array */
- u8 *value;
+ struct mask_value *mv;
u16 indices;
+ u8 set_tc_index;
u32 default_index; /* index range is 0...0xffff */
- int set_tc_index;
+#define DSMARK_EMBEDDED_SZ 16
+ struct mask_value embedded[DSMARK_EMBEDDED_SZ];
};
static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
@@ -116,7 +122,6 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_DSMARK_MAX + 1];
int err = -EINVAL;
- u8 mask = 0;
pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
__func__, sch, p, classid, parent, *arg);
@@ -133,14 +138,11 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
if (err < 0)
goto errout;
- if (tb[TCA_DSMARK_MASK])
- mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
-
if (tb[TCA_DSMARK_VALUE])
- p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+ p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]);
if (tb[TCA_DSMARK_MASK])
- p->mask[*arg - 1] = mask;
+ p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
err = 0;
@@ -155,8 +157,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
if (!dsmark_valid_index(p, arg))
return -EINVAL;
- p->mask[arg - 1] = 0xff;
- p->value[arg - 1] = 0;
+ p->mv[arg - 1].mask = 0xff;
+ p->mv[arg - 1].value = 0;
return 0;
}
@@ -173,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
return;
for (i = 0; i < p->indices; i++) {
- if (p->mask[i] == 0xff && !p->value[i])
+ if (p->mv[i].mask == 0xff && !p->mv[i].value)
goto ignore;
if (walker->count >= walker->skip) {
if (walker->fn(sch, i + 1, walker) < 0) {
@@ -230,7 +232,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
else {
struct tcf_result res;
struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
- int result = tc_classify(skb, fl, &res);
+ int result = tc_classify(skb, fl, &res, false);
pr_debug("result %d class 0x%04x\n", result, res.classid);
@@ -291,12 +293,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
- ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
- p->value[index]);
+ ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask,
+ p->mv[index].value);
break;
case htons(ETH_P_IPV6):
- ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
- p->value[index]);
+ ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask,
+ p->mv[index].value);
break;
default:
/*
@@ -304,7 +306,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
* This way, we can send non-IP traffic through dsmark
* and don't need yet another qdisc as a bypass.
*/
- if (p->mask[index] != 0xff || p->value[index])
+ if (p->mv[index].mask != 0xff || p->mv[index].value)
pr_warn("%s: unsupported protocol %d\n",
__func__, ntohs(tc_skb_protocol(skb)));
break;
@@ -346,7 +348,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
int err = -EINVAL;
u32 default_index = NO_DEFAULT_INDEX;
u16 indices;
- u8 *mask;
+ int i;
pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
@@ -366,18 +368,18 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_DSMARK_DEFAULT_INDEX])
default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
- mask = kmalloc(indices * 2, GFP_KERNEL);
- if (mask == NULL) {
+ if (indices <= DSMARK_EMBEDDED_SZ)
+ p->mv = p->embedded;
+ else
+ p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL);
+ if (!p->mv) {
err = -ENOMEM;
goto errout;
}
-
- p->mask = mask;
- memset(p->mask, 0xff, indices);
-
- p->value = p->mask + indices;
- memset(p->value, 0, indices);
-
+ for (i = 0; i < indices; i++) {
+ p->mv[i].mask = 0xff;
+ p->mv[i].value = 0;
+ }
p->indices = indices;
p->default_index = default_index;
p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
@@ -410,7 +412,8 @@ static void dsmark_destroy(struct Qdisc *sch)
tcf_destroy_chain(&p->filter_list);
qdisc_destroy(p->q);
- kfree(p->mask);
+ if (p->mv != p->embedded)
+ kfree(p->mv);
}
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
@@ -430,8 +433,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
- nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
+ if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) ||
+ nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value))
goto nla_put_failure;
return nla_nest_end(skb, opts);
diff --git a/kernel/net/sched/sch_fifo.c b/kernel/net/sched/sch_fifo.c
index 2e2398cfc..2177eac0a 100644
--- a/kernel/net/sched/sch_fifo.c
+++ b/kernel/net/sched/sch_fifo.c
@@ -54,7 +54,7 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
if (opt == NULL) {
- u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
+ u32 limit = qdisc_dev(sch)->tx_queue_len;
if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
diff --git a/kernel/net/sched/sch_fq.c b/kernel/net/sched/sch_fq.c
index f377702d4..109b23227 100644
--- a/kernel/net/sched/sch_fq.c
+++ b/kernel/net/sched/sch_fq.c
@@ -224,13 +224,16 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
return &q->internal;
- /* SYNACK messages are attached to a listener socket.
- * 1) They are not part of a 'flow' yet
- * 2) We do not want to rate limit them (eg SYNFLOOD attack),
+ /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
+ * or a listener (SYNCOOKIE mode)
+ * 1) request sockets are not full blown,
+ * they do not contain sk_pacing_rate
+ * 2) They are not part of a 'flow' yet
+ * 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE
- * 3) We pretend they are orphaned
+ * 4) We pretend they are orphaned
*/
- if (!sk || sk->sk_state == TCP_LISTEN) {
+ if (!sk || sk_listener(sk)) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not
diff --git a/kernel/net/sched/sch_fq_codel.c b/kernel/net/sched/sch_fq_codel.c
index 9291598b5..4c834e93d 100644
--- a/kernel/net/sched/sch_fq_codel.c
+++ b/kernel/net/sched/sch_fq_codel.c
@@ -6,7 +6,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
*/
#include <linux/module.h>
@@ -23,7 +23,6 @@
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-#include <net/flow_keys.h>
#include <net/codel.h>
/* Fair Queue CoDel.
@@ -68,15 +67,9 @@ struct fq_codel_sched_data {
};
static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
- const struct sk_buff *skb)
+ struct sk_buff *skb)
{
- struct flow_keys keys;
- unsigned int hash;
-
- skb_flow_dissect(skb, &keys);
- hash = jhash_3words((__force u32)keys.dst,
- (__force u32)keys.src ^ keys.ip_proto,
- (__force u32)keys.ports, q->perturbation);
+ u32 hash = skb_get_hash_perturb(skb, q->perturbation);
return reciprocal_scale(hash, q->flows_cnt);
}
@@ -99,7 +92,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
return fq_codel_hash(q, skb) + 1;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tc_classify(skb, filter, &res);
+ result = tc_classify(skb, filter, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -170,6 +163,15 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
return idx;
}
+static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
+{
+ unsigned int prev_backlog;
+
+ prev_backlog = sch->qstats.backlog;
+ fq_codel_drop(sch);
+ return prev_backlog - sch->qstats.backlog;
+}
+
static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -286,10 +288,26 @@ begin:
static void fq_codel_reset(struct Qdisc *sch)
{
- struct sk_buff *skb;
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ int i;
- while ((skb = fq_codel_dequeue(sch)) != NULL)
- kfree_skb(skb);
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ for (i = 0; i < q->flows_cnt; i++) {
+ struct fq_codel_flow *flow = q->flows + i;
+
+ while (flow->head) {
+ struct sk_buff *skb = dequeue_head(flow);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ kfree_skb(skb);
+ }
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ codel_vars_init(&flow->cvars);
+ }
+ memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
+ sch->q.qlen = 0;
}
static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
@@ -299,6 +317,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
[TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
[TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
[TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -329,6 +348,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
}
+ if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) {
+ u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]);
+
+ q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
if (tb[TCA_FQ_CODEL_INTERVAL]) {
u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
@@ -448,6 +473,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
q->flows_cnt))
goto nla_put_failure;
+ if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+ nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD,
+ codel_time_to_us(q->cparams.ce_threshold)))
+ goto nla_put_failure;
+
return nla_nest_end(skb, opts);
nla_put_failure:
@@ -466,6 +496,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.qdisc_stats.drop_overlimit = q->drop_overlimit;
st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
st.qdisc_stats.new_flow_count = q->new_flow_count;
+ st.qdisc_stats.ce_mark = q->cstats.ce_mark;
list_for_each(pos, &q->new_flows)
st.qdisc_stats.new_flows_len++;
@@ -598,7 +629,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.enqueue = fq_codel_enqueue,
.dequeue = fq_codel_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = fq_codel_drop,
+ .drop = fq_codel_qdisc_drop,
.init = fq_codel_init,
.reset = fq_codel_reset,
.destroy = fq_codel_destroy,
diff --git a/kernel/net/sched/sch_generic.c b/kernel/net/sched/sch_generic.c
index 1e346523f..47ef1b11b 100644
--- a/kernel/net/sched/sch_generic.c
+++ b/kernel/net/sched/sch_generic.c
@@ -416,33 +416,25 @@ struct Qdisc noop_qdisc = {
};
EXPORT_SYMBOL(noop_qdisc);
-static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
+static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
+{
+ /* register_qdisc() assigns a default of noop_enqueue if unset,
+ * but __dev_queue_xmit() treats noqueue only as such
+ * if this is NULL - so clear it here. */
+ qdisc->enqueue = NULL;
+ return 0;
+}
+
+struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
.id = "noqueue",
.priv_size = 0,
+ .init = noqueue_init,
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.peek = noop_dequeue,
.owner = THIS_MODULE,
};
-static struct Qdisc noqueue_qdisc;
-static struct netdev_queue noqueue_netdev_queue = {
- .qdisc = &noqueue_qdisc,
- .qdisc_sleeping = &noqueue_qdisc,
-};
-
-static struct Qdisc noqueue_qdisc = {
- .enqueue = NULL,
- .dequeue = noop_dequeue,
- .flags = TCQ_F_BUILTIN,
- .ops = &noqueue_qdisc_ops,
- .list = LIST_HEAD_INIT(noqueue_qdisc.list),
- .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
- .dev_queue = &noqueue_netdev_queue,
- .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
-};
-
-
static const u8 prio2band[TC_PRIO_MAX + 1] = {
1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
@@ -666,8 +658,10 @@ static void qdisc_rcu_free(struct rcu_head *head)
{
struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
- if (qdisc_is_percpu_stats(qdisc))
+ if (qdisc_is_percpu_stats(qdisc)) {
free_percpu(qdisc->cpu_bstats);
+ free_percpu(qdisc->cpu_qstats);
+ }
kfree((char *) qdisc - qdisc->padded);
}
@@ -733,18 +727,19 @@ static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
{
- struct Qdisc *qdisc = &noqueue_qdisc;
+ struct Qdisc *qdisc;
+ const struct Qdisc_ops *ops = default_qdisc_ops;
- if (dev->tx_queue_len) {
- qdisc = qdisc_create_dflt(dev_queue,
- default_qdisc_ops, TC_H_ROOT);
- if (!qdisc) {
- netdev_info(dev, "activation failed\n");
- return;
- }
- if (!netif_is_multiqueue(dev))
- qdisc->flags |= TCQ_F_ONETXQUEUE;
+ if (dev->priv_flags & IFF_NO_QUEUE)
+ ops = &noqueue_qdisc_ops;
+
+ qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
+ if (!qdisc) {
+ netdev_info(dev, "activation failed\n");
+ return;
}
+ if (!netif_is_multiqueue(dev))
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
dev_queue->qdisc_sleeping = qdisc;
}
@@ -755,7 +750,8 @@ static void attach_default_qdiscs(struct net_device *dev)
txq = netdev_get_tx_queue(dev, 0);
- if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
+ if (!netif_is_multiqueue(dev) ||
+ dev->priv_flags & IFF_NO_QUEUE) {
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
dev->qdisc = txq->qdisc_sleeping;
atomic_inc(&dev->qdisc->refcnt);
@@ -779,7 +775,7 @@ static void transition_one_qdisc(struct net_device *dev,
clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
- if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+ if (need_watchdog_p) {
dev_queue->trans_start = 0;
*need_watchdog_p = 1;
}
diff --git a/kernel/net/sched/sch_gred.c b/kernel/net/sched/sch_gred.c
index 634529e0c..80105109f 100644
--- a/kernel/net/sched/sch_gred.c
+++ b/kernel/net/sched/sch_gred.c
@@ -165,7 +165,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
*/
- if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <=
+ sch->limit))
return qdisc_enqueue_tail(skb, sch);
else
goto drop;
@@ -397,7 +398,10 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
q->DP = dp;
q->prio = prio;
- q->limit = ctl->limit;
+ if (ctl->limit > sch->limit)
+ q->limit = sch->limit;
+ else
+ q->limit = ctl->limit;
if (q->backlog == 0)
red_end_of_idle_period(&q->vars);
@@ -414,6 +418,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
[TCA_GRED_MAX_P] = { .type = NLA_U32 },
+ [TCA_GRED_LIMIT] = { .type = NLA_U32 },
};
static int gred_change(struct Qdisc *sch, struct nlattr *opt)
@@ -433,11 +438,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
if (err < 0)
return err;
- if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
+ if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) {
+ if (tb[TCA_GRED_LIMIT] != NULL)
+ sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
return gred_change_table_def(sch, opt);
+ }
if (tb[TCA_GRED_PARMS] == NULL ||
- tb[TCA_GRED_STAB] == NULL)
+ tb[TCA_GRED_STAB] == NULL ||
+ tb[TCA_GRED_LIMIT] != NULL)
return -EINVAL;
max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
@@ -501,6 +510,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
return -EINVAL;
+ if (tb[TCA_GRED_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
+ else
+ sch->limit = qdisc_dev(sch)->tx_queue_len
+ * psched_mtu(qdisc_dev(sch));
+
return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
}
@@ -531,6 +546,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit))
+ goto nla_put_failure;
+
parms = nla_nest_start(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
diff --git a/kernel/net/sched/sch_hfsc.c b/kernel/net/sched/sch_hfsc.c
index e6c7416d0..b7ebe2c87 100644
--- a/kernel/net/sched/sch_hfsc.c
+++ b/kernel/net/sched/sch_hfsc.c
@@ -1165,7 +1165,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
head = &q->root;
tcf = rcu_dereference_bh(q->root.filter_list);
- while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+ while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
diff --git a/kernel/net/sched/sch_hhf.c b/kernel/net/sched/sch_hhf.c
index 15d3aabfe..86b04e31e 100644
--- a/kernel/net/sched/sch_hhf.c
+++ b/kernel/net/sched/sch_hhf.c
@@ -9,7 +9,6 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
-#include <net/flow_keys.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
@@ -176,22 +175,6 @@ static u32 hhf_time_stamp(void)
return jiffies;
}
-static unsigned int skb_hash(const struct hhf_sched_data *q,
- const struct sk_buff *skb)
-{
- struct flow_keys keys;
- unsigned int hash;
-
- if (skb->sk && skb->sk->sk_hash)
- return skb->sk->sk_hash;
-
- skb_flow_dissect(skb, &keys);
- hash = jhash_3words((__force u32)keys.dst,
- (__force u32)keys.src ^ keys.ip_proto,
- (__force u32)keys.ports, q->perturbation);
- return hash;
-}
-
/* Looks up a heavy-hitter flow in a chaining list of table T. */
static struct hh_flow_state *seek_list(const u32 hash,
struct list_head *head,
@@ -280,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
}
/* Get hashed flow-id of the skb. */
- hash = skb_hash(q, skb);
+ hash = skb_get_hash_perturb(skb, q->perturbation);
/* Check if this packet belongs to an already established HH flow. */
flow_pos = hash & HHF_BIT_MASK;
@@ -385,6 +368,15 @@ static unsigned int hhf_drop(struct Qdisc *sch)
return bucket - q->buckets;
}
+static unsigned int hhf_qdisc_drop(struct Qdisc *sch)
+{
+ unsigned int prev_backlog;
+
+ prev_backlog = sch->qstats.backlog;
+ hhf_drop(sch);
+ return prev_backlog - sch->qstats.backlog;
+}
+
static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct hhf_sched_data *q = qdisc_priv(sch);
@@ -713,7 +705,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
.enqueue = hhf_enqueue,
.dequeue = hhf_dequeue,
.peek = qdisc_peek_dequeued,
- .drop = hhf_drop,
+ .drop = hhf_qdisc_drop,
.init = hhf_init,
.reset = hhf_reset,
.destroy = hhf_destroy,
diff --git a/kernel/net/sched/sch_htb.c b/kernel/net/sched/sch_htb.c
index f1acb0f60..15ccd7f8f 100644
--- a/kernel/net/sched/sch_htb.c
+++ b/kernel/net/sched/sch_htb.c
@@ -229,7 +229,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+ while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
@@ -1048,11 +1048,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_HTB_DIRECT_QLEN])
q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
- else {
+ else
q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
- if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
- q->direct_qlen = 2;
- }
+
if ((q->rate2quantum = gopt->rate2quantum) < 1)
q->rate2quantum = 1;
q->defcls = gopt->defcls;
diff --git a/kernel/net/sched/sch_ingress.c b/kernel/net/sched/sch_ingress.c
index 4cdbfb856..e7c648fa9 100644
--- a/kernel/net/sched/sch_ingress.c
+++ b/kernel/net/sched/sch_ingress.c
@@ -12,16 +12,10 @@
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
-struct ingress_qdisc_data {
- struct tcf_proto __rcu *filter_list;
-};
-
-/* ------------------------- Class/flow operations ------------------------- */
-
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
@@ -49,57 +43,24 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
- struct ingress_qdisc_data *p = qdisc_priv(sch);
-
- return &p->filter_list;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
+ struct net_device *dev = qdisc_dev(sch);
-static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
-{
- struct ingress_qdisc_data *p = qdisc_priv(sch);
- struct tcf_result res;
- struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
- int result;
-
- result = tc_classify(skb, fl, &res);
-
- qdisc_bstats_update(sch, skb);
- switch (result) {
- case TC_ACT_SHOT:
- result = TC_ACT_SHOT;
- qdisc_qstats_drop(sch);
- break;
- case TC_ACT_STOLEN:
- case TC_ACT_QUEUED:
- result = TC_ACT_STOLEN;
- break;
- case TC_ACT_RECLASSIFY:
- case TC_ACT_OK:
- skb->tc_index = TC_H_MIN(res.classid);
- default:
- result = TC_ACT_OK;
- break;
- }
-
- return result;
+ return &dev->ingress_cl_list;
}
-/* ------------------------------------------------------------- */
-
static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
{
net_inc_ingress_queue();
+ sch->flags |= TCQ_F_CPUSTATS;
return 0;
}
static void ingress_destroy(struct Qdisc *sch)
{
- struct ingress_qdisc_data *p = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
- tcf_destroy_chain(&p->filter_list);
+ tcf_destroy_chain(&dev->ingress_cl_list);
net_dec_ingress_queue();
}
@@ -110,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
+
return nla_nest_end(skb, nest);
nla_put_failure:
@@ -130,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = {
static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.cl_ops = &ingress_class_ops,
.id = "ingress",
- .priv_size = sizeof(struct ingress_qdisc_data),
- .enqueue = ingress_enqueue,
.init = ingress_init,
.destroy = ingress_destroy,
.dump = ingress_dump,
@@ -148,6 +108,7 @@ static void __exit ingress_module_exit(void)
unregister_qdisc(&ingress_qdisc_ops);
}
-module_init(ingress_module_init)
-module_exit(ingress_module_exit)
+module_init(ingress_module_init);
+module_exit(ingress_module_exit);
+
MODULE_LICENSE("GPL");
diff --git a/kernel/net/sched/sch_mq.c b/kernel/net/sched/sch_mq.c
index f3cbaecd2..3e82f047c 100644
--- a/kernel/net/sched/sch_mq.c
+++ b/kernel/net/sched/sch_mq.c
@@ -63,7 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
if (qdisc == NULL)
goto err;
priv->qdiscs[ntx] = qdisc;
- qdisc->flags |= TCQ_F_ONETXQUEUE;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
sch->flags |= TCQ_F_MQROOT;
@@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
*old = dev_graft_qdisc(dev_queue, new);
if (new)
- new->flags |= TCQ_F_ONETXQUEUE;
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
diff --git a/kernel/net/sched/sch_mqprio.c b/kernel/net/sched/sch_mqprio.c
index 3811a7454..ad70ecf57 100644
--- a/kernel/net/sched/sch_mqprio.c
+++ b/kernel/net/sched/sch_mqprio.c
@@ -132,7 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
goto err;
}
priv->qdiscs[i] = qdisc;
- qdisc->flags |= TCQ_F_ONETXQUEUE;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
/* If the mqprio options indicate that hardware should own
@@ -209,7 +209,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
*old = dev_graft_qdisc(dev_queue, new);
if (new)
- new->flags |= TCQ_F_ONETXQUEUE;
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
diff --git a/kernel/net/sched/sch_multiq.c b/kernel/net/sched/sch_multiq.c
index 42dd21887..4e904ca0a 100644
--- a/kernel/net/sched/sch_multiq.c
+++ b/kernel/net/sched/sch_multiq.c
@@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- err = tc_classify(skb, fl, &res);
+ err = tc_classify(skb, fl, &res, false);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
diff --git a/kernel/net/sched/sch_netem.c b/kernel/net/sched/sch_netem.c
index 956ead2ca..5abd1d9de 100644
--- a/kernel/net/sched/sch_netem.c
+++ b/kernel/net/sched/sch_netem.c
@@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
struct Qdisc *rootq = qdisc_root(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
- q->duplicate = 0;
- qdisc_enqueue_root(skb2, rootq);
+ q->duplicate = 0;
+ rootq->enqueue(skb2, rootq);
q->duplicate = dupsave;
}
diff --git a/kernel/net/sched/sch_plug.c b/kernel/net/sched/sch_plug.c
index 89f8fcf73..5abfe4467 100644
--- a/kernel/net/sched/sch_plug.c
+++ b/kernel/net/sched/sch_plug.c
@@ -130,12 +130,8 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt)
q->unplug_indefinite = false;
if (opt == NULL) {
- /* We will set a default limit of 100 pkts (~150kB)
- * in case tx_queue_len is not available. The
- * default value is completely arbitrary.
- */
- u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
- q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+ q->limit = qdisc_dev(sch)->tx_queue_len
+ * psched_mtu(qdisc_dev(sch));
} else {
struct tc_plug_qopt *ctl = nla_data(opt);
@@ -216,6 +212,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
.peek = qdisc_peek_head,
.init = plug_init,
.change = plug_change,
+ .reset = qdisc_reset_queue,
.owner = THIS_MODULE,
};
diff --git a/kernel/net/sched/sch_prio.c b/kernel/net/sched/sch_prio.c
index 8e5cd34aa..ba6487f27 100644
--- a/kernel/net/sched/sch_prio.c
+++ b/kernel/net/sched/sch_prio.c
@@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (TC_H_MAJ(skb->priority) != sch->handle) {
fl = rcu_dereference_bh(q->filter_list);
- err = tc_classify(skb, fl, &res);
+ err = tc_classify(skb, fl, &res, false);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
diff --git a/kernel/net/sched/sch_qfq.c b/kernel/net/sched/sch_qfq.c
index 3ec7e88a4..3dc3a6e56 100644
--- a/kernel/net/sched/sch_qfq.c
+++ b/kernel/net/sched/sch_qfq.c
@@ -186,7 +186,6 @@ struct qfq_sched {
u64 oldV, V; /* Precise virtual times. */
struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
- u32 num_active_agg; /* Num. of active aggregates */
u32 wsum; /* weight sum */
u32 iwsum; /* inverse weight sum */
@@ -339,8 +338,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
{
- if (!hlist_unhashed(&agg->nonfull_next))
- hlist_del_init(&agg->nonfull_next);
+ hlist_del_init(&agg->nonfull_next);
q->wsum -= agg->class_weight;
if (q->wsum != 0)
q->iwsum = ONE_FP / q->wsum;
@@ -719,7 +717,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
fl = rcu_dereference_bh(q->filter_list);
- result = tc_classify(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
diff --git a/kernel/net/sched/sch_sfb.c b/kernel/net/sched/sch_sfb.c
index 5819dd826..5bbb6332e 100644
--- a/kernel/net/sched/sch_sfb.c
+++ b/kernel/net/sched/sch_sfb.c
@@ -26,7 +26,6 @@
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-#include <net/flow_keys.h>
/*
* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
@@ -259,7 +258,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
struct tcf_result res;
int result;
- result = tc_classify(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -285,9 +284,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
int i;
u32 p_min = ~0;
u32 minqlen = ~0;
- u32 r, slot, salt, sfbhash;
+ u32 r, sfbhash;
+ u32 slot = q->slot;
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- struct flow_keys keys;
if (unlikely(sch->q.qlen >= q->limit)) {
qdisc_qstats_overlimit(sch);
@@ -309,22 +308,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
fl = rcu_dereference_bh(q->filter_list);
if (fl) {
+ u32 salt;
+
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
- keys.src = salt;
- keys.dst = 0;
- keys.ports = 0;
+ sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
} else {
- skb_flow_dissect(skb, &keys);
+ sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation);
}
- slot = q->slot;
- sfbhash = jhash_3words((__force u32)keys.dst,
- (__force u32)keys.src,
- (__force u32)keys.ports,
- q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -356,10 +350,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (unlikely(p_min >= SFB_MAX_PROB)) {
/* Inelastic flow */
if (q->double_buffering) {
- sfbhash = jhash_3words((__force u32)keys.dst,
- (__force u32)keys.src,
- (__force u32)keys.ports,
- q->bins[slot].perturbation);
+ sfbhash = skb_get_hash_perturb(skb,
+ q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -510,7 +502,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
limit = ctl->limit;
if (limit == 0)
- limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+ limit = qdisc_dev(sch)->tx_queue_len;
child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
if (IS_ERR(child))
diff --git a/kernel/net/sched/sch_sfq.c b/kernel/net/sched/sch_sfq.c
index b877140be..3abab534e 100644
--- a/kernel/net/sched/sch_sfq.c
+++ b/kernel/net/sched/sch_sfq.c
@@ -23,7 +23,6 @@
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-#include <net/flow_keys.h>
#include <net/red.h>
@@ -156,30 +155,10 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
return &q->dep[val - SFQ_MAX_FLOWS];
}
-/*
- * In order to be able to quickly rehash our queue when timer changes
- * q->perturbation, we store flow_keys in skb->cb[]
- */
-struct sfq_skb_cb {
- struct flow_keys keys;
-};
-
-static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
-{
- qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
- return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
-}
-
static unsigned int sfq_hash(const struct sfq_sched_data *q,
const struct sk_buff *skb)
{
- const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
- unsigned int hash;
-
- hash = jhash_3words((__force u32)keys->dst,
- (__force u32)keys->src ^ keys->ip_proto,
- (__force u32)keys->ports, q->perturbation);
- return hash & (q->divisor - 1);
+ return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -196,13 +175,11 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
return TC_H_MIN(skb->priority);
fl = rcu_dereference_bh(q->filter_list);
- if (!fl) {
- skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
+ if (!fl)
return sfq_hash(q, skb) + 1;
- }
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tc_classify(skb, fl, &res);
+ result = tc_classify(skb, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -329,10 +306,10 @@ drop:
len = qdisc_pkt_len(skb);
slot->backlog -= len;
sfq_dec(q, x);
- kfree_skb(skb);
sch->q.qlen--;
qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
+ kfree_skb(skb);
return len;
}
diff --git a/kernel/net/sctp/associola.c b/kernel/net/sctp/associola.c
index 197c3f59e..559afd0ee 100644
--- a/kernel/net/sctp/associola.c
+++ b/kernel/net/sctp/associola.c
@@ -1208,20 +1208,22 @@ void sctp_assoc_update(struct sctp_association *asoc,
* within this document.
*
* Our basic strategy is to round-robin transports in priorities
- * according to sctp_state_prio_map[] e.g., if no such
+ * according to sctp_trans_score() e.g., if no such
* transport with state SCTP_ACTIVE exists, round-robin through
* SCTP_UNKNOWN, etc. You get the picture.
*/
-static const u8 sctp_trans_state_to_prio_map[] = {
- [SCTP_ACTIVE] = 3, /* best case */
- [SCTP_UNKNOWN] = 2,
- [SCTP_PF] = 1,
- [SCTP_INACTIVE] = 0, /* worst case */
-};
-
static u8 sctp_trans_score(const struct sctp_transport *trans)
{
- return sctp_trans_state_to_prio_map[trans->state];
+ switch (trans->state) {
+ case SCTP_ACTIVE:
+ return 3; /* best case */
+ case SCTP_UNKNOWN:
+ return 2;
+ case SCTP_PF:
+ return 1;
+ default: /* case SCTP_INACTIVE */
+ return 0; /* worst case */
+ }
}
static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1,
@@ -1588,7 +1590,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
/* Set an association id for a given association */
int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
{
- bool preload = !!(gfp & __GFP_WAIT);
+ bool preload = gfpflags_allow_blocking(gfp);
int ret;
/* If the id is already assigned, keep it. */
diff --git a/kernel/net/sctp/auth.c b/kernel/net/sctp/auth.c
index 4f15b7d73..1543e39f4 100644
--- a/kernel/net/sctp/auth.c
+++ b/kernel/net/sctp/auth.c
@@ -809,8 +809,8 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
if (!has_sha1)
return -EINVAL;
- memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0],
- hmacs->shmac_num_idents * sizeof(__u16));
+ for (i = 0; i < hmacs->shmac_num_idents; i++)
+ ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]);
ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) +
hmacs->shmac_num_idents * sizeof(__u16));
return 0;
diff --git a/kernel/net/sctp/ipv6.c b/kernel/net/sctp/ipv6.c
index 0e4198ee2..ec529121f 100644
--- a/kernel/net/sctp/ipv6.c
+++ b/kernel/net/sctp/ipv6.c
@@ -209,6 +209,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
struct flowi6 *fl6 = &transport->fl.u.ip6;
+ int res;
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
skb->len, &fl6->saddr, &fl6->daddr);
@@ -220,7 +221,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
- return ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+ rcu_read_lock();
+ res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass);
+ rcu_read_unlock();
+ return res;
}
/* Returns the dst cache entry for the given source and destination ip
@@ -262,7 +266,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
pr_debug("src=%pI6 - ", &fl6->saddr);
}
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
+
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
if (!asoc || saddr)
goto out;
@@ -316,14 +323,13 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
}
}
}
- rcu_read_unlock();
-
if (baddr) {
fl6->saddr = baddr->v6.sin6_addr;
fl6->fl6_sport = baddr->v6.sin6_port;
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
}
+ rcu_read_unlock();
out:
if (!IS_ERR_OR_NULL(dst)) {
@@ -331,8 +337,9 @@ out:
rt = (struct rt6_info *)dst;
t->dst = dst;
- t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr,
+ t->dst_cookie = rt6_get_cookie(rt);
+ pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
+ &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
&fl6->saddr);
} else {
t->dst = NULL;
@@ -634,8 +641,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
struct sock *newsk;
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct sctp6_sock *newsctp6sk;
+ struct ipv6_txoptions *opt;
- newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot);
+ newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
if (!newsk)
goto out;
@@ -653,6 +661,13 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+ rcu_read_lock();
+ opt = rcu_dereference(np->opt);
+ if (opt)
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ rcu_read_unlock();
+
/* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
* and getpeername().
*/
diff --git a/kernel/net/sctp/outqueue.c b/kernel/net/sctp/outqueue.c
index 7e8f0a117..c0380cfb1 100644
--- a/kernel/net/sctp/outqueue.c
+++ b/kernel/net/sctp/outqueue.c
@@ -324,6 +324,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
"illegal chunk");
+ sctp_chunk_hold(chunk);
sctp_outq_tail_data(q, chunk);
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
@@ -1251,6 +1252,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
*/
sack_a_rwnd = ntohl(sack->a_rwnd);
+ asoc->peer.zero_window_announced = !sack_a_rwnd;
outstanding = q->outstanding_bytes;
if (outstanding < sack_a_rwnd)
diff --git a/kernel/net/sctp/protocol.c b/kernel/net/sctp/protocol.c
index e13c3c3ea..8b4ff3156 100644
--- a/kernel/net/sctp/protocol.c
+++ b/kernel/net/sctp/protocol.c
@@ -60,6 +60,8 @@
#include <net/inet_common.h>
#include <net/inet_ecn.h>
+#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024)
+
/* Global data structures. */
struct sctp_globals sctp_globals __read_mostly;
@@ -487,23 +489,43 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
*/
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ struct net_device *odev;
+
if (!laddr->valid)
continue;
- if ((laddr->state == SCTP_ADDR_SRC) &&
- (AF_INET == laddr->a.sa.sa_family)) {
- fl4->fl4_sport = laddr->a.v4.sin_port;
- flowi4_update_output(fl4,
- asoc->base.sk->sk_bound_dev_if,
- RT_CONN_FLAGS(asoc->base.sk),
- daddr->v4.sin_addr.s_addr,
- laddr->a.v4.sin_addr.s_addr);
-
- rt = ip_route_output_key(sock_net(sk), fl4);
- if (!IS_ERR(rt)) {
- dst = &rt->dst;
- goto out_unlock;
- }
+ if (laddr->state != SCTP_ADDR_SRC ||
+ AF_INET != laddr->a.sa.sa_family)
+ continue;
+
+ fl4->fl4_sport = laddr->a.v4.sin_port;
+ flowi4_update_output(fl4,
+ asoc->base.sk->sk_bound_dev_if,
+ RT_CONN_FLAGS(asoc->base.sk),
+ daddr->v4.sin_addr.s_addr,
+ laddr->a.v4.sin_addr.s_addr);
+
+ rt = ip_route_output_key(sock_net(sk), fl4);
+ if (IS_ERR(rt))
+ continue;
+
+ if (!dst)
+ dst = &rt->dst;
+
+ /* Ensure the src address belongs to the output
+ * interface.
+ */
+ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
+ false);
+ if (!odev || odev->ifindex != fl4->flowi4_oif) {
+ if (&rt->dst != dst)
+ dst_release(&rt->dst);
+ continue;
}
+
+ if (dst != &rt->dst)
+ dst_release(dst);
+ dst = &rt->dst;
+ break;
}
out_unlock:
@@ -550,7 +572,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
struct sctp_association *asoc)
{
struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
- sk->sk_prot);
+ sk->sk_prot, 0);
struct inet_sock *newinet;
if (!newsk)
@@ -1332,6 +1354,8 @@ static __init int sctp_init(void)
unsigned long limit;
int max_share;
int order;
+ int num_entries;
+ int max_entry_order;
sock_skb_cb_check_size(sizeof(struct sctp_ulpevent));
@@ -1384,14 +1408,24 @@ static __init int sctp_init(void)
/* Size and allocate the association hash table.
* The methodology is similar to that of the tcp hash tables.
+ * Though not identical. Start by getting a goal size
*/
if (totalram_pages >= (128 * 1024))
goal = totalram_pages >> (22 - PAGE_SHIFT);
else
goal = totalram_pages >> (24 - PAGE_SHIFT);
- for (order = 0; (1UL << order) < goal; order++)
- ;
+ /* Then compute the page order for said goal */
+ order = get_order(goal);
+
+ /* Now compute the required page order for the maximum sized table we
+ * want to create
+ */
+ max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES *
+ sizeof(struct sctp_bind_hashbucket));
+
+ /* Limit the page order by that maximum hash table size */
+ order = min(order, max_entry_order);
do {
sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE /
@@ -1425,20 +1459,35 @@ static __init int sctp_init(void)
INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain);
}
- /* Allocate and initialize the SCTP port hash table. */
+ /* Allocate and initialize the SCTP port hash table.
+ * Note that order is initalized to start at the max sized
+ * table we want to support. If we can't get that many pages
+ * reduce the order and try again
+ */
do {
- sctp_port_hashsize = (1UL << order) * PAGE_SIZE /
- sizeof(struct sctp_bind_hashbucket);
- if ((sctp_port_hashsize > (64 * 1024)) && order > 0)
- continue;
sctp_port_hashtable = (struct sctp_bind_hashbucket *)
__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
} while (!sctp_port_hashtable && --order > 0);
+
if (!sctp_port_hashtable) {
pr_err("Failed bind hash alloc\n");
status = -ENOMEM;
goto err_bhash_alloc;
}
+
+ /* Now compute the number of entries that will fit in the
+ * port hash space we allocated
+ */
+ num_entries = (1UL << order) * PAGE_SIZE /
+ sizeof(struct sctp_bind_hashbucket);
+
+ /* And finish by rounding it down to the nearest power of two
+ * this wastes some memory of course, but its needed because
+ * the hash function operates based on the assumption that
+ * that the number of entries is a power of two
+ */
+ sctp_port_hashsize = rounddown_pow_of_two(num_entries);
+
for (i = 0; i < sctp_port_hashsize; i++) {
spin_lock_init(&sctp_port_hashtable[i].lock);
INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
diff --git a/kernel/net/sctp/sm_make_chunk.c b/kernel/net/sctp/sm_make_chunk.c
index 06320c8c1..5d6a03fad 100644
--- a/kernel/net/sctp/sm_make_chunk.c
+++ b/kernel/net/sctp/sm_make_chunk.c
@@ -1652,7 +1652,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
/* Set an expiration time for the cookie. */
cookie->c.expiration = ktime_add(asoc->cookie_life,
- ktime_get());
+ ktime_get_real());
/* Copy the peer's init packet. */
memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
@@ -1780,7 +1780,7 @@ no_hmac:
if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
kt = skb_get_ktime(skb);
else
- kt = ktime_get();
+ kt = ktime_get_real();
if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
/*
@@ -2494,7 +2494,7 @@ static int sctp_process_param(struct sctp_association *asoc,
__u16 sat;
int retval = 1;
sctp_scope_t scope;
- time_t stale;
+ u32 stale;
struct sctp_af *af;
union sctp_addr_param *addr_param;
struct sctp_transport *t;
@@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
sctp_assoc_set_primary(asoc, asconf->transport);
sctp_assoc_del_nonprimary_peers(asoc,
asconf->transport);
- } else
- sctp_assoc_del_peer(asoc, &addr);
+ return SCTP_ERROR_NO_ERROR;
+ }
+
+ /* If the address is not part of the association, the
+ * ASCONF-ACK with Error Cause Indication Parameter
+ * which including cause of Unresolvable Address should
+ * be sent.
+ */
+ peer = sctp_assoc_lookup_paddr(asoc, &addr);
+ if (!peer)
+ return SCTP_ERROR_DNS_FAILED;
+
+ sctp_assoc_rm_peer(asoc, peer);
break;
case SCTP_PARAM_SET_PRIMARY:
/* ADDIP Section 4.2.4
@@ -3132,11 +3143,18 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
case SCTP_PARAM_IPV4_ADDRESS:
if (length != sizeof(sctp_ipv4addr_param_t))
return false;
+ /* ensure there is only one addr param and it's in the
+ * beginning of addip_hdr params, or we reject it.
+ */
+ if (param.v != addip->addip_hdr.params)
+ return false;
addr_param_seen = true;
break;
case SCTP_PARAM_IPV6_ADDRESS:
if (length != sizeof(sctp_ipv6addr_param_t))
return false;
+ if (param.v != addip->addip_hdr.params)
+ return false;
addr_param_seen = true;
break;
case SCTP_PARAM_ADD_IP:
diff --git a/kernel/net/sctp/sm_sideeffect.c b/kernel/net/sctp/sm_sideeffect.c
index fef2acdf4..6098d4c42 100644
--- a/kernel/net/sctp/sm_sideeffect.c
+++ b/kernel/net/sctp/sm_sideeffect.c
@@ -244,12 +244,13 @@ void sctp_generate_t3_rtx_event(unsigned long peer)
int error;
struct sctp_transport *transport = (struct sctp_transport *) peer;
struct sctp_association *asoc = transport->asoc;
- struct net *net = sock_net(asoc->base.sk);
+ struct sock *sk = asoc->base.sk;
+ struct net *net = sock_net(sk);
/* Check whether a task is in the sock. */
- bh_lock_sock(asoc->base.sk);
- if (sock_owned_by_user(asoc->base.sk)) {
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
pr_debug("%s: sock is busy\n", __func__);
/* Try again later. */
@@ -272,10 +273,10 @@ void sctp_generate_t3_rtx_event(unsigned long peer)
transport, GFP_ATOMIC);
if (error)
- asoc->base.sk->sk_err = -error;
+ sk->sk_err = -error;
out_unlock:
- bh_unlock_sock(asoc->base.sk);
+ bh_unlock_sock(sk);
sctp_transport_put(transport);
}
@@ -285,11 +286,12 @@ out_unlock:
static void sctp_generate_timeout_event(struct sctp_association *asoc,
sctp_event_timeout_t timeout_type)
{
- struct net *net = sock_net(asoc->base.sk);
+ struct sock *sk = asoc->base.sk;
+ struct net *net = sock_net(sk);
int error = 0;
- bh_lock_sock(asoc->base.sk);
- if (sock_owned_by_user(asoc->base.sk)) {
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
pr_debug("%s: sock is busy: timer %d\n", __func__,
timeout_type);
@@ -312,10 +314,10 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc,
(void *)timeout_type, GFP_ATOMIC);
if (error)
- asoc->base.sk->sk_err = -error;
+ sk->sk_err = -error;
out_unlock:
- bh_unlock_sock(asoc->base.sk);
+ bh_unlock_sock(sk);
sctp_association_put(asoc);
}
@@ -365,10 +367,11 @@ void sctp_generate_heartbeat_event(unsigned long data)
int error = 0;
struct sctp_transport *transport = (struct sctp_transport *) data;
struct sctp_association *asoc = transport->asoc;
- struct net *net = sock_net(asoc->base.sk);
+ struct sock *sk = asoc->base.sk;
+ struct net *net = sock_net(sk);
- bh_lock_sock(asoc->base.sk);
- if (sock_owned_by_user(asoc->base.sk)) {
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
pr_debug("%s: sock is busy\n", __func__);
/* Try again later. */
@@ -388,11 +391,11 @@ void sctp_generate_heartbeat_event(unsigned long data)
asoc->state, asoc->ep, asoc,
transport, GFP_ATOMIC);
- if (error)
- asoc->base.sk->sk_err = -error;
+ if (error)
+ sk->sk_err = -error;
out_unlock:
- bh_unlock_sock(asoc->base.sk);
+ bh_unlock_sock(sk);
sctp_transport_put(transport);
}
@@ -403,10 +406,11 @@ void sctp_generate_proto_unreach_event(unsigned long data)
{
struct sctp_transport *transport = (struct sctp_transport *) data;
struct sctp_association *asoc = transport->asoc;
- struct net *net = sock_net(asoc->base.sk);
+ struct sock *sk = asoc->base.sk;
+ struct net *net = sock_net(sk);
- bh_lock_sock(asoc->base.sk);
- if (sock_owned_by_user(asoc->base.sk)) {
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
pr_debug("%s: sock is busy\n", __func__);
/* Try again later. */
@@ -427,7 +431,7 @@ void sctp_generate_proto_unreach_event(unsigned long data)
asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC);
out_unlock:
- bh_unlock_sock(asoc->base.sk);
+ bh_unlock_sock(sk);
sctp_association_put(asoc);
}
@@ -702,7 +706,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
* outstanding data and rely on the retransmission limit be reached
* to shutdown the association.
*/
- if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING)
+ if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING)
t->asoc->overall_error_count = 0;
/* Clear the hb_sent flag to signal that we had a good
@@ -954,7 +958,7 @@ static void sctp_cmd_del_non_primary(struct sctp_association *asoc)
t = list_entry(pos, struct sctp_transport, transports);
if (!sctp_cmp_addr_exact(&t->ipaddr,
&asoc->peer.primary_addr)) {
- sctp_assoc_del_peer(asoc, &t->ipaddr);
+ sctp_assoc_rm_peer(asoc, t);
}
}
}
diff --git a/kernel/net/sctp/sm_statefuns.c b/kernel/net/sctp/sm_statefuns.c
index 3ee27b770..22c2bf367 100644
--- a/kernel/net/sctp/sm_statefuns.c
+++ b/kernel/net/sctp/sm_statefuns.c
@@ -853,7 +853,7 @@ nomem:
/*
* Respond to a normal COOKIE ACK chunk.
- * We are the side that is being asked for an association.
+ * We are the side that is asking for an association.
*
* RFC 2960 5.1 Normal Establishment of an Association
*
@@ -2306,7 +2306,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
sctp_cmd_seq_t *commands)
{
struct sctp_chunk *chunk = arg;
- time_t stale;
+ u32 stale;
sctp_cookie_preserve_param_t bht;
sctp_errhdr_t *err;
struct sctp_chunk *reply;
@@ -4829,7 +4829,8 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
retval = SCTP_DISPOSITION_CONSUME;
- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
/* Even if we can't send the ABORT due to low memory delete the
* TCB. This is a departure from our typical NOMEM handling.
@@ -4966,7 +4967,8 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
retval = SCTP_DISPOSITION_CONSUME;
- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
SCTP_STATE(SCTP_STATE_CLOSED));
@@ -5412,7 +5414,8 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS);
if (asoc->overall_error_count >= asoc->max_retrans) {
- if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
+ if (asoc->peer.zero_window_announced &&
+ asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
/*
* We are here likely because the receiver had its rwnd
* closed for a while and we have not been able to
diff --git a/kernel/net/sctp/socket.c b/kernel/net/sctp/socket.c
index 5f6c4e613..be1489fc3 100644
--- a/kernel/net/sctp/socket.c
+++ b/kernel/net/sctp/socket.c
@@ -972,7 +972,7 @@ static int sctp_setsockopt_bindx(struct sock *sk,
return -EFAULT;
/* Alloc space for the address array in kernel memory. */
- kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN);
if (unlikely(!kaddrs))
return -ENOMEM;
@@ -1301,8 +1301,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
int addrs_size,
sctp_assoc_t *assoc_id)
{
- int err = 0;
struct sockaddr *kaddrs;
+ gfp_t gfp = GFP_KERNEL;
+ int err = 0;
pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
__func__, sk, addrs, addrs_size);
@@ -1315,7 +1316,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
return -EFAULT;
/* Alloc space for the address array in kernel memory. */
- kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ if (sk->sk_socket->file)
+ gfp = GFP_USER | __GFP_NOWARN;
+ kaddrs = kmalloc(addrs_size, gfp);
if (unlikely(!kaddrs))
return -ENOMEM;
@@ -1513,8 +1516,7 @@ static void sctp_close(struct sock *sk, long timeout)
struct sctp_chunk *chunk;
chunk = sctp_make_abort_user(asoc, NULL, 0);
- if (chunk)
- sctp_primitive_ABORT(net, asoc, chunk);
+ sctp_primitive_ABORT(net, asoc, chunk);
} else
sctp_primitive_SHUTDOWN(net, asoc, NULL);
}
@@ -1952,8 +1954,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
/* Now send the (possibly) fragmented message. */
list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
- sctp_chunk_hold(chunk);
-
/* Do accounting for the write space. */
sctp_set_owner_w(chunk);
@@ -1966,15 +1966,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
* breaks.
*/
err = sctp_primitive_SEND(net, asoc, datamsg);
+ sctp_datamsg_put(datamsg);
/* Did the lower layer accept the chunk? */
- if (err) {
- sctp_datamsg_free(datamsg);
+ if (err)
goto out_free;
- }
pr_debug("%s: we sent primitively\n", __func__);
- sctp_datamsg_put(datamsg);
err = msg_len;
if (unlikely(wait_connect)) {
@@ -2121,12 +2119,6 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (sp->subscribe.sctp_data_io_event)
sctp_ulpevent_read_sndrcvinfo(event, msg);
-#if 0
- /* FIXME: we should be calling IP/IPv6 layers. */
- if (sk->sk_protinfo.af_inet.cmsg_flags)
- ip_cmsg_recv(msg, skb);
-#endif
-
err = copied;
/* If skb's length exceeds the user's buffer, update the skb and
@@ -2206,12 +2198,6 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
return -EFAULT;
- if (sctp_sk(sk)->subscribe.sctp_data_io_event)
- pr_warn_ratelimited(DEPRECATED "%s (pid %d) "
- "Requested SCTP_SNDRCVINFO event.\n"
- "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n",
- current->comm, task_pid_nr(current));
-
/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
* if there is no data to be sent or retransmit, the stack will
* immediately send up this notification.
@@ -4487,7 +4473,7 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval
}
newfile = sock_alloc_file(newsock, 0, NULL);
- if (unlikely(IS_ERR(newfile))) {
+ if (IS_ERR(newfile)) {
put_unused_fd(retval);
sock_release(newsock);
return PTR_ERR(newfile);
@@ -4940,7 +4926,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
to = optval + offsetof(struct sctp_getaddrs, addrs);
space_left = len - offsetof(struct sctp_getaddrs, addrs);
- addrs = kmalloc(space_left, GFP_KERNEL);
+ addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN);
if (!addrs)
return -ENOMEM;
@@ -5556,6 +5542,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
struct sctp_hmac_algo_param *hmacs;
__u16 data_len = 0;
u32 num_idents;
+ int i;
if (!ep->auth_enable)
return -EACCES;
@@ -5573,8 +5560,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
return -EFAULT;
if (put_user(num_idents, &p->shmac_num_idents))
return -EFAULT;
- if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len))
- return -EFAULT;
+ for (i = 0; i < num_idents; i++) {
+ __u16 hmacid = ntohs(hmacs->hmac_ids[i]);
+
+ if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16)))
+ return -EFAULT;
+ }
return 0;
}
@@ -5789,7 +5780,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num;
- ids = kmalloc(len, GFP_KERNEL);
+ ids = kmalloc(len, GFP_USER | __GFP_NOWARN);
if (unlikely(!ids))
return -ENOMEM;
@@ -6470,7 +6461,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sctp_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/*
* Since the socket is not locked, the buffer
* might be made available after the writeable check and
@@ -6654,6 +6645,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
if (cmsgs->srinfo->sinfo_flags &
~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_SACK_IMMEDIATELY |
SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -6677,6 +6669,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
if (cmsgs->sinfo->snd_flags &
~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_SACK_IMMEDIATELY |
SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -6813,26 +6806,30 @@ no_packet:
static void __sctp_write_space(struct sctp_association *asoc)
{
struct sock *sk = asoc->base.sk;
- struct socket *sock = sk->sk_socket;
- if ((sctp_wspace(asoc) > 0) && sock) {
- if (waitqueue_active(&asoc->wait))
- wake_up_interruptible(&asoc->wait);
+ if (sctp_wspace(asoc) <= 0)
+ return;
- if (sctp_writeable(sk)) {
- wait_queue_head_t *wq = sk_sleep(sk);
+ if (waitqueue_active(&asoc->wait))
+ wake_up_interruptible(&asoc->wait);
- if (wq && waitqueue_active(wq))
- wake_up_interruptible(wq);
+ if (sctp_writeable(sk)) {
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq) {
+ if (waitqueue_active(&wq->wait))
+ wake_up_interruptible(&wq->wait);
/* Note that we try to include the Async I/O support
* here by modeling from the current TCP/UDP code.
* We have not tested with it yet.
*/
if (!(sk->sk_shutdown & SEND_SHUTDOWN))
- sock_wake_async(sock,
- SOCK_WAKE_SPACE, POLL_OUT);
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
}
+ rcu_read_unlock();
}
}
@@ -7175,6 +7172,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_type = sk->sk_type;
newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
newsk->sk_flags = sk->sk_flags;
+ newsk->sk_tsflags = sk->sk_tsflags;
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
@@ -7207,6 +7205,11 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newinet->mc_ttl = 1;
newinet->mc_index = 0;
newinet->mc_list = NULL;
+
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
+
+ security_sk_clone(sk, newsk);
}
static inline void sctp_copy_descendant(struct sock *sk_to,
@@ -7387,6 +7390,13 @@ struct proto sctp_prot = {
#if IS_ENABLED(CONFIG_IPV6)
+#include <net/transp_v6.h>
+static void sctp_v6_destroy_sock(struct sock *sk)
+{
+ sctp_destroy_sock(sk);
+ inet6_destroy_sock(sk);
+}
+
struct proto sctpv6_prot = {
.name = "SCTPv6",
.owner = THIS_MODULE,
@@ -7396,7 +7406,7 @@ struct proto sctpv6_prot = {
.accept = sctp_accept,
.ioctl = sctp_ioctl,
.init = sctp_init_sock,
- .destroy = sctp_destroy_sock,
+ .destroy = sctp_v6_destroy_sock,
.shutdown = sctp_shutdown,
.setsockopt = sctp_setsockopt,
.getsockopt = sctp_getsockopt,
diff --git a/kernel/net/sctp/sysctl.c b/kernel/net/sctp/sysctl.c
index 26d50c565..3e0fc5127 100644
--- a/kernel/net/sctp/sysctl.c
+++ b/kernel/net/sctp/sysctl.c
@@ -320,7 +320,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
struct ctl_table tbl;
bool changed = false;
char *none = "none";
- char tmp[8];
+ char tmp[8] = {0};
int ret;
memset(&tbl, 0, sizeof(struct ctl_table));
diff --git a/kernel/net/sctp/transport.c b/kernel/net/sctp/transport.c
index a0a431824..aab9e3f29 100644
--- a/kernel/net/sctp/transport.c
+++ b/kernel/net/sctp/transport.c
@@ -331,7 +331,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
* 1/8, rto_alpha would be expressed as 3.
*/
tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta)
- + (((__u32)abs64((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
+ + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha)
+ (rtt >> net->sctp.rto_alpha);
} else {
diff --git a/kernel/net/socket.c b/kernel/net/socket.c
index 884e32997..d730ef9df 100644
--- a/kernel/net/socket.c
+++ b/kernel/net/socket.c
@@ -257,6 +257,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
}
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
+ wq->flags = 0;
RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
@@ -373,7 +374,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
- if (unlikely(IS_ERR(file))) {
+ if (IS_ERR(file)) {
/* drop dentry, keep inode */
ihold(d_inode(path.dentry));
path_put(&path);
@@ -576,9 +577,6 @@ void sock_release(struct socket *sock)
if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
- if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags))
- return;
-
this_cpu_sub(sockets_in_use, 1);
if (!sock->file) {
iput(SOCK_INODE(sock));
@@ -1059,27 +1057,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
return 0;
}
-/* This function may be called only under socket lock or callback_lock or rcu_lock */
+/* This function may be called only under rcu_lock */
-int sock_wake_async(struct socket *sock, int how, int band)
+int sock_wake_async(struct socket_wq *wq, int how, int band)
{
- struct socket_wq *wq;
-
- if (!sock)
+ if (!wq || !wq->fasync_list)
return -1;
- rcu_read_lock();
- wq = rcu_dereference(sock->wq);
- if (!wq || !wq->fasync_list) {
- rcu_read_unlock();
- return -1;
- }
+
switch (how) {
case SOCK_WAKE_WAITD:
- if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
+ if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
break;
goto call_kill;
case SOCK_WAKE_SPACE:
- if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
+ if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
break;
/* fall through */
case SOCK_WAKE_IO:
@@ -1089,7 +1080,7 @@ call_kill:
case SOCK_WAKE_URG:
kill_fasync(&wq->fasync_list, SIGURG, band);
}
- rcu_read_unlock();
+
return 0;
}
EXPORT_SYMBOL(sock_wake_async);
@@ -1213,9 +1204,9 @@ int sock_create(int family, int type, int protocol, struct socket **res)
}
EXPORT_SYMBOL(sock_create);
-int sock_create_kern(int family, int type, int protocol, struct socket **res)
+int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
{
- return __sock_create(&init_net, family, type, protocol, res, 1);
+ return __sock_create(net, family, type, protocol, res, 1);
}
EXPORT_SYMBOL(sock_create_kern);
@@ -1306,7 +1297,7 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
}
newfile1 = sock_alloc_file(sock1, flags, NULL);
- if (unlikely(IS_ERR(newfile1))) {
+ if (IS_ERR(newfile1)) {
err = PTR_ERR(newfile1);
goto out_put_unused_both;
}
@@ -1470,7 +1461,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
goto out_put;
}
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
- if (unlikely(IS_ERR(newfile))) {
+ if (IS_ERR(newfile)) {
err = PTR_ERR(newfile);
put_unused_fd(newfd);
sock_release(newsock);
@@ -1705,6 +1696,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
/* We assume all kernel code knows the size of sockaddr_storage */
msg.msg_namelen = 0;
+ msg.msg_iocb = NULL;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
diff --git a/kernel/net/sunrpc/Kconfig b/kernel/net/sunrpc/Kconfig
index 9068e72aa..04ce2c0b6 100644
--- a/kernel/net/sunrpc/Kconfig
+++ b/kernel/net/sunrpc/Kconfig
@@ -48,28 +48,16 @@ config SUNRPC_DEBUG
If unsure, say Y.
-config SUNRPC_XPRT_RDMA_CLIENT
- tristate "RPC over RDMA Client Support"
+config SUNRPC_XPRT_RDMA
+ tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
default SUNRPC && INFINIBAND
help
- This option allows the NFS client to support an RDMA-enabled
- transport.
+ This option allows the NFS client and server to use RDMA
+ transports (InfiniBand, iWARP, or RoCE).
- To compile RPC client RDMA transport support as a module,
- choose M here: the module will be called xprtrdma.
+ To compile this support as a module, choose M. The module
+ will be called rpcrdma.ko.
- If unsure, say N.
-
-config SUNRPC_XPRT_RDMA_SERVER
- tristate "RPC over RDMA Server Support"
- depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
- default SUNRPC && INFINIBAND
- help
- This option allows the NFS server to support an RDMA-enabled
- transport.
-
- To compile RPC server RDMA transport support as a module,
- choose M here: the module will be called svcrdma.
-
- If unsure, say N.
+ If unsure, or you know there is no RDMA capability on your
+ hardware platform, say N.
diff --git a/kernel/net/sunrpc/Makefile b/kernel/net/sunrpc/Makefile
index 15e6f6c23..b512fbd9d 100644
--- a/kernel/net/sunrpc/Makefile
+++ b/kernel/net/sunrpc/Makefile
@@ -5,8 +5,7 @@
obj-$(CONFIG_SUNRPC) += sunrpc.o
obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
-
-obj-y += xprtrdma/
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
auth.o auth_null.o auth_unix.o auth_generic.o \
@@ -15,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
sunrpc_syms.o cache.o rpc_pipe.o \
svc_xprt.o
sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
-sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
sunrpc-$(CONFIG_PROC_FS) += stats.o
sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/kernel/net/sunrpc/auth.c b/kernel/net/sunrpc/auth.c
index 47f38be41..02f53674d 100644
--- a/kernel/net/sunrpc/auth.c
+++ b/kernel/net/sunrpc/auth.c
@@ -72,7 +72,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
-static struct kernel_param_ops param_ops_hashtbl_sz = {
+static const struct kernel_param_ops param_ops_hashtbl_sz = {
.set = param_set_hashtbl_sz,
.get = param_get_hashtbl_sz,
};
diff --git a/kernel/net/sunrpc/auth_gss/auth_gss.c b/kernel/net/sunrpc/auth_gss/auth_gss.c
index dace13d76..799e65b94 100644
--- a/kernel/net/sunrpc/auth_gss/auth_gss.c
+++ b/kernel/net/sunrpc/auth_gss/auth_gss.c
@@ -1411,17 +1411,16 @@ gss_key_timeout(struct rpc_cred *rc)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
struct gss_cl_ctx *ctx;
- unsigned long now = jiffies;
- unsigned long expire;
+ unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ);
+ int ret = 0;
rcu_read_lock();
ctx = rcu_dereference(gss_cred->gc_ctx);
- if (ctx)
- expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+ if (!ctx || time_after(timeout, ctx->gc_expiry))
+ ret = -EACCES;
rcu_read_unlock();
- if (!ctx || time_after(now, expire))
- return -EACCES;
- return 0;
+
+ return ret;
}
static int
diff --git a/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c b/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c
index b5408e8a3..fee3c15a4 100644
--- a/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/kernel/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -881,9 +881,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
if (err)
goto out_err;
- sg_init_table(sg, 1);
- sg_set_buf(sg, &zeroconstant, 4);
-
+ sg_init_one(sg, &zeroconstant, 4);
err = crypto_hash_digest(&desc, sg, 4, Kseq);
if (err)
goto out_err;
@@ -951,9 +949,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
if (err)
goto out_err;
- sg_init_table(sg, 1);
- sg_set_buf(sg, zeroconstant, 4);
-
+ sg_init_one(sg, zeroconstant, 4);
err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
if (err)
goto out_err;
diff --git a/kernel/net/sunrpc/auth_unix.c b/kernel/net/sunrpc/auth_unix.c
index 4feda2d0a..548240dd1 100644
--- a/kernel/net/sunrpc/auth_unix.c
+++ b/kernel/net/sunrpc/auth_unix.c
@@ -23,7 +23,7 @@ struct unx_cred {
};
#define uc_uid uc_base.cr_uid
-#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2))
+#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME))
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
diff --git a/kernel/net/sunrpc/backchannel_rqst.c b/kernel/net/sunrpc/backchannel_rqst.c
index 28504dfd3..229956bf8 100644
--- a/kernel/net/sunrpc/backchannel_rqst.c
+++ b/kernel/net/sunrpc/backchannel_rqst.c
@@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
{
- return xprt->bc_alloc_count > 0;
+ return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
}
static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
{
+ atomic_add(n, &xprt->bc_free_slots);
xprt->bc_alloc_count += n;
}
static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
{
+ atomic_sub(n, &xprt->bc_free_slots);
return xprt->bc_alloc_count -= n;
}
@@ -67,6 +69,55 @@ static void xprt_free_allocation(struct rpc_rqst *req)
kfree(req);
}
+static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
+{
+ struct page *page;
+ /* Preallocate one XDR receive buffer */
+ page = alloc_page(gfp_flags);
+ if (page == NULL)
+ return -ENOMEM;
+ buf->head[0].iov_base = page_address(page);
+ buf->head[0].iov_len = PAGE_SIZE;
+ buf->tail[0].iov_base = NULL;
+ buf->tail[0].iov_len = 0;
+ buf->page_len = 0;
+ buf->len = 0;
+ buf->buflen = PAGE_SIZE;
+ return 0;
+}
+
+static
+struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
+{
+ struct rpc_rqst *req;
+
+ /* Pre-allocate one backchannel rpc_rqst */
+ req = kzalloc(sizeof(*req), gfp_flags);
+ if (req == NULL)
+ return NULL;
+
+ req->rq_xprt = xprt;
+ INIT_LIST_HEAD(&req->rq_list);
+ INIT_LIST_HEAD(&req->rq_bc_list);
+
+ /* Preallocate one XDR receive buffer */
+ if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
+ printk(KERN_ERR "Failed to create bc receive xbuf\n");
+ goto out_free;
+ }
+ req->rq_rcv_buf.len = PAGE_SIZE;
+
+ /* Preallocate one XDR send buffer */
+ if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) {
+ printk(KERN_ERR "Failed to create bc snd xbuf\n");
+ goto out_free;
+ }
+ return req;
+out_free:
+ xprt_free_allocation(req);
+ return NULL;
+}
+
/*
* Preallocate up to min_reqs structures and related buffers for use
* by the backchannel. This function can be called multiple times
@@ -87,9 +138,15 @@ static void xprt_free_allocation(struct rpc_rqst *req)
*/
int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
{
- struct page *page_rcv = NULL, *page_snd = NULL;
- struct xdr_buf *xbufp = NULL;
- struct rpc_rqst *req, *tmp;
+ if (!xprt->ops->bc_setup)
+ return 0;
+ return xprt->ops->bc_setup(xprt, min_reqs);
+}
+EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
+
+int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
+{
+ struct rpc_rqst *req;
struct list_head tmp_list;
int i;
@@ -106,7 +163,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
INIT_LIST_HEAD(&tmp_list);
for (i = 0; i < min_reqs; i++) {
/* Pre-allocate one backchannel rpc_rqst */
- req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+ req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
if (req == NULL) {
printk(KERN_ERR "Failed to create bc rpc_rqst\n");
goto out_free;
@@ -115,41 +172,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
/* Add the allocated buffer to the tmp list */
dprintk("RPC: adding req= %p\n", req);
list_add(&req->rq_bc_pa_list, &tmp_list);
-
- req->rq_xprt = xprt;
- INIT_LIST_HEAD(&req->rq_list);
- INIT_LIST_HEAD(&req->rq_bc_list);
-
- /* Preallocate one XDR receive buffer */
- page_rcv = alloc_page(GFP_KERNEL);
- if (page_rcv == NULL) {
- printk(KERN_ERR "Failed to create bc receive xbuf\n");
- goto out_free;
- }
- xbufp = &req->rq_rcv_buf;
- xbufp->head[0].iov_base = page_address(page_rcv);
- xbufp->head[0].iov_len = PAGE_SIZE;
- xbufp->tail[0].iov_base = NULL;
- xbufp->tail[0].iov_len = 0;
- xbufp->page_len = 0;
- xbufp->len = PAGE_SIZE;
- xbufp->buflen = PAGE_SIZE;
-
- /* Preallocate one XDR send buffer */
- page_snd = alloc_page(GFP_KERNEL);
- if (page_snd == NULL) {
- printk(KERN_ERR "Failed to create bc snd xbuf\n");
- goto out_free;
- }
-
- xbufp = &req->rq_snd_buf;
- xbufp->head[0].iov_base = page_address(page_snd);
- xbufp->head[0].iov_len = 0;
- xbufp->tail[0].iov_base = NULL;
- xbufp->tail[0].iov_len = 0;
- xbufp->page_len = 0;
- xbufp->len = 0;
- xbufp->buflen = PAGE_SIZE;
}
/*
@@ -167,7 +189,10 @@ out_free:
/*
* Memory allocation failed, free the temporary list
*/
- list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) {
+ while (!list_empty(&tmp_list)) {
+ req = list_first_entry(&tmp_list,
+ struct rpc_rqst,
+ rq_bc_pa_list);
list_del(&req->rq_bc_pa_list);
xprt_free_allocation(req);
}
@@ -175,7 +200,6 @@ out_free:
dprintk("RPC: setup backchannel transport failed\n");
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
/**
* xprt_destroy_backchannel - Destroys the backchannel preallocated structures.
@@ -188,6 +212,13 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
*/
void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
{
+ if (xprt->ops->bc_destroy)
+ xprt->ops->bc_destroy(xprt, max_reqs);
+}
+EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
+
+void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
+{
struct rpc_rqst *req = NULL, *tmp = NULL;
dprintk("RPC: destroy backchannel transport\n");
@@ -210,16 +241,21 @@ out:
dprintk("RPC: backchannel list empty= %s\n",
list_empty(&xprt->bc_pa_list) ? "true" : "false");
}
-EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
{
struct rpc_rqst *req = NULL;
dprintk("RPC: allocate a backchannel request\n");
- if (list_empty(&xprt->bc_pa_list))
+ if (atomic_read(&xprt->bc_free_slots) <= 0)
goto not_found;
-
+ if (list_empty(&xprt->bc_pa_list)) {
+ req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
+ if (!req)
+ goto not_found;
+ list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+ xprt->bc_alloc_count++;
+ }
req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
rq_bc_pa_list);
req->rq_reply_bytes_recvd = 0;
@@ -241,15 +277,32 @@ void xprt_free_bc_request(struct rpc_rqst *req)
{
struct rpc_xprt *xprt = req->rq_xprt;
+ xprt->ops->bc_free_rqst(req);
+}
+
+void xprt_free_bc_rqst(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+
dprintk("RPC: free backchannel req=%p\n", req);
req->rq_connect_cookie = xprt->connect_cookie - 1;
smp_mb__before_atomic();
- WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
smp_mb__after_atomic();
- if (!xprt_need_to_requeue(xprt)) {
+ /*
+ * Return it to the list of preallocations so that it
+ * may be reused by a new callback request.
+ */
+ spin_lock_bh(&xprt->bc_pa_lock);
+ if (xprt_need_to_requeue(xprt)) {
+ list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+ xprt->bc_alloc_count++;
+ req = NULL;
+ }
+ spin_unlock_bh(&xprt->bc_pa_lock);
+ if (req != NULL) {
/*
* The last remaining session was destroyed while this
* entry was in use. Free the entry and don't attempt
@@ -260,14 +313,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
xprt_free_allocation(req);
return;
}
-
- /*
- * Return it to the list of preallocations so that it
- * may be reused by a new callback request.
- */
- spin_lock_bh(&xprt->bc_pa_lock);
- list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
- spin_unlock_bh(&xprt->bc_pa_lock);
}
/*
@@ -311,6 +356,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
spin_lock(&xprt->bc_pa_lock);
list_del(&req->rq_bc_pa_list);
+ xprt_dec_alloc_count(xprt, 1);
spin_unlock(&xprt->bc_pa_lock);
req->rq_private_buf.len = copied;
diff --git a/kernel/net/sunrpc/bc_svc.c b/kernel/net/sunrpc/bc_svc.c
deleted file mode 100644
index 15c7a8a1c..000000000
--- a/kernel/net/sunrpc/bc_svc.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/******************************************************************************
-
-(c) 2007 Network Appliance, Inc. All Rights Reserved.
-(c) 2009 NetApp. All Rights Reserved.
-
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-******************************************************************************/
-
-/*
- * The NFSv4.1 callback service helper routines.
- * They implement the transport level processing required to send the
- * reply over an existing open connection previously established by the client.
- */
-
-#include <linux/module.h>
-
-#include <linux/sunrpc/xprt.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/bc_xprt.h>
-
-#define RPCDBG_FACILITY RPCDBG_SVCDSP
-
-/* Empty callback ops */
-static const struct rpc_call_ops nfs41_callback_ops = {
-};
-
-
-/*
- * Send the callback reply
- */
-int bc_send(struct rpc_rqst *req)
-{
- struct rpc_task *task;
- int ret;
-
- dprintk("RPC: bc_send req= %p\n", req);
- task = rpc_run_bc_task(req, &nfs41_callback_ops);
- if (IS_ERR(task))
- ret = PTR_ERR(task);
- else {
- WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
- ret = task->tk_status;
- rpc_put_task(task);
- }
- dprintk("RPC: bc_send ret= %d\n", ret);
- return ret;
-}
-
diff --git a/kernel/net/sunrpc/cache.c b/kernel/net/sunrpc/cache.c
index 2928afffb..21e203531 100644
--- a/kernel/net/sunrpc/cache.c
+++ b/kernel/net/sunrpc/cache.c
@@ -41,28 +41,30 @@
static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
static void cache_revisit_request(struct cache_head *item);
-static void cache_init(struct cache_head *h)
+static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
time_t now = seconds_since_boot();
- h->next = NULL;
+ INIT_HLIST_NODE(&h->cache_list);
h->flags = 0;
kref_init(&h->ref);
h->expiry_time = now + CACHE_NEW_EXPIRY;
+ if (now <= detail->flush_time)
+ /* ensure it isn't already expired */
+ now = detail->flush_time + 1;
h->last_refresh = now;
}
struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
struct cache_head *key, int hash)
{
- struct cache_head **head, **hp;
- struct cache_head *new = NULL, *freeme = NULL;
+ struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL;
+ struct hlist_head *head;
head = &detail->hash_table[hash];
read_lock(&detail->hash_lock);
- for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
- struct cache_head *tmp = *hp;
+ hlist_for_each_entry(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp))
/* This entry is expired, we will discard it. */
@@ -82,18 +84,16 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
* we might get lose if we need to
* cache_put it soon.
*/
- cache_init(new);
+ cache_init(new, detail);
detail->init(new, key);
write_lock(&detail->hash_lock);
/* check if entry appeared while we slept */
- for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
- struct cache_head *tmp = *hp;
+ hlist_for_each_entry(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp)) {
- *hp = tmp->next;
- tmp->next = NULL;
+ hlist_del_init(&tmp->cache_list);
detail->entries --;
freeme = tmp;
break;
@@ -104,8 +104,8 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
return tmp;
}
}
- new->next = *head;
- *head = new;
+
+ hlist_add_head(&new->cache_list, head);
detail->entries++;
cache_get(new);
write_unlock(&detail->hash_lock);
@@ -119,10 +119,15 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
-static void cache_fresh_locked(struct cache_head *head, time_t expiry)
+static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+ struct cache_detail *detail)
{
+ time_t now = seconds_since_boot();
+ if (now <= detail->flush_time)
+ /* ensure it isn't immediately treated as expired */
+ now = detail->flush_time + 1;
head->expiry_time = expiry;
- head->last_refresh = seconds_since_boot();
+ head->last_refresh = now;
smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */
set_bit(CACHE_VALID, &head->flags);
}
@@ -143,7 +148,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
* If 'old' is not VALID, we update it directly,
* otherwise we need to replace it
*/
- struct cache_head **head;
struct cache_head *tmp;
if (!test_bit(CACHE_VALID, &old->flags)) {
@@ -153,7 +157,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
set_bit(CACHE_NEGATIVE, &old->flags);
else
detail->update(old, new);
- cache_fresh_locked(old, new->expiry_time);
+ cache_fresh_locked(old, new->expiry_time, detail);
write_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
return old;
@@ -166,21 +170,19 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
cache_put(old, detail);
return NULL;
}
- cache_init(tmp);
+ cache_init(tmp, detail);
detail->init(tmp, old);
- head = &detail->hash_table[hash];
write_lock(&detail->hash_lock);
if (test_bit(CACHE_NEGATIVE, &new->flags))
set_bit(CACHE_NEGATIVE, &tmp->flags);
else
detail->update(tmp, new);
- tmp->next = *head;
- *head = tmp;
+ hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
detail->entries++;
cache_get(tmp);
- cache_fresh_locked(tmp, new->expiry_time);
- cache_fresh_locked(old, 0);
+ cache_fresh_locked(tmp, new->expiry_time, detail);
+ cache_fresh_locked(old, 0, detail);
write_unlock(&detail->hash_lock);
cache_fresh_unlocked(tmp, detail);
cache_fresh_unlocked(old, detail);
@@ -225,7 +227,8 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
set_bit(CACHE_NEGATIVE, &h->flags);
- cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
+ cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY,
+ detail);
rv = -ENOENT;
}
write_unlock(&detail->hash_lock);
@@ -416,28 +419,29 @@ static int cache_clean(void)
/* find a non-empty bucket in the table */
while (current_detail &&
current_index < current_detail->hash_size &&
- current_detail->hash_table[current_index] == NULL)
+ hlist_empty(&current_detail->hash_table[current_index]))
current_index++;
/* find a cleanable entry in the bucket and clean it, or set to next bucket */
if (current_detail && current_index < current_detail->hash_size) {
- struct cache_head *ch, **cp;
+ struct cache_head *ch = NULL;
struct cache_detail *d;
+ struct hlist_head *head;
+ struct hlist_node *tmp;
write_lock(&current_detail->hash_lock);
/* Ok, now to clean this strand */
- cp = & current_detail->hash_table[current_index];
- for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) {
+ head = &current_detail->hash_table[current_index];
+ hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
if (current_detail->nextcheck > ch->expiry_time)
current_detail->nextcheck = ch->expiry_time+1;
if (!cache_is_expired(current_detail, ch))
continue;
- *cp = ch->next;
- ch->next = NULL;
+ hlist_del_init(&ch->cache_list);
current_detail->entries--;
rv = 1;
break;
@@ -492,10 +496,13 @@ EXPORT_SYMBOL_GPL(cache_flush);
void cache_purge(struct cache_detail *detail)
{
- detail->flush_time = LONG_MAX;
+ time_t now = seconds_since_boot();
+ if (detail->flush_time >= now)
+ now = detail->flush_time + 1;
+ /* 'now' is the maximum value any 'last_refresh' can have */
+ detail->flush_time = now;
detail->nextcheck = seconds_since_boot();
cache_flush();
- detail->flush_time = 1;
}
EXPORT_SYMBOL_GPL(cache_purge);
@@ -1218,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
if (bp[0] == '\\' && bp[1] == 'x') {
/* HEX STRING */
bp += 2;
- while (len < bufsize) {
+ while (len < bufsize - 1) {
int h, l;
h = hex_to_bin(bp[0]);
@@ -1270,18 +1277,13 @@ EXPORT_SYMBOL_GPL(qword_get);
* get a header, then pass each real item in the cache
*/
-struct handle {
- struct cache_detail *cd;
-};
-
-static void *c_start(struct seq_file *m, loff_t *pos)
+void *cache_seq_start(struct seq_file *m, loff_t *pos)
__acquires(cd->hash_lock)
{
loff_t n = *pos;
unsigned int hash, entry;
struct cache_head *ch;
- struct cache_detail *cd = ((struct handle*)m->private)->cd;
-
+ struct cache_detail *cd = m->private;
read_lock(&cd->hash_lock);
if (!n--)
@@ -1289,7 +1291,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
hash = n >> 32;
entry = n & ((1LL<<32) - 1);
- for (ch=cd->hash_table[hash]; ch; ch=ch->next)
+ hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list)
if (!entry--)
return ch;
n &= ~((1LL<<32) - 1);
@@ -1297,51 +1299,57 @@ static void *c_start(struct seq_file *m, loff_t *pos)
hash++;
n += 1LL<<32;
} while(hash < cd->hash_size &&
- cd->hash_table[hash]==NULL);
+ hlist_empty(&cd->hash_table[hash]));
if (hash >= cd->hash_size)
return NULL;
*pos = n+1;
- return cd->hash_table[hash];
+ return hlist_entry_safe(cd->hash_table[hash].first,
+ struct cache_head, cache_list);
}
+EXPORT_SYMBOL_GPL(cache_seq_start);
-static void *c_next(struct seq_file *m, void *p, loff_t *pos)
+void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
{
struct cache_head *ch = p;
int hash = (*pos >> 32);
- struct cache_detail *cd = ((struct handle*)m->private)->cd;
+ struct cache_detail *cd = m->private;
if (p == SEQ_START_TOKEN)
hash = 0;
- else if (ch->next == NULL) {
+ else if (ch->cache_list.next == NULL) {
hash++;
*pos += 1LL<<32;
} else {
++*pos;
- return ch->next;
+ return hlist_entry_safe(ch->cache_list.next,
+ struct cache_head, cache_list);
}
*pos &= ~((1LL<<32) - 1);
while (hash < cd->hash_size &&
- cd->hash_table[hash] == NULL) {
+ hlist_empty(&cd->hash_table[hash])) {
hash++;
*pos += 1LL<<32;
}
if (hash >= cd->hash_size)
return NULL;
++*pos;
- return cd->hash_table[hash];
+ return hlist_entry_safe(cd->hash_table[hash].first,
+ struct cache_head, cache_list);
}
+EXPORT_SYMBOL_GPL(cache_seq_next);
-static void c_stop(struct seq_file *m, void *p)
+void cache_seq_stop(struct seq_file *m, void *p)
__releases(cd->hash_lock)
{
- struct cache_detail *cd = ((struct handle*)m->private)->cd;
+ struct cache_detail *cd = m->private;
read_unlock(&cd->hash_lock);
}
+EXPORT_SYMBOL_GPL(cache_seq_stop);
static int c_show(struct seq_file *m, void *p)
{
struct cache_head *cp = p;
- struct cache_detail *cd = ((struct handle*)m->private)->cd;
+ struct cache_detail *cd = m->private;
if (p == SEQ_START_TOKEN)
return cd->cache_show(m, cd, NULL);
@@ -1364,33 +1372,36 @@ static int c_show(struct seq_file *m, void *p)
}
static const struct seq_operations cache_content_op = {
- .start = c_start,
- .next = c_next,
- .stop = c_stop,
+ .start = cache_seq_start,
+ .next = cache_seq_next,
+ .stop = cache_seq_stop,
.show = c_show,
};
static int content_open(struct inode *inode, struct file *file,
struct cache_detail *cd)
{
- struct handle *han;
+ struct seq_file *seq;
+ int err;
if (!cd || !try_module_get(cd->owner))
return -EACCES;
- han = __seq_open_private(file, &cache_content_op, sizeof(*han));
- if (han == NULL) {
+
+ err = seq_open(file, &cache_content_op);
+ if (err) {
module_put(cd->owner);
- return -ENOMEM;
+ return err;
}
- han->cd = cd;
+ seq = file->private_data;
+ seq->private = cd;
return 0;
}
static int content_release(struct inode *inode, struct file *file,
struct cache_detail *cd)
{
- int ret = seq_release_private(inode, file);
+ int ret = seq_release(inode, file);
module_put(cd->owner);
return ret;
}
@@ -1437,6 +1448,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
{
char tbuf[20];
char *bp, *ep;
+ time_t then, now;
if (*ppos || count > sizeof(tbuf)-1)
return -EINVAL;
@@ -1448,8 +1460,22 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
return -EINVAL;
bp = tbuf;
- cd->flush_time = get_expiry(&bp);
- cd->nextcheck = seconds_since_boot();
+ then = get_expiry(&bp);
+ now = seconds_since_boot();
+ cd->nextcheck = now;
+ /* Can only set flush_time to 1 second beyond "now", or
+ * possibly 1 second beyond flushtime. This is because
+ * flush_time never goes backwards so it mustn't get too far
+ * ahead of time.
+ */
+ if (then >= now) {
+ /* Want to flush everything, so behave like cache_purge() */
+ if (cd->flush_time >= now)
+ now = cd->flush_time + 1;
+ then = now;
+ }
+
+ cd->flush_time = then;
cache_flush();
*ppos += count;
@@ -1665,17 +1691,21 @@ EXPORT_SYMBOL_GPL(cache_unregister_net);
struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net)
{
struct cache_detail *cd;
+ int i;
cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL);
if (cd == NULL)
return ERR_PTR(-ENOMEM);
- cd->hash_table = kzalloc(cd->hash_size * sizeof(struct cache_head *),
+ cd->hash_table = kzalloc(cd->hash_size * sizeof(struct hlist_head),
GFP_KERNEL);
if (cd->hash_table == NULL) {
kfree(cd);
return ERR_PTR(-ENOMEM);
}
+
+ for (i = 0; i < cd->hash_size; i++)
+ INIT_HLIST_HEAD(&cd->hash_table[i]);
cd->net = net;
return cd;
}
diff --git a/kernel/net/sunrpc/clnt.c b/kernel/net/sunrpc/clnt.c
index e6ce15173..23608eb0d 100644
--- a/kernel/net/sunrpc/clnt.c
+++ b/kernel/net/sunrpc/clnt.c
@@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
task->tk_flags |= RPC_TASK_SOFT;
if (clnt->cl_noretranstimeo)
task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
- if (sk_memalloc_socks()) {
- struct rpc_xprt *xprt;
-
- rcu_read_lock();
- xprt = rcu_dereference(clnt->cl_xprt);
- if (xprt->swapper)
- task->tk_flags |= RPC_TASK_SWAPPER;
- rcu_read_unlock();
- }
+ if (atomic_read(&clnt->cl_swapper))
+ task->tk_flags |= RPC_TASK_SWAPPER;
/* Add to the client's list of all tasks */
spin_lock(&clnt->cl_lock);
list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
* rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
* rpc_execute against it
* @req: RPC request
- * @tk_ops: RPC call ops
*/
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
- const struct rpc_call_ops *tk_ops)
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
{
struct rpc_task *task;
struct xdr_buf *xbufp = &req->rq_snd_buf;
struct rpc_task_setup task_setup_data = {
- .callback_ops = tk_ops,
+ .callback_ops = &rpc_default_ops,
+ .flags = RPC_TASK_SOFTCONN,
};
dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task)
req->rq_callsize + req->rq_rcvsize);
if (req->rq_buffer != NULL)
return;
+ xprt_inject_disconnect(xprt);
dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
@@ -1909,6 +1902,7 @@ call_transmit_status(struct rpc_task *task)
switch (task->tk_status) {
case -EAGAIN:
+ case -ENOBUFS:
break;
default:
dprint_status(task);
@@ -1935,7 +1929,6 @@ call_transmit_status(struct rpc_task *task)
case -ECONNABORTED:
case -EADDRINUSE:
case -ENOTCONN:
- case -ENOBUFS:
case -EPIPE:
rpc_task_force_reencode(task);
}
@@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
- if (!xprt_prepare_transmit(task)) {
- /*
- * Could not reserve the transport. Try again after the
- * transport is released.
- */
- task->tk_status = 0;
- task->tk_action = call_bc_transmit;
- return;
- }
+ if (!xprt_prepare_transmit(task))
+ goto out_retry;
- task->tk_action = rpc_exit_task;
if (task->tk_status < 0) {
printk(KERN_NOTICE "RPC: Could not send backchannel reply "
"error: %d\n", task->tk_status);
- return;
+ goto out_done;
}
+ if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
+ req->rq_bytes_sent = 0;
xprt_transmit(task);
+
+ if (task->tk_status == -EAGAIN)
+ goto out_nospace;
+
xprt_end_transmit(task);
dprint_status(task);
switch (task->tk_status) {
case 0:
/* Success */
- break;
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
+ case -ECONNRESET:
+ case -ECONNREFUSED:
+ case -EADDRINUSE:
+ case -ENOTCONN:
+ case -EPIPE:
+ break;
case -ETIMEDOUT:
/*
* Problem reaching the server. Disconnect and let the
@@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task)
break;
}
rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+out_done:
+ task->tk_action = rpc_exit_task;
+ return;
+out_nospace:
+ req->rq_connect_cookie = req->rq_xprt->connect_cookie;
+out_retry:
+ task->tk_status = 0;
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
@@ -2054,12 +2057,13 @@ call_status(struct rpc_task *task)
case -ECONNABORTED:
rpc_force_rebind(clnt);
case -EADDRINUSE:
- case -ENOBUFS:
rpc_delay(task, 3*HZ);
case -EPIPE:
case -ENOTCONN:
task->tk_action = call_bind;
break;
+ case -ENOBUFS:
+ rpc_delay(task, HZ>>2);
case -EAGAIN:
task->tk_action = call_transmit;
break;
@@ -2476,3 +2480,59 @@ void rpc_show_tasks(struct net *net)
spin_unlock(&sn->rpc_client_lock);
}
#endif
+
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+ int ret = 0;
+ struct rpc_xprt *xprt;
+
+ if (atomic_inc_return(&clnt->cl_swapper) == 1) {
+retry:
+ rcu_read_lock();
+ xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ rcu_read_unlock();
+ if (!xprt) {
+ /*
+ * If we didn't get a reference, then we likely are
+ * racing with a migration event. Wait for a grace
+ * period and try again.
+ */
+ synchronize_rcu();
+ goto retry;
+ }
+
+ ret = xprt_enable_swap(xprt);
+ xprt_put(xprt);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
+
+void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+
+ if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
+retry:
+ rcu_read_lock();
+ xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ rcu_read_unlock();
+ if (!xprt) {
+ /*
+ * If we didn't get a reference, then we likely are
+ * racing with a migration event. Wait for a grace
+ * period and try again.
+ */
+ synchronize_rcu();
+ goto retry;
+ }
+
+ xprt_disable_swap(xprt);
+ xprt_put(xprt);
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
+#endif /* CONFIG_SUNRPC_SWAP */
diff --git a/kernel/net/sunrpc/debugfs.c b/kernel/net/sunrpc/debugfs.c
index 82962f7e6..e7b4d9356 100644
--- a/kernel/net/sunrpc/debugfs.c
+++ b/kernel/net/sunrpc/debugfs.c
@@ -10,9 +10,12 @@
#include "netns.h"
static struct dentry *topdir;
+static struct dentry *rpc_fault_dir;
static struct dentry *rpc_clnt_dir;
static struct dentry *rpc_xprt_dir;
+unsigned int rpc_inject_disconnect;
+
struct rpc_clnt_iter {
struct rpc_clnt *clnt;
loff_t pos;
@@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
debugfs_remove_recursive(xprt->debugfs);
xprt->debugfs = NULL;
}
+
+ atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
}
void
@@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
xprt->debugfs = NULL;
}
+static int
+fault_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = kmalloc(128, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int
+fault_release(struct inode *inode, struct file *filp)
+{
+ kfree(filp->private_data);
+ return 0;
+}
+
+static ssize_t
+fault_disconnect_read(struct file *filp, char __user *user_buf,
+ size_t len, loff_t *offset)
+{
+ char *buffer = (char *)filp->private_data;
+ size_t size;
+
+ size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
+ return simple_read_from_buffer(user_buf, len, offset, buffer, size);
+}
+
+static ssize_t
+fault_disconnect_write(struct file *filp, const char __user *user_buf,
+ size_t len, loff_t *offset)
+{
+ char buffer[16];
+
+ if (len >= sizeof(buffer))
+ len = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, user_buf, len))
+ return -EFAULT;
+ buffer[len] = '\0';
+ if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
+ return -EINVAL;
+ return len;
+}
+
+static const struct file_operations fault_disconnect_fops = {
+ .owner = THIS_MODULE,
+ .open = fault_open,
+ .read = fault_disconnect_read,
+ .write = fault_disconnect_write,
+ .release = fault_release,
+};
+
+static struct dentry *
+inject_fault_dir(struct dentry *topdir)
+{
+ struct dentry *faultdir;
+
+ faultdir = debugfs_create_dir("inject_fault", topdir);
+ if (!faultdir)
+ return NULL;
+
+ if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
+ NULL, &fault_disconnect_fops))
+ return NULL;
+
+ return faultdir;
+}
+
void __exit
sunrpc_debugfs_exit(void)
{
debugfs_remove_recursive(topdir);
topdir = NULL;
+ rpc_fault_dir = NULL;
rpc_clnt_dir = NULL;
rpc_xprt_dir = NULL;
}
@@ -282,6 +355,10 @@ sunrpc_debugfs_init(void)
if (!topdir)
return;
+ rpc_fault_dir = inject_fault_dir(topdir);
+ if (!rpc_fault_dir)
+ goto out_remove;
+
rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
if (!rpc_clnt_dir)
goto out_remove;
@@ -294,5 +371,6 @@ sunrpc_debugfs_init(void)
out_remove:
debugfs_remove_recursive(topdir);
topdir = NULL;
+ rpc_fault_dir = NULL;
rpc_clnt_dir = NULL;
}
diff --git a/kernel/net/sunrpc/sched.c b/kernel/net/sunrpc/sched.c
index 337ca851a..73ad57a59 100644
--- a/kernel/net/sunrpc/sched.c
+++ b/kernel/net/sunrpc/sched.c
@@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
}
EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
-static int rpc_wait_bit_killable(struct wait_bit_key *key)
+static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
@@ -1092,14 +1092,10 @@ void
rpc_destroy_mempool(void)
{
rpciod_stop();
- if (rpc_buffer_mempool)
- mempool_destroy(rpc_buffer_mempool);
- if (rpc_task_mempool)
- mempool_destroy(rpc_task_mempool);
- if (rpc_task_slabp)
- kmem_cache_destroy(rpc_task_slabp);
- if (rpc_buffer_slabp)
- kmem_cache_destroy(rpc_buffer_slabp);
+ mempool_destroy(rpc_buffer_mempool);
+ mempool_destroy(rpc_task_mempool);
+ kmem_cache_destroy(rpc_task_slabp);
+ kmem_cache_destroy(rpc_buffer_slabp);
rpc_destroy_wait_queue(&delay_queue);
}
diff --git a/kernel/net/sunrpc/svc.c b/kernel/net/sunrpc/svc.c
index 78974e4d9..cc9852897 100644
--- a/kernel/net/sunrpc/svc.c
+++ b/kernel/net/sunrpc/svc.c
@@ -34,36 +34,19 @@
static void svc_unregister(const struct svc_serv *serv, struct net *net);
-#define svc_serv_is_pooled(serv) ((serv)->sv_function)
+#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function)
-/*
- * Mode for mapping cpus to pools.
- */
-enum {
- SVC_POOL_AUTO = -1, /* choose one of the others */
- SVC_POOL_GLOBAL, /* no mapping, just a single global pool
- * (legacy & UP mode) */
- SVC_POOL_PERCPU, /* one pool per cpu */
- SVC_POOL_PERNODE /* one pool per numa node */
-};
#define SVC_POOL_DEFAULT SVC_POOL_GLOBAL
/*
* Structure for mapping cpus to pools and vice versa.
* Setup once during sunrpc initialisation.
*/
-static struct svc_pool_map {
- int count; /* How many svc_servs use us */
- int mode; /* Note: int not enum to avoid
- * warnings about "enumeration value
- * not handled in switch" */
- unsigned int npools;
- unsigned int *pool_to; /* maps pool id to cpu or node */
- unsigned int *to_pool; /* maps cpu or node to pool id */
-} svc_pool_map = {
- .count = 0,
+struct svc_pool_map svc_pool_map = {
.mode = SVC_POOL_DEFAULT
};
+EXPORT_SYMBOL_GPL(svc_pool_map);
+
static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
static int
@@ -236,7 +219,7 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
* vice versa). Initialise the map if we're the first user.
* Returns the number of pools.
*/
-static unsigned int
+unsigned int
svc_pool_map_get(void)
{
struct svc_pool_map *m = &svc_pool_map;
@@ -271,7 +254,7 @@ svc_pool_map_get(void)
mutex_unlock(&svc_pool_map_mutex);
return m->npools;
}
-
+EXPORT_SYMBOL_GPL(svc_pool_map_get);
/*
* Drop a reference to the global map of cpus to pools.
@@ -280,7 +263,7 @@ svc_pool_map_get(void)
* mode using the pool_mode module option without
* rebooting or re-loading sunrpc.ko.
*/
-static void
+void
svc_pool_map_put(void)
{
struct svc_pool_map *m = &svc_pool_map;
@@ -297,7 +280,7 @@ svc_pool_map_put(void)
mutex_unlock(&svc_pool_map_mutex);
}
-
+EXPORT_SYMBOL_GPL(svc_pool_map_put);
static int svc_pool_map_get_node(unsigned int pidx)
{
@@ -423,7 +406,7 @@ EXPORT_SYMBOL_GPL(svc_bind);
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- void (*shutdown)(struct svc_serv *serv, struct net *net))
+ struct svc_serv_ops *ops)
{
struct svc_serv *serv;
unsigned int vers;
@@ -440,7 +423,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
bufsize = RPCSVC_MAXPAYLOAD;
serv->sv_max_payload = bufsize? bufsize : 4096;
serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
- serv->sv_shutdown = shutdown;
+ serv->sv_ops = ops;
xdrsize = 0;
while (prog) {
prog->pg_lovers = prog->pg_nvers-1;
@@ -486,26 +469,22 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
struct svc_serv *
svc_create(struct svc_program *prog, unsigned int bufsize,
- void (*shutdown)(struct svc_serv *serv, struct net *net))
+ struct svc_serv_ops *ops)
{
- return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+ return __svc_create(prog, bufsize, /*npools*/1, ops);
}
EXPORT_SYMBOL_GPL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
- void (*shutdown)(struct svc_serv *serv, struct net *net),
- svc_thread_fn func, struct module *mod)
+ struct svc_serv_ops *ops)
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
- serv = __svc_create(prog, bufsize, npools, shutdown);
+ serv = __svc_create(prog, bufsize, npools, ops);
if (!serv)
goto out_err;
-
- serv->sv_function = func;
- serv->sv_module = mod;
return serv;
out_err:
svc_pool_map_put();
@@ -517,8 +496,8 @@ void svc_shutdown_net(struct svc_serv *serv, struct net *net)
{
svc_close_net(serv, net);
- if (serv->sv_shutdown)
- serv->sv_shutdown(serv, net);
+ if (serv->sv_ops->svo_shutdown)
+ serv->sv_ops->svo_shutdown(serv, net);
}
EXPORT_SYMBOL_GPL(svc_shutdown_net);
@@ -604,40 +583,52 @@ svc_release_buffer(struct svc_rqst *rqstp)
}
struct svc_rqst *
-svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
{
struct svc_rqst *rqstp;
rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node);
if (!rqstp)
- goto out_enomem;
+ return rqstp;
- serv->sv_nrthreads++;
__set_bit(RQ_BUSY, &rqstp->rq_flags);
spin_lock_init(&rqstp->rq_lock);
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
- spin_lock_bh(&pool->sp_lock);
- pool->sp_nrthreads++;
- list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
- spin_unlock_bh(&pool->sp_lock);
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
if (!rqstp->rq_argp)
- goto out_thread;
+ goto out_enomem;
rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
if (!rqstp->rq_resp)
- goto out_thread;
+ goto out_enomem;
if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
- goto out_thread;
+ goto out_enomem;
return rqstp;
-out_thread:
- svc_exit_thread(rqstp);
out_enomem:
- return ERR_PTR(-ENOMEM);
+ svc_rqst_free(rqstp);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(svc_rqst_alloc);
+
+struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+{
+ struct svc_rqst *rqstp;
+
+ rqstp = svc_rqst_alloc(serv, pool, node);
+ if (!rqstp)
+ return ERR_PTR(-ENOMEM);
+
+ serv->sv_nrthreads++;
+ spin_lock_bh(&pool->sp_lock);
+ pool->sp_nrthreads++;
+ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
+ spin_unlock_bh(&pool->sp_lock);
+ return rqstp;
}
EXPORT_SYMBOL_GPL(svc_prepare_thread);
@@ -739,12 +730,12 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
break;
}
- __module_get(serv->sv_module);
- task = kthread_create_on_node(serv->sv_function, rqstp,
+ __module_get(serv->sv_ops->svo_module);
+ task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
error = PTR_ERR(task);
- module_put(serv->sv_module);
+ module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
break;
}
@@ -772,15 +763,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads);
* mutex" for the service.
*/
void
-svc_exit_thread(struct svc_rqst *rqstp)
+svc_rqst_free(struct svc_rqst *rqstp)
{
- struct svc_serv *serv = rqstp->rq_server;
- struct svc_pool *pool = rqstp->rq_pool;
-
svc_release_buffer(rqstp);
kfree(rqstp->rq_resp);
kfree(rqstp->rq_argp);
kfree(rqstp->rq_auth_data);
+ kfree_rcu(rqstp, rq_rcu_head);
+}
+EXPORT_SYMBOL_GPL(svc_rqst_free);
+
+void
+svc_exit_thread(struct svc_rqst *rqstp)
+{
+ struct svc_serv *serv = rqstp->rq_server;
+ struct svc_pool *pool = rqstp->rq_pool;
spin_lock_bh(&pool->sp_lock);
pool->sp_nrthreads--;
@@ -788,7 +785,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
list_del_rcu(&rqstp->rq_all);
spin_unlock_bh(&pool->sp_lock);
- kfree_rcu(rqstp, rq_rcu_head);
+ svc_rqst_free(rqstp);
/* Release the server */
if (serv)
@@ -1290,7 +1287,6 @@ err_bad:
svc_putnl(resv, ntohl(rpc_stat));
goto sendit;
}
-EXPORT_SYMBOL_GPL(svc_process);
/*
* Process the RPC request.
@@ -1338,6 +1334,7 @@ out_drop:
svc_drop(rqstp);
return 0;
}
+EXPORT_SYMBOL_GPL(svc_process);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/*
@@ -1350,6 +1347,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
+ struct rpc_task *task;
+ int proc_error;
+ int error;
+
+ dprintk("svc: %s(%p)\n", __func__, req);
/* Build the svc_rqst used by the common processing routine */
rqstp->rq_xprt = serv->sv_bc_xprt;
@@ -1362,31 +1364,54 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
+ /* Adjust the argument buffer length */
+ rqstp->rq_arg.len = req->rq_private_buf.len;
+ if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
+ rqstp->rq_arg.page_len = 0;
+ } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len +
+ rqstp->rq_arg.page_len)
+ rqstp->rq_arg.page_len = rqstp->rq_arg.len -
+ rqstp->rq_arg.head[0].iov_len;
+ else
+ rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len +
+ rqstp->rq_arg.page_len;
+
/* reset result send buffer "put" position */
resv->iov_len = 0;
- if (rqstp->rq_prot != IPPROTO_TCP) {
- printk(KERN_ERR "No support for Non-TCP transports!\n");
- BUG();
- }
-
/*
* Skip the next two words because they've already been
- * processed in the trasport
+ * processed in the transport
*/
svc_getu32(argv); /* XID */
svc_getnl(argv); /* CALLDIR */
- /* Returns 1 for send, 0 for drop */
- if (svc_process_common(rqstp, argv, resv)) {
- memcpy(&req->rq_snd_buf, &rqstp->rq_res,
- sizeof(req->rq_snd_buf));
- return bc_send(req);
- } else {
- /* drop request */
+ /* Parse and execute the bc call */
+ proc_error = svc_process_common(rqstp, argv, resv);
+
+ atomic_inc(&req->rq_xprt->bc_free_slots);
+ if (!proc_error) {
+ /* Processing error: drop the request */
xprt_free_bc_request(req);
return 0;
}
+
+ /* Finally, send the reply synchronously */
+ memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+ task = rpc_run_bc_task(req);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
+ goto out;
+ }
+
+ WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
+ error = task->tk_status;
+ rpc_put_task(task);
+
+out:
+ dprintk("svc: %s(), error=%d\n", __func__, error);
+ return error;
}
EXPORT_SYMBOL_GPL(bc_svc_process);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
diff --git a/kernel/net/sunrpc/svc_xprt.c b/kernel/net/sunrpc/svc_xprt.c
index ba2313cd4..5b69bb580 100644
--- a/kernel/net/sunrpc/svc_xprt.c
+++ b/kernel/net/sunrpc/svc_xprt.c
@@ -24,7 +24,6 @@ static int svc_deferred_recv(struct svc_rqst *rqstp);
static struct cache_deferred_req *svc_defer(struct cache_req *req);
static void svc_age_temp_xprts(unsigned long closure);
static void svc_delete_xprt(struct svc_xprt *xprt);
-static void svc_xprt_do_enqueue(struct svc_xprt *xprt);
/* apparently the "standard" is that clients close
* idle connections after 5 minutes, servers after
@@ -225,12 +224,12 @@ static void svc_xprt_received(struct svc_xprt *xprt)
}
/* As soon as we clear busy, the xprt could be closed and
- * 'put', so we need a reference to call svc_xprt_do_enqueue with:
+ * 'put', so we need a reference to call svc_enqueue_xprt with:
*/
svc_xprt_get(xprt);
smp_mb__before_atomic();
clear_bit(XPT_BUSY, &xprt->xpt_flags);
- svc_xprt_do_enqueue(xprt);
+ xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
svc_xprt_put(xprt);
}
@@ -320,7 +319,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
return false;
}
-static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
+void svc_xprt_do_enqueue(struct svc_xprt *xprt)
{
struct svc_pool *pool;
struct svc_rqst *rqstp = NULL;
@@ -402,6 +401,7 @@ redo_search:
out:
trace_svc_xprt_do_enqueue(xprt, rqstp);
}
+EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
/*
* Queue up a transport with data pending. If there are idle nfsd
@@ -412,7 +412,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
{
if (test_bit(XPT_BUSY, &xprt->xpt_flags))
return;
- svc_xprt_do_enqueue(xprt);
+ xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
}
EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
diff --git a/kernel/net/sunrpc/svcsock.c b/kernel/net/sunrpc/svcsock.c
index 0c8120229..1413cdcc1 100644
--- a/kernel/net/sunrpc/svcsock.c
+++ b/kernel/net/sunrpc/svcsock.c
@@ -181,7 +181,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
struct page **ppage = xdr->pages;
size_t base = xdr->page_base;
unsigned int pglen = xdr->page_len;
- unsigned int flags = MSG_MORE;
+ unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
int slen;
int len = 0;
@@ -399,6 +399,31 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
return svc_port_is_privileged(svc_addr(rqstp));
}
+static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
+{
+ if (!wq)
+ return false;
+ /*
+ * There should normally be a memory * barrier here--see
+ * wq_has_sleeper().
+ *
+ * It appears that isn't currently necessary, though, basically
+ * because callers all appear to have sufficient memory barriers
+ * between the time the relevant change is made and the
+ * time they call these callbacks.
+ *
+ * The nfsd code itself doesn't actually explicitly wait on
+ * these waitqueues, but it may wait on them for example in
+ * sendpage() or sendmsg() calls. (And those may be the only
+ * places, since it it uses nonblocking reads.)
+ *
+ * Maybe we should add the memory barriers anyway, but these are
+ * hot paths so we'd need to be convinced there's no sigificant
+ * penalty.
+ */
+ return waitqueue_active(wq);
+}
+
/*
* INET callback when data has been received on the socket.
*/
@@ -414,7 +439,7 @@ static void svc_udp_data_ready(struct sock *sk)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible(wq);
}
@@ -432,7 +457,7 @@ static void svc_write_space(struct sock *sk)
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq)) {
+ if (sunrpc_waitqueue_active(wq)) {
dprintk("RPC svc_write_space: someone sleeping on %p\n",
svsk);
wake_up_interruptible(wq);
@@ -787,7 +812,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
}
wq = sk_sleep(sk);
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible_all(wq);
}
@@ -808,7 +833,7 @@ static void svc_tcp_state_change(struct sock *sk)
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible_all(wq);
}
@@ -823,7 +848,7 @@ static void svc_tcp_data_ready(struct sock *sk)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible(wq);
}
@@ -1367,7 +1392,6 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
/*
* Initialize socket for RPC use and create svc_sock struct
- * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
*/
static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct socket *sock,
@@ -1594,7 +1618,7 @@ static void svc_sock_detach(struct svc_xprt *xprt)
sk->sk_write_space = svsk->sk_owspace;
wq = sk_sleep(sk);
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible(wq);
}
diff --git a/kernel/net/sunrpc/sysctl.c b/kernel/net/sunrpc/sysctl.c
index 887f0183b..c88d9bc06 100644
--- a/kernel/net/sunrpc/sysctl.c
+++ b/kernel/net/sunrpc/sysctl.c
@@ -76,7 +76,7 @@ static int
proc_dodebug(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- char tmpbuf[20], c, *s;
+ char tmpbuf[20], c, *s = NULL;
char __user *p;
unsigned int value;
size_t left, len;
@@ -103,23 +103,24 @@ proc_dodebug(struct ctl_table *table, int write,
return -EFAULT;
tmpbuf[left] = '\0';
- for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--)
- value = 10 * value + (*s - '0');
- if (*s && !isspace(*s))
- return -EINVAL;
- while (left && isspace(*s))
- left--, s++;
+ value = simple_strtol(tmpbuf, &s, 0);
+ if (s) {
+ left -= (s - tmpbuf);
+ if (left && !isspace(*s))
+ return -EINVAL;
+ while (left && isspace(*s))
+ left--, s++;
+ } else
+ left = 0;
*(unsigned int *) table->data = value;
/* Display the RPC tasks on writing to rpc_debug */
if (strcmp(table->procname, "rpc_debug") == 0)
rpc_show_tasks(&init_net);
} else {
- if (!access_ok(VERIFY_WRITE, buffer, left))
- return -EFAULT;
- len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data);
+ len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data);
if (len > left)
len = left;
- if (__copy_to_user(buffer, tmpbuf, len))
+ if (copy_to_user(buffer, tmpbuf, len))
return -EFAULT;
if ((left -= len) > 0) {
if (put_user('\n', (char __user *)buffer + len))
diff --git a/kernel/net/sunrpc/xprt.c b/kernel/net/sunrpc/xprt.c
index d109d308e..2e98f4a24 100644
--- a/kernel/net/sunrpc/xprt.c
+++ b/kernel/net/sunrpc/xprt.c
@@ -68,6 +68,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
static void xprt_connect_status(struct rpc_task *task);
static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
+static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
@@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
}
xprt_clear_locked(xprt);
out_sleep:
+ if (req)
+ __xprt_put_cong(xprt, req);
dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_timeout = 0;
task->tk_status = -EAGAIN;
@@ -608,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work)
struct rpc_xprt *xprt =
container_of(work, struct rpc_xprt, task_cleanup);
- xprt->ops->close(xprt);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ xprt->ops->close(xprt);
xprt_release_write(xprt, NULL);
wake_up_bit(&xprt->state, XPRT_LOCKED);
}
@@ -969,6 +972,7 @@ void xprt_transmit(struct rpc_task *task)
task->tk_status = status;
return;
}
+ xprt_inject_disconnect(xprt);
dprintk("RPC: %5u xmit complete\n", task->tk_pid);
task->tk_flags |= RPC_TASK_SENT;
@@ -1287,6 +1291,7 @@ void xprt_release(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(req->rq_buffer);
+ xprt_inject_disconnect(xprt);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
task->tk_rqstp = NULL;
diff --git a/kernel/net/sunrpc/xprtrdma/Makefile b/kernel/net/sunrpc/xprtrdma/Makefile
index 579f72bbc..33f99d300 100644
--- a/kernel/net/sunrpc/xprtrdma/Makefile
+++ b/kernel/net/sunrpc/xprtrdma/Makefile
@@ -1,9 +1,8 @@
-obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
-xprtrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o physical_ops.o
-
-obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
-
-svcrdma-y := svc_rdma.o svc_rdma_transport.o \
- svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
+rpcrdma-y := transport.o rpc_rdma.o verbs.o \
+ fmr_ops.o frwr_ops.o physical_ops.o \
+ svc_rdma.o svc_rdma_transport.o \
+ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
+ module.o
+rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/kernel/net/sunrpc/xprtrdma/backchannel.c b/kernel/net/sunrpc/xprtrdma/backchannel.c
new file mode 100644
index 000000000..2dcb44f69
--- /dev/null
+++ b/kernel/net/sunrpc/xprtrdma/backchannel.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA.
+ */
+
+#include <linux/module.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+#define RPCRDMA_BACKCHANNEL_DEBUG
+
+static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+ spin_lock(&buf->rb_reqslock);
+ list_del(&req->rl_all);
+ spin_unlock(&buf->rb_reqslock);
+
+ rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+
+ kfree(rqst);
+}
+
+static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_req *req;
+ struct xdr_buf *buf;
+ size_t size;
+
+ req = rpcrdma_create_req(r_xprt);
+ if (!req)
+ return -ENOMEM;
+ req->rl_backchannel = true;
+
+ size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ req->rl_rdmabuf = rb;
+
+ size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ rb->rg_owner = req;
+ req->rl_sendbuf = rb;
+ /* so that rpcr_to_rdmar works when receiving a request */
+ rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
+
+ buf = &rqst->rq_snd_buf;
+ buf->head[0].iov_base = rqst->rq_buffer;
+ buf->head[0].iov_len = 0;
+ buf->tail[0].iov_base = NULL;
+ buf->tail[0].iov_len = 0;
+ buf->page_len = 0;
+ buf->len = 0;
+ buf->buflen = size;
+
+ return 0;
+
+out_fail:
+ rpcrdma_bc_free_rqst(r_xprt, rqst);
+ return -ENOMEM;
+}
+
+/* Allocate and add receive buffers to the rpcrdma_buffer's
+ * existing list of rep's. These are released when the
+ * transport is destroyed.
+ */
+static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
+ unsigned int count)
+{
+ struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
+ struct rpcrdma_rep *rep;
+ unsigned long flags;
+ int rc = 0;
+
+ while (count--) {
+ rep = rpcrdma_create_rep(r_xprt);
+ if (IS_ERR(rep)) {
+ pr_err("RPC: %s: reply buffer alloc failed\n",
+ __func__);
+ rc = PTR_ERR(rep);
+ break;
+ }
+
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ }
+
+ return rc;
+}
+
+/**
+ * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of concurrent incoming requests to expect
+ *
+ * Returns 0 on success; otherwise a negative errno
+ */
+int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+ struct rpc_rqst *rqst;
+ unsigned int i;
+ int rc;
+
+ /* The backchannel reply path returns each rpc_rqst to the
+ * bc_pa_list _after_ the reply is sent. If the server is
+ * faster than the client, it can send another backward
+ * direction request before the rpc_rqst is returned to the
+ * list. The client rejects the request in this case.
+ *
+ * Twice as many rpc_rqsts are prepared to ensure there is
+ * always an rpc_rqst available as soon as a reply is sent.
+ */
+ if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
+ goto out_err;
+
+ for (i = 0; i < (reqs << 1); i++) {
+ rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
+ if (!rqst) {
+ pr_err("RPC: %s: Failed to create bc rpc_rqst\n",
+ __func__);
+ goto out_free;
+ }
+
+ rqst->rq_xprt = &r_xprt->rx_xprt;
+ INIT_LIST_HEAD(&rqst->rq_list);
+ INIT_LIST_HEAD(&rqst->rq_bc_list);
+
+ if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
+ goto out_free;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+ }
+
+ rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
+ if (rc)
+ goto out_free;
+
+ rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs);
+ if (rc)
+ goto out_free;
+
+ buffer->rb_bc_srv_max_requests = reqs;
+ request_module("svcrdma");
+
+ return 0;
+
+out_free:
+ xprt_rdma_bc_destroy(xprt, reqs);
+
+out_err:
+ pr_err("RPC: %s: setup backchannel transport failed\n", __func__);
+ return -ENOMEM;
+}
+
+/**
+ * xprt_rdma_bc_up - Create transport endpoint for backchannel service
+ * @serv: server endpoint
+ * @net: network namespace
+ *
+ * The "xprt" is an implied argument: it supplies the name of the
+ * backchannel transport class.
+ *
+ * Returns zero on success, negative errno on failure
+ */
+int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
+{
+ int ret;
+
+ ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * rpcrdma_bc_marshal_reply - Send backwards direction reply
+ * @rqst: buffer containing RPC reply data
+ *
+ * Returns zero on success.
+ */
+int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_msg *headerp;
+ size_t rpclen;
+
+ headerp = rdmab_to_msg(req->rl_rdmabuf);
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit =
+ cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+ headerp->rm_type = rdma_msg;
+ headerp->rm_body.rm_chunks[0] = xdr_zero;
+ headerp->rm_body.rm_chunks[1] = xdr_zero;
+ headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+ rpclen = rqst->rq_svec[0].iov_len;
+
+ pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
+ __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
+ pr_info("RPC: %s: RPC/RDMA: %*ph\n",
+ __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
+ pr_info("RPC: %s: RPC: %*ph\n",
+ __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
+
+ req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
+ req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
+ req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+
+ req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
+ req->rl_send_iov[1].length = rpclen;
+ req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
+
+ req->rl_niovs = 2;
+ return 0;
+}
+
+/**
+ * xprt_rdma_bc_destroy - Release resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of incoming requests to destroy; ignored
+ */
+void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpc_rqst *rqst, *tmp;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+
+ rpcrdma_bc_free_rqst(r_xprt, rqst);
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ }
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * xprt_rdma_bc_free_rqst - Release a backchannel rqst
+ * @rqst: request to release
+ */
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+
+ smp_mb__before_atomic();
+ WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
+ clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+ smp_mb__after_atomic();
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * rpcrdma_bc_receive_call - Handle a backward direction call
+ * @xprt: transport receiving the call
+ * @rep: receive buffer containing the call
+ *
+ * Called in the RPC reply handler, which runs in a tasklet.
+ * Be quick about it.
+ *
+ * Operational assumptions:
+ * o Backchannel credits are ignored, just as the NFS server
+ * forechannel currently does
+ * o The ULP manages a replay cache (eg, NFSv4.1 sessions).
+ * No replay detection is done at the transport level
+ */
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_rep *rep)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct rpcrdma_msg *headerp;
+ struct svc_serv *bc_serv;
+ struct rpcrdma_req *req;
+ struct rpc_rqst *rqst;
+ struct xdr_buf *buf;
+ size_t size;
+ __be32 *p;
+
+ headerp = rdmab_to_msg(rep->rr_rdmabuf);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: callback XID %08x, length=%u\n",
+ __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
+ pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp);
+#endif
+
+ /* Sanity check:
+ * Need at least enough bytes for RPC/RDMA header, as code
+ * here references the header fields by array offset. Also,
+ * backward calls are always inline, so ensure there
+ * are some bytes beyond the RPC/RDMA header.
+ */
+ if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
+ goto out_short;
+ p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
+ size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
+
+ /* Grab a free bc rqst */
+ spin_lock(&xprt->bc_pa_lock);
+ if (list_empty(&xprt->bc_pa_list)) {
+ spin_unlock(&xprt->bc_pa_lock);
+ goto out_overflow;
+ }
+ rqst = list_first_entry(&xprt->bc_pa_list,
+ struct rpc_rqst, rq_bc_pa_list);
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock(&xprt->bc_pa_lock);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: using rqst %p\n", __func__, rqst);
+#endif
+
+ /* Prepare rqst */
+ rqst->rq_reply_bytes_recvd = 0;
+ rqst->rq_bytes_sent = 0;
+ rqst->rq_xid = headerp->rm_xid;
+ set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+
+ buf = &rqst->rq_rcv_buf;
+ memset(buf, 0, sizeof(*buf));
+ buf->head[0].iov_base = p;
+ buf->head[0].iov_len = size;
+ buf->len = size;
+
+ /* The receive buffer has to be hooked to the rpcrdma_req
+ * so that it can be reposted after the server is done
+ * parsing it but just before sending the backward
+ * direction reply.
+ */
+ req = rpcr_to_rdmar(rqst);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: attaching rep %p to req %p\n",
+ __func__, rep, req);
+#endif
+ req->rl_reply = rep;
+
+ /* Defeat the retransmit detection logic in send_request */
+ req->rl_connect_cookie = 0;
+
+ /* Queue rqst for ULP's callback service */
+ bc_serv = xprt->bc_serv;
+ spin_lock(&bc_serv->sv_cb_lock);
+ list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
+ spin_unlock(&bc_serv->sv_cb_lock);
+
+ wake_up(&bc_serv->sv_cb_waitq);
+
+ r_xprt->rx_stats.bcall_count++;
+ return;
+
+out_overflow:
+ pr_warn("RPC/RDMA backchannel overflow\n");
+ xprt_disconnect_done(xprt);
+ /* This receive buffer gets reposted automatically
+ * when the connection is re-established.
+ */
+ return;
+
+out_short:
+ pr_warn("RPC/RDMA short backward direction call\n");
+
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ xprt_disconnect_done(xprt);
+ else
+ pr_warn("RPC: %s: reposting rep %p\n",
+ __func__, rep);
+}
diff --git a/kernel/net/sunrpc/xprtrdma/fmr_ops.c b/kernel/net/sunrpc/xprtrdma/fmr_ops.c
index 302d4ebf6..f1e8dafbd 100644
--- a/kernel/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/kernel/net/sunrpc/xprtrdma/fmr_ops.c
@@ -11,6 +11,21 @@
* can take tens of usecs to complete.
*/
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using the
+ * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
+ * finished, the Memory Region is unmapped using the ib_unmap_fmr
+ * verb (fmr_op_unmap).
+ */
+
+/* Transport recovery
+ *
+ * After a transport reconnect, fmr_op_map re-uses the MR already
+ * allocated for the RPC, but generates a fresh rkey then maps the
+ * MR again. This process is synchronous.
+ */
+
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_mw *r;
int i, rc;
+ spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
- i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
- dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
+ i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
+ i += 2; /* head + tail */
+ i *= buf->rb_max_requests; /* one set for each RPC slot */
+ dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
+ rc = -ENOMEM;
while (i--) {
r = kzalloc(sizeof(*r), GFP_KERNEL);
if (!r)
- return -ENOMEM;
+ goto out;
- r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->r.fmr))
+ r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
+ sizeof(u64), GFP_KERNEL);
+ if (!r->r.fmr.physaddrs)
+ goto out_free;
+
+ r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->r.fmr.fmr))
goto out_fmr_err;
list_add(&r->mw_list, &buf->rb_mws);
@@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
return 0;
out_fmr_err:
- rc = PTR_ERR(r->r.fmr);
+ rc = PTR_ERR(r->r.fmr.fmr);
dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
+ kfree(r->r.fmr.physaddrs);
+out_free:
kfree(r);
+out:
return rc;
}
+static int
+__fmr_unmap(struct rpcrdma_mw *r)
+{
+ LIST_HEAD(l);
+
+ list_add(&r->r.fmr.fmr->list, &l);
+ return ib_unmap_fmr(&l);
+}
+
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_id->device;
+ struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
int len, pageoff, i, rc;
+ struct rpcrdma_mw *mw;
+
+ mw = seg1->rl_mw;
+ seg1->rl_mw = NULL;
+ if (!mw) {
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOMEM;
+ } else {
+ /* this is a retransmit; generate a fresh rkey */
+ rc = __fmr_unmap(mw);
+ if (rc)
+ return rc;
+ }
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
rpcrdma_map_one(device, seg, direction);
- physaddrs[i] = seg->mr_dma;
+ mw->r.fmr.physaddrs[i] = seg->mr_dma;
len += seg->mr_len;
++seg;
++i;
@@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
break;
}
- rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+ rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
+ i, seg1->mr_dma);
if (rc)
goto out_maperr;
- seg1->mr_rkey = mw->r.fmr->rkey;
+ seg1->rl_mw = mw;
+ seg1->mr_rkey = mw->r.fmr.fmr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len;
@@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_device *device;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
int rc, nsegs = seg->mr_nsegs;
- LIST_HEAD(l);
- list_add(&seg1->rl_mw->r.fmr->list, &l);
- rc = ib_unmap_fmr(&l);
- read_lock(&ia->ri_qplock);
- device = ia->ri_id->device;
+ dprintk("RPC: %s: FMR %p\n", __func__, mw);
+
+ seg1->rl_mw = NULL;
while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(device, seg++);
- read_unlock(&ia->ri_qplock);
+ rpcrdma_unmap_one(ia->ri_device, seg++);
+ rc = __fmr_unmap(mw);
if (rc)
goto out_err;
+ rpcrdma_put_mw(r_xprt, mw);
return nsegs;
out_err:
+ /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
+ * will attempt to release it when the transport is destroyed.
+ */
dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
return nsegs;
}
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-fmr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_mw *r;
- LIST_HEAD(list);
- int rc;
-
- list_for_each_entry(r, &buf->rb_all, mw_all)
- list_add(&r->r.fmr->list, &list);
-
- rc = ib_unmap_fmr(&list);
- if (rc)
- dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
- __func__, rc);
-}
-
static void
fmr_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
- rc = ib_dealloc_fmr(r->r.fmr);
+ kfree(r->r.fmr.physaddrs);
+
+ rc = ib_dealloc_fmr(r->r.fmr.fmr);
if (rc)
dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
__func__, rc);
+
kfree(r);
}
}
@@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
- .ro_reset = fmr_op_reset,
.ro_destroy = fmr_op_destroy,
.ro_displayname = "fmr",
};
diff --git a/kernel/net/sunrpc/xprtrdma/frwr_ops.c b/kernel/net/sunrpc/xprtrdma/frwr_ops.c
index dff0481db..88cf9e726 100644
--- a/kernel/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/kernel/net/sunrpc/xprtrdma/frwr_ops.c
@@ -11,12 +11,136 @@
* but most complex memory registration mode.
*/
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * Work Request (frmr_op_map). When the RDMA operation is finished, this
+ * Memory Region is invalidated using a LOCAL_INV Work Request
+ * (frmr_op_unmap).
+ *
+ * Typically these Work Requests are not signaled, and neither are RDMA
+ * SEND Work Requests (with the exception of signaling occasionally to
+ * prevent provider work queue overflows). This greatly reduces HCA
+ * interrupt workload.
+ *
+ * As an optimization, frwr_op_unmap marks MRs INVALID before the
+ * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
+ * rb_mws immediately so that no work (like managing a linked list
+ * under a spinlock) is needed in the completion upcall.
+ *
+ * But this means that frwr_op_map() can occasionally encounter an MR
+ * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
+ * ordering prevents a subsequent FAST_REG WR from executing against
+ * that MR while it is still being invalidated.
+ */
+
+/* Transport recovery
+ *
+ * ->op_map and the transport connect worker cannot run at the same
+ * time, but ->op_unmap can fire while the transport connect worker
+ * is running. Thus MR recovery is handled in ->op_map, to guarantee
+ * that recovered MRs are owned by a sending RPC, and not one where
+ * ->op_unmap could fire at the same time transport reconnect is
+ * being done.
+ *
+ * When the underlying transport disconnects, MRs are left in one of
+ * three states:
+ *
+ * INVALID: The MR was not in use before the QP entered ERROR state.
+ * (Or, the LOCAL_INV WR has not completed or flushed yet).
+ *
+ * STALE: The MR was being registered or unregistered when the QP
+ * entered ERROR state, and the pending WR was flushed.
+ *
+ * VALID: The MR was registered before the QP entered ERROR state.
+ *
+ * When frwr_op_map encounters STALE and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * allocates fresh resources, it is deferred to a workqueue, and the
+ * recovered MRs are placed back on the rb_mws list when recovery is
+ * complete. frwr_op_map allocates another MR for the current RPC while
+ * the broken MR is reset.
+ *
+ * To ensure that frwr_op_map doesn't encounter an MR that is marked
+ * INVALID but that is about to be flushed due to a previous transport
+ * disconnect, the transport connect worker attempts to drain all
+ * pending send queue WRs before the transport is reconnected.
+ */
+
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+static struct workqueue_struct *frwr_recovery_wq;
+
+#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
+
+int
+frwr_alloc_recovery_wq(void)
+{
+ frwr_recovery_wq = alloc_workqueue("frwr_recovery",
+ FRWR_RECOVERY_WQ_FLAGS, 0);
+ return !frwr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+frwr_destroy_recovery_wq(void)
+{
+ struct workqueue_struct *wq;
+
+ if (!frwr_recovery_wq)
+ return;
+
+ wq = frwr_recovery_wq;
+ frwr_recovery_wq = NULL;
+ destroy_workqueue(wq);
+}
+
+/* Deferred reset of a single FRMR. Generate a fresh rkey by
+ * replacing the MR.
+ *
+ * There's no recovery if this fails. The FRMR is abandoned, but
+ * remains in rb_all. It will be cleaned up when the transport is
+ * destroyed.
+ */
+static void
+__frwr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
+ r.frmr.fr_work);
+ struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
+ unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+
+ if (ib_dereg_mr(r->r.frmr.fr_mr))
+ goto out_fail;
+
+ r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(r->r.frmr.fr_mr))
+ goto out_fail;
+
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+ rpcrdma_put_mw(r_xprt, r);
+ return;
+
+out_fail:
+ pr_warn("RPC: %s: FRMR %p unrecovered\n",
+ __func__, r);
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__frwr_queue_recovery(struct rpcrdma_mw *r)
+{
+ INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
+}
+
static int
__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
unsigned int depth)
@@ -24,24 +148,28 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
struct rpcrdma_frmr *f = &r->r.frmr;
int rc;
- f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+ f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
if (IS_ERR(f->fr_mr))
goto out_mr_err;
- f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
- if (IS_ERR(f->fr_pgl))
+
+ f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
+ if (!f->sg)
goto out_list_err;
+
+ sg_init_table(f->sg, depth);
+
return 0;
out_mr_err:
rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
+ dprintk("RPC: %s: ib_alloc_mr status %i\n",
__func__, rc);
return rc;
out_list_err:
- rc = PTR_ERR(f->fr_pgl);
- dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
- __func__, rc);
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
ib_dereg_mr(f->fr_mr);
return rc;
}
@@ -55,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r)
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ kfree(r->r.frmr.sg);
}
static int
@@ -128,8 +256,11 @@ frwr_sendcompletion(struct ib_wc *wc)
/* WARNING: Only wr_id and status are reliable at this point */
r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- dprintk("RPC: %s: frmr %p (stale), status %d\n",
- __func__, r, wc->status);
+ if (wc->status == IB_WC_WR_FLUSH_ERR)
+ dprintk("RPC: %s: frmr %p flushed\n", __func__, r);
+ else
+ pr_warn("RPC: %s: frmr %p error, status %s (%d)\n",
+ __func__, r, ib_wc_status_msg(wc->status), wc->status);
r->r.frmr.fr_state = FRMR_IS_STALE;
}
@@ -137,16 +268,19 @@ static int
frwr_op_init(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
int i;
+ spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
- i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
- dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
+ i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
+ i += 2; /* head + tail */
+ i *= buf->rb_max_requests; /* one set for each RPC slot */
+ dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
while (i--) {
struct rpcrdma_mw *r;
@@ -165,6 +299,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
r->mw_sendcompletion = frwr_sendcompletion;
+ r->r.frmr.fr_xprt = r_xprt;
}
return 0;
@@ -178,78 +313,103 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_id->device;
+ struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->r.frmr;
- struct ib_mr *mr = frmr->fr_mr;
- struct ib_send_wr fastreg_wr, *bad_wr;
+ struct rpcrdma_mw *mw;
+ struct rpcrdma_frmr *frmr;
+ struct ib_mr *mr;
+ struct ib_reg_wr reg_wr;
+ struct ib_send_wr *bad_wr;
+ int rc, i, n, dma_nents;
u8 key;
- int len, pageoff;
- int i, rc;
- int seg_len;
- u64 pa;
- int page_no;
-
- pageoff = offset_in_page(seg1->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
+
+ mw = seg1->rl_mw;
+ seg1->rl_mw = NULL;
+ do {
+ if (mw)
+ __frwr_queue_recovery(mw);
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOMEM;
+ } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
+ frmr = &mw->r.frmr;
+ frmr->fr_state = FRMR_IS_VALID;
+ mr = frmr->fr_mr;
+
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
- for (page_no = i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- pa = seg->mr_dma;
- for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
- frmr->fr_pgl->page_list[page_no++] = pa;
- pa += PAGE_SIZE;
- }
- len += seg->mr_len;
+
+ for (i = 0; i < nsegs;) {
+ if (seg->mr_page)
+ sg_set_page(&frmr->sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ seg->mr_len);
+
++seg;
++i;
+
/* Check for holes */
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
- __func__, mw, i, len);
+ frmr->sg_nents = i;
- frmr->fr_state = FRMR_IS_VALID;
+ dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ if (!dma_nents) {
+ pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
+ __func__, frmr->sg, frmr->sg_nents);
+ return -ENOMEM;
+ }
+
+ n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->sg_nents)) {
+ pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
+ __func__, frmr->fr_mr, n, frmr->sg_nents);
+ rc = n < 0 ? n : -EINVAL;
+ goto out_senderr;
+ }
+
+ dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
+ __func__, mw, frmr->sg_nents, mr->length);
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.wr_id = (unsigned long)(void *)mw;
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
- fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.page_list_len = page_no;
- fastreg_wr.wr.fast_reg.length = len;
- fastreg_wr.wr.fast_reg.access_flags = writing ?
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
- IB_ACCESS_REMOTE_READ;
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
- fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+ reg_wr.wr.next = NULL;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.wr_id = (uintptr_t)mw;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.wr.send_flags = 0;
+ reg_wr.mr = mr;
+ reg_wr.key = mr->rkey;
+ reg_wr.access = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, &reg_wr.wr, &bad_wr);
if (rc)
goto out_senderr;
+ seg1->mr_dir = direction;
+ seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ seg1->mr_base = mr->iova;
+ seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_len = mr->length;
+
+ return frmr->sg_nents;
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_update_fast_reg_key(mr, --key);
- frmr->fr_state = FRMR_IS_INVALID;
- while (i--)
- rpcrdma_unmap_one(device, --seg);
+ ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
+ __frwr_queue_recovery(mw);
return rc;
}
@@ -261,78 +421,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
+ struct rpcrdma_frmr *frmr = &mw->r.frmr;
struct ib_send_wr invalidate_wr, *bad_wr;
int rc, nsegs = seg->mr_nsegs;
- struct ib_device *device;
- seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+ dprintk("RPC: %s: FRMR %p\n", __func__, mw);
+
+ seg1->rl_mw = NULL;
+ frmr->fr_state = FRMR_IS_INVALID;
memset(&invalidate_wr, 0, sizeof(invalidate_wr));
- invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+ invalidate_wr.wr_id = (unsigned long)(void *)mw;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+ invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
+ ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
read_lock(&ia->ri_qplock);
- device = ia->ri_id->device;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(device, seg++);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc)
goto out_err;
+
+ rpcrdma_put_mw(r_xprt, mw);
return nsegs;
out_err:
- /* Force rpcrdma_buffer_get() to retry */
- seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
+ __frwr_queue_recovery(mw);
return nsegs;
}
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each. FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-frwr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- struct rpcrdma_mw *r;
- int rc;
-
- list_for_each_entry(r, &buf->rb_all, mw_all) {
- if (r->r.frmr.fr_state == FRMR_IS_INVALID)
- continue;
-
- __frwr_release(r);
- rc = __frwr_init(r, pd, device, depth);
- if (rc) {
- dprintk("RPC: %s: mw %p left %s\n",
- __func__, r,
- (r->r.frmr.fr_state == FRMR_IS_STALE ?
- "stale" : "valid"));
- continue;
- }
-
- r->r.frmr.fr_state = FRMR_IS_INVALID;
- }
-}
-
static void
frwr_op_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_mw *r;
+ /* Ensure stale MWs for "buf" are no longer in flight */
+ flush_workqueue(frwr_recovery_wq);
+
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
@@ -347,7 +475,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
- .ro_reset = frwr_op_reset,
.ro_destroy = frwr_op_destroy,
.ro_displayname = "frwr",
};
diff --git a/kernel/net/sunrpc/xprtrdma/module.c b/kernel/net/sunrpc/xprtrdma/module.c
new file mode 100644
index 000000000..560712bd9
--- /dev/null
+++ b/kernel/net/sunrpc/xprtrdma/module.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ */
+
+/* rpcrdma.ko module initialization
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc.");
+MODULE_DESCRIPTION("RPC/RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("svcrdma");
+MODULE_ALIAS("xprtrdma");
+
+static void __exit rpc_rdma_cleanup(void)
+{
+ xprt_rdma_cleanup();
+ svc_rdma_cleanup();
+}
+
+static int __init rpc_rdma_init(void)
+{
+ int rc;
+
+ rc = svc_rdma_init();
+ if (rc)
+ goto out;
+
+ rc = xprt_rdma_init();
+ if (rc)
+ svc_rdma_cleanup();
+
+out:
+ return rc;
+}
+
+module_init(rpc_rdma_init);
+module_exit(rpc_rdma_cleanup);
diff --git a/kernel/net/sunrpc/xprtrdma/physical_ops.c b/kernel/net/sunrpc/xprtrdma/physical_ops.c
index ba518af16..617b76f22 100644
--- a/kernel/net/sunrpc/xprtrdma/physical_ops.c
+++ b/kernel/net/sunrpc/xprtrdma/physical_ops.c
@@ -23,6 +23,21 @@ static int
physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_mr *mr;
+
+ /* Obtain an rkey to use for RPC data payloads.
+ */
+ mr = ib_get_dma_mr(ia->ri_pd,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ);
+ if (IS_ERR(mr)) {
+ pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+ __func__, PTR_ERR(mr));
+ return -ENOMEM;
+ }
+
+ ia->ri_dma_mr = mr;
return 0;
}
@@ -50,9 +65,8 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- rpcrdma_map_one(ia->ri_id->device, seg,
- rpcrdma_data_dir(writing));
- seg->mr_rkey = ia->ri_bind_mem->rkey;
+ rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
+ seg->mr_rkey = ia->ri_dma_mr->rkey;
seg->mr_base = seg->mr_dma;
seg->mr_nsegs = 1;
return 1;
@@ -65,19 +79,11 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- read_lock(&ia->ri_qplock);
- rpcrdma_unmap_one(ia->ri_id->device, seg);
- read_unlock(&ia->ri_qplock);
-
+ rpcrdma_unmap_one(ia->ri_device, seg);
return 1;
}
static void
-physical_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-}
-
-static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
}
@@ -88,7 +94,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
- .ro_reset = physical_op_reset,
.ro_destroy = physical_op_destroy,
.ro_displayname = "physical",
};
diff --git a/kernel/net/sunrpc/xprtrdma/rpc_rdma.c b/kernel/net/sunrpc/xprtrdma/rpc_rdma.c
index 2c53ea9e1..c10d96994 100644
--- a/kernel/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/kernel/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,6 +71,67 @@ static const char transfertypes[][12] = {
};
#endif
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+ unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+ return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+ unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+ return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+ size_t tlen = buf->tail[0].iov_len;
+ size_t skip = tlen & 3;
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (tlen < 4)
+ return 0;
+
+ /* xdr_write_pages() adds a pad at the beginning of the tail
+ * if the content in "buf->pages" is unaligned. Force the
+ * tail's actual content to land at the next XDR position
+ * after the head instead.
+ */
+ if (skip) {
+ unsigned char *src, *dst;
+ unsigned int count;
+
+ src = buf->tail[0].iov_base;
+ dst = buf->head[0].iov_base;
+ dst += buf->head[0].iov_len;
+
+ src += skip;
+ tlen -= skip;
+
+ dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
+ __func__, skip, dst, src, tlen);
+
+ for (count = tlen; count; count--)
+ *dst++ = *src++;
+ }
+
+ return tlen;
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (len && n == nsegs)
return -EIO;
+ /* When encoding the read list, the tail is always sent inline */
+ if (type == rpcrdma_readch)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -284,9 +349,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return (unsigned char *)iptr - (unsigned char *)headerp;
out:
- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
- return n;
-
for (pos = 0; nchunks--;)
pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
&req->rl_segments[pos]);
@@ -300,8 +362,7 @@ out:
* pre-registered memory buffer for this request. For small amounts
* of data, this is efficient. The cutoff value is tunable.
*/
-static int
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
{
int i, npages, curlen;
int copy_len;
@@ -313,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
destp = rqst->rq_svec[0].iov_base;
curlen = rqst->rq_svec[0].iov_len;
destp += curlen;
- /*
- * Do optional padding where it makes sense. Alignment of write
- * payload can help the server, if our setting is accurate.
- */
- pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
- if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
- pad = 0; /* don't pad this request */
- dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
- __func__, pad, destp, rqst->rq_slen, curlen);
+ dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
+ __func__, destp, rqst->rq_slen, curlen);
copy_len = rqst->rq_snd_buf.page_len;
@@ -358,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
page_base = 0;
}
/* header now contains entire send message */
- return pad;
}
/*
@@ -383,11 +436,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
- size_t rpclen, padlen;
+ size_t rpclen;
ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
+ return rpcrdma_bc_marshal_reply(rqst);
+#endif
+
/*
* rpclen gets amount of data in first buffer, which is the
* pre-registered buffer.
@@ -405,28 +463,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
+ * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
- * return as inline (but see later).
+ * return as inline.
* o Large non-read ops return as a single reply chunk.
- * o Large read ops return data as write chunk(s), header as inline.
- *
- * Note: the NFS code sending down multiple result segments implies
- * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
- */
-
- /*
- * This code can handle read chunks, write chunks OR reply
- * chunks -- only one type. If the request is too big to fit
- * inline, then we will choose read chunks. If the request is
- * a READ, then use write chunks to separate the file data
- * into pages; otherwise use reply chunks.
*/
- if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
- wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
+ else if (rpcrdma_results_inline(rqst))
+ wtype = rpcrdma_noch;
else
wtype = rpcrdma_replych;
@@ -435,21 +480,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*
* o If the total request is under the inline threshold, all ops
* are sent as inline.
- * o Large non-write ops are sent with the entire message as a
- * single read chunk (protocol 0-position special case).
* o Large write ops transmit data as read chunk(s), header as
* inline.
+ * o Large non-write ops are sent with the entire message as a
+ * single read chunk (protocol 0-position special case).
*
- * Note: the NFS code sending down multiple argument segments
- * implies the op is a write.
- * TBD check NFSv4 setacl
+ * This assumes that the upper layer does not present a request
+ * that both has a data payload, and whose non-data arguments
+ * by themselves are larger than the inline threshold.
*/
- if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ if (rpcrdma_args_inline(rqst)) {
rtype = rpcrdma_noch;
- else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
- else
+ } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ } else {
+ r_xprt->rx_stats.nomsg_call_count++;
+ headerp->rm_type = htonl(RDMA_NOMSG);
+ rtype = rpcrdma_areadch;
+ rpclen = 0;
+ }
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
@@ -461,7 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
}
hdrlen = RPCRDMA_HDRLEN_MIN;
- padlen = 0;
/*
* Pull up any extra send data into the preregistered buffer.
@@ -470,45 +518,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rtype == rpcrdma_noch) {
- padlen = rpcrdma_inline_pullup(rqst,
- RPCRDMA_INLINE_PAD_VALUE(rqst));
-
- if (padlen) {
- headerp->rm_type = rdma_msgp;
- headerp->rm_body.rm_padded.rm_align =
- cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
- headerp->rm_body.rm_padded.rm_thresh =
- cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
- headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
- hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (wtype != rpcrdma_noch) {
- dprintk("RPC: %s: invalid chunk list\n",
- __func__);
- return -EIO;
- }
- } else {
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- /*
- * Currently we try to not actually use read inline.
- * Reply chunks have the desirable property that
- * they land, packed, directly in the target buffers
- * without headers, so they require no fixup. The
- * additional RDMA Write op sends the same amount
- * of data, streams on-the-wire and adds no overhead
- * on receive. Therefore, we request a reply chunk
- * for non-writes wherever feasible and efficient.
- */
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
- }
- }
+ rpcrdma_inline_pullup(rqst);
+ headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+ /* new length after pullup */
+ rpclen = rqst->rq_svec[0].iov_len;
+ } else if (rtype == rpcrdma_readch)
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
if (rtype != rpcrdma_noch) {
hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
headerp, rtype);
@@ -521,9 +539,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (hdrlen < 0)
return hdrlen;
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[wtype], hdrlen, rpclen,
headerp, base, rdmab_lkey(req->rl_rdmabuf));
/*
@@ -537,26 +555,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+ req->rl_niovs = 1;
+ if (rtype == rpcrdma_areadch)
+ return 0;
+
req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
req->rl_send_iov[1].length = rpclen;
req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
req->rl_niovs = 2;
-
- if (padlen) {
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
- req->rl_send_iov[2].length = padlen;
- req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-
- req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
- req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
- req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 4;
- }
-
return 0;
}
@@ -709,6 +716,37 @@ rpcrdma_connect_worker(struct work_struct *work)
spin_unlock_bh(&xprt->transport_lock);
}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+{
+ __be32 *p = (__be32 *)headerp;
+
+ if (headerp->rm_type != rdma_msg)
+ return false;
+ if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+ return false;
+ if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+ return false;
+ if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+ return false;
+
+ /* sanity */
+ if (p[7] != headerp->rm_xid)
+ return false;
+ /* call direction */
+ if (p[8] != cpu_to_be32(RPC_CALL))
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
/*
* This function is called when an async event is posted to
* the connection which changes the connection state. All it
@@ -721,8 +759,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
schedule_delayed_work(&ep->rep_connect_worker, 0);
}
-/*
- * Called as a tasklet to do req/reply match and complete a request
+/* Process received RPC/RDMA messages.
+ *
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
*/
@@ -732,60 +770,39 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
- struct rpc_xprt *xprt = rep->rr_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
__be32 *iptr;
int rdmalen, status;
unsigned long cwnd;
u32 credits;
- /* Check status. If bad, signal disconnect and return rep to pool */
- if (rep->rr_len == ~0U) {
- rpcrdma_recv_buffer_put(rep);
- if (r_xprt->rx_ep.rep_connected == 1) {
- r_xprt->rx_ep.rep_connected = -EIO;
- rpcrdma_conn_func(&r_xprt->rx_ep);
- }
- return;
- }
- if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
- dprintk("RPC: %s: short/invalid reply\n", __func__);
- goto repost;
- }
+ dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
+
+ if (rep->rr_len == RPCRDMA_BAD_LEN)
+ goto out_badstatus;
+ if (rep->rr_len < RPCRDMA_HDRLEN_MIN)
+ goto out_shortreply;
+
headerp = rdmab_to_msg(rep->rr_rdmabuf);
- if (headerp->rm_vers != rpcrdma_version) {
- dprintk("RPC: %s: invalid version %d\n",
- __func__, be32_to_cpu(headerp->rm_vers));
- goto repost;
- }
+ if (headerp->rm_vers != rpcrdma_version)
+ goto out_badversion;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (rpcrdma_is_bcall(headerp))
+ goto out_bcall;
+#endif
- /* Get XID and try for a match. */
- spin_lock(&xprt->transport_lock);
+ /* Match incoming rpcrdma_rep to an rpcrdma_req to
+ * get context for handling any incoming chunks.
+ */
+ spin_lock_bh(&xprt->transport_lock);
rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
- if (rqst == NULL) {
- spin_unlock(&xprt->transport_lock);
- dprintk("RPC: %s: reply 0x%p failed "
- "to match any request xid 0x%08x len %d\n",
- __func__, rep, be32_to_cpu(headerp->rm_xid),
- rep->rr_len);
-repost:
- r_xprt->rx_stats.bad_reply_count++;
- rep->rr_func = rpcrdma_reply_handler;
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
- rpcrdma_recv_buffer_put(rep);
-
- return;
- }
+ if (!rqst)
+ goto out_nomatch;
- /* get request object */
req = rpcr_to_rdmar(rqst);
- if (req->rl_reply) {
- spin_unlock(&xprt->transport_lock);
- dprintk("RPC: %s: duplicate reply 0x%p to RPC "
- "request 0x%p: xid 0x%08x\n", __func__, rep, req,
- be32_to_cpu(headerp->rm_xid));
- goto repost;
- }
+ if (req->rl_reply)
+ goto out_duplicate;
dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
" RPC request 0x%p xid 0x%08x\n",
@@ -882,8 +899,50 @@ badheader:
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(rqst->rq_task);
+ xprt_complete_rqst(rqst->rq_task, status);
+ spin_unlock_bh(&xprt->transport_lock);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
- xprt_complete_rqst(rqst->rq_task, status);
- spin_unlock(&xprt->transport_lock);
+ return;
+
+out_badstatus:
+ rpcrdma_recv_buffer_put(rep);
+ if (r_xprt->rx_ep.rep_connected == 1) {
+ r_xprt->rx_ep.rep_connected = -EIO;
+ rpcrdma_conn_func(&r_xprt->rx_ep);
+ }
+ return;
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+out_bcall:
+ rpcrdma_bc_receive_call(r_xprt, rep);
+ return;
+#endif
+
+out_shortreply:
+ dprintk("RPC: %s: short/invalid reply\n", __func__);
+ goto repost;
+
+out_badversion:
+ dprintk("RPC: %s: invalid version %d\n",
+ __func__, be32_to_cpu(headerp->rm_vers));
+ goto repost;
+
+out_nomatch:
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
+ __func__, be32_to_cpu(headerp->rm_xid),
+ rep->rr_len);
+ goto repost;
+
+out_duplicate:
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: "
+ "duplicate reply %p to RPC request %p: xid 0x%08x\n",
+ __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+
+repost:
+ r_xprt->rx_stats.bad_reply_count++;
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ rpcrdma_recv_buffer_put(rep);
}
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma.c b/kernel/net/sunrpc/xprtrdma/svc_rdma.c
index c1b627026..1b7051bdb 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma.c
@@ -38,8 +38,7 @@
*
* Author: Tom Tucker <tom@opengridcomputing.com>
*/
-#include <linux/module.h>
-#include <linux/init.h>
+
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
@@ -240,6 +239,9 @@ void svc_rdma_cleanup(void)
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ svc_unreg_xprt_class(&svc_rdma_bc_class);
+#endif
svc_unreg_xprt_class(&svc_rdma_class);
kmem_cache_destroy(svc_rdma_map_cachep);
kmem_cache_destroy(svc_rdma_ctxt_cachep);
@@ -287,6 +289,9 @@ int svc_rdma_init(void)
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ svc_reg_xprt_class(&svc_rdma_bc_class);
+#endif
return 0;
err1:
kmem_cache_destroy(svc_rdma_map_cachep);
@@ -295,8 +300,3 @@ int svc_rdma_init(void)
destroy_workqueue(svc_rdma_wq);
return -ENOMEM;
}
-MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
-MODULE_DESCRIPTION("SVC RDMA Transport");
-MODULE_LICENSE("Dual BSD/GPL");
-module_init(svc_rdma_init);
-module_exit(svc_rdma_cleanup);
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index b681855cf..e2fca7617 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -50,12 +50,12 @@
/*
* Decodes a read chunk list. The expected format is as follows:
* descrim : xdr_one
- * position : u32 offset into XDR stream
- * handle : u32 RKEY
+ * position : __be32 offset into XDR stream
+ * handle : __be32 RKEY
* . . .
* end-of-list: xdr_zero
*/
-static u32 *decode_read_list(u32 *va, u32 *vaend)
+static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
{
struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
@@ -67,20 +67,20 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
}
ch++;
}
- return (u32 *)&ch->rc_position;
+ return &ch->rc_position;
}
/*
* Decodes a write chunk list. The expected format is as follows:
* descrim : xdr_one
* nchunks : <count>
- * handle : u32 RKEY ---+
- * length : u32 <len of segment> |
+ * handle : __be32 RKEY ---+
+ * length : __be32 <len of segment> |
* offset : remove va + <count>
* . . . |
* ---+
*/
-static u32 *decode_write_list(u32 *va, u32 *vaend)
+static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
{
unsigned long start, end;
int nchunks;
@@ -90,14 +90,14 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
/* Check for not write-array */
if (ary->wc_discrim == xdr_zero)
- return (u32 *)&ary->wc_nchunks;
+ return &ary->wc_nchunks;
if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
(unsigned long)vaend) {
dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
return NULL;
}
- nchunks = ntohl(ary->wc_nchunks);
+ nchunks = be32_to_cpu(ary->wc_nchunks);
start = (unsigned long)&ary->wc_array[0];
end = (unsigned long)vaend;
@@ -112,10 +112,10 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
* rs_length is the 2nd 4B field in wc_target and taking its
* address skips the list terminator
*/
- return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length;
+ return &ary->wc_array[nchunks].wc_target.rs_length;
}
-static u32 *decode_reply_array(u32 *va, u32 *vaend)
+static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
{
unsigned long start, end;
int nchunks;
@@ -124,14 +124,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
/* Check for no reply-array */
if (ary->wc_discrim == xdr_zero)
- return (u32 *)&ary->wc_nchunks;
+ return &ary->wc_nchunks;
if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
(unsigned long)vaend) {
dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
return NULL;
}
- nchunks = ntohl(ary->wc_nchunks);
+ nchunks = be32_to_cpu(ary->wc_nchunks);
start = (unsigned long)&ary->wc_array[0];
end = (unsigned long)vaend;
@@ -142,15 +142,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
ary, nchunks, vaend);
return NULL;
}
- return (u32 *)&ary->wc_array[nchunks];
+ return (__be32 *)&ary->wc_array[nchunks];
}
int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
struct svc_rqst *rqstp)
{
struct rpcrdma_msg *rmsgp = NULL;
- u32 *va;
- u32 *vaend;
+ __be32 *va, *vaend;
u32 hdr_len;
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
@@ -162,22 +161,17 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
return -EINVAL;
}
- /* Decode the header */
- rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
- rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
- rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
- rmsgp->rm_type = ntohl(rmsgp->rm_type);
-
- if (rmsgp->rm_vers != RPCRDMA_VERSION)
+ if (rmsgp->rm_vers != rpcrdma_version)
return -ENOSYS;
/* Pull in the extra for the padded case and bump our pointer */
- if (rmsgp->rm_type == RDMA_MSGP) {
+ if (rmsgp->rm_type == rdma_msgp) {
int hdrlen;
+
rmsgp->rm_body.rm_padded.rm_align =
- ntohl(rmsgp->rm_body.rm_padded.rm_align);
+ be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
rmsgp->rm_body.rm_padded.rm_thresh =
- ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+ be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
rqstp->rq_arg.head[0].iov_base = va;
@@ -192,7 +186,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
* chunk list and a reply chunk list.
*/
va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
va = decode_read_list(va, vaend);
if (!va)
return -EINVAL;
@@ -211,76 +205,20 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
return hdr_len;
}
-int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
-{
- struct rpcrdma_msg *rmsgp = NULL;
- struct rpcrdma_read_chunk *ch;
- struct rpcrdma_write_array *ary;
- u32 *va;
- u32 hdrlen;
-
- dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
- rqstp);
- rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-
- /* Pull in the extra for the padded case and bump our pointer */
- if (rmsgp->rm_type == RDMA_MSGP) {
- va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rqstp->rq_arg.head[0].iov_base = va;
- hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rqstp->rq_arg.head[0].iov_len -= hdrlen;
- return hdrlen;
- }
-
- /*
- * Skip all chunks to find RPC msg. These were previously processed
- */
- va = &rmsgp->rm_body.rm_chunks[0];
-
- /* Skip read-list */
- for (ch = (struct rpcrdma_read_chunk *)va;
- ch->rc_discrim != xdr_zero; ch++);
- va = (u32 *)&ch->rc_position;
-
- /* Skip write-list */
- ary = (struct rpcrdma_write_array *)va;
- if (ary->wc_discrim == xdr_zero)
- va = (u32 *)&ary->wc_nchunks;
- else
- /*
- * rs_length is the 2nd 4B field in wc_target and taking its
- * address skips the list terminator
- */
- va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
-
- /* Skip reply-array */
- ary = (struct rpcrdma_write_array *)va;
- if (ary->wc_discrim == xdr_zero)
- va = (u32 *)&ary->wc_nchunks;
- else
- va = (u32 *)&ary->wc_array[ary->wc_nchunks];
-
- rqstp->rq_arg.head[0].iov_base = va;
- hdrlen = (unsigned long)va - (unsigned long)rmsgp;
- rqstp->rq_arg.head[0].iov_len -= hdrlen;
-
- return hdrlen;
-}
-
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err, u32 *va)
+ enum rpcrdma_errcode err, __be32 *va)
{
- u32 *startp = va;
+ __be32 *startp = va;
- *va++ = htonl(rmsgp->rm_xid);
- *va++ = htonl(rmsgp->rm_vers);
- *va++ = htonl(xprt->sc_max_requests);
- *va++ = htonl(RDMA_ERROR);
- *va++ = htonl(err);
+ *va++ = rmsgp->rm_xid;
+ *va++ = rmsgp->rm_vers;
+ *va++ = cpu_to_be32(xprt->sc_max_requests);
+ *va++ = rdma_error;
+ *va++ = cpu_to_be32(err);
if (err == ERR_VERS) {
- *va++ = htonl(RPCRDMA_VERSION);
- *va++ = htonl(RPCRDMA_VERSION);
+ *va++ = rpcrdma_version;
+ *va++ = rpcrdma_version;
}
return (int)((unsigned long)va - (unsigned long)startp);
@@ -297,7 +235,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
&rmsgp->rm_body.rm_chunks[1];
if (wr_ary->wc_discrim)
wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+ &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
wc_target.rs_length;
else
wr_ary = (struct rpcrdma_write_array *)
@@ -306,7 +244,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
/* skip reply array */
if (wr_ary->wc_discrim)
wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+ &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
else
wr_ary = (struct rpcrdma_write_array *)
&wr_ary->wc_nchunks;
@@ -325,7 +263,7 @@ void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
ary = (struct rpcrdma_write_array *)
&rmsgp->rm_body.rm_chunks[1];
ary->wc_discrim = xdr_one;
- ary->wc_nchunks = htonl(chunks);
+ ary->wc_nchunks = cpu_to_be32(chunks);
/* write-list terminator */
ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
@@ -338,7 +276,7 @@ void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
int chunks)
{
ary->wc_discrim = xdr_one;
- ary->wc_nchunks = htonl(chunks);
+ ary->wc_nchunks = cpu_to_be32(chunks);
}
void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
@@ -350,7 +288,7 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
seg->rs_handle = rs_handle;
seg->rs_offset = rs_offset;
- seg->rs_length = htonl(write_len);
+ seg->rs_length = cpu_to_be32(write_len);
}
void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
@@ -358,10 +296,10 @@ void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rdma_resp,
enum rpcrdma_proc rdma_type)
{
- rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
- rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
- rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
- rdma_resp->rm_type = htonl(rdma_type);
+ rdma_resp->rm_xid = rdma_argp->rm_xid;
+ rdma_resp->rm_vers = rdma_argp->rm_vers;
+ rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
+ rdma_resp->rm_type = cpu_to_be32(rdma_type);
/* Encode <nul> chunks lists */
rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f9f13a32d..ff4f01e52 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -85,7 +85,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
- if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+ if (rmsgp->rm_type == rdma_nomsg)
rqstp->rq_arg.pages = &rqstp->rq_pages[0];
else
rqstp->rq_arg.pages = &rqstp->rq_pages[1];
@@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0].iov_len = 0;
}
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
- if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
- RDMA_TRANSPORT_IWARP)
- return 1;
- else
- return min_t(int, sge_count, xprt->sc_max_sge);
-}
-
/* Issue an RDMA_READ using the local lkey to map the data sink */
int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
@@ -135,7 +126,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
u64 rs_offset,
bool last)
{
- struct ib_send_wr read_wr;
+ struct ib_rdma_wr read_wr;
int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
int ret, read, pno;
@@ -144,9 +135,9 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
ctxt->direction = DMA_FROM_DEVICE;
ctxt->read_hdr = head;
- pages_needed =
- min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
- read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+ pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd);
+ read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset,
+ rs_length);
for (pno = 0; pno < pages_needed; pno++) {
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
@@ -189,16 +180,16 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
memset(&read_wr, 0, sizeof(read_wr));
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.opcode = IB_WR_RDMA_READ;
- ctxt->wr_op = read_wr.opcode;
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = rs_handle;
- read_wr.wr.rdma.remote_addr = rs_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge = pages_needed;
-
- ret = svc_rdma_send(xprt, &read_wr);
+ read_wr.wr.wr_id = (unsigned long)ctxt;
+ read_wr.wr.opcode = IB_WR_RDMA_READ;
+ ctxt->wr_op = read_wr.wr.opcode;
+ read_wr.wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.rkey = rs_handle;
+ read_wr.remote_addr = rs_offset;
+ read_wr.wr.sg_list = ctxt->sge;
+ read_wr.wr.num_sge = pages_needed;
+
+ ret = svc_rdma_send(xprt, &read_wr.wr);
if (ret) {
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -228,14 +219,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
u64 rs_offset,
bool last)
{
- struct ib_send_wr read_wr;
+ struct ib_rdma_wr read_wr;
struct ib_send_wr inv_wr;
- struct ib_send_wr fastreg_wr;
+ struct ib_reg_wr reg_wr;
u8 key;
- int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ int nents = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
- int ret, read, pno;
+ int ret, read, pno, dma_nents, n;
u32 pg_off = *page_offset;
u32 pg_no = *page_no;
@@ -244,16 +235,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
ctxt->direction = DMA_FROM_DEVICE;
ctxt->frmr = frmr;
- pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
- read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+ nents = min_t(unsigned int, nents, xprt->sc_frmr_pg_list_len);
+ read = min_t(int, (nents << PAGE_SHIFT) - *page_offset, rs_length);
- frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
frmr->direction = DMA_FROM_DEVICE;
frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
- frmr->map_len = pages_needed << PAGE_SHIFT;
- frmr->page_list_len = pages_needed;
+ frmr->sg_nents = nents;
- for (pno = 0; pno < pages_needed; pno++) {
+ for (pno = 0; pno < nents; pno++) {
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
@@ -261,17 +250,12 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
head->arg.len += len;
if (!pg_off)
head->count++;
+
+ sg_set_page(&frmr->sg[pno], rqstp->rq_arg.pages[pg_no],
+ len, pg_off);
+
rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
rqstp->rq_next_page = rqstp->rq_respages + 1;
- frmr->page_list->page_list[pno] =
- ib_dma_map_page(xprt->sc_cm_id->device,
- head->arg.pages[pg_no], 0,
- PAGE_SIZE, DMA_FROM_DEVICE);
- ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[pno]);
- if (ret)
- goto err;
- atomic_inc(&xprt->sc_dma_used);
/* adjust offset and wrap to next page if needed */
pg_off += len;
@@ -287,43 +271,57 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
else
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ dma_nents = ib_dma_map_sg(xprt->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents,
+ frmr->direction);
+ if (!dma_nents) {
+ pr_err("svcrdma: failed to dma map sg %p\n",
+ frmr->sg);
+ return -ENOMEM;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+
+ n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->sg_nents)) {
+ pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
+ frmr->mr, n, frmr->sg_nents);
+ return n < 0 ? n : -EINVAL;
+ }
+
/* Bump the key */
key = (u8)(frmr->mr->lkey & 0x000000FF);
ib_update_fast_reg_key(frmr->mr, ++key);
- ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+ ctxt->sge[0].addr = frmr->mr->iova;
ctxt->sge[0].lkey = frmr->mr->lkey;
- ctxt->sge[0].length = read;
+ ctxt->sge[0].length = frmr->mr->length;
ctxt->count = 1;
ctxt->read_hdr = head;
- /* Prepare FASTREG WR */
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.send_flags = IB_SEND_SIGNALED;
- fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
- fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
- fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.length = frmr->map_len;
- fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
- fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
- fastreg_wr.next = &read_wr;
+ /* Prepare REG WR */
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.wr_id = 0;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->lkey;
+ reg_wr.access = frmr->access_flags;
+ reg_wr.wr.next = &read_wr.wr;
/* Prepare RDMA_READ */
memset(&read_wr, 0, sizeof(read_wr));
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = rs_handle;
- read_wr.wr.rdma.remote_addr = rs_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge = 1;
+ read_wr.wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.rkey = rs_handle;
+ read_wr.remote_addr = rs_offset;
+ read_wr.wr.sg_list = ctxt->sge;
+ read_wr.wr.num_sge = 1;
if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
- read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+ read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ read_wr.wr.wr_id = (unsigned long)ctxt;
+ read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
} else {
- read_wr.opcode = IB_WR_RDMA_READ;
- read_wr.next = &inv_wr;
+ read_wr.wr.opcode = IB_WR_RDMA_READ;
+ read_wr.wr.next = &inv_wr;
/* Prepare invalidate */
memset(&inv_wr, 0, sizeof(inv_wr));
inv_wr.wr_id = (unsigned long)ctxt;
@@ -331,10 +329,10 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
}
- ctxt->wr_op = read_wr.opcode;
+ ctxt->wr_op = read_wr.wr.opcode;
/* Post the chain */
- ret = svc_rdma_send(xprt, &fastreg_wr);
+ ret = svc_rdma_send(xprt, &reg_wr.wr);
if (ret) {
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -348,7 +346,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
atomic_inc(&rdma_stat_read);
return ret;
err:
- svc_rdma_unmap_dma(ctxt);
+ ib_dma_unmap_sg(xprt->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents, frmr->direction);
svc_rdma_put_context(ctxt, 0);
svc_rdma_put_frmr(xprt, frmr);
return ret;
@@ -541,7 +540,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.page_base = head->arg.page_base;
/* rq_respages starts after the last arg page */
- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_respages = &rqstp->rq_pages[page_no];
rqstp->rq_next_page = rqstp->rq_respages + 1;
/* Rebuild rq_arg head and tail. */
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 7de33d1af..969a1ab75 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -136,6 +136,79 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
return dma_addr;
}
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == xdr_zero)
+ return NULL;
+ return ch;
+}
+
+/* Returns the address of the first read write array element or <nul>
+ * if no write array list is present
+ */
+static struct rpcrdma_write_array *
+svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
+{
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
+ rmsgp->rm_body.rm_chunks[1] == xdr_zero)
+ return NULL;
+ return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+}
+
+/* Returns the address of the first reply array element or <nul> if no
+ * reply array is present
+ */
+static struct rpcrdma_write_array *
+svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *rch;
+ struct rpcrdma_write_array *wr_ary;
+ struct rpcrdma_write_array *rp_ary;
+
+ /* XXX: Need to fix when reply chunk may occur with read list
+ * and/or write list.
+ */
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
+ rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ return NULL;
+
+ rch = svc_rdma_get_read_chunk(rmsgp);
+ if (rch) {
+ while (rch->rc_discrim != xdr_zero)
+ rch++;
+
+ /* The reply chunk follows an empty write array located
+ * at 'rc_position' here. The reply array is at rc_target.
+ */
+ rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
+ goto found_it;
+ }
+
+ wr_ary = svc_rdma_get_write_array(rmsgp);
+ if (wr_ary) {
+ int chunk = be32_to_cpu(wr_ary->wc_nchunks);
+
+ rp_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_array[chunk].wc_target.rs_length;
+ goto found_it;
+ }
+
+ /* No read list, no write list */
+ rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
+
+ found_it:
+ if (rp_ary->wc_discrim == xdr_zero)
+ return NULL;
+ return rp_ary;
+}
+
/* Assumptions:
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
@@ -144,7 +217,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
u32 xdr_off, int write_len,
struct svc_rdma_req_map *vec)
{
- struct ib_send_wr write_wr;
+ struct ib_rdma_wr write_wr;
struct ib_sge *sge;
int xdr_sge_no;
int sge_no;
@@ -209,17 +282,17 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
ctxt->wr_op = IB_WR_RDMA_WRITE;
- write_wr.wr_id = (unsigned long)ctxt;
- write_wr.sg_list = &sge[0];
- write_wr.num_sge = sge_no;
- write_wr.opcode = IB_WR_RDMA_WRITE;
- write_wr.send_flags = IB_SEND_SIGNALED;
- write_wr.wr.rdma.rkey = rmr;
- write_wr.wr.rdma.remote_addr = to;
+ write_wr.wr.wr_id = (unsigned long)ctxt;
+ write_wr.wr.sg_list = &sge[0];
+ write_wr.wr.num_sge = sge_no;
+ write_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ write_wr.wr.send_flags = IB_SEND_SIGNALED;
+ write_wr.rkey = rmr;
+ write_wr.remote_addr = to;
/* Post It */
atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr))
+ if (svc_rdma_send(xprt, &write_wr.wr))
goto err;
return write_len - bc;
err:
@@ -240,6 +313,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
u32 xdr_off;
int chunk_off;
int chunk_no;
+ int nchunks;
struct rpcrdma_write_array *arg_ary;
struct rpcrdma_write_array *res_ary;
int ret;
@@ -251,14 +325,15 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
&rdma_resp->rm_body.rm_chunks[1];
/* Write chunks start at the pagelist */
+ nchunks = be32_to_cpu(arg_ary->wc_nchunks);
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
- xfer_len && chunk_no < arg_ary->wc_nchunks;
+ xfer_len && chunk_no < nchunks;
chunk_no++) {
struct rpcrdma_segment *arg_ch;
u64 rs_offset;
arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, ntohl(arg_ch->rs_length));
+ write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
/* Prepare the response chunk given the length actually
* written */
@@ -270,7 +345,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
chunk_off = 0;
while (write_len) {
ret = send_write(xprt, rqstp,
- ntohl(arg_ch->rs_handle),
+ be32_to_cpu(arg_ch->rs_handle),
rs_offset + chunk_off,
xdr_off,
write_len,
@@ -318,13 +393,13 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
&rdma_resp->rm_body.rm_chunks[2];
/* xdr offset starts at RPC message */
- nchunks = ntohl(arg_ary->wc_nchunks);
+ nchunks = be32_to_cpu(arg_ary->wc_nchunks);
for (xdr_off = 0, chunk_no = 0;
xfer_len && chunk_no < nchunks;
chunk_no++) {
u64 rs_offset;
ch = &arg_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, htonl(ch->rs_length));
+ write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
/* Prepare the reply chunk given the length actually
* written */
@@ -335,7 +410,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
chunk_off = 0;
while (write_len) {
ret = send_write(xprt, rqstp,
- ntohl(ch->rs_handle),
+ be32_to_cpu(ch->rs_handle),
rs_offset + chunk_off,
xdr_off,
write_len,
@@ -382,6 +457,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
int byte_count)
{
struct ib_send_wr send_wr;
+ u32 xdr_off;
int sge_no;
int sge_bytes;
int page_no;
@@ -416,8 +492,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->direction = DMA_TO_DEVICE;
/* Map the payload indicated by 'byte_count' */
+ xdr_off = 0;
for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
- int xdr_off = 0;
sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
ctxt->sge[sge_no].addr =
@@ -455,6 +531,13 @@ static int send_reply(struct svcxprt_rdma *rdma,
}
rqstp->rq_next_page = rqstp->rq_respages + 1;
+ /* The loop above bumps sc_dma_used for each sge. The
+ * xdr_buf.tail gets a separate sge, but resides in the
+ * same page as xdr_buf.head. Don't count it twice.
+ */
+ if (sge_no > ctxt->count)
+ atomic_dec(&rdma->sc_dma_used);
+
if (sge_no > rdma->sc_max_sge) {
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err;
@@ -515,7 +598,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
- res_page = svc_rdma_get_page();
+ res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
rdma_resp = page_address(res_page);
reply_ary = svc_rdma_get_reply_array(rdma_argp);
if (reply_ary)
diff --git a/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c b/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f609c1c2d..b348b4ade 100644
--- a/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/kernel/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -56,6 +56,7 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -95,16 +96,69 @@ struct svc_xprt_class svc_rdma_class = {
.xcl_ident = XPRT_TRANSPORT_RDMA,
};
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
+ struct sockaddr *, int, int);
+static void svc_rdma_bc_detach(struct svc_xprt *);
+static void svc_rdma_bc_free(struct svc_xprt *);
+
+static struct svc_xprt_ops svc_rdma_bc_ops = {
+ .xpo_create = svc_rdma_bc_create,
+ .xpo_detach = svc_rdma_bc_detach,
+ .xpo_free = svc_rdma_bc_free,
+ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+ .xpo_secure_port = svc_rdma_secure_port,
+};
+
+struct svc_xprt_class svc_rdma_bc_class = {
+ .xcl_name = "rdma-bc",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_rdma_bc_ops,
+ .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
+};
+
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ struct svcxprt_rdma *cma_xprt;
+ struct svc_xprt *xprt;
+
+ cma_xprt = rdma_create_xprt(serv, 0);
+ if (!cma_xprt)
+ return ERR_PTR(-ENOMEM);
+ xprt = &cma_xprt->sc_xprt;
+
+ svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
+ serv->sv_bc_xprt = xprt;
+
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+ return xprt;
+}
+
+static void svc_rdma_bc_detach(struct svc_xprt *xprt)
+{
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+}
+
+static void svc_rdma_bc_free(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+ if (xprt)
+ kfree(rdma);
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
struct svc_rdma_op_ctxt *ctxt;
- while (1) {
- ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
- if (ctxt)
- break;
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- }
+ ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
+ GFP_KERNEL | __GFP_NOFAIL);
ctxt->xprt = xprt;
INIT_LIST_HEAD(&ctxt->dto_q);
ctxt->count = 0;
@@ -156,12 +210,8 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
struct svc_rdma_req_map *svc_rdma_get_req_map(void)
{
struct svc_rdma_req_map *map;
- while (1) {
- map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
- if (map)
- break;
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- }
+ map = kmem_cache_alloc(svc_rdma_map_cachep,
+ GFP_KERNEL | __GFP_NOFAIL);
map->count = 0;
return map;
}
@@ -175,8 +225,8 @@ void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
static void cq_event_handler(struct ib_event *event, void *context)
{
struct svc_xprt *xprt = context;
- dprintk("svcrdma: received CQ event id=%d, context=%p\n",
- event->event, context);
+ dprintk("svcrdma: received CQ event %s (%d), context=%p\n",
+ ib_event_msg(event->event), event->event, context);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
}
@@ -191,8 +241,9 @@ static void qp_event_handler(struct ib_event *event, void *context)
case IB_EVENT_COMM_EST:
case IB_EVENT_SQ_DRAINED:
case IB_EVENT_QP_LAST_WQE_REACHED:
- dprintk("svcrdma: QP event %d received for QP=%p\n",
- event->event, event->element.qp);
+ dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
+ ib_event_msg(event->event), event->event,
+ event->element.qp);
break;
/* These are considered fatal events */
case IB_EVENT_PATH_MIG_ERR:
@@ -201,9 +252,10 @@ static void qp_event_handler(struct ib_event *event, void *context)
case IB_EVENT_QP_ACCESS_ERR:
case IB_EVENT_DEVICE_FATAL:
default:
- dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+ dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
"closing transport\n",
- event->event, event->element.qp);
+ ib_event_msg(event->event), event->event,
+ event->element.qp);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
break;
}
@@ -402,7 +454,8 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
for (i = 0; i < ret; i++) {
wc = &wc_a[i];
if (wc->status != IB_WC_SUCCESS) {
- dprintk("svcrdma: sq wc err status %d\n",
+ dprintk("svcrdma: sq wc err status %s (%d)\n",
+ ib_wc_status_msg(wc->status),
wc->status);
/* Close the transport */
@@ -490,18 +543,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return cma_xprt;
}
-struct page *svc_rdma_get_page(void)
-{
- struct page *page;
-
- while ((page = alloc_page(GFP_KERNEL)) == NULL) {
- /* If we can't get memory, wait a bit and try again */
- printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
- schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
- }
- return page;
-}
-
int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
{
struct ib_recv_wr recv_wr, *bad_recv_wr;
@@ -520,7 +561,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err_put_ctxt;
}
- page = svc_rdma_get_page();
+ page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
ctxt->pages[sge_no] = page;
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
@@ -616,7 +657,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
- "event=%d\n", cma_id, cma_id->context, event->event);
+ "event = %s (%d)\n", cma_id, cma_id->context,
+ rdma_event_msg(event->event), event->event);
handle_connect_req(cma_id,
event->param.conn.initiator_depth);
break;
@@ -636,7 +678,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
default:
dprintk("svcrdma: Unexpected event on listening endpoint %p, "
- "event=%d\n", cma_id, event->event);
+ "event = %s (%d)\n", cma_id,
+ rdma_event_msg(event->event), event->event);
break;
}
@@ -669,15 +712,18 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
- "event=%d\n", cma_id, xprt, event->event);
+ "event = %s (%d)\n", cma_id, xprt,
+ rdma_event_msg(event->event), event->event);
if (xprt) {
set_bit(XPT_CLOSE, &xprt->xpt_flags);
svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
}
break;
default:
dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
- "event=%d\n", cma_id, event->event);
+ "event = %s (%d)\n", cma_id,
+ rdma_event_msg(event->event), event->event);
break;
}
return 0;
@@ -704,8 +750,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
- listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
- IB_QPT_RC);
+ listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(listen_id)) {
ret = PTR_ERR(listen_id);
dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
@@ -744,24 +790,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
{
struct ib_mr *mr;
- struct ib_fast_reg_page_list *pl;
+ struct scatterlist *sg;
struct svc_rdma_fastreg_mr *frmr;
+ u32 num_sg;
frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
if (!frmr)
goto err;
- mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+ num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
+ mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
if (IS_ERR(mr))
goto err_free_frmr;
- pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
- RPCSVC_MAXPAGES);
- if (IS_ERR(pl))
+ sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL);
+ if (!sg)
goto err_free_mr;
+ sg_init_table(sg, RPCSVC_MAXPAGES);
+
frmr->mr = mr;
- frmr->page_list = pl;
+ frmr->sg = sg;
INIT_LIST_HEAD(&frmr->frmr_list);
return frmr;
@@ -781,8 +830,8 @@ static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
frmr = list_entry(xprt->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
+ kfree(frmr->sg);
ib_dereg_mr(frmr->mr);
- ib_free_fast_reg_page_list(frmr->page_list);
kfree(frmr);
}
}
@@ -796,8 +845,7 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
- frmr->map_len = 0;
- frmr->page_list_len = 0;
+ frmr->sg_nents = 0;
}
spin_unlock_bh(&rdma->sc_frmr_q_lock);
if (frmr)
@@ -806,25 +854,13 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
return rdma_alloc_frmr(rdma);
}
-static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
- struct svc_rdma_fastreg_mr *frmr)
-{
- int page_no;
- for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
- dma_addr_t addr = frmr->page_list->page_list[page_no];
- if (ib_dma_mapping_error(frmr->mr->device, addr))
- continue;
- atomic_dec(&xprt->sc_dma_used);
- ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
- frmr->direction);
- }
-}
-
void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
struct svc_rdma_fastreg_mr *frmr)
{
if (frmr) {
- frmr_unmap_dma(rdma, frmr);
+ ib_dma_unmap_sg(rdma->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents, frmr->direction);
+ atomic_dec(&rdma->sc_dma_used);
spin_lock_bh(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
@@ -848,10 +884,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
+ struct ib_cq_init_attr cq_attr = {};
struct ib_qp_init_attr qp_attr;
struct ib_device_attr devattr;
int uninitialized_var(dma_mr_acc);
- int need_dma_mr;
+ int need_dma_mr = 0;
int ret;
int i;
@@ -884,6 +921,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
* capabilities of this particular device */
newxprt->sc_max_sge = min((size_t)devattr.max_sge,
(size_t)RPCSVC_MAXPAGES);
+ newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+ RPCSVC_MAXPAGES);
newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
(size_t)svcrdma_max_requests);
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
@@ -900,22 +939,22 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
+ cq_attr.cqe = newxprt->sc_sq_depth;
newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
sq_comp_handler,
cq_event_handler,
newxprt,
- newxprt->sc_sq_depth,
- 0);
+ &cq_attr);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
+ cq_attr.cqe = newxprt->sc_max_requests;
newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
rq_comp_handler,
cq_event_handler,
newxprt,
- newxprt->sc_max_requests,
- 0);
+ &cq_attr);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -985,35 +1024,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/*
* Determine if a DMA MR is required and if so, what privs are required
*/
- switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
- case RDMA_TRANSPORT_IWARP:
- newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
- need_dma_mr = 1;
- dma_mr_acc =
- (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE);
- } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- } else
- need_dma_mr = 0;
- break;
- case RDMA_TRANSPORT_IB:
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- } else if (!(devattr.device_cap_flags &
- IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- } else
- need_dma_mr = 0;
- break;
- default:
+ if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num) &&
+ !rdma_ib_or_roce(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num))
goto errout;
+
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
+ !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num) &&
+ !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
+ dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
}
+ if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
+ newxprt->sc_cm_id->port_num))
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+
/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
if (need_dma_mr) {
/* Register all of physical memory */
@@ -1067,6 +1097,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
" remote_ip : %pI4\n"
" remote_port : %d\n"
" max_sge : %d\n"
+ " max_sge_rd : %d\n"
" sq_depth : %d\n"
" max_requests : %d\n"
" ord : %d\n",
@@ -1080,6 +1111,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
route.addr.dst_addr)->sin_port),
newxprt->sc_max_sge,
+ newxprt->sc_max_sge_rd,
newxprt->sc_sq_depth,
newxprt->sc_max_requests,
newxprt->sc_ord);
@@ -1222,40 +1254,6 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
return 1;
}
-/*
- * Attempt to register the kvec representing the RPC memory with the
- * device.
- *
- * Returns:
- * NULL : The device does not support fastreg or there were no more
- * fastreg mr.
- * frmr : The kvec register request was successfully posted.
- * <0 : An error was encountered attempting to register the kvec.
- */
-int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
- struct svc_rdma_fastreg_mr *frmr)
-{
- struct ib_send_wr fastreg_wr;
- u8 key;
-
- /* Bump the key */
- key = (u8)(frmr->mr->lkey & 0x000000FF);
- ib_update_fast_reg_key(frmr->mr, ++key);
-
- /* Prepare FASTREG WR */
- memset(&fastreg_wr, 0, sizeof fastreg_wr);
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.send_flags = IB_SEND_SIGNALED;
- fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
- fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
- fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.length = frmr->map_len;
- fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
- fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
- return svc_rdma_send(xprt, &fastreg_wr);
-}
-
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
struct ib_send_wr *bad_wr, *n_wr;
@@ -1319,11 +1317,11 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
struct ib_send_wr err_wr;
struct page *p;
struct svc_rdma_op_ctxt *ctxt;
- u32 *va;
+ __be32 *va;
int length;
int ret;
- p = svc_rdma_get_page();
+ p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
va = page_address(p);
/* XDR encode error */
diff --git a/kernel/net/sunrpc/xprtrdma/transport.c b/kernel/net/sunrpc/xprtrdma/transport.c
index 54f23b1be..8c545f7d7 100644
--- a/kernel/net/sunrpc/xprtrdma/transport.c
+++ b/kernel/net/sunrpc/xprtrdma/transport.c
@@ -48,7 +48,6 @@
*/
#include <linux/module.h>
-#include <linux/init.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/sunrpc/addr.h>
@@ -59,11 +58,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-MODULE_LICENSE("Dual BSD/GPL");
-
-MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
-MODULE_AUTHOR("Network Appliance, Inc.");
-
/*
* tunables
*/
@@ -181,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
}
static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
- struct sockaddr *sap = (struct sockaddr *)
- &rpcx_to_rdmad(xprt).addr;
char buf[128];
switch (sap->sa_family) {
@@ -246,6 +238,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
xprt_clear_connecting(xprt);
}
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+ rx_xprt);
+
+ pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+ rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
/*
* xprt_rdma_destroy
*
@@ -268,8 +270,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
xprt_clear_connected(xprt);
- rpcrdma_buffer_destroy(&r_xprt->rx_buf);
rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_buffer_destroy(&r_xprt->rx_buf);
rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
@@ -298,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args)
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
struct rpcrdma_ep *new_ep;
- struct sockaddr_in *sin;
+ struct sockaddr *sap;
int rc;
if (args->addrlen > sizeof(xprt->addr)) {
@@ -329,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args)
* Set up RDMA-specific connect data.
*/
- /* Put server RDMA address in local cdata */
- memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+ sap = (struct sockaddr *)&cdata.addr;
+ memcpy(sap, args->dstaddr, args->addrlen);
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
xprt->addrlen = args->addrlen;
- memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+ memcpy(&xprt->addr, sap, xprt->addrlen);
- sin = (struct sockaddr_in *)&cdata.addr;
- if (ntohs(sin->sin_port) != 0)
+ if (rpc_get_port(sap))
xprt_set_bound(xprt);
- dprintk("RPC: %s: %pI4:%u\n",
- __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
- /* Set max requests */
cdata.max_requests = xprt->max_reqs;
- /* Set some length limits */
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -371,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
- xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
if (rc)
goto out1;
@@ -405,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args)
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt_rdma_format_addresses(xprt);
+ xprt_rdma_format_addresses(xprt, sap);
xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
if (xprt->max_payload == 0)
goto out4;
@@ -416,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args)
if (!try_module_get(THIS_MODULE))
goto out4;
+ dprintk("RPC: %s: %s:%s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
return xprt;
out4:
@@ -618,12 +616,6 @@ xprt_rdma_send_request(struct rpc_task *task)
if (req->rl_reply == NULL) /* e.g. reconnection */
rpcrdma_recv_buffer_get(req);
- if (req->rl_reply) {
- req->rl_reply->rr_func = rpcrdma_reply_handler;
- /* this need only be done once, but... */
- req->rl_reply->rr_xprt = xprt;
- }
-
/* Must suppress retransmit to maintain credits */
if (req->rl_connect_cookie == xprt->connect_cookie)
goto drop_connection;
@@ -655,31 +647,41 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
if (xprt_connected(xprt))
idle_time = (long)(jiffies - xprt->last_used) / HZ;
- seq_printf(seq,
- "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
- "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
- 0, /* need a local port? */
- xprt->stat.bind_count,
- xprt->stat.connect_count,
- xprt->stat.connect_time,
- idle_time,
- xprt->stat.sends,
- xprt->stat.recvs,
- xprt->stat.bad_xids,
- xprt->stat.req_u,
- xprt->stat.bklog_u,
-
- r_xprt->rx_stats.read_chunk_count,
- r_xprt->rx_stats.write_chunk_count,
- r_xprt->rx_stats.reply_chunk_count,
- r_xprt->rx_stats.total_rdma_request,
- r_xprt->rx_stats.total_rdma_reply,
- r_xprt->rx_stats.pullup_copy_count,
- r_xprt->rx_stats.fixup_copy_count,
- r_xprt->rx_stats.hardway_register_count,
- r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ seq_puts(seq, "\txprt:\trdma ");
+ seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+ 0, /* need a local port? */
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u);
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+ r_xprt->rx_stats.read_chunk_count,
+ r_xprt->rx_stats.write_chunk_count,
+ r_xprt->rx_stats.reply_chunk_count,
+ r_xprt->rx_stats.total_rdma_request,
+ r_xprt->rx_stats.total_rdma_reply,
+ r_xprt->rx_stats.pullup_copy_count,
+ r_xprt->rx_stats.fixup_copy_count,
+ r_xprt->rx_stats.hardway_register_count,
+ r_xprt->rx_stats.failed_marshal_count,
+ r_xprt->rx_stats.bad_reply_count,
+ r_xprt->rx_stats.nomsg_call_count);
+}
+
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+ return 0;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
}
/*
@@ -700,7 +702,16 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
- .print_stats = xprt_rdma_print_stats
+ .print_stats = xprt_rdma_print_stats,
+ .enable_swap = xprt_rdma_enable_swap,
+ .disable_swap = xprt_rdma_disable_swap,
+ .inject_disconnect = xprt_rdma_inject_disconnect,
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ .bc_setup = xprt_rdma_bc_setup,
+ .bc_up = xprt_rdma_bc_up,
+ .bc_free_rqst = xprt_rdma_bc_free_rqst,
+ .bc_destroy = xprt_rdma_bc_destroy,
+#endif
};
static struct xprt_class xprt_rdma = {
@@ -711,7 +722,7 @@ static struct xprt_class xprt_rdma = {
.setup = xprt_setup_rdma,
};
-static void __exit xprt_rdma_cleanup(void)
+void xprt_rdma_cleanup(void)
{
int rc;
@@ -726,17 +737,32 @@ static void __exit xprt_rdma_cleanup(void)
if (rc)
dprintk("RPC: %s: xprt_unregister returned %i\n",
__func__, rc);
+
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
}
-static int __init xprt_rdma_init(void)
+int xprt_rdma_init(void)
{
int rc;
- rc = xprt_register_transport(&xprt_rdma);
-
+ rc = frwr_alloc_recovery_wq();
if (rc)
return rc;
+ rc = rpcrdma_alloc_wq();
+ if (rc) {
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
+ rc = xprt_register_transport(&xprt_rdma);
+ if (rc) {
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
dprintk("Defaults:\n");
@@ -753,6 +779,3 @@ static int __init xprt_rdma_init(void)
#endif
return 0;
}
-
-module_init(xprt_rdma_init);
-module_exit(xprt_rdma_cleanup);
diff --git a/kernel/net/sunrpc/xprtrdma/verbs.c b/kernel/net/sunrpc/xprtrdma/verbs.c
index 4870d272e..eadd16551 100644
--- a/kernel/net/sunrpc/xprtrdma/verbs.c
+++ b/kernel/net/sunrpc/xprtrdma/verbs.c
@@ -52,6 +52,7 @@
#include <linux/prefetch.h>
#include <linux/sunrpc/addr.h>
#include <asm/bitops.h>
+#include <linux/module.h> /* try_module_get()/module_put() */
#include "xprt_rdma.h"
@@ -67,79 +68,33 @@
* internal functions
*/
-/*
- * handle replies in tasklet context, using a single, global list
- * rdma tasklet function -- just turn around and call the func
- * for all replies on the list
- */
-
-static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
-static LIST_HEAD(rpcrdma_tasklets_g);
+static struct workqueue_struct *rpcrdma_receive_wq;
-static void
-rpcrdma_run_tasklet(unsigned long data)
+int
+rpcrdma_alloc_wq(void)
{
- struct rpcrdma_rep *rep;
- void (*func)(struct rpcrdma_rep *);
- unsigned long flags;
+ struct workqueue_struct *recv_wq;
- data = data;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- while (!list_empty(&rpcrdma_tasklets_g)) {
- rep = list_entry(rpcrdma_tasklets_g.next,
- struct rpcrdma_rep, rr_list);
- list_del(&rep->rr_list);
- func = rep->rr_func;
- rep->rr_func = NULL;
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-
- if (func)
- func(rep);
- else
- rpcrdma_recv_buffer_put(rep);
-
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- }
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-}
+ recv_wq = alloc_workqueue("xprtrdma_receive",
+ WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
+ 0);
+ if (!recv_wq)
+ return -ENOMEM;
-static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
-
-static const char * const async_event[] = {
- "CQ error",
- "QP fatal error",
- "QP request error",
- "QP access error",
- "communication established",
- "send queue drained",
- "path migration successful",
- "path mig error",
- "device fatal error",
- "port active",
- "port error",
- "LID change",
- "P_key change",
- "SM change",
- "SRQ error",
- "SRQ limit reached",
- "last WQE reached",
- "client reregister",
- "GID change",
-};
-
-#define ASYNC_MSG(status) \
- ((status) < ARRAY_SIZE(async_event) ? \
- async_event[(status)] : "unknown async error")
+ rpcrdma_receive_wq = recv_wq;
+ return 0;
+}
-static void
-rpcrdma_schedule_tasklet(struct list_head *sched_list)
+void
+rpcrdma_destroy_wq(void)
{
- unsigned long flags;
+ struct workqueue_struct *wq;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- list_splice_tail(sched_list, &rpcrdma_tasklets_g);
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
- tasklet_schedule(&rpcrdma_tasklet_g);
+ if (rpcrdma_receive_wq) {
+ wq = rpcrdma_receive_wq;
+ rpcrdma_receive_wq = NULL;
+ destroy_workqueue(wq);
+ }
}
static void
@@ -148,7 +103,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
struct rpcrdma_ep *ep = context;
pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ASYNC_MSG(event->event),
+ __func__, ib_event_msg(event->event),
event->device->name, context);
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
@@ -163,7 +118,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
struct rpcrdma_ep *ep = context;
pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ASYNC_MSG(event->event),
+ __func__, ib_event_msg(event->event),
event->device->name, context);
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
@@ -172,35 +127,6 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
}
}
-static const char * const wc_status[] = {
- "success",
- "local length error",
- "local QP operation error",
- "local EE context operation error",
- "local protection error",
- "WR flushed",
- "memory management operation error",
- "bad response error",
- "local access error",
- "remote invalid request error",
- "remote access error",
- "remote operation error",
- "transport retry counter exceeded",
- "RNR retry counter exceeded",
- "local RDD violation error",
- "remove invalid RD request",
- "operation aborted",
- "invalid EE context number",
- "invalid EE context state",
- "fatal error",
- "response timeout error",
- "general error",
-};
-
-#define COMPLETION_MSG(status) \
- ((status) < ARRAY_SIZE(wc_status) ? \
- wc_status[(status)] : "unexpected completion error")
-
static void
rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{
@@ -209,7 +135,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS &&
wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("RPC: %s: SEND: %s\n",
- __func__, COMPLETION_MSG(wc->status));
+ __func__, ib_wc_status_msg(wc->status));
} else {
struct rpcrdma_mw *r;
@@ -218,63 +144,54 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
}
}
-static int
-rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+/* The common case is a single send completion is waiting. By
+ * passing two WC entries to ib_poll_cq, a return code of 1
+ * means there is exactly one WC waiting and no more. We don't
+ * have to invoke ib_poll_cq again to know that the CQ has been
+ * properly drained.
+ */
+static void
+rpcrdma_sendcq_poll(struct ib_cq *cq)
{
- struct ib_wc *wcs;
- int budget, count, rc;
+ struct ib_wc *pos, wcs[2];
+ int count, rc;
- budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
- wcs = ep->rep_send_wcs;
+ pos = wcs;
- rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
- if (rc <= 0)
- return rc;
+ rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
+ if (rc < 0)
+ break;
count = rc;
while (count-- > 0)
- rpcrdma_sendcq_process_wc(wcs++);
- } while (rc == RPCRDMA_POLLSIZE && --budget);
- return 0;
+ rpcrdma_sendcq_process_wc(pos++);
+ } while (rc == ARRAY_SIZE(wcs));
+ return;
}
-/*
- * Handle send, fast_reg_mr, and local_inv completions.
- *
- * Send events are typically suppressed and thus do not result
- * in an upcall. Occasionally one is signaled, however. This
- * prevents the provider's completion queue from wrapping and
- * losing a completion.
+/* Handle provider send completion upcalls.
*/
static void
rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
{
- struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
- int rc;
-
- rc = rpcrdma_sendcq_poll(cq, ep);
- if (rc) {
- dprintk("RPC: %s: ib_poll_cq failed: %i\n",
- __func__, rc);
- return;
- }
+ do {
+ rpcrdma_sendcq_poll(cq);
+ } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS) > 0);
+}
- rc = ib_req_notify_cq(cq,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
- if (rc == 0)
- return;
- if (rc < 0) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- return;
- }
+static void
+rpcrdma_receive_worker(struct work_struct *work)
+{
+ struct rpcrdma_rep *rep =
+ container_of(work, struct rpcrdma_rep, rr_work);
- rpcrdma_sendcq_poll(cq, ep);
+ rpcrdma_reply_handler(rep);
}
static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc)
{
struct rpcrdma_rep *rep =
(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -291,126 +208,70 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
__func__, rep, wc->byte_len);
rep->rr_len = wc->byte_len;
- ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+ ib_dma_sync_single_for_cpu(rep->rr_device,
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
prefetch(rdmab_to_msg(rep->rr_rdmabuf));
out_schedule:
- list_add_tail(&rep->rr_list, sched_list);
+ queue_work(rpcrdma_receive_wq, &rep->rr_work);
return;
+
out_fail:
if (wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("RPC: %s: rep %p: %s\n",
- __func__, rep, COMPLETION_MSG(wc->status));
- rep->rr_len = ~0U;
+ __func__, rep, ib_wc_status_msg(wc->status));
+ rep->rr_len = RPCRDMA_BAD_LEN;
goto out_schedule;
}
-static int
-rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+/* The wc array is on stack: automatic memory is always CPU-local.
+ *
+ * struct ib_wc is 64 bytes, making the poll array potentially
+ * large. But this is at the bottom of the call chain. Further
+ * substantial work is done in another thread.
+ */
+static void
+rpcrdma_recvcq_poll(struct ib_cq *cq)
{
- struct list_head sched_list;
- struct ib_wc *wcs;
- int budget, count, rc;
+ struct ib_wc *pos, wcs[4];
+ int count, rc;
- INIT_LIST_HEAD(&sched_list);
- budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
- wcs = ep->rep_recv_wcs;
+ pos = wcs;
- rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
- if (rc <= 0)
- goto out_schedule;
+ rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
+ if (rc < 0)
+ break;
count = rc;
while (count-- > 0)
- rpcrdma_recvcq_process_wc(wcs++, &sched_list);
- } while (rc == RPCRDMA_POLLSIZE && --budget);
- rc = 0;
-
-out_schedule:
- rpcrdma_schedule_tasklet(&sched_list);
- return rc;
+ rpcrdma_recvcq_process_wc(pos++);
+ } while (rc == ARRAY_SIZE(wcs));
}
-/*
- * Handle receive completions.
- *
- * It is reentrant but processes single events in order to maintain
- * ordering of receives to keep server credits.
- *
- * It is the responsibility of the scheduled tasklet to return
- * recv buffers to the pool. NOTE: this affects synchronization of
- * connection shutdown. That is, the structures required for
- * the completion of the reply handler must remain intact until
- * all memory has been reclaimed.
+/* Handle provider receive completion upcalls.
*/
static void
rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{
- struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
- int rc;
-
- rc = rpcrdma_recvcq_poll(cq, ep);
- if (rc) {
- dprintk("RPC: %s: ib_poll_cq failed: %i\n",
- __func__, rc);
- return;
- }
-
- rc = ib_req_notify_cq(cq,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
- if (rc == 0)
- return;
- if (rc < 0) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- return;
- }
-
- rpcrdma_recvcq_poll(cq, ep);
+ do {
+ rpcrdma_recvcq_poll(cq);
+ } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS) > 0);
}
static void
rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
{
struct ib_wc wc;
- LIST_HEAD(sched_list);
while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_recvcq_process_wc(&wc, &sched_list);
- if (!list_empty(&sched_list))
- rpcrdma_schedule_tasklet(&sched_list);
+ rpcrdma_recvcq_process_wc(&wc);
while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
rpcrdma_sendcq_process_wc(&wc);
}
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-static const char * const conn[] = {
- "address resolved",
- "address error",
- "route resolved",
- "route error",
- "connect request",
- "connect response",
- "connect error",
- "unreachable",
- "rejected",
- "established",
- "disconnected",
- "device removal",
- "multicast join",
- "multicast error",
- "address change",
- "timewait exit",
-};
-
-#define CONNECTION_MSG(status) \
- ((status) < ARRAY_SIZE(conn) ? \
- conn[(status)] : "unrecognized connection error")
-#endif
-
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -476,7 +337,7 @@ connected:
default:
dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
__func__, sap, rpc_get_port(sap), ep,
- CONNECTION_MSG(event->event));
+ rdma_event_msg(event->event));
break;
}
@@ -487,7 +348,7 @@ connected:
pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
sap, rpc_get_port(sap),
- ia->ri_id->device->name,
+ ia->ri_device->name,
ia->ri_ops->ro_displayname,
xprt->rx_buf.rb_max_requests,
ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@@ -500,6 +361,14 @@ connected:
return 0;
}
+static void rpcrdma_destroy_id(struct rdma_cm_id *id)
+{
+ if (id) {
+ module_put(id->device->owner);
+ rdma_destroy_id(id);
+ }
+}
+
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -509,7 +378,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
init_completion(&ia->ri_done);
- id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
+ IB_QPT_RC);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
dprintk("RPC: %s: rdma_create_id() failed %i\n",
@@ -526,6 +396,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
}
wait_for_completion_interruptible_timeout(&ia->ri_done,
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+
+ /* FIXME:
+ * Until xprtrdma supports DEVICE_REMOVAL, the provider must
+ * be pinned while there are active NFS/RDMA mounts to prevent
+ * hangs and crashes at umount time.
+ */
+ if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
+ dprintk("RPC: %s: Failed to get device module\n",
+ __func__);
+ ia->ri_async_rc = -ENODEV;
+ }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -535,16 +416,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto out;
+ goto put;
}
wait_for_completion_interruptible_timeout(&ia->ri_done,
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
- goto out;
+ goto put;
return id;
-
+put:
+ module_put(id->device->owner);
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -579,17 +461,20 @@ rpcrdma_clean_cq(struct ib_cq *cq)
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
- int rc, mem_priv;
struct rpcrdma_ia *ia = &xprt->rx_ia;
struct ib_device_attr *devattr = &ia->ri_devattr;
+ int rc;
+
+ ia->ri_dma_mr = NULL;
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
goto out1;
}
+ ia->ri_device = ia->ri_id->device;
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+ ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
@@ -597,69 +482,39 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out2;
}
- rc = ib_query_device(ia->ri_id->device, devattr);
+ rc = ib_query_device(ia->ri_device, devattr);
if (rc) {
dprintk("RPC: %s: ib_query_device failed %d\n",
__func__, rc);
goto out3;
}
- if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
- ia->ri_have_dma_lkey = 1;
- ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
- }
-
if (memreg == RPCRDMA_FRMR) {
- /* Requires both frmr reg and local dma lkey */
- if (((devattr->device_cap_flags &
- (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
- (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
- (devattr->max_fast_reg_page_list_len == 0)) {
+ if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ (devattr->max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
}
}
if (memreg == RPCRDMA_MTHCAFMR) {
- if (!ia->ri_id->device->alloc_fmr) {
+ if (!ia->ri_device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
+ rc = -EINVAL;
+ goto out3;
}
}
- /*
- * Optionally obtain an underlying physical identity mapping in
- * order to do a memory window-based bind. This base registration
- * is protected from remote access - that is enabled only by binding
- * for the specific bytes targeted during each RPC operation, and
- * revoked after the corresponding completion similar to a storage
- * adapter.
- */
switch (memreg) {
case RPCRDMA_FRMR:
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
break;
case RPCRDMA_ALLPHYSICAL:
ia->ri_ops = &rpcrdma_physical_memreg_ops;
- mem_priv = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ;
- goto register_setup;
+ break;
case RPCRDMA_MTHCAFMR:
ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- if (ia->ri_have_dma_lkey)
- break;
- mem_priv = IB_ACCESS_LOCAL_WRITE;
- register_setup:
- ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
- if (IS_ERR(ia->ri_bind_mem)) {
- printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n",
- __func__, PTR_ERR(ia->ri_bind_mem));
- rc = -ENOMEM;
- goto out3;
- }
break;
default:
printk(KERN_ERR "RPC: Unsupported memory "
@@ -670,9 +525,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
- /* Else will do memory reg/dereg for each chunk */
- ia->ri_memreg_strategy = memreg;
-
rwlock_init(&ia->ri_qplock);
return 0;
@@ -680,7 +532,7 @@ out3:
ib_dealloc_pd(ia->ri_pd);
ia->ri_pd = NULL;
out2:
- rdma_destroy_id(ia->ri_id);
+ rpcrdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
out1:
return rc;
@@ -694,25 +546,17 @@ out1:
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering\n", __func__);
- if (ia->ri_bind_mem != NULL) {
- rc = ib_dereg_mr(ia->ri_bind_mem);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
+ rpcrdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
}
- if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
- rc = ib_dealloc_pd(ia->ri_pd);
- dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
- __func__, rc);
- }
+
+ /* If the pd is still busy, xprtrdma missed freeing a resource */
+ if (ia->ri_pd && !IS_ERR(ia->ri_pd))
+ ib_dealloc_pd(ia->ri_pd);
}
/*
@@ -724,35 +568,44 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
{
struct ib_device_attr *devattr = &ia->ri_devattr;
struct ib_cq *sendcq, *recvcq;
+ struct ib_cq_init_attr cq_attr = {};
+ unsigned int max_qp_wr;
int rc, err;
+ if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ dprintk("RPC: %s: insufficient sge's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+ dprintk("RPC: %s: insufficient wqe's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+ max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
+
/* check provider's send/recv wr limits */
- if (cdata->max_requests > devattr->max_qp_wr)
- cdata->max_requests = devattr->max_qp_wr;
+ if (cdata->max_requests > max_qp_wr)
+ cdata->max_requests = max_qp_wr;
ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
ep->rep_attr.qp_context = ep;
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+ ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
- ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+ ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
ep->rep_attr.qp_type = IB_QPT_RC;
ep->rep_attr.port_num = ~0;
- if (cdata->padding) {
- ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
- GFP_KERNEL);
- if (IS_ERR(ep->rep_padbuf))
- return PTR_ERR(ep->rep_padbuf);
- } else
- ep->rep_padbuf = NULL;
-
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
@@ -771,9 +624,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
- rpcrdma_cq_async_error_upcall, ep,
- ep->rep_attr.cap.max_send_wr + 1, 0);
+ cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
+ sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
+ rpcrdma_cq_async_error_upcall, NULL, &cq_attr);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
dprintk("RPC: %s: failed to create send CQ: %i\n",
@@ -788,9 +641,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
goto out2;
}
- recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
- rpcrdma_cq_async_error_upcall, ep,
- ep->rep_attr.cap.max_recv_wr + 1, 0);
+ cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
+ recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
+ rpcrdma_cq_async_error_upcall, NULL, &cq_attr);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
dprintk("RPC: %s: failed to create recv CQ: %i\n",
@@ -835,7 +688,8 @@ out2:
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
out1:
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+ if (ia->ri_dma_mr)
+ ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -856,25 +710,32 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp) {
+ if (ia->ri_id->qp)
rpcrdma_ep_disconnect(ep, ia);
+
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+
+ if (ia->ri_id->qp) {
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = ib_destroy_cq(ep->rep_attr.send_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
+
+ if (ia->ri_dma_mr) {
+ rc = ib_dereg_mr(ia->ri_dma_mr);
+ dprintk("RPC: %s: ib_dereg_mr returned %i\n",
+ __func__, rc);
+ }
}
/*
@@ -896,8 +757,6 @@ retry:
rpcrdma_flush_cqs(ep);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
- ia->ri_ops->ro_reset(xprt);
-
id = rpcrdma_create_id(xprt, ia,
(struct sockaddr *)&xprt->rx_data.addr);
if (IS_ERR(id)) {
@@ -911,10 +770,10 @@ retry:
* More stuff I haven't thought of!
* Rrrgh!
*/
- if (ia->ri_id->device != id->device) {
+ if (ia->ri_device != id->device) {
printk("RPC: %s: can't reconnect on "
"different device!\n", __func__);
- rdma_destroy_id(id);
+ rpcrdma_destroy_id(id);
rc = -ENETUNREACH;
goto out;
}
@@ -923,7 +782,7 @@ retry:
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- rdma_destroy_id(id);
+ rpcrdma_destroy_id(id);
rc = -ENETUNREACH;
goto out;
}
@@ -934,7 +793,7 @@ retry:
write_unlock(&ia->ri_qplock);
rdma_destroy_qp(old);
- rdma_destroy_id(old);
+ rpcrdma_destroy_id(old);
} else {
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -983,7 +842,21 @@ retry:
}
rc = ep->rep_connected;
} else {
+ struct rpcrdma_xprt *r_xprt;
+ unsigned int extras;
+
dprintk("RPC: %s: connected\n", __func__);
+
+ r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+ extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+
+ if (extras) {
+ rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
+ if (rc)
+ pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
+ __func__, rc);
+ rc = 0;
+ }
}
out:
@@ -1020,20 +893,25 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
}
}
-static struct rpcrdma_req *
+struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (req == NULL)
return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&req->rl_free);
+ spin_lock(&buffer->rb_reqslock);
+ list_add(&req->rl_all, &buffer->rb_allreqs);
+ spin_unlock(&buffer->rb_reqslock);
req->rl_buffer = &r_xprt->rx_buf;
return req;
}
-static struct rpcrdma_rep *
+struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
@@ -1053,7 +931,9 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
- rep->rr_buffer = &r_xprt->rx_buf;
+ rep->rr_device = ia->ri_device;
+ rep->rr_rxprt = r_xprt;
+ INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
return rep;
out_free:
@@ -1067,44 +947,21 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- char *p;
- size_t len;
int i, rc;
- buf->rb_max_requests = cdata->max_requests;
+ buf->rb_max_requests = r_xprt->rx_data.max_requests;
+ buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_lock);
- /* Need to allocate:
- * 1. arrays for send and recv pointers
- * 2. arrays of struct rpcrdma_req to fill in pointers
- * 3. array of struct rpcrdma_rep for replies
- * Send/recv buffers in req/rep need to be registered
- */
- len = buf->rb_max_requests *
- (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
-
- p = kzalloc(len, GFP_KERNEL);
- if (p == NULL) {
- dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
- __func__, len);
- rc = -ENOMEM;
- goto out;
- }
- buf->rb_pool = p; /* for freeing it later */
-
- buf->rb_send_bufs = (struct rpcrdma_req **) p;
- p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
- buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
- p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
-
rc = ia->ri_ops->ro_init(r_xprt);
if (rc)
goto out;
+ INIT_LIST_HEAD(&buf->rb_send_bufs);
+ INIT_LIST_HEAD(&buf->rb_allreqs);
+ spin_lock_init(&buf->rb_reqslock);
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
- struct rpcrdma_rep *rep;
req = rpcrdma_create_req(r_xprt);
if (IS_ERR(req)) {
@@ -1113,7 +970,13 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(req);
goto out;
}
- buf->rb_send_bufs[i] = req;
+ req->rl_backchannel = false;
+ list_add(&req->rl_free, &buf->rb_send_bufs);
+ }
+
+ INIT_LIST_HEAD(&buf->rb_recv_bufs);
+ for (i = 0; i < buf->rb_max_requests + 2; i++) {
+ struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
if (IS_ERR(rep)) {
@@ -1122,7 +985,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(rep);
goto out;
}
- buf->rb_recv_bufs[i] = rep;
+ list_add(&rep->rr_list, &buf->rb_recv_bufs);
}
return 0;
@@ -1131,22 +994,38 @@ out:
return rc;
}
+static struct rpcrdma_req *
+rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_req *req;
+
+ req = list_first_entry(&buf->rb_send_bufs,
+ struct rpcrdma_req, rl_free);
+ list_del(&req->rl_free);
+ return req;
+}
+
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_rep *rep;
+
+ rep = list_first_entry(&buf->rb_recv_bufs,
+ struct rpcrdma_rep, rr_list);
+ list_del(&rep->rr_list);
+ return rep;
+}
+
static void
rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
{
- if (!rep)
- return;
-
rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
kfree(rep);
}
-static void
+void
rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
- if (!req)
- return;
-
rpcrdma_free_regbuf(ia, req->rl_sendbuf);
rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
kfree(req);
@@ -1156,220 +1035,88 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- int i;
-
- /* clean up in reverse order from create
- * 1. recv mr memory (mr free, then kfree)
- * 2. send mr memory (mr free, then kfree)
- * 3. MWs
- */
- dprintk("RPC: %s: entering\n", __func__);
-
- for (i = 0; i < buf->rb_max_requests; i++) {
- if (buf->rb_recv_bufs)
- rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
- if (buf->rb_send_bufs)
- rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
- }
-
- ia->ri_ops->ro_destroy(buf);
- kfree(buf->rb_pool);
-}
+ while (!list_empty(&buf->rb_recv_bufs)) {
+ struct rpcrdma_rep *rep;
-/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
- * some req segments uninitialized.
- */
-static void
-rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
-{
- if (*mw) {
- list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
- *mw = NULL;
+ rep = rpcrdma_buffer_get_rep_locked(buf);
+ rpcrdma_destroy_rep(ia, rep);
}
-}
-/* Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
-static void
-rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mr_seg *seg = req->rl_segments;
- struct rpcrdma_mr_seg *seg1 = seg;
- int i;
+ spin_lock(&buf->rb_reqslock);
+ while (!list_empty(&buf->rb_allreqs)) {
+ struct rpcrdma_req *req;
- for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
- rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
- rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
-}
+ req = list_first_entry(&buf->rb_allreqs,
+ struct rpcrdma_req, rl_all);
+ list_del(&req->rl_all);
-static void
-rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
- buf->rb_send_bufs[--buf->rb_send_index] = req;
- req->rl_niovs = 0;
- if (req->rl_reply) {
- buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
- req->rl_reply->rr_func = NULL;
- req->rl_reply = NULL;
+ spin_unlock(&buf->rb_reqslock);
+ rpcrdma_destroy_req(ia, req);
+ spin_lock(&buf->rb_reqslock);
}
-}
-
-/* rpcrdma_unmap_one() was already done during deregistration.
- * Redo only the ib_post_send().
- */
-static void
-rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
-{
- struct rpcrdma_xprt *r_xprt =
- container_of(ia, struct rpcrdma_xprt, rx_ia);
- struct ib_send_wr invalidate_wr, *bad_wr;
- int rc;
-
- dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
-
- /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
- r->r.frmr.fr_state = FRMR_IS_INVALID;
-
- memset(&invalidate_wr, 0, sizeof(invalidate_wr));
- invalidate_wr.wr_id = (unsigned long)(void *)r;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
-
- dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
- __func__, r, r->r.frmr.fr_mr->rkey);
+ spin_unlock(&buf->rb_reqslock);
- read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc) {
- /* Force rpcrdma_buffer_get() to retry */
- r->r.frmr.fr_state = FRMR_IS_STALE;
- dprintk("RPC: %s: ib_post_send failed, %i\n",
- __func__, rc);
- }
+ ia->ri_ops->ro_destroy(buf);
}
-static void
-rpcrdma_retry_flushed_linv(struct list_head *stale,
- struct rpcrdma_buffer *buf)
+struct rpcrdma_mw *
+rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- struct list_head *pos;
- struct rpcrdma_mw *r;
- unsigned long flags;
-
- list_for_each(pos, stale) {
- r = list_entry(pos, struct rpcrdma_mw, mw_list);
- rpcrdma_retry_local_inv(r, ia);
- }
-
- spin_lock_irqsave(&buf->rb_lock, flags);
- list_splice_tail(stale, &buf->rb_mws);
- spin_unlock_irqrestore(&buf->rb_lock, flags);
-}
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_mw *mw = NULL;
-static struct rpcrdma_req *
-rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
- struct list_head *stale)
-{
- struct rpcrdma_mw *r;
- int i;
-
- i = RPCRDMA_MAX_SEGS - 1;
- while (!list_empty(&buf->rb_mws)) {
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- if (r->r.frmr.fr_state == FRMR_IS_STALE) {
- list_add(&r->mw_list, stale);
- continue;
- }
- req->rl_segments[i].rl_mw = r;
- if (unlikely(i-- == 0))
- return req; /* Success */
+ spin_lock(&buf->rb_mwlock);
+ if (!list_empty(&buf->rb_mws)) {
+ mw = list_first_entry(&buf->rb_mws,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
}
+ spin_unlock(&buf->rb_mwlock);
- /* Not enough entries on rb_mws for this req */
- rpcrdma_buffer_put_sendbuf(req, buf);
- rpcrdma_buffer_put_mrs(req, buf);
- return NULL;
+ if (!mw)
+ pr_err("RPC: %s: no MWs available\n", __func__);
+ return mw;
}
-static struct rpcrdma_req *
-rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+void
+rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *r;
- int i;
-
- i = RPCRDMA_MAX_SEGS - 1;
- while (!list_empty(&buf->rb_mws)) {
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- req->rl_segments[i].rl_mw = r;
- if (unlikely(i-- == 0))
- return req; /* Success */
- }
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- /* Not enough entries on rb_mws for this req */
- rpcrdma_buffer_put_sendbuf(req, buf);
- rpcrdma_buffer_put_mrs(req, buf);
- return NULL;
+ spin_lock(&buf->rb_mwlock);
+ list_add_tail(&mw->mw_list, &buf->rb_mws);
+ spin_unlock(&buf->rb_mwlock);
}
/*
* Get a set of request/reply buffers.
*
- * Reply buffer (if needed) is attached to send buffer upon return.
- * Rule:
- * rb_send_index and rb_recv_index MUST always be pointing to the
- * *next* available buffer (non-NULL). They are incremented after
- * removing buffers, and decremented *before* returning them.
+ * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
- struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
- struct list_head stale;
struct rpcrdma_req *req;
- unsigned long flags;
-
- spin_lock_irqsave(&buffers->rb_lock, flags);
- if (buffers->rb_send_index == buffers->rb_max_requests) {
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
- dprintk("RPC: %s: out of request buffers\n", __func__);
- return ((struct rpcrdma_req *)NULL);
- }
- req = buffers->rb_send_bufs[buffers->rb_send_index];
- if (buffers->rb_send_index < buffers->rb_recv_index) {
- dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
- __func__,
- buffers->rb_recv_index - buffers->rb_send_index);
- req->rl_reply = NULL;
- } else {
- req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
- buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
- }
- buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+ spin_lock(&buffers->rb_lock);
+ if (list_empty(&buffers->rb_send_bufs))
+ goto out_reqbuf;
+ req = rpcrdma_buffer_get_req_locked(buffers);
+ if (list_empty(&buffers->rb_recv_bufs))
+ goto out_repbuf;
+ req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
+ return req;
- INIT_LIST_HEAD(&stale);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
- break;
- case RPCRDMA_MTHCAFMR:
- req = rpcrdma_buffer_get_fmrs(req, buffers);
- break;
- default:
- break;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
- if (!list_empty(&stale))
- rpcrdma_retry_flushed_linv(&stale, buffers);
+out_reqbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("RPC: %s: out of request buffers\n", __func__);
+ return NULL;
+out_repbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("RPC: %s: out of reply buffers\n", __func__);
+ req->rl_reply = NULL;
return req;
}
@@ -1381,39 +1128,31 @@ void
rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
- struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
- unsigned long flags;
+ struct rpcrdma_rep *rep = req->rl_reply;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- rpcrdma_buffer_put_sendbuf(req, buffers);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- case RPCRDMA_MTHCAFMR:
- rpcrdma_buffer_put_mrs(req, buffers);
- break;
- default:
- break;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ req->rl_niovs = 0;
+ req->rl_reply = NULL;
+
+ spin_lock(&buffers->rb_lock);
+ list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
+ if (rep)
+ list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
}
/*
* Recover reply buffers from pool.
- * This happens when recovering from error conditions.
- * Post-increment counter/array index.
+ * This happens when recovering from disconnect.
*/
void
rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
- unsigned long flags;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- if (buffers->rb_recv_index < buffers->rb_max_requests) {
- req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
- buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ if (!list_empty(&buffers->rb_recv_bufs))
+ req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
}
/*
@@ -1423,13 +1162,11 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
- struct rpcrdma_buffer *buffers = rep->rr_buffer;
- unsigned long flags;
+ struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
- rep->rr_func = NULL;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
}
/*
@@ -1444,75 +1181,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
- struct ib_mr **mrp, struct ib_sge *iov)
-{
- struct ib_phys_buf ipb;
- struct ib_mr *mr;
- int rc;
-
- /*
- * All memory passed here was kmalloc'ed, therefore phys-contiguous.
- */
- iov->addr = ib_dma_map_single(ia->ri_id->device,
- va, len, DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
- return -ENOMEM;
-
- iov->length = len;
-
- if (ia->ri_have_dma_lkey) {
- *mrp = NULL;
- iov->lkey = ia->ri_dma_lkey;
- return 0;
- } else if (ia->ri_bind_mem != NULL) {
- *mrp = NULL;
- iov->lkey = ia->ri_bind_mem->lkey;
- return 0;
- }
-
- ipb.addr = iov->addr;
- ipb.size = iov->length;
- mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
- IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
- dprintk("RPC: %s: phys convert: 0x%llx "
- "registered 0x%llx length %d\n",
- __func__, (unsigned long long)ipb.addr,
- (unsigned long long)iov->addr, len);
-
- if (IS_ERR(mr)) {
- *mrp = NULL;
- rc = PTR_ERR(mr);
- dprintk("RPC: %s: failed with %i\n", __func__, rc);
- } else {
- *mrp = mr;
- iov->lkey = mr->lkey;
- rc = 0;
- }
-
- return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
- struct ib_mr *mr, struct ib_sge *iov)
-{
- int rc;
-
- ib_dma_unmap_single(ia->ri_id->device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
- if (NULL == mr)
- return 0;
-
- rc = ib_dereg_mr(mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
- return rc;
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1532,26 +1200,29 @@ struct rpcrdma_regbuf *
rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- int rc;
+ struct ib_sge *iov;
- rc = -ENOMEM;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
goto out;
- rb->rg_size = size;
- rb->rg_owner = NULL;
- rc = rpcrdma_register_internal(ia, rb->rg_base, size,
- &rb->rg_mr, &rb->rg_iov);
- if (rc)
+ iov = &rb->rg_iov;
+ iov->addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base, size,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_device, iov->addr))
goto out_free;
+ iov->length = size;
+ iov->lkey = ia->ri_pd->local_dma_lkey;
+ rb->rg_size = size;
+ rb->rg_owner = NULL;
return rb;
out_free:
kfree(rb);
out:
- return ERR_PTR(rc);
+ return ERR_PTR(-ENOMEM);
}
/**
@@ -1562,10 +1233,15 @@ out:
void
rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
- if (rb) {
- rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
- kfree(rb);
- }
+ struct ib_sge *iov;
+
+ if (!rb)
+ return;
+
+ iov = &rb->rg_iov;
+ ib_dma_unmap_single(ia->ri_device,
+ iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ kfree(rb);
}
/*
@@ -1578,9 +1254,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
+ struct ib_device *device = ia->ri_device;
struct ib_send_wr send_wr, *send_wr_fail;
struct rpcrdma_rep *rep = req->rl_reply;
- int rc;
+ struct ib_sge *iov = req->rl_send_iov;
+ int i, rc;
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1591,19 +1269,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
send_wr.next = NULL;
send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
- send_wr.sg_list = req->rl_send_iov;
+ send_wr.sg_list = iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
- if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
- ib_dma_sync_single_for_device(ia->ri_id->device,
- req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_id->device,
- req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_id->device,
- req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
- DMA_TO_DEVICE);
+
+ for (i = 0; i < send_wr.num_sge; i++)
+ ib_dma_sync_single_for_device(device, iov[i].addr,
+ iov[i].length, DMA_TO_DEVICE);
+ dprintk("RPC: %s: posting %d s/g entries\n",
+ __func__, send_wr.num_sge);
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
@@ -1636,7 +1310,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
recv_wr.num_sge = 1;
- ib_dma_sync_single_for_cpu(ia->ri_id->device,
+ ib_dma_sync_single_for_cpu(ia->ri_device,
rdmab_addr(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf),
DMA_BIDIRECTIONAL);
@@ -1649,6 +1323,47 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
return rc;
}
+/**
+ * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
+ * @r_xprt: transport associated with these backchannel resources
+ * @min_reqs: minimum number of incoming requests expected
+ *
+ * Returns zero if all requested buffers were posted, or a negative errno.
+ */
+int
+rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
+{
+ struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_rep *rep;
+ unsigned long flags;
+ int rc;
+
+ while (count--) {
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ if (list_empty(&buffers->rb_recv_bufs))
+ goto out_reqbuf;
+ rep = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+
+ rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ if (rc)
+ goto out_rc;
+ }
+
+ return 0;
+
+out_reqbuf:
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ pr_warn("%s: no extra receive buffers\n", __func__);
+ return -ENOMEM;
+
+out_rc:
+ rpcrdma_recv_buffer_put(rep);
+ return rc;
+}
+
/* How many chunk list items fit within our inline buffers?
*/
unsigned int
diff --git a/kernel/net/sunrpc/xprtrdma/xprt_rdma.h b/kernel/net/sunrpc/xprtrdma/xprt_rdma.h
index 78e0b8bea..ac7f8d4f6 100644
--- a/kernel/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/kernel/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -51,7 +51,6 @@
#include <linux/sunrpc/clnt.h> /* rpc_xprt */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
-#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
@@ -62,14 +61,12 @@
struct rpcrdma_ia {
const struct rpcrdma_memreg_ops *ri_ops;
rwlock_t ri_qplock;
+ struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_bind_mem;
- u32 ri_dma_lkey;
- int ri_have_dma_lkey;
+ struct ib_mr *ri_dma_mr;
struct completion ri_done;
int ri_async_rc;
- enum rpcrdma_memreg ri_memreg_strategy;
unsigned int ri_max_frmr_depth;
struct ib_device_attr ri_devattr;
struct ib_qp_attr ri_qp_attr;
@@ -80,21 +77,15 @@ struct rpcrdma_ia {
* RDMA Endpoint -- one per transport instance
*/
-#define RPCRDMA_WC_BUDGET (128)
-#define RPCRDMA_POLLSIZE (16)
-
struct rpcrdma_ep {
atomic_t rep_cqcount;
int rep_cqinit;
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
- struct rpcrdma_regbuf *rep_padbuf;
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
- struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
- struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
};
/*
@@ -110,6 +101,16 @@ struct rpcrdma_ep {
*/
#define RPCRDMA_IGNORE_COMPLETION (0ULL)
+/* Pre-allocate extra Work Requests for handling backward receives
+ * and sends. This is a fixed value because the Work Queues are
+ * allocated when the forward channel is set up.
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+#define RPCRDMA_BACKWARD_WRS (8)
+#else
+#define RPCRDMA_BACKWARD_WRS (0)
+#endif
+
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
*
* The below structure appears at the front of a large region of kmalloc'd
@@ -119,7 +120,6 @@ struct rpcrdma_ep {
struct rpcrdma_regbuf {
size_t rg_size;
struct rpcrdma_req *rg_owner;
- struct ib_mr *rg_mr;
struct ib_sge rg_iov;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
@@ -165,21 +165,22 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
struct rpcrdma_buffer;
struct rpcrdma_rep {
unsigned int rr_len;
- struct rpcrdma_buffer *rr_buffer;
- struct rpc_xprt *rr_xprt;
- void (*rr_func)(struct rpcrdma_rep *);
+ struct ib_device *rr_device;
+ struct rpcrdma_xprt *rr_rxprt;
+ struct work_struct rr_work;
struct list_head rr_list;
struct rpcrdma_regbuf *rr_rdmabuf;
};
+#define RPCRDMA_BAD_LEN (~0U)
+
/*
* struct rpcrdma_mw - external memory region metadata
*
@@ -200,14 +201,22 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct ib_fast_reg_page_list *fr_pgl;
+ struct scatterlist *sg;
+ int sg_nents;
struct ib_mr *fr_mr;
enum rpcrdma_frmr_state fr_state;
+ struct work_struct fr_work;
+ struct rpcrdma_xprt *fr_xprt;
+};
+
+struct rpcrdma_fmr {
+ struct ib_fmr *fmr;
+ u64 *physaddrs;
};
struct rpcrdma_mw {
union {
- struct ib_fmr *fmr;
+ struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
} r;
void (*mw_sendcompletion)(struct ib_wc *);
@@ -252,16 +261,22 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
+#define RPCRDMA_MAX_IOVS (2)
+
struct rpcrdma_req {
- unsigned int rl_niovs; /* 0, 2 or 4 */
- unsigned int rl_nchunks; /* non-zero if chunks */
- unsigned int rl_connect_cookie; /* retry detection */
- struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+ struct list_head rl_free;
+ unsigned int rl_niovs;
+ unsigned int rl_nchunks;
+ unsigned int rl_connect_cookie;
+ struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[4]; /* for active requests */
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
+ struct rpcrdma_regbuf *rl_rdmabuf;
+ struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+
+ struct list_head rl_all;
+ bool rl_backchannel;
};
static inline struct rpcrdma_req *
@@ -281,15 +296,19 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
* One of these is associated with a transport instance
*/
struct rpcrdma_buffer {
- spinlock_t rb_lock; /* protects indexes */
- u32 rb_max_requests;/* client max requests */
- struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
- struct list_head rb_all;
- int rb_send_index;
- struct rpcrdma_req **rb_send_bufs;
- int rb_recv_index;
- struct rpcrdma_rep **rb_recv_bufs;
- char *rb_pool;
+ spinlock_t rb_mwlock; /* protect rb_mws list */
+ struct list_head rb_mws;
+ struct list_head rb_all;
+ char *rb_pool;
+
+ spinlock_t rb_lock; /* protect buf lists */
+ struct list_head rb_send_bufs;
+ struct list_head rb_recv_bufs;
+ u32 rb_max_requests;
+
+ u32 rb_bc_srv_max_requests;
+ spinlock_t rb_reqslock; /* protect rb_allreqs */
+ struct list_head rb_allreqs;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -334,6 +353,8 @@ struct rpcrdma_stats {
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
+ unsigned long nomsg_call_count;
+ unsigned long bcall_count;
};
/*
@@ -350,7 +371,6 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *);
int (*ro_init)(struct rpcrdma_xprt *);
- void (*ro_reset)(struct rpcrdma_xprt *);
void (*ro_destroy)(struct rpcrdma_buffer *);
const char *ro_displayname;
};
@@ -410,9 +430,14 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
/*
* Buffer calls - xprtrdma/verbs.c
*/
+struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
+struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
+void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
+void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
@@ -424,6 +449,13 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
+
+int frwr_alloc_recovery_wq(void);
+void frwr_destroy_recovery_wq(void);
+
+int rpcrdma_alloc_wq(void);
+void rpcrdma_destroy_wq(void);
/*
* Wrappers for chunk registration, shared by read/write chunk code.
@@ -480,6 +512,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
*/
int rpcrdma_marshal_req(struct rpc_rqst *);
+/* RPC/RDMA module init - xprtrdma/transport.c
+ */
+int xprt_rdma_init(void);
+void xprt_rdma_cleanup(void);
+
+/* Backchannel calls - xprtrdma/backchannel.c
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
+int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
+int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
+void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
/* Temporary NFS request map cache. Created in svc_rdma.c */
extern struct kmem_cache *svc_rdma_map_cachep;
/* WR context cache. Created in svc_rdma.c */
@@ -487,10 +536,4 @@ extern struct kmem_cache *svc_rdma_ctxt_cachep;
/* Workqueue created in svc_rdma.c */
extern struct workqueue_struct *svc_rdma_wq;
-#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
-#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
-#else
-#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
-#endif
-
#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/kernel/net/sunrpc/xprtsock.c b/kernel/net/sunrpc/xprtsock.c
index 5e3ad598d..027c9ef8a 100644
--- a/kernel/net/sunrpc/xprtsock.c
+++ b/kernel/net/sunrpc/xprtsock.c
@@ -360,8 +360,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
int flags = XS_SENDMSG_FLAGS;
remainder -= len;
- if (remainder != 0 || more)
+ if (more)
flags |= MSG_MORE;
+ if (remainder != 0)
+ flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE;
err = do_sendpage(sock, *ppage, base, len, flags);
if (remainder == 0 || err != len)
break;
@@ -396,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
if (unlikely(!sock))
return -ENOTSOCK;
- clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
if (base != 0) {
addr = NULL;
addrlen = 0;
@@ -440,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task)
struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
transport->inet->sk_write_pending--;
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
}
/**
@@ -465,20 +465,11 @@ static int xs_nospace(struct rpc_task *task)
/* Don't race with disconnect */
if (xprt_connected(xprt)) {
- if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
- /*
- * Notify TCP that we're limited by the application
- * window size
- */
- set_bit(SOCK_NOSPACE, &transport->sock->flags);
- sk->sk_write_pending++;
- /* ...and wait for more buffer space */
- xprt_wait_for_buffer_space(task, xs_nospace_callback);
- }
- } else {
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ /* wait for more buffer space */
+ sk->sk_write_pending++;
+ xprt_wait_for_buffer_space(task, xs_nospace_callback);
+ } else
ret = -ENOTCONN;
- }
spin_unlock_bh(&xprt->transport_lock);
@@ -527,6 +518,10 @@ static int xs_local_send_request(struct rpc_task *task)
true, &sent);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - req->rq_bytes_sent, status);
+
+ if (status == -EAGAIN && sock_writeable(transport->inet))
+ status = -ENOBUFS;
+
if (likely(sent > 0) || status == 0) {
req->rq_bytes_sent += sent;
req->rq_xmit_bytes_sent += sent;
@@ -539,6 +534,7 @@ static int xs_local_send_request(struct rpc_task *task)
switch (status) {
case -ENOBUFS:
+ break;
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -589,6 +585,9 @@ static int xs_udp_send_request(struct rpc_task *task)
if (status == -EPERM)
goto process_status;
+ if (status == -EAGAIN && sock_writeable(transport->inet))
+ status = -ENOBUFS;
+
if (sent > 0 || status == 0) {
req->rq_xmit_bytes_sent += sent;
if (sent >= req->rq_slen)
@@ -606,9 +605,6 @@ process_status:
case -EAGAIN:
status = xs_nospace(task);
break;
- default:
- dprintk("RPC: sendmsg returned unrecognized error %d\n",
- -status);
case -ENETUNREACH:
case -ENOBUFS:
case -EPIPE:
@@ -616,31 +612,16 @@ process_status:
case -EPERM:
/* When the server has died, an ICMP port unreachable message
* prompts ECONNREFUSED. */
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
}
return status;
}
/**
- * xs_tcp_shutdown - gracefully shut down a TCP socket
- * @xprt: transport
- *
- * Initiates a graceful shutdown of the TCP socket by calling the
- * equivalent of shutdown(SHUT_RDWR);
- */
-static void xs_tcp_shutdown(struct rpc_xprt *xprt)
-{
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
- struct socket *sock = transport->sock;
-
- if (sock != NULL) {
- kernel_sock_shutdown(sock, SHUT_RDWR);
- trace_rpc_socket_shutdown(xprt, sock);
- }
-}
-
-/**
* xs_tcp_send_request - write an RPC request to a TCP socket
* @task: address of RPC task that manages the state of an RPC request
*
@@ -687,9 +668,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
xdr->len - req->rq_bytes_sent, status);
- if (unlikely(sent == 0 && status < 0))
- break;
-
/* If we've sent the entire packet, immediately
* reset the count of bytes sent. */
req->rq_bytes_sent += sent;
@@ -699,30 +677,34 @@ static int xs_tcp_send_request(struct rpc_task *task)
return 0;
}
- if (sent != 0)
- continue;
- status = -EAGAIN;
- break;
+ if (status < 0)
+ break;
+ if (sent == 0) {
+ status = -EAGAIN;
+ break;
+ }
}
+ if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
+ status = -ENOBUFS;
switch (status) {
case -ENOTSOCK:
status = -ENOTCONN;
/* Should we call xs_close() here? */
break;
- case -ENOBUFS:
case -EAGAIN:
status = xs_nospace(task);
break;
- default:
- dprintk("RPC: sendmsg returned unrecognized error %d\n",
- -status);
case -ECONNRESET:
case -ECONNREFUSED:
case -ENOTCONN:
case -EADDRINUSE:
+ case -ENOBUFS:
case -EPIPE:
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
}
return status;
@@ -827,6 +809,12 @@ static void xs_reset_transport(struct sock_xprt *transport)
if (sk == NULL)
return;
+ if (atomic_read(&transport->xprt.swapper))
+ sk_clear_memalloc(sk);
+
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+
+ mutex_lock(&transport->recv_mutex);
write_lock_bh(&sk->sk_callback_lock);
transport->inet = NULL;
transport->sock = NULL;
@@ -837,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
xprt_clear_connected(xprt);
write_unlock_bh(&sk->sk_callback_lock);
xs_sock_reset_connection_flags(xprt);
+ mutex_unlock(&transport->recv_mutex);
trace_rpc_socket_close(xprt, sock);
sock_release(sock);
@@ -864,6 +853,13 @@ static void xs_close(struct rpc_xprt *xprt)
xprt_disconnect_done(xprt);
}
+static void xs_inject_disconnect(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: injecting transport disconnect on xprt=%p\n",
+ xprt);
+ xprt_disconnect_done(xprt);
+}
+
static void xs_xprt_free(struct rpc_xprt *xprt)
{
xs_free_peer_addresses(xprt);
@@ -877,9 +873,13 @@ static void xs_xprt_free(struct rpc_xprt *xprt)
*/
static void xs_destroy(struct rpc_xprt *xprt)
{
+ struct sock_xprt *transport = container_of(xprt,
+ struct sock_xprt, xprt);
dprintk("RPC: xs_destroy xprt %p\n", xprt);
+ cancel_delayed_work_sync(&transport->connect_worker);
xs_close(xprt);
+ cancel_work_sync(&transport->recv_worker);
xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
@@ -900,45 +900,36 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
}
/**
- * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
- * @sk: socket with data to read
- * @len: how much data to read
+ * xs_local_data_read_skb
+ * @xprt: transport
+ * @sk: socket
+ * @skb: skbuff
*
* Currently this assumes we can read the whole reply in a single gulp.
*/
-static void xs_local_data_ready(struct sock *sk)
+static void xs_local_data_read_skb(struct rpc_xprt *xprt,
+ struct sock *sk,
+ struct sk_buff *skb)
{
struct rpc_task *task;
- struct rpc_xprt *xprt;
struct rpc_rqst *rovr;
- struct sk_buff *skb;
- int err, repsize, copied;
+ int repsize, copied;
u32 _xid;
__be32 *xp;
- read_lock_bh(&sk->sk_callback_lock);
- dprintk("RPC: %s...\n", __func__);
- xprt = xprt_from_sock(sk);
- if (xprt == NULL)
- goto out;
-
- skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
- goto out;
-
repsize = skb->len - sizeof(rpc_fraghdr);
if (repsize < 4) {
dprintk("RPC: impossible RPC reply size %d\n", repsize);
- goto dropit;
+ return;
}
/* Copy the XID from the skb... */
xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
if (xp == NULL)
- goto dropit;
+ return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
@@ -956,51 +947,68 @@ static void xs_local_data_ready(struct sock *sk)
xprt_complete_rqst(task, copied);
out_unlock:
- spin_unlock(&xprt->transport_lock);
- dropit:
- skb_free_datagram(sk, skb);
- out:
- read_unlock_bh(&sk->sk_callback_lock);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void xs_local_data_receive(struct sock_xprt *transport)
+{
+ struct sk_buff *skb;
+ struct sock *sk;
+ int err;
+
+ mutex_lock(&transport->recv_mutex);
+ sk = transport->inet;
+ if (sk == NULL)
+ goto out;
+ for (;;) {
+ skb = skb_recv_datagram(sk, 0, 1, &err);
+ if (skb == NULL)
+ break;
+ xs_local_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ }
+out:
+ mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_local_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_local_data_receive(transport);
}
/**
- * xs_udp_data_ready - "data ready" callback for UDP sockets
- * @sk: socket with data to read
- * @len: how much data to read
+ * xs_udp_data_read_skb - receive callback for UDP sockets
+ * @xprt: transport
+ * @sk: socket
+ * @skb: skbuff
*
*/
-static void xs_udp_data_ready(struct sock *sk)
+static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
+ struct sock *sk,
+ struct sk_buff *skb)
{
struct rpc_task *task;
- struct rpc_xprt *xprt;
struct rpc_rqst *rovr;
- struct sk_buff *skb;
- int err, repsize, copied;
+ int repsize, copied;
u32 _xid;
__be32 *xp;
- read_lock_bh(&sk->sk_callback_lock);
- dprintk("RPC: xs_udp_data_ready...\n");
- if (!(xprt = xprt_from_sock(sk)))
- goto out;
-
- if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
- goto out;
-
repsize = skb->len - sizeof(struct udphdr);
if (repsize < 4) {
dprintk("RPC: impossible RPC reply size %d!\n", repsize);
- goto dropit;
+ return;
}
/* Copy the XID from the skb... */
xp = skb_header_pointer(skb, sizeof(struct udphdr),
sizeof(_xid), &_xid);
if (xp == NULL)
- goto dropit;
+ return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
@@ -1021,10 +1029,54 @@ static void xs_udp_data_ready(struct sock *sk)
xprt_complete_rqst(task, copied);
out_unlock:
- spin_unlock(&xprt->transport_lock);
- dropit:
- skb_free_datagram(sk, skb);
- out:
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void xs_udp_data_receive(struct sock_xprt *transport)
+{
+ struct sk_buff *skb;
+ struct sock *sk;
+ int err;
+
+ mutex_lock(&transport->recv_mutex);
+ sk = transport->inet;
+ if (sk == NULL)
+ goto out;
+ for (;;) {
+ skb = skb_recv_datagram(sk, 0, 1, &err);
+ if (skb == NULL)
+ break;
+ xs_udp_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ }
+out:
+ mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_udp_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_udp_data_receive(transport);
+}
+
+/**
+ * xs_data_ready - "data ready" callback for UDP sockets
+ * @sk: socket with data to read
+ *
+ */
+static void xs_data_ready(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ dprintk("RPC: xs_data_ready...\n");
+ xprt = xprt_from_sock(sk);
+ if (xprt != NULL) {
+ struct sock_xprt *transport = container_of(xprt,
+ struct sock_xprt, xprt);
+ queue_work(rpciod_workqueue, &transport->recv_worker);
+ }
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1239,12 +1291,12 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
/* Find and lock the request corresponding to this xid */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
req = xprt_lookup_rqst(xprt, transport->tcp_xid);
if (!req) {
dprintk("RPC: XID %08x request not found!\n",
ntohl(transport->tcp_xid));
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return -1;
}
@@ -1253,7 +1305,7 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_rqst(req->rq_task, transport->tcp_copied);
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1273,10 +1325,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
struct rpc_rqst *req;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
@@ -1287,7 +1339,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_bc_request(req, transport->tcp_copied);
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1302,6 +1354,17 @@ static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
xs_tcp_read_reply(xprt, desc) :
xs_tcp_read_callback(xprt, desc);
}
+
+static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
+{
+ int ret;
+
+ ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
+ SVC_SOCK_ANONYMOUS);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
#else
static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
@@ -1387,42 +1450,69 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
return len - desc.count;
}
+static void xs_tcp_data_receive(struct sock_xprt *transport)
+{
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct sock *sk;
+ read_descriptor_t rd_desc = {
+ .count = 2*1024*1024,
+ .arg.data = xprt,
+ };
+ unsigned long total = 0;
+ int read = 0;
+
+ mutex_lock(&transport->recv_mutex);
+ sk = transport->inet;
+ if (sk == NULL)
+ goto out;
+
+ /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
+ for (;;) {
+ lock_sock(sk);
+ read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+ release_sock(sk);
+ if (read <= 0)
+ break;
+ total += read;
+ rd_desc.count = 65536;
+ }
+out:
+ mutex_unlock(&transport->recv_mutex);
+ trace_xs_tcp_data_ready(xprt, read, total);
+}
+
+static void xs_tcp_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_tcp_data_receive(transport);
+}
+
/**
* xs_tcp_data_ready - "data ready" callback for TCP sockets
* @sk: socket with data to read
- * @bytes: how much data to read
*
*/
static void xs_tcp_data_ready(struct sock *sk)
{
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
- read_descriptor_t rd_desc;
- int read;
- unsigned long total = 0;
dprintk("RPC: xs_tcp_data_ready...\n");
read_lock_bh(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk))) {
- read = 0;
+ if (!(xprt = xprt_from_sock(sk)))
goto out;
- }
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
/* Any data means we had a useful conversation, so
* the we don't need to delay the next reconnect
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
+ queue_work(rpciod_workqueue, &transport->recv_worker);
- /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
- rd_desc.arg.data = xprt;
- do {
- rd_desc.count = 65536;
- read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- if (read > 0)
- total += read;
- } while (read > 0);
out:
- trace_xs_tcp_data_ready(xprt, read, total);
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1508,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk)
static void xs_write_space(struct sock *sk)
{
- struct socket *sock;
+ struct socket_wq *wq;
struct rpc_xprt *xprt;
- if (unlikely(!(sock = sk->sk_socket)))
+ if (!sk->sk_socket)
return;
- clear_bit(SOCK_NOSPACE, &sock->flags);
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
if (unlikely(!(xprt = xprt_from_sock(sk))))
return;
- if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
- return;
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
+ goto out;
xprt_write_space(xprt);
+out:
+ rcu_read_unlock();
}
/**
@@ -1870,10 +1964,10 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_local_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
sk->sk_error_report = xs_error_report;
- sk->sk_allocation = GFP_ATOMIC;
+ sk->sk_allocation = GFP_NOIO;
xprt_clear_connected(xprt);
@@ -1892,9 +1986,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
/**
* xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
- * @xprt: RPC transport to connect
* @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
*/
static int xs_local_setup_socket(struct sock_xprt *transport)
{
@@ -1966,43 +2058,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
msleep_interruptible(15000);
}
-#ifdef CONFIG_SUNRPC_SWAP
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+/*
+ * Note that this should be called with XPRT_LOCKED held (or when we otherwise
+ * know that we have exclusive access to the socket), to guard against
+ * races with xs_reset_transport.
+ */
static void xs_set_memalloc(struct rpc_xprt *xprt)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
xprt);
- if (xprt->swapper)
+ /*
+ * If there's no sock, then we have nothing to set. The
+ * reconnecting process will get it for us.
+ */
+ if (!transport->inet)
+ return;
+ if (atomic_read(&xprt->swapper))
sk_set_memalloc(transport->inet);
}
/**
- * xs_swapper - Tag this transport as being used for swap.
+ * xs_enable_swap - Tag this transport as being used for swap.
* @xprt: transport to tag
- * @enable: enable/disable
*
+ * Take a reference to this transport on behalf of the rpc_clnt, and
+ * optionally mark it for swapping if it wasn't already.
*/
-int xs_swapper(struct rpc_xprt *xprt, int enable)
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
{
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
- xprt);
- int err = 0;
+ struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
- if (enable) {
- xprt->swapper++;
- xs_set_memalloc(xprt);
- } else if (xprt->swapper) {
- xprt->swapper--;
- sk_clear_memalloc(transport->inet);
- }
+ if (atomic_inc_return(&xprt->swapper) != 1)
+ return 0;
+ if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
+ return -ERESTARTSYS;
+ if (xs->inet)
+ sk_set_memalloc(xs->inet);
+ xprt_release_xprt(xprt, NULL);
+ return 0;
+}
- return err;
+/**
+ * xs_disable_swap - Untag this transport as being used for swap.
+ * @xprt: transport to tag
+ *
+ * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
+ * swapper refcount goes to 0, untag the socket as a memalloc socket.
+ */
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
+
+ if (!atomic_dec_and_test(&xprt->swapper))
+ return;
+ if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
+ return;
+ if (xs->inet)
+ sk_clear_memalloc(xs->inet);
+ xprt_release_xprt(xprt, NULL);
}
-EXPORT_SYMBOL_GPL(xs_swapper);
#else
static void xs_set_memalloc(struct rpc_xprt *xprt)
{
}
+
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
+{
+ return -EINVAL;
+}
+
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+}
#endif
static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2017,9 +2150,9 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_udp_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
- sk->sk_allocation = GFP_ATOMIC;
+ sk->sk_allocation = GFP_NOIO;
xprt_set_connected(xprt);
@@ -2063,6 +2196,27 @@ out:
xprt_wake_pending_tasks(xprt, status);
}
+/**
+ * xs_tcp_shutdown - gracefully shut down a TCP socket
+ * @xprt: transport
+ *
+ * Initiates a graceful shutdown of the TCP socket by calling the
+ * equivalent of shutdown(SHUT_RDWR);
+ */
+static void xs_tcp_shutdown(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct socket *sock = transport->sock;
+
+ if (sock == NULL)
+ return;
+ if (xprt_connected(xprt)) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ trace_rpc_socket_shutdown(xprt, sock);
+ } else
+ xs_reset_transport(transport);
+}
+
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2073,6 +2227,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
unsigned int keepidle = xprt->timeout->to_initval / HZ;
unsigned int keepcnt = xprt->timeout->to_retries + 1;
unsigned int opt_on = 1;
+ unsigned int timeo;
/* TCP Keepalive options */
kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2084,6 +2239,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
(char *)&keepcnt, sizeof(keepcnt));
+ /* TCP user timeout (see RFC5482) */
+ timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
+ (xprt->timeout->to_retries + 1);
+ kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+ (char *)&timeo, sizeof(timeo));
+
write_lock_bh(&sk->sk_callback_lock);
xs_save_old_callbacks(transport, sk);
@@ -2093,7 +2254,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
sk->sk_error_report = xs_error_report;
- sk->sk_allocation = GFP_ATOMIC;
+ sk->sk_allocation = GFP_NOIO;
/* socket options */
sock_reset_flag(sk, SOCK_LINGER);
@@ -2132,9 +2293,6 @@ out:
/**
* xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
- * @xprt: RPC transport to connect
- * @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
*
* Invoked by a work queue tasklet.
*/
@@ -2405,7 +2563,7 @@ static int bc_send_request(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct svc_xprt *xprt;
- u32 len;
+ int len;
dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
/*
@@ -2470,6 +2628,8 @@ static struct rpc_xprt_ops xs_local_ops = {
.close = xs_close,
.destroy = xs_destroy,
.print_stats = xs_local_print_stats,
+ .enable_swap = xs_enable_swap,
+ .disable_swap = xs_disable_swap,
};
static struct rpc_xprt_ops xs_udp_ops = {
@@ -2489,6 +2649,9 @@ static struct rpc_xprt_ops xs_udp_ops = {
.close = xs_close,
.destroy = xs_destroy,
.print_stats = xs_udp_print_stats,
+ .enable_swap = xs_enable_swap,
+ .disable_swap = xs_disable_swap,
+ .inject_disconnect = xs_inject_disconnect,
};
static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2505,6 +2668,15 @@ static struct rpc_xprt_ops xs_tcp_ops = {
.close = xs_tcp_shutdown,
.destroy = xs_destroy,
.print_stats = xs_tcp_print_stats,
+ .enable_swap = xs_enable_swap,
+ .disable_swap = xs_disable_swap,
+ .inject_disconnect = xs_inject_disconnect,
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+ .bc_setup = xprt_setup_bc,
+ .bc_up = xs_tcp_bc_up,
+ .bc_free_rqst = xprt_free_bc_rqst,
+ .bc_destroy = xprt_destroy_bc,
+#endif
};
/*
@@ -2522,6 +2694,9 @@ static struct rpc_xprt_ops bc_tcp_ops = {
.close = bc_close,
.destroy = bc_destroy,
.print_stats = xs_tcp_print_stats,
+ .enable_swap = xs_enable_swap,
+ .disable_swap = xs_disable_swap,
+ .inject_disconnect = xs_inject_disconnect,
};
static int xs_init_anyaddr(const int family, struct sockaddr *sap)
@@ -2572,6 +2747,7 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
}
new = container_of(xprt, struct sock_xprt, xprt);
+ mutex_init(&new->recv_mutex);
memcpy(&xprt->addr, args->dstaddr, args->addrlen);
xprt->addrlen = args->addrlen;
if (args->srcaddr)
@@ -2625,6 +2801,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker,
xs_dummy_setup_socket);
@@ -2696,21 +2873,20 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
xprt->timeout = &xs_udp_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
+
switch (addr->sa_family) {
case AF_INET:
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_udp_setup_socket);
xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
break;
case AF_INET6:
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_udp_setup_socket);
xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
break;
default:
@@ -2775,21 +2951,20 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->ops = &xs_tcp_ops;
xprt->timeout = &xs_tcp_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
+
switch (addr->sa_family) {
case AF_INET:
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_tcp_setup_socket);
xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
break;
case AF_INET6:
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_tcp_setup_socket);
xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
break;
default:
@@ -2989,7 +3164,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
RPC_MAX_RESVPORT);
}
-static struct kernel_param_ops param_ops_portnr = {
+static const struct kernel_param_ops param_ops_portnr = {
.set = param_set_portnr,
.get = param_get_uint,
};
@@ -3008,7 +3183,7 @@ static int param_set_slot_table_size(const char *val,
RPC_MAX_SLOT_TABLE);
}
-static struct kernel_param_ops param_ops_slot_table_size = {
+static const struct kernel_param_ops param_ops_slot_table_size = {
.set = param_set_slot_table_size,
.get = param_get_uint,
};
@@ -3024,7 +3199,7 @@ static int param_set_max_slot_table_size(const char *val,
RPC_MAX_SLOT_TABLE_LIMIT);
}
-static struct kernel_param_ops param_ops_max_slot_table_size = {
+static const struct kernel_param_ops param_ops_max_slot_table_size = {
.set = param_set_max_slot_table_size,
.get = param_get_uint,
};
diff --git a/kernel/net/switchdev/switchdev.c b/kernel/net/switchdev/switchdev.c
index 055453d48..d5d7132ac 100644
--- a/kernel/net/switchdev/switchdev.c
+++ b/kernel/net/switchdev/switchdev.c
@@ -1,6 +1,6 @@
/*
* net/switchdev/switchdev.c - Switch device API
- * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
+ * Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us>
* Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
@@ -15,97 +15,598 @@
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
#include <net/ip_fib.h>
#include <net/switchdev.h>
/**
- * netdev_switch_parent_id_get - Get ID of a switch
+ * switchdev_trans_item_enqueue - Enqueue data item to transaction queue
+ *
+ * @trans: transaction
+ * @data: pointer to data being queued
+ * @destructor: data destructor
+ * @tritem: transaction item being queued
+ *
+ * Enqeueue data item to transaction queue. tritem is typically placed in
+ * cointainter pointed at by data pointer. Destructor is called on
+ * transaction abort and after successful commit phase in case
+ * the caller did not dequeue the item before.
+ */
+void switchdev_trans_item_enqueue(struct switchdev_trans *trans,
+ void *data, void (*destructor)(void const *),
+ struct switchdev_trans_item *tritem)
+{
+ tritem->data = data;
+ tritem->destructor = destructor;
+ list_add_tail(&tritem->list, &trans->item_list);
+}
+EXPORT_SYMBOL_GPL(switchdev_trans_item_enqueue);
+
+static struct switchdev_trans_item *
+__switchdev_trans_item_dequeue(struct switchdev_trans *trans)
+{
+ struct switchdev_trans_item *tritem;
+
+ if (list_empty(&trans->item_list))
+ return NULL;
+ tritem = list_first_entry(&trans->item_list,
+ struct switchdev_trans_item, list);
+ list_del(&tritem->list);
+ return tritem;
+}
+
+/**
+ * switchdev_trans_item_dequeue - Dequeue data item from transaction queue
+ *
+ * @trans: transaction
+ */
+void *switchdev_trans_item_dequeue(struct switchdev_trans *trans)
+{
+ struct switchdev_trans_item *tritem;
+
+ tritem = __switchdev_trans_item_dequeue(trans);
+ BUG_ON(!tritem);
+ return tritem->data;
+}
+EXPORT_SYMBOL_GPL(switchdev_trans_item_dequeue);
+
+static void switchdev_trans_init(struct switchdev_trans *trans)
+{
+ INIT_LIST_HEAD(&trans->item_list);
+}
+
+static void switchdev_trans_items_destroy(struct switchdev_trans *trans)
+{
+ struct switchdev_trans_item *tritem;
+
+ while ((tritem = __switchdev_trans_item_dequeue(trans)))
+ tritem->destructor(tritem->data);
+}
+
+static void switchdev_trans_items_warn_destroy(struct net_device *dev,
+ struct switchdev_trans *trans)
+{
+ WARN(!list_empty(&trans->item_list), "%s: transaction item queue is not empty.\n",
+ dev->name);
+ switchdev_trans_items_destroy(trans);
+}
+
+static LIST_HEAD(deferred);
+static DEFINE_SPINLOCK(deferred_lock);
+
+typedef void switchdev_deferred_func_t(struct net_device *dev,
+ const void *data);
+
+struct switchdev_deferred_item {
+ struct list_head list;
+ struct net_device *dev;
+ switchdev_deferred_func_t *func;
+ unsigned long data[0];
+};
+
+static struct switchdev_deferred_item *switchdev_deferred_dequeue(void)
+{
+ struct switchdev_deferred_item *dfitem;
+
+ spin_lock_bh(&deferred_lock);
+ if (list_empty(&deferred)) {
+ dfitem = NULL;
+ goto unlock;
+ }
+ dfitem = list_first_entry(&deferred,
+ struct switchdev_deferred_item, list);
+ list_del(&dfitem->list);
+unlock:
+ spin_unlock_bh(&deferred_lock);
+ return dfitem;
+}
+
+/**
+ * switchdev_deferred_process - Process ops in deferred queue
+ *
+ * Called to flush the ops currently queued in deferred ops queue.
+ * rtnl_lock must be held.
+ */
+void switchdev_deferred_process(void)
+{
+ struct switchdev_deferred_item *dfitem;
+
+ ASSERT_RTNL();
+
+ while ((dfitem = switchdev_deferred_dequeue())) {
+ dfitem->func(dfitem->dev, dfitem->data);
+ dev_put(dfitem->dev);
+ kfree(dfitem);
+ }
+}
+EXPORT_SYMBOL_GPL(switchdev_deferred_process);
+
+static void switchdev_deferred_process_work(struct work_struct *work)
+{
+ rtnl_lock();
+ switchdev_deferred_process();
+ rtnl_unlock();
+}
+
+static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work);
+
+static int switchdev_deferred_enqueue(struct net_device *dev,
+ const void *data, size_t data_len,
+ switchdev_deferred_func_t *func)
+{
+ struct switchdev_deferred_item *dfitem;
+
+ dfitem = kmalloc(sizeof(*dfitem) + data_len, GFP_ATOMIC);
+ if (!dfitem)
+ return -ENOMEM;
+ dfitem->dev = dev;
+ dfitem->func = func;
+ memcpy(dfitem->data, data, data_len);
+ dev_hold(dev);
+ spin_lock_bh(&deferred_lock);
+ list_add_tail(&dfitem->list, &deferred);
+ spin_unlock_bh(&deferred_lock);
+ schedule_work(&deferred_process_work);
+ return 0;
+}
+
+/**
+ * switchdev_port_attr_get - Get port attribute
+ *
+ * @dev: port device
+ * @attr: attribute to get
+ */
+int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr)
+{
+ const struct switchdev_ops *ops = dev->switchdev_ops;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ struct switchdev_attr first = {
+ .id = SWITCHDEV_ATTR_ID_UNDEFINED
+ };
+ int err = -EOPNOTSUPP;
+
+ if (ops && ops->switchdev_port_attr_get)
+ return ops->switchdev_port_attr_get(dev, attr);
+
+ if (attr->flags & SWITCHDEV_F_NO_RECURSE)
+ return err;
+
+ /* Switch device port(s) may be stacked under
+ * bond/team/vlan dev, so recurse down to get attr on
+ * each port. Return -ENODATA if attr values don't
+ * compare across ports.
+ */
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = switchdev_port_attr_get(lower_dev, attr);
+ if (err)
+ break;
+ if (first.id == SWITCHDEV_ATTR_ID_UNDEFINED)
+ first = *attr;
+ else if (memcmp(&first, attr, sizeof(*attr)))
+ return -ENODATA;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_attr_get);
+
+static int __switchdev_port_attr_set(struct net_device *dev,
+ const struct switchdev_attr *attr,
+ struct switchdev_trans *trans)
+{
+ const struct switchdev_ops *ops = dev->switchdev_ops;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err = -EOPNOTSUPP;
+
+ if (ops && ops->switchdev_port_attr_set) {
+ err = ops->switchdev_port_attr_set(dev, attr, trans);
+ goto done;
+ }
+
+ if (attr->flags & SWITCHDEV_F_NO_RECURSE)
+ goto done;
+
+ /* Switch device port(s) may be stacked under
+ * bond/team/vlan dev, so recurse down to set attr on
+ * each port.
+ */
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = __switchdev_port_attr_set(lower_dev, attr, trans);
+ if (err)
+ break;
+ }
+
+done:
+ if (err == -EOPNOTSUPP && attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP)
+ err = 0;
+
+ return err;
+}
+
+static int switchdev_port_attr_set_now(struct net_device *dev,
+ const struct switchdev_attr *attr)
+{
+ struct switchdev_trans trans;
+ int err;
+
+ switchdev_trans_init(&trans);
+
+ /* Phase I: prepare for attr set. Driver/device should fail
+ * here if there are going to be issues in the commit phase,
+ * such as lack of resources or support. The driver/device
+ * should reserve resources needed for the commit phase here,
+ * but should not commit the attr.
+ */
+
+ trans.ph_prepare = true;
+ err = __switchdev_port_attr_set(dev, attr, &trans);
+ if (err) {
+ /* Prepare phase failed: abort the transaction. Any
+ * resources reserved in the prepare phase are
+ * released.
+ */
+
+ if (err != -EOPNOTSUPP)
+ switchdev_trans_items_destroy(&trans);
+
+ return err;
+ }
+
+ /* Phase II: commit attr set. This cannot fail as a fault
+ * of driver/device. If it does, it's a bug in the driver/device
+ * because the driver said everythings was OK in phase I.
+ */
+
+ trans.ph_prepare = false;
+ err = __switchdev_port_attr_set(dev, attr, &trans);
+ WARN(err, "%s: Commit of attribute (id=%d) failed.\n",
+ dev->name, attr->id);
+ switchdev_trans_items_warn_destroy(dev, &trans);
+
+ return err;
+}
+
+static void switchdev_port_attr_set_deferred(struct net_device *dev,
+ const void *data)
+{
+ const struct switchdev_attr *attr = data;
+ int err;
+
+ err = switchdev_port_attr_set_now(dev, attr);
+ if (err && err != -EOPNOTSUPP)
+ netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n",
+ err, attr->id);
+}
+
+static int switchdev_port_attr_set_defer(struct net_device *dev,
+ const struct switchdev_attr *attr)
+{
+ return switchdev_deferred_enqueue(dev, attr, sizeof(*attr),
+ switchdev_port_attr_set_deferred);
+}
+
+/**
+ * switchdev_port_attr_set - Set port attribute
+ *
* @dev: port device
- * @psid: switch ID
+ * @attr: attribute to set
+ *
+ * Use a 2-phase prepare-commit transaction model to ensure
+ * system is not left in a partially updated state due to
+ * failure from driver/device.
*
- * Get ID of a switch this port is part of.
+ * rtnl_lock must be held and must not be in atomic section,
+ * in case SWITCHDEV_F_DEFER flag is not set.
*/
-int netdev_switch_parent_id_get(struct net_device *dev,
- struct netdev_phys_item_id *psid)
+int switchdev_port_attr_set(struct net_device *dev,
+ const struct switchdev_attr *attr)
+{
+ if (attr->flags & SWITCHDEV_F_DEFER)
+ return switchdev_port_attr_set_defer(dev, attr);
+ ASSERT_RTNL();
+ return switchdev_port_attr_set_now(dev, attr);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_attr_set);
+
+static size_t switchdev_obj_size(const struct switchdev_obj *obj)
+{
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ return sizeof(struct switchdev_obj_port_vlan);
+ case SWITCHDEV_OBJ_ID_IPV4_FIB:
+ return sizeof(struct switchdev_obj_ipv4_fib);
+ case SWITCHDEV_OBJ_ID_PORT_FDB:
+ return sizeof(struct switchdev_obj_port_fdb);
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+static int __switchdev_port_obj_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct switchdev_trans *trans)
+{
+ const struct switchdev_ops *ops = dev->switchdev_ops;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err = -EOPNOTSUPP;
+
+ if (ops && ops->switchdev_port_obj_add)
+ return ops->switchdev_port_obj_add(dev, obj, trans);
+
+ /* Switch device port(s) may be stacked under
+ * bond/team/vlan dev, so recurse down to add object on
+ * each port.
+ */
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = __switchdev_port_obj_add(lower_dev, obj, trans);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static int switchdev_port_obj_add_now(struct net_device *dev,
+ const struct switchdev_obj *obj)
{
- const struct swdev_ops *ops = dev->swdev_ops;
+ struct switchdev_trans trans;
+ int err;
+
+ ASSERT_RTNL();
+
+ switchdev_trans_init(&trans);
- if (!ops || !ops->swdev_parent_id_get)
- return -EOPNOTSUPP;
- return ops->swdev_parent_id_get(dev, psid);
+ /* Phase I: prepare for obj add. Driver/device should fail
+ * here if there are going to be issues in the commit phase,
+ * such as lack of resources or support. The driver/device
+ * should reserve resources needed for the commit phase here,
+ * but should not commit the obj.
+ */
+
+ trans.ph_prepare = true;
+ err = __switchdev_port_obj_add(dev, obj, &trans);
+ if (err) {
+ /* Prepare phase failed: abort the transaction. Any
+ * resources reserved in the prepare phase are
+ * released.
+ */
+
+ if (err != -EOPNOTSUPP)
+ switchdev_trans_items_destroy(&trans);
+
+ return err;
+ }
+
+ /* Phase II: commit obj add. This cannot fail as a fault
+ * of driver/device. If it does, it's a bug in the driver/device
+ * because the driver said everythings was OK in phase I.
+ */
+
+ trans.ph_prepare = false;
+ err = __switchdev_port_obj_add(dev, obj, &trans);
+ WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id);
+ switchdev_trans_items_warn_destroy(dev, &trans);
+
+ return err;
+}
+
+static void switchdev_port_obj_add_deferred(struct net_device *dev,
+ const void *data)
+{
+ const struct switchdev_obj *obj = data;
+ int err;
+
+ err = switchdev_port_obj_add_now(dev, obj);
+ if (err && err != -EOPNOTSUPP)
+ netdev_err(dev, "failed (err=%d) to add object (id=%d)\n",
+ err, obj->id);
+}
+
+static int switchdev_port_obj_add_defer(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj),
+ switchdev_port_obj_add_deferred);
}
-EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get);
/**
- * netdev_switch_port_stp_update - Notify switch device port of STP
- * state change
+ * switchdev_port_obj_add - Add port object
+ *
* @dev: port device
- * @state: port STP state
+ * @id: object ID
+ * @obj: object to add
+ *
+ * Use a 2-phase prepare-commit transaction model to ensure
+ * system is not left in a partially updated state due to
+ * failure from driver/device.
*
- * Notify switch device port of bridge port STP state change.
+ * rtnl_lock must be held and must not be in atomic section,
+ * in case SWITCHDEV_F_DEFER flag is not set.
*/
-int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
+int switchdev_port_obj_add(struct net_device *dev,
+ const struct switchdev_obj *obj)
{
- const struct swdev_ops *ops = dev->swdev_ops;
+ if (obj->flags & SWITCHDEV_F_DEFER)
+ return switchdev_port_obj_add_defer(dev, obj);
+ ASSERT_RTNL();
+ return switchdev_port_obj_add_now(dev, obj);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_obj_add);
+
+static int switchdev_port_obj_del_now(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ const struct switchdev_ops *ops = dev->switchdev_ops;
struct net_device *lower_dev;
struct list_head *iter;
int err = -EOPNOTSUPP;
- if (ops && ops->swdev_port_stp_update)
- return ops->swdev_port_stp_update(dev, state);
+ if (ops && ops->switchdev_port_obj_del)
+ return ops->switchdev_port_obj_del(dev, obj);
+
+ /* Switch device port(s) may be stacked under
+ * bond/team/vlan dev, so recurse down to delete object on
+ * each port.
+ */
netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = netdev_switch_port_stp_update(lower_dev, state);
- if (err && err != -EOPNOTSUPP)
- return err;
+ err = switchdev_port_obj_del_now(lower_dev, obj);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static void switchdev_port_obj_del_deferred(struct net_device *dev,
+ const void *data)
+{
+ const struct switchdev_obj *obj = data;
+ int err;
+
+ err = switchdev_port_obj_del_now(dev, obj);
+ if (err && err != -EOPNOTSUPP)
+ netdev_err(dev, "failed (err=%d) to del object (id=%d)\n",
+ err, obj->id);
+}
+
+static int switchdev_port_obj_del_defer(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj),
+ switchdev_port_obj_del_deferred);
+}
+
+/**
+ * switchdev_port_obj_del - Delete port object
+ *
+ * @dev: port device
+ * @id: object ID
+ * @obj: object to delete
+ *
+ * rtnl_lock must be held and must not be in atomic section,
+ * in case SWITCHDEV_F_DEFER flag is not set.
+ */
+int switchdev_port_obj_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ if (obj->flags & SWITCHDEV_F_DEFER)
+ return switchdev_port_obj_del_defer(dev, obj);
+ ASSERT_RTNL();
+ return switchdev_port_obj_del_now(dev, obj);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
+
+/**
+ * switchdev_port_obj_dump - Dump port objects
+ *
+ * @dev: port device
+ * @id: object ID
+ * @obj: object to dump
+ * @cb: function to call with a filled object
+ *
+ * rtnl_lock must be held.
+ */
+int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
+ switchdev_obj_dump_cb_t *cb)
+{
+ const struct switchdev_ops *ops = dev->switchdev_ops;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err = -EOPNOTSUPP;
+
+ ASSERT_RTNL();
+
+ if (ops && ops->switchdev_port_obj_dump)
+ return ops->switchdev_port_obj_dump(dev, obj, cb);
+
+ /* Switch device port(s) may be stacked under
+ * bond/team/vlan dev, so recurse down to dump objects on
+ * first port at bottom of stack.
+ */
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = switchdev_port_obj_dump(lower_dev, obj, cb);
+ break;
}
return err;
}
-EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update);
+EXPORT_SYMBOL_GPL(switchdev_port_obj_dump);
-static DEFINE_MUTEX(netdev_switch_mutex);
-static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain);
+static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
/**
- * register_netdev_switch_notifier - Register notifier
+ * register_switchdev_notifier - Register notifier
* @nb: notifier_block
*
* Register switch device notifier. This should be used by code
* which needs to monitor events happening in particular device.
* Return values are same as for atomic_notifier_chain_register().
*/
-int register_netdev_switch_notifier(struct notifier_block *nb)
+int register_switchdev_notifier(struct notifier_block *nb)
{
int err;
- mutex_lock(&netdev_switch_mutex);
- err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb);
- mutex_unlock(&netdev_switch_mutex);
+ rtnl_lock();
+ err = raw_notifier_chain_register(&switchdev_notif_chain, nb);
+ rtnl_unlock();
return err;
}
-EXPORT_SYMBOL_GPL(register_netdev_switch_notifier);
+EXPORT_SYMBOL_GPL(register_switchdev_notifier);
/**
- * unregister_netdev_switch_notifier - Unregister notifier
+ * unregister_switchdev_notifier - Unregister notifier
* @nb: notifier_block
*
* Unregister switch device notifier.
* Return values are same as for atomic_notifier_chain_unregister().
*/
-int unregister_netdev_switch_notifier(struct notifier_block *nb)
+int unregister_switchdev_notifier(struct notifier_block *nb)
{
int err;
- mutex_lock(&netdev_switch_mutex);
- err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb);
- mutex_unlock(&netdev_switch_mutex);
+ rtnl_lock();
+ err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb);
+ rtnl_unlock();
return err;
}
-EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
+EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
/**
- * call_netdev_switch_notifiers - Call notifiers
+ * call_switchdev_notifiers - Call notifiers
* @val: value passed unmodified to notifier function
* @dev: port device
* @info: notifier information data
@@ -113,147 +614,498 @@ EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
* Call all network notifier blocks. This should be called by driver
* when it needs to propagate hardware event.
* Return values are same as for atomic_notifier_call_chain().
+ * rtnl_lock must be held.
*/
-int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
- struct netdev_switch_notifier_info *info)
+int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
+ struct switchdev_notifier_info *info)
{
int err;
+ ASSERT_RTNL();
+
info->dev = dev;
- mutex_lock(&netdev_switch_mutex);
- err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info);
- mutex_unlock(&netdev_switch_mutex);
+ err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
+ return err;
+}
+EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
+
+struct switchdev_vlan_dump {
+ struct switchdev_obj_port_vlan vlan;
+ struct sk_buff *skb;
+ u32 filter_mask;
+ u16 flags;
+ u16 begin;
+ u16 end;
+};
+
+static int switchdev_port_vlan_dump_put(struct switchdev_vlan_dump *dump)
+{
+ struct bridge_vlan_info vinfo;
+
+ vinfo.flags = dump->flags;
+
+ if (dump->begin == 0 && dump->end == 0) {
+ return 0;
+ } else if (dump->begin == dump->end) {
+ vinfo.vid = dump->begin;
+ if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ return -EMSGSIZE;
+ } else {
+ vinfo.vid = dump->begin;
+ vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+ if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ return -EMSGSIZE;
+ vinfo.vid = dump->end;
+ vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
+ vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END;
+ if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+static int switchdev_port_vlan_dump_cb(struct switchdev_obj *obj)
+{
+ struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+ struct switchdev_vlan_dump *dump =
+ container_of(vlan, struct switchdev_vlan_dump, vlan);
+ int err = 0;
+
+ if (vlan->vid_begin > vlan->vid_end)
+ return -EINVAL;
+
+ if (dump->filter_mask & RTEXT_FILTER_BRVLAN) {
+ dump->flags = vlan->flags;
+ for (dump->begin = dump->end = vlan->vid_begin;
+ dump->begin <= vlan->vid_end;
+ dump->begin++, dump->end++) {
+ err = switchdev_port_vlan_dump_put(dump);
+ if (err)
+ return err;
+ }
+ } else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) {
+ if (dump->begin > vlan->vid_begin &&
+ dump->begin >= vlan->vid_end) {
+ if ((dump->begin - 1) == vlan->vid_end &&
+ dump->flags == vlan->flags) {
+ /* prepend */
+ dump->begin = vlan->vid_begin;
+ } else {
+ err = switchdev_port_vlan_dump_put(dump);
+ dump->flags = vlan->flags;
+ dump->begin = vlan->vid_begin;
+ dump->end = vlan->vid_end;
+ }
+ } else if (dump->end <= vlan->vid_begin &&
+ dump->end < vlan->vid_end) {
+ if ((dump->end + 1) == vlan->vid_begin &&
+ dump->flags == vlan->flags) {
+ /* append */
+ dump->end = vlan->vid_end;
+ } else {
+ err = switchdev_port_vlan_dump_put(dump);
+ dump->flags = vlan->flags;
+ dump->begin = vlan->vid_begin;
+ dump->end = vlan->vid_end;
+ }
+ } else {
+ err = -EINVAL;
+ }
+ }
+
return err;
}
-EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers);
+
+static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 filter_mask)
+{
+ struct switchdev_vlan_dump dump = {
+ .vlan.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .skb = skb,
+ .filter_mask = filter_mask,
+ };
+ int err = 0;
+
+ if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
+ (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
+ err = switchdev_port_obj_dump(dev, &dump.vlan.obj,
+ switchdev_port_vlan_dump_cb);
+ if (err)
+ goto err_out;
+ if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
+ /* last one */
+ err = switchdev_port_vlan_dump_put(&dump);
+ }
+
+err_out:
+ return err == -EOPNOTSUPP ? 0 : err;
+}
/**
- * netdev_switch_port_bridge_setlink - Notify switch device port of bridge
- * port attributes
+ * switchdev_port_bridge_getlink - Get bridge port attributes
*
* @dev: port device
- * @nlh: netlink msg with bridge port attributes
- * @flags: bridge setlink flags
*
- * Notify switch device port of bridge port attributes
+ * Called for SELF on rtnl_bridge_getlink to get bridge port
+ * attributes.
*/
-int netdev_switch_port_bridge_setlink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u32 filter_mask,
+ int nlflags)
{
- const struct net_device_ops *ops = dev->netdev_ops;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
+ };
+ u16 mode = BRIDGE_MODE_UNDEF;
+ u32 mask = BR_LEARNING | BR_LEARNING_SYNC | BR_FLOOD;
+ int err;
- if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
- return 0;
+ err = switchdev_port_attr_get(dev, &attr);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
+ attr.u.brport_flags, mask, nlflags,
+ filter_mask, switchdev_port_vlan_fill);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
+
+static int switchdev_port_br_setflag(struct net_device *dev,
+ struct nlattr *nlattr,
+ unsigned long brport_flag)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
+ };
+ u8 flag = nla_get_u8(nlattr);
+ int err;
- if (!ops->ndo_bridge_setlink)
- return -EOPNOTSUPP;
+ err = switchdev_port_attr_get(dev, &attr);
+ if (err)
+ return err;
- return ops->ndo_bridge_setlink(dev, nlh, flags);
+ if (flag)
+ attr.u.brport_flags |= brport_flag;
+ else
+ attr.u.brport_flags &= ~brport_flag;
+
+ return switchdev_port_attr_set(dev, &attr);
+}
+
+static const struct nla_policy
+switchdev_port_bridge_policy[IFLA_BRPORT_MAX + 1] = {
+ [IFLA_BRPORT_STATE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_COST] = { .type = NLA_U32 },
+ [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 },
+ [IFLA_BRPORT_MODE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_GUARD] = { .type = NLA_U8 },
+ [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 },
+ [IFLA_BRPORT_FAST_LEAVE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 },
+ [IFLA_BRPORT_LEARNING_SYNC] = { .type = NLA_U8 },
+ [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
+};
+
+static int switchdev_port_br_setlink_protinfo(struct net_device *dev,
+ struct nlattr *protinfo)
+{
+ struct nlattr *attr;
+ int rem;
+ int err;
+
+ err = nla_validate_nested(protinfo, IFLA_BRPORT_MAX,
+ switchdev_port_bridge_policy);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, protinfo, rem) {
+ switch (nla_type(attr)) {
+ case IFLA_BRPORT_LEARNING:
+ err = switchdev_port_br_setflag(dev, attr,
+ BR_LEARNING);
+ break;
+ case IFLA_BRPORT_LEARNING_SYNC:
+ err = switchdev_port_br_setflag(dev, attr,
+ BR_LEARNING_SYNC);
+ break;
+ case IFLA_BRPORT_UNICAST_FLOOD:
+ err = switchdev_port_br_setflag(dev, attr, BR_FLOOD);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int switchdev_port_br_afspec(struct net_device *dev,
+ struct nlattr *afspec,
+ int (*f)(struct net_device *dev,
+ const struct switchdev_obj *obj))
+{
+ struct nlattr *attr;
+ struct bridge_vlan_info *vinfo;
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ };
+ int rem;
+ int err;
+
+ nla_for_each_nested(attr, afspec, rem) {
+ if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO)
+ continue;
+ if (nla_len(attr) != sizeof(struct bridge_vlan_info))
+ return -EINVAL;
+ vinfo = nla_data(attr);
+ if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
+ return -EINVAL;
+ vlan.flags = vinfo->flags;
+ if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ if (vlan.vid_begin)
+ return -EINVAL;
+ vlan.vid_begin = vinfo->vid;
+ /* don't allow range of pvids */
+ if (vlan.flags & BRIDGE_VLAN_INFO_PVID)
+ return -EINVAL;
+ } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) {
+ if (!vlan.vid_begin)
+ return -EINVAL;
+ vlan.vid_end = vinfo->vid;
+ if (vlan.vid_end <= vlan.vid_begin)
+ return -EINVAL;
+ err = f(dev, &vlan.obj);
+ if (err)
+ return err;
+ vlan.vid_begin = 0;
+ } else {
+ if (vlan.vid_begin)
+ return -EINVAL;
+ vlan.vid_begin = vinfo->vid;
+ vlan.vid_end = vinfo->vid;
+ err = f(dev, &vlan.obj);
+ if (err)
+ return err;
+ vlan.vid_begin = 0;
+ }
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink);
/**
- * netdev_switch_port_bridge_dellink - Notify switch device port of bridge
- * port attribute delete
+ * switchdev_port_bridge_setlink - Set bridge port attributes
*
* @dev: port device
- * @nlh: netlink msg with bridge port attributes
- * @flags: bridge setlink flags
+ * @nlh: netlink header
+ * @flags: netlink flags
*
- * Notify switch device port of bridge port attribute delete
+ * Called for SELF on rtnl_bridge_setlink to set bridge port
+ * attributes.
*/
-int netdev_switch_port_bridge_dellink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_setlink(struct net_device *dev,
+ struct nlmsghdr *nlh, u16 flags)
{
- const struct net_device_ops *ops = dev->netdev_ops;
+ struct nlattr *protinfo;
+ struct nlattr *afspec;
+ int err = 0;
- if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
- return 0;
+ protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
+ IFLA_PROTINFO);
+ if (protinfo) {
+ err = switchdev_port_br_setlink_protinfo(dev, protinfo);
+ if (err)
+ return err;
+ }
- if (!ops->ndo_bridge_dellink)
- return -EOPNOTSUPP;
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
+ IFLA_AF_SPEC);
+ if (afspec)
+ err = switchdev_port_br_afspec(dev, afspec,
+ switchdev_port_obj_add);
- return ops->ndo_bridge_dellink(dev, nlh, flags);
+ return err;
}
-EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink);
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink);
/**
- * ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink
- * op for master devices
+ * switchdev_port_bridge_dellink - Set bridge port attributes
*
* @dev: port device
- * @nlh: netlink msg with bridge port attributes
- * @flags: bridge setlink flags
+ * @nlh: netlink header
+ * @flags: netlink flags
*
- * Notify master device slaves of bridge port attributes
+ * Called for SELF on rtnl_bridge_dellink to set bridge port
+ * attributes.
*/
-int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_dellink(struct net_device *dev,
+ struct nlmsghdr *nlh, u16 flags)
{
- struct net_device *lower_dev;
- struct list_head *iter;
- int ret = 0, err = 0;
+ struct nlattr *afspec;
- if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
- return ret;
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
+ IFLA_AF_SPEC);
+ if (afspec)
+ return switchdev_port_br_afspec(dev, afspec,
+ switchdev_port_obj_del);
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags);
- if (err && err != -EOPNOTSUPP)
- ret = err;
- }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
+
+/**
+ * switchdev_port_fdb_add - Add FDB (MAC/VLAN) entry to port
+ *
+ * @ndmsg: netlink hdr
+ * @nlattr: netlink attributes
+ * @dev: port device
+ * @addr: MAC address to add
+ * @vid: VLAN to add
+ *
+ * Add FDB entry to switch device.
+ */
+int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev, const unsigned char *addr,
+ u16 vid, u16 nlm_flags)
+{
+ struct switchdev_obj_port_fdb fdb = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
+ .vid = vid,
+ };
- return ret;
+ ether_addr_copy(fdb.addr, addr);
+ return switchdev_port_obj_add(dev, &fdb.obj);
}
-EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink);
+EXPORT_SYMBOL_GPL(switchdev_port_fdb_add);
/**
- * ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink
- * op for master devices
+ * switchdev_port_fdb_del - Delete FDB (MAC/VLAN) entry from port
*
+ * @ndmsg: netlink hdr
+ * @nlattr: netlink attributes
* @dev: port device
- * @nlh: netlink msg with bridge port attributes
- * @flags: bridge dellink flags
+ * @addr: MAC address to delete
+ * @vid: VLAN to delete
*
- * Notify master device slaves of bridge port attribute deletes
+ * Delete FDB entry from switch device.
*/
-int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev, const unsigned char *addr,
+ u16 vid)
{
- struct net_device *lower_dev;
- struct list_head *iter;
- int ret = 0, err = 0;
+ struct switchdev_obj_port_fdb fdb = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
+ .vid = vid,
+ };
- if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
- return ret;
+ ether_addr_copy(fdb.addr, addr);
+ return switchdev_port_obj_del(dev, &fdb.obj);
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags);
- if (err && err != -EOPNOTSUPP)
- ret = err;
- }
+struct switchdev_fdb_dump {
+ struct switchdev_obj_port_fdb fdb;
+ struct net_device *dev;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx;
+};
- return ret;
+static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj)
+{
+ struct switchdev_obj_port_fdb *fdb = SWITCHDEV_OBJ_PORT_FDB(obj);
+ struct switchdev_fdb_dump *dump =
+ container_of(fdb, struct switchdev_fdb_dump, fdb);
+ u32 portid = NETLINK_CB(dump->cb->skb).portid;
+ u32 seq = dump->cb->nlh->nlmsg_seq;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ if (dump->idx < dump->cb->args[0])
+ goto skip;
+
+ nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+ sizeof(*ndm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = NTF_SELF;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dump->dev->ifindex;
+ ndm->ndm_state = fdb->ndm_state;
+
+ if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, fdb->addr))
+ goto nla_put_failure;
+
+ if (fdb->vid && nla_put_u16(dump->skb, NDA_VLAN, fdb->vid))
+ goto nla_put_failure;
+
+ nlmsg_end(dump->skb, nlh);
+
+skip:
+ dump->idx++;
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(dump->skb, nlh);
+ return -EMSGSIZE;
}
-EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink);
-static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
+/**
+ * switchdev_port_fdb_dump - Dump port FDB (MAC/VLAN) entries
+ *
+ * @skb: netlink skb
+ * @cb: netlink callback
+ * @dev: port device
+ * @filter_dev: filter device
+ * @idx:
+ *
+ * Delete FDB entry from switch device.
+ */
+int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev, int idx)
+{
+ struct switchdev_fdb_dump dump = {
+ .fdb.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
+ .dev = dev,
+ .skb = skb,
+ .cb = cb,
+ .idx = idx,
+ };
+
+ switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb);
+ return dump.idx;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
+
+static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
{
- const struct swdev_ops *ops = dev->swdev_ops;
+ const struct switchdev_ops *ops = dev->switchdev_ops;
struct net_device *lower_dev;
struct net_device *port_dev;
struct list_head *iter;
/* Recusively search down until we find a sw port dev.
- * (A sw port dev supports swdev_parent_id_get).
+ * (A sw port dev supports switchdev_port_attr_get).
*/
- if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
- ops && ops->swdev_parent_id_get)
+ if (ops && ops->switchdev_port_attr_get)
return dev;
netdev_for_each_lower_dev(dev, lower_dev, iter) {
- port_dev = netdev_switch_get_lowest_dev(lower_dev);
+ port_dev = switchdev_get_lowest_dev(lower_dev);
if (port_dev)
return port_dev;
}
@@ -261,13 +1113,17 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
return NULL;
}
-static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
+static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
{
- struct netdev_phys_item_id psid;
- struct netdev_phys_item_id prev_psid;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ };
+ struct switchdev_attr prev_attr;
struct net_device *dev = NULL;
int nhsel;
+ ASSERT_RTNL();
+
/* For this route, all nexthop devs must be on the same switch. */
for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
@@ -276,28 +1132,25 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
if (!nh->nh_dev)
return NULL;
- dev = netdev_switch_get_lowest_dev(nh->nh_dev);
+ dev = switchdev_get_lowest_dev(nh->nh_dev);
if (!dev)
return NULL;
- if (netdev_switch_parent_id_get(dev, &psid))
+ if (switchdev_port_attr_get(dev, &attr))
return NULL;
- if (nhsel > 0) {
- if (prev_psid.id_len != psid.id_len)
+ if (nhsel > 0 &&
+ !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
return NULL;
- if (memcmp(prev_psid.id, psid.id, psid.id_len))
- return NULL;
- }
- prev_psid = psid;
+ prev_attr = attr;
}
return dev;
}
/**
- * netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch
+ * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry
*
* @dst: route's IPv4 destination address
* @dst_len: destination address length (prefix length)
@@ -307,15 +1160,25 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
* @nlflags: netlink flags passed in (NLM_F_*)
* @tb_id: route table ID
*
- * Add IPv4 route entry to switch device.
+ * Add/modify switch IPv4 route entry.
*/
-int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 nlflags, u32 tb_id)
+int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 nlflags, u32 tb_id)
{
+ struct switchdev_obj_ipv4_fib ipv4_fib = {
+ .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
+ .dst = dst,
+ .dst_len = dst_len,
+ .tos = tos,
+ .type = type,
+ .nlflags = nlflags,
+ .tb_id = tb_id,
+ };
struct net_device *dev;
- const struct swdev_ops *ops;
int err = 0;
+ memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi));
+
/* Don't offload route if using custom ip rules or if
* IPv4 FIB offloading has been disabled completely.
*/
@@ -328,25 +1191,20 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
if (fi->fib_net->ipv4.fib_offload_disabled)
return 0;
- dev = netdev_switch_get_dev_by_nhs(fi);
+ dev = switchdev_get_dev_by_nhs(fi);
if (!dev)
return 0;
- ops = dev->swdev_ops;
-
- if (ops->swdev_fib_ipv4_add) {
- err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len,
- fi, tos, type, nlflags,
- tb_id);
- if (!err)
- fi->fib_flags |= RTNH_F_OFFLOAD;
- }
- return err;
+ err = switchdev_port_obj_add(dev, &ipv4_fib.obj);
+ if (!err)
+ fi->fib_flags |= RTNH_F_OFFLOAD;
+
+ return err == -EOPNOTSUPP ? 0 : err;
}
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
/**
- * netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch
+ * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
*
* @dst: route's IPv4 destination address
* @dst_len: destination address length (prefix length)
@@ -357,38 +1215,44 @@ EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
*
* Delete IPv4 route entry from switch device.
*/
-int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id)
+int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id)
{
+ struct switchdev_obj_ipv4_fib ipv4_fib = {
+ .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
+ .dst = dst,
+ .dst_len = dst_len,
+ .tos = tos,
+ .type = type,
+ .nlflags = 0,
+ .tb_id = tb_id,
+ };
struct net_device *dev;
- const struct swdev_ops *ops;
int err = 0;
+ memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi));
+
if (!(fi->fib_flags & RTNH_F_OFFLOAD))
return 0;
- dev = netdev_switch_get_dev_by_nhs(fi);
+ dev = switchdev_get_dev_by_nhs(fi);
if (!dev)
return 0;
- ops = dev->swdev_ops;
- if (ops->swdev_fib_ipv4_del) {
- err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len,
- fi, tos, type, tb_id);
- if (!err)
- fi->fib_flags &= ~RTNH_F_OFFLOAD;
- }
+ err = switchdev_port_obj_del(dev, &ipv4_fib.obj);
+ if (!err)
+ fi->fib_flags &= ~RTNH_F_OFFLOAD;
- return err;
+ return err == -EOPNOTSUPP ? 0 : err;
}
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
/**
- * netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation
+ * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
*
* @fi: route FIB info structure
*/
-void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
+void switchdev_fib_ipv4_abort(struct fib_info *fi)
{
/* There was a problem installing this route to the offload
* device. For now, until we come up with more refined
@@ -401,4 +1265,108 @@ void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
fib_flush_external(fi->fib_net);
fi->fib_net->ipv4.fib_offload_disabled = true;
}
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
+
+static bool switchdev_port_same_parent_id(struct net_device *a,
+ struct net_device *b)
+{
+ struct switchdev_attr a_attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+ struct switchdev_attr b_attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+
+ if (switchdev_port_attr_get(a, &a_attr) ||
+ switchdev_port_attr_get(b, &b_attr))
+ return false;
+
+ return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
+}
+
+static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
+ struct net_device *group_dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev == dev)
+ continue;
+ if (switchdev_port_same_parent_id(dev, lower_dev))
+ return lower_dev->offload_fwd_mark;
+ return switchdev_port_fwd_mark_get(dev, lower_dev);
+ }
+
+ return dev->ifindex;
+}
+
+static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
+ u32 old_mark, u32 *reset_mark)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev->offload_fwd_mark == old_mark) {
+ if (!*reset_mark)
+ *reset_mark = lower_dev->ifindex;
+ lower_dev->offload_fwd_mark = *reset_mark;
+ }
+ switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
+ }
+}
+
+/**
+ * switchdev_port_fwd_mark_set - Set port offload forwarding mark
+ *
+ * @dev: port device
+ * @group_dev: containing device
+ * @joining: true if dev is joining group; false if leaving group
+ *
+ * An ungrouped port's offload mark is just its ifindex. A grouped
+ * port's (member of a bridge, for example) offload mark is the ifindex
+ * of one of the ports in the group with the same parent (switch) ID.
+ * Ports on the same device in the same group will have the same mark.
+ *
+ * Example:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=5
+ * sw2p2 ifindex=5 mark=5
+ *
+ * If sw2p2 leaves the bridge, we'll have:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=4
+ * sw2p2 ifindex=5 mark=5
+ */
+void switchdev_port_fwd_mark_set(struct net_device *dev,
+ struct net_device *group_dev,
+ bool joining)
+{
+ u32 mark = dev->ifindex;
+ u32 reset_mark = 0;
+
+ if (group_dev) {
+ ASSERT_RTNL();
+ if (joining)
+ mark = switchdev_port_fwd_mark_get(dev, group_dev);
+ else if (dev->offload_fwd_mark == mark)
+ /* Ohoh, this port was the mark reference port,
+ * but it's leaving the group, so reset the
+ * mark for the remaining ports in the group.
+ */
+ switchdev_port_fwd_mark_reset(group_dev, mark,
+ &reset_mark);
+ }
+
+ dev->offload_fwd_mark = mark;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
diff --git a/kernel/net/sysctl_net.c b/kernel/net/sysctl_net.c
index e7000be32..ed98c1fc3 100644
--- a/kernel/net/sysctl_net.c
+++ b/kernel/net/sysctl_net.c
@@ -94,10 +94,14 @@ __init int net_sysctl_init(void)
goto out;
ret = register_pernet_subsys(&sysctl_pernet_ops);
if (ret)
- goto out;
+ goto out1;
register_sysctl_root(&net_sysctl_root);
out:
return ret;
+out1:
+ unregister_sysctl_table(net_header);
+ net_header = NULL;
+ goto out;
}
struct ctl_table_header *register_net_sysctl(struct net *net,
diff --git a/kernel/net/tipc/addr.c b/kernel/net/tipc/addr.c
index ba7daa864..48fd3b5a7 100644
--- a/kernel/net/tipc/addr.c
+++ b/kernel/net/tipc/addr.c
@@ -38,13 +38,6 @@
#include "addr.h"
#include "core.h"
-u32 tipc_own_addr(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- return tn->own_addr;
-}
-
/**
* in_own_cluster - test for cluster inclusion; <0.0.0> always matches
*/
diff --git a/kernel/net/tipc/addr.h b/kernel/net/tipc/addr.h
index 7ba6d5c8a..93f7c983b 100644
--- a/kernel/net/tipc/addr.h
+++ b/kernel/net/tipc/addr.h
@@ -41,10 +41,18 @@
#include <linux/tipc.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include "core.h"
#define TIPC_ZONE_MASK 0xff000000u
#define TIPC_CLUSTER_MASK 0xfffff000u
+static inline u32 tipc_own_addr(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return tn->own_addr;
+}
+
static inline u32 tipc_zone_mask(u32 addr)
{
return addr & TIPC_ZONE_MASK;
diff --git a/kernel/net/tipc/bcast.c b/kernel/net/tipc/bcast.c
index c5cbdcb1f..92e367a0a 100644
--- a/kernel/net/tipc/bcast.c
+++ b/kernel/net/tipc/bcast.c
@@ -35,690 +35,301 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/tipc_config.h>
#include "socket.h"
#include "msg.h"
#include "bcast.h"
#include "name_distr.h"
-#include "core.h"
+#include "link.h"
+#include "node.h"
-#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */
-#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */
+#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
+#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
const char tipc_bclink_name[] = "broadcast-link";
-static void tipc_nmap_diff(struct tipc_node_map *nm_a,
- struct tipc_node_map *nm_b,
- struct tipc_node_map *nm_diff);
-static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node);
-static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node);
-
-static void tipc_bclink_lock(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- spin_lock_bh(&tn->bclink->lock);
-}
-
-static void tipc_bclink_unlock(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- spin_unlock_bh(&tn->bclink->lock);
-}
-
-void tipc_bclink_input(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- tipc_sk_mcast_rcv(net, &tn->bclink->arrvq, &tn->bclink->inputq);
-}
-
-uint tipc_bclink_get_mtu(void)
-{
- return MAX_PKT_DEFAULT_MCAST;
-}
-
-static u32 bcbuf_acks(struct sk_buff *buf)
-{
- return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle;
-}
-
-static void bcbuf_set_acks(struct sk_buff *buf, u32 acks)
-{
- TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks;
-}
-
-static void bcbuf_decr_acks(struct sk_buff *buf)
-{
- bcbuf_set_acks(buf, bcbuf_acks(buf) - 1);
-}
+/**
+ * struct tipc_bc_base - base structure for keeping broadcast send state
+ * @link: broadcast send link structure
+ * @inputq: data input queue; will only carry SOCK_WAKEUP messages
+ * @dest: array keeping number of reachable destinations per bearer
+ * @primary_bearer: a bearer having links to all broadcast destinations, if any
+ */
+struct tipc_bc_base {
+ struct tipc_link *link;
+ struct sk_buff_head inputq;
+ int dests[MAX_BEARERS];
+ int primary_bearer;
+};
-void tipc_bclink_add_node(struct net *net, u32 addr)
+static struct tipc_bc_base *tipc_bc_base(struct net *net)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- tipc_bclink_lock(net);
- tipc_nmap_add(&tn->bclink->bcast_nodes, addr);
- tipc_bclink_unlock(net);
+ return tipc_net(net)->bcbase;
}
-void tipc_bclink_remove_node(struct net *net, u32 addr)
+int tipc_bcast_get_mtu(struct net *net)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- tipc_bclink_lock(net);
- tipc_nmap_remove(&tn->bclink->bcast_nodes, addr);
- tipc_bclink_unlock(net);
+ return tipc_link_mtu(tipc_bc_sndlink(net));
}
-static void bclink_set_last_sent(struct net *net)
+/* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
+ * if any, and make it primary bearer
+ */
+static void tipc_bcbase_select_primary(struct net *net)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *bcl = tn->bcl;
- struct sk_buff *skb = skb_peek(&bcl->backlogq);
+ struct tipc_bc_base *bb = tipc_bc_base(net);
+ int all_dests = tipc_link_bc_peers(bb->link);
+ int i, mtu;
- if (skb)
- bcl->fsm_msg_cnt = mod(buf_seqno(skb) - 1);
- else
- bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1);
-}
-
-u32 tipc_bclink_get_last_sent(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ bb->primary_bearer = INVALID_BEARER_ID;
- return tn->bcl->fsm_msg_cnt;
-}
+ if (!all_dests)
+ return;
-static void bclink_update_last_sent(struct tipc_node *node, u32 seqno)
-{
- node->bclink.last_sent = less_eq(node->bclink.last_sent, seqno) ?
- seqno : node->bclink.last_sent;
-}
+ for (i = 0; i < MAX_BEARERS; i++) {
+ if (!bb->dests[i])
+ continue;
-/**
- * tipc_bclink_retransmit_to - get most recent node to request retransmission
- *
- * Called with bclink_lock locked
- */
-struct tipc_node *tipc_bclink_retransmit_to(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ mtu = tipc_bearer_mtu(net, i);
+ if (mtu < tipc_link_mtu(bb->link))
+ tipc_link_set_mtu(bb->link, mtu);
- return tn->bclink->retransmit_to;
-}
+ if (bb->dests[i] < all_dests)
+ continue;
-/**
- * bclink_retransmit_pkt - retransmit broadcast packets
- * @after: sequence number of last packet to *not* retransmit
- * @to: sequence number of last packet to retransmit
- *
- * Called with bclink_lock locked
- */
-static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
-{
- struct sk_buff *skb;
- struct tipc_link *bcl = tn->bcl;
+ bb->primary_bearer = i;
- skb_queue_walk(&bcl->transmq, skb) {
- if (more(buf_seqno(skb), after)) {
- tipc_link_retransmit(bcl, skb, mod(to - after));
+ /* Reduce risk that all nodes select same primary */
+ if ((i ^ tipc_own_addr(net)) & 1)
break;
- }
}
}
-/**
- * tipc_bclink_wakeup_users - wake up pending users
- *
- * Called with no locks taken
- */
-void tipc_bclink_wakeup_users(struct net *net)
+void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bc_base *bb = tipc_bc_base(net);
- tipc_sk_rcv(net, &tn->bclink->link.wakeupq);
+ tipc_bcast_lock(net);
+ bb->dests[bearer_id]++;
+ tipc_bcbase_select_primary(net);
+ tipc_bcast_unlock(net);
}
-/**
- * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets
- * @n_ptr: node that sent acknowledgement info
- * @acked: broadcast sequence # that has been acknowledged
- *
- * Node is locked, bclink_lock unlocked.
- */
-void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
+void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id)
{
- struct sk_buff *skb, *tmp;
- unsigned int released = 0;
- struct net *net = n_ptr->net;
- struct tipc_net *tn = net_generic(net, tipc_net_id);
-
- if (unlikely(!n_ptr->bclink.recv_permitted))
- return;
-
- tipc_bclink_lock(net);
-
- /* Bail out if tx queue is empty (no clean up is required) */
- skb = skb_peek(&tn->bcl->transmq);
- if (!skb)
- goto exit;
-
- /* Determine which messages need to be acknowledged */
- if (acked == INVALID_LINK_SEQ) {
- /*
- * Contact with specified node has been lost, so need to
- * acknowledge sent messages only (if other nodes still exist)
- * or both sent and unsent messages (otherwise)
- */
- if (tn->bclink->bcast_nodes.count)
- acked = tn->bcl->fsm_msg_cnt;
- else
- acked = tn->bcl->next_out_no;
- } else {
- /*
- * Bail out if specified sequence number does not correspond
- * to a message that has been sent and not yet acknowledged
- */
- if (less(acked, buf_seqno(skb)) ||
- less(tn->bcl->fsm_msg_cnt, acked) ||
- less_eq(acked, n_ptr->bclink.acked))
- goto exit;
- }
-
- /* Skip over packets that node has previously acknowledged */
- skb_queue_walk(&tn->bcl->transmq, skb) {
- if (more(buf_seqno(skb), n_ptr->bclink.acked))
- break;
- }
+ struct tipc_bc_base *bb = tipc_bc_base(net);
- /* Update packets that node is now acknowledging */
- skb_queue_walk_from_safe(&tn->bcl->transmq, skb, tmp) {
- if (more(buf_seqno(skb), acked))
- break;
- bcbuf_decr_acks(skb);
- bclink_set_last_sent(net);
- if (bcbuf_acks(skb) == 0) {
- __skb_unlink(skb, &tn->bcl->transmq);
- kfree_skb(skb);
- released = 1;
- }
- }
- n_ptr->bclink.acked = acked;
-
- /* Try resolving broadcast link congestion, if necessary */
- if (unlikely(skb_peek(&tn->bcl->backlogq))) {
- tipc_link_push_packets(tn->bcl);
- bclink_set_last_sent(net);
- }
- if (unlikely(released && !skb_queue_empty(&tn->bcl->wakeupq)))
- n_ptr->action_flags |= TIPC_WAKEUP_BCAST_USERS;
-exit:
- tipc_bclink_unlock(net);
+ tipc_bcast_lock(net);
+ bb->dests[bearer_id]--;
+ tipc_bcbase_select_primary(net);
+ tipc_bcast_unlock(net);
}
-/**
- * tipc_bclink_update_link_state - update broadcast link state
+/* tipc_bcbase_xmit - broadcast a packet queue across one or more bearers
*
- * RCU and node lock set
+ * Note that number of reachable destinations, as indicated in the dests[]
+ * array, may transitionally differ from the number of destinations indicated
+ * in each sent buffer. We can sustain this. Excess destination nodes will
+ * drop and never acknowledge the unexpected packets, and missing destinations
+ * will either require retransmission (if they are just about to be added to
+ * the bearer), or be removed from the buffer's 'ackers' counter (if they
+ * just went down)
*/
-void tipc_bclink_update_link_state(struct tipc_node *n_ptr,
- u32 last_sent)
+static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
{
- struct sk_buff *buf;
- struct net *net = n_ptr->net;
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ int bearer_id;
+ struct tipc_bc_base *bb = tipc_bc_base(net);
+ struct sk_buff *skb, *_skb;
+ struct sk_buff_head _xmitq;
- /* Ignore "stale" link state info */
- if (less_eq(last_sent, n_ptr->bclink.last_in))
+ if (skb_queue_empty(xmitq))
return;
- /* Update link synchronization state; quit if in sync */
- bclink_update_last_sent(n_ptr, last_sent);
-
- if (n_ptr->bclink.last_sent == n_ptr->bclink.last_in)
+ /* The typical case: at least one bearer has links to all nodes */
+ bearer_id = bb->primary_bearer;
+ if (bearer_id >= 0) {
+ tipc_bearer_bc_xmit(net, bearer_id, xmitq);
return;
-
- /* Update out-of-sync state; quit if loss is still unconfirmed */
- if ((++n_ptr->bclink.oos_state) == 1) {
- if (n_ptr->bclink.deferred_size < (TIPC_MIN_LINK_WIN / 2))
- return;
- n_ptr->bclink.oos_state++;
}
- /* Don't NACK if one has been recently sent (or seen) */
- if (n_ptr->bclink.oos_state & 0x1)
- return;
+ /* We have to transmit across all bearers */
+ skb_queue_head_init(&_xmitq);
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ if (!bb->dests[bearer_id])
+ continue;
- /* Send NACK */
- buf = tipc_buf_acquire(INT_H_SIZE);
- if (buf) {
- struct tipc_msg *msg = buf_msg(buf);
- struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferdq);
- u32 to = skb ? buf_seqno(skb) - 1 : n_ptr->bclink.last_sent;
-
- tipc_msg_init(tn->own_addr, msg, BCAST_PROTOCOL, STATE_MSG,
- INT_H_SIZE, n_ptr->addr);
- msg_set_non_seq(msg, 1);
- msg_set_mc_netid(msg, tn->net_id);
- msg_set_bcast_ack(msg, n_ptr->bclink.last_in);
- msg_set_bcgap_after(msg, n_ptr->bclink.last_in);
- msg_set_bcgap_to(msg, to);
-
- tipc_bclink_lock(net);
- tipc_bearer_send(net, MAX_BEARERS, buf, NULL);
- tn->bcl->stats.sent_nacks++;
- tipc_bclink_unlock(net);
- kfree_skb(buf);
-
- n_ptr->bclink.oos_state++;
+ skb_queue_walk(xmitq, skb) {
+ _skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ break;
+ __skb_queue_tail(&_xmitq, _skb);
+ }
+ tipc_bearer_bc_xmit(net, bearer_id, &_xmitq);
}
+ __skb_queue_purge(xmitq);
+ __skb_queue_purge(&_xmitq);
}
-/**
- * bclink_peek_nack - monitor retransmission requests sent by other nodes
- *
- * Delay any upcoming NACK by this node if another node has already
- * requested the first message this node is going to ask for.
- */
-static void bclink_peek_nack(struct net *net, struct tipc_msg *msg)
-{
- struct tipc_node *n_ptr = tipc_node_find(net, msg_destnode(msg));
-
- if (unlikely(!n_ptr))
- return;
-
- tipc_node_lock(n_ptr);
- if (n_ptr->bclink.recv_permitted &&
- (n_ptr->bclink.last_in != n_ptr->bclink.last_sent) &&
- (n_ptr->bclink.last_in == msg_bcgap_after(msg)))
- n_ptr->bclink.oos_state = 2;
- tipc_node_unlock(n_ptr);
- tipc_node_put(n_ptr);
-}
-
-/* tipc_bclink_xmit - deliver buffer chain to all nodes in cluster
+/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster
* and to identified node local sockets
* @net: the applicable net namespace
* @list: chain of buffers containing message
* Consumes the buffer chain, except when returning -ELINKCONG
* Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
*/
-int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list)
+int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *bcl = tn->bcl;
- struct tipc_bclink *bclink = tn->bclink;
+ struct tipc_link *l = tipc_bc_sndlink(net);
+ struct sk_buff_head xmitq, inputq, rcvq;
int rc = 0;
- int bc = 0;
- struct sk_buff *skb;
- struct sk_buff_head arrvq;
- struct sk_buff_head inputq;
- /* Prepare clone of message for local node */
- skb = tipc_msg_reassemble(list);
- if (unlikely(!skb)) {
- __skb_queue_purge(list);
+ __skb_queue_head_init(&rcvq);
+ __skb_queue_head_init(&xmitq);
+ skb_queue_head_init(&inputq);
+
+ /* Prepare message clone for local node */
+ if (unlikely(!tipc_msg_reassemble(list, &rcvq)))
return -EHOSTUNREACH;
- }
- /* Broadcast to all nodes */
- if (likely(bclink)) {
- tipc_bclink_lock(net);
- if (likely(bclink->bcast_nodes.count)) {
- rc = __tipc_link_xmit(net, bcl, list);
- if (likely(!rc)) {
- u32 len = skb_queue_len(&bcl->transmq);
-
- bclink_set_last_sent(net);
- bcl->stats.queue_sz_counts++;
- bcl->stats.accu_queue_sz += len;
- }
- bc = 1;
- }
- tipc_bclink_unlock(net);
- }
- if (unlikely(!bc))
- __skb_queue_purge(list);
+ tipc_bcast_lock(net);
+ if (tipc_link_bc_peers(l))
+ rc = tipc_link_xmit(l, list, &xmitq);
+ tipc_bcast_unlock(net);
+ /* Don't send to local node if adding to link failed */
if (unlikely(rc)) {
- kfree_skb(skb);
+ __skb_queue_purge(&rcvq);
return rc;
}
- /* Deliver message clone */
- __skb_queue_head_init(&arrvq);
- skb_queue_head_init(&inputq);
- __skb_queue_tail(&arrvq, skb);
- tipc_sk_mcast_rcv(net, &arrvq, &inputq);
- return rc;
-}
-/**
- * bclink_accept_pkt - accept an incoming, in-sequence broadcast packet
- *
- * Called with both sending node's lock and bclink_lock taken.
- */
-static void bclink_accept_pkt(struct tipc_node *node, u32 seqno)
-{
- struct tipc_net *tn = net_generic(node->net, tipc_net_id);
-
- bclink_update_last_sent(node, seqno);
- node->bclink.last_in = seqno;
- node->bclink.oos_state = 0;
- tn->bcl->stats.recv_info++;
-
- /*
- * Unicast an ACK periodically, ensuring that
- * all nodes in the cluster don't ACK at the same time
- */
- if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) {
- tipc_link_proto_xmit(node->active_links[node->addr & 1],
- STATE_MSG, 0, 0, 0, 0);
- tn->bcl->stats.sent_acks++;
- }
+ /* Broadcast to all nodes, inluding local node */
+ tipc_bcbase_xmit(net, &xmitq);
+ tipc_sk_mcast_rcv(net, &rcvq, &inputq);
+ __skb_queue_purge(list);
+ return 0;
}
-/**
- * tipc_bclink_rcv - receive a broadcast packet, and deliver upwards
+/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
*
* RCU is locked, no other locks set
*/
-void tipc_bclink_rcv(struct net *net, struct sk_buff *buf)
+int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *bcl = tn->bcl;
- struct tipc_msg *msg = buf_msg(buf);
- struct tipc_node *node;
- u32 next_in;
- u32 seqno;
- int deferred = 0;
- int pos = 0;
- struct sk_buff *iskb;
- struct sk_buff_head *arrvq, *inputq;
-
- /* Screen out unwanted broadcast messages */
- if (msg_mc_netid(msg) != tn->net_id)
- goto exit;
-
- node = tipc_node_find(net, msg_prevnode(msg));
- if (unlikely(!node))
- goto exit;
-
- tipc_node_lock(node);
- if (unlikely(!node->bclink.recv_permitted))
- goto unlock;
-
- /* Handle broadcast protocol message */
- if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) {
- if (msg_type(msg) != STATE_MSG)
- goto unlock;
- if (msg_destnode(msg) == tn->own_addr) {
- tipc_bclink_acknowledge(node, msg_bcast_ack(msg));
- tipc_bclink_lock(net);
- bcl->stats.recv_nacks++;
- tn->bclink->retransmit_to = node;
- bclink_retransmit_pkt(tn, msg_bcgap_after(msg),
- msg_bcgap_to(msg));
- tipc_bclink_unlock(net);
- tipc_node_unlock(node);
- } else {
- tipc_node_unlock(node);
- bclink_peek_nack(net, msg);
- }
- tipc_node_put(node);
- goto exit;
- }
-
- /* Handle in-sequence broadcast message */
- seqno = msg_seqno(msg);
- next_in = mod(node->bclink.last_in + 1);
- arrvq = &tn->bclink->arrvq;
- inputq = &tn->bclink->inputq;
-
- if (likely(seqno == next_in)) {
-receive:
- /* Deliver message to destination */
- if (likely(msg_isdata(msg))) {
- tipc_bclink_lock(net);
- bclink_accept_pkt(node, seqno);
- spin_lock_bh(&inputq->lock);
- __skb_queue_tail(arrvq, buf);
- spin_unlock_bh(&inputq->lock);
- node->action_flags |= TIPC_BCAST_MSG_EVT;
- tipc_bclink_unlock(net);
- tipc_node_unlock(node);
- } else if (msg_user(msg) == MSG_BUNDLER) {
- tipc_bclink_lock(net);
- bclink_accept_pkt(node, seqno);
- bcl->stats.recv_bundles++;
- bcl->stats.recv_bundled += msg_msgcnt(msg);
- pos = 0;
- while (tipc_msg_extract(buf, &iskb, &pos)) {
- spin_lock_bh(&inputq->lock);
- __skb_queue_tail(arrvq, iskb);
- spin_unlock_bh(&inputq->lock);
- }
- node->action_flags |= TIPC_BCAST_MSG_EVT;
- tipc_bclink_unlock(net);
- tipc_node_unlock(node);
- } else if (msg_user(msg) == MSG_FRAGMENTER) {
- tipc_bclink_lock(net);
- bclink_accept_pkt(node, seqno);
- tipc_buf_append(&node->bclink.reasm_buf, &buf);
- if (unlikely(!buf && !node->bclink.reasm_buf)) {
- tipc_bclink_unlock(net);
- goto unlock;
- }
- bcl->stats.recv_fragments++;
- if (buf) {
- bcl->stats.recv_fragmented++;
- msg = buf_msg(buf);
- tipc_bclink_unlock(net);
- goto receive;
- }
- tipc_bclink_unlock(net);
- tipc_node_unlock(node);
- } else {
- tipc_bclink_lock(net);
- bclink_accept_pkt(node, seqno);
- tipc_bclink_unlock(net);
- tipc_node_unlock(node);
- kfree_skb(buf);
- }
- buf = NULL;
-
- /* Determine new synchronization state */
- tipc_node_lock(node);
- if (unlikely(!tipc_node_is_up(node)))
- goto unlock;
-
- if (node->bclink.last_in == node->bclink.last_sent)
- goto unlock;
+ struct tipc_msg *hdr = buf_msg(skb);
+ struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
+ struct sk_buff_head xmitq;
+ int rc;
- if (skb_queue_empty(&node->bclink.deferdq)) {
- node->bclink.oos_state = 1;
- goto unlock;
- }
-
- msg = buf_msg(skb_peek(&node->bclink.deferdq));
- seqno = msg_seqno(msg);
- next_in = mod(next_in + 1);
- if (seqno != next_in)
- goto unlock;
-
- /* Take in-sequence message from deferred queue & deliver it */
- buf = __skb_dequeue(&node->bclink.deferdq);
- goto receive;
- }
+ __skb_queue_head_init(&xmitq);
- /* Handle out-of-sequence broadcast message */
- if (less(next_in, seqno)) {
- deferred = tipc_link_defer_pkt(&node->bclink.deferdq,
- buf);
- bclink_update_last_sent(node, seqno);
- buf = NULL;
+ if (msg_mc_netid(hdr) != tipc_netid(net) || !tipc_link_is_up(l)) {
+ kfree_skb(skb);
+ return 0;
}
- tipc_bclink_lock(net);
-
- if (deferred)
- bcl->stats.deferred_recv++;
+ tipc_bcast_lock(net);
+ if (msg_user(hdr) == BCAST_PROTOCOL)
+ rc = tipc_link_bc_nack_rcv(l, skb, &xmitq);
else
- bcl->stats.duplicates++;
+ rc = tipc_link_rcv(l, skb, NULL);
+ tipc_bcast_unlock(net);
- tipc_bclink_unlock(net);
+ tipc_bcbase_xmit(net, &xmitq);
-unlock:
- tipc_node_unlock(node);
- tipc_node_put(node);
-exit:
- kfree_skb(buf);
-}
+ /* Any socket wakeup messages ? */
+ if (!skb_queue_empty(inputq))
+ tipc_sk_rcv(net, inputq);
-u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
-{
- return (n_ptr->bclink.recv_permitted &&
- (tipc_bclink_get_last_sent(n_ptr->net) != n_ptr->bclink.acked));
+ return rc;
}
-
-/**
- * tipc_bcbearer_send - send a packet through the broadcast pseudo-bearer
- *
- * Send packet over as many bearers as necessary to reach all nodes
- * that have joined the broadcast link.
+/* tipc_bcast_ack_rcv - receive and handle a broadcast acknowledge
*
- * Returns 0 (packet sent successfully) under all circumstances,
- * since the broadcast link's pseudo-bearer never blocks
+ * RCU is locked, no other locks set
*/
-static int tipc_bcbearer_send(struct net *net, struct sk_buff *buf,
- struct tipc_bearer *unused1,
- struct tipc_media_addr *unused2)
+void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked)
{
- int bp_index;
- struct tipc_msg *msg = buf_msg(buf);
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bcbearer *bcbearer = tn->bcbearer;
- struct tipc_bclink *bclink = tn->bclink;
-
- /* Prepare broadcast link message for reliable transmission,
- * if first time trying to send it;
- * preparation is skipped for broadcast link protocol messages
- * since they are sent in an unreliable manner and don't need it
- */
- if (likely(!msg_non_seq(buf_msg(buf)))) {
- bcbuf_set_acks(buf, bclink->bcast_nodes.count);
- msg_set_non_seq(msg, 1);
- msg_set_mc_netid(msg, tn->net_id);
- tn->bcl->stats.sent_info++;
- if (WARN_ON(!bclink->bcast_nodes.count)) {
- dump_stack();
- return 0;
- }
- }
+ struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
+ struct sk_buff_head xmitq;
- /* Send buffer over bearers until all targets reached */
- bcbearer->remains = bclink->bcast_nodes;
-
- for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
- struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary;
- struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary;
- struct tipc_bearer *bp[2] = {p, s};
- struct tipc_bearer *b = bp[msg_link_selector(msg)];
- struct sk_buff *tbuf;
-
- if (!p)
- break; /* No more bearers to try */
- if (!b)
- b = p;
- tipc_nmap_diff(&bcbearer->remains, &b->nodes,
- &bcbearer->remains_new);
- if (bcbearer->remains_new.count == bcbearer->remains.count)
- continue; /* Nothing added by bearer pair */
-
- if (bp_index == 0) {
- /* Use original buffer for first bearer */
- tipc_bearer_send(net, b->identity, buf, &b->bcast_addr);
- } else {
- /* Avoid concurrent buffer access */
- tbuf = pskb_copy_for_clone(buf, GFP_ATOMIC);
- if (!tbuf)
- break;
- tipc_bearer_send(net, b->identity, tbuf,
- &b->bcast_addr);
- kfree_skb(tbuf); /* Bearer keeps a clone */
- }
- if (bcbearer->remains_new.count == 0)
- break; /* All targets reached */
+ __skb_queue_head_init(&xmitq);
- bcbearer->remains = bcbearer->remains_new;
- }
+ tipc_bcast_lock(net);
+ tipc_link_bc_ack_rcv(l, acked, &xmitq);
+ tipc_bcast_unlock(net);
- return 0;
+ tipc_bcbase_xmit(net, &xmitq);
+
+ /* Any socket wakeup messages ? */
+ if (!skb_queue_empty(inputq))
+ tipc_sk_rcv(net, inputq);
}
-/**
- * tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer
+/* tipc_bcast_synch_rcv - check and update rcv link with peer's send state
+ *
+ * RCU is locked, no other locks set
*/
-void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr,
- u32 node, bool action)
+void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+ struct tipc_msg *hdr)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bcbearer *bcbearer = tn->bcbearer;
- struct tipc_bcbearer_pair *bp_temp = bcbearer->bpairs_temp;
- struct tipc_bcbearer_pair *bp_curr;
- struct tipc_bearer *b;
- int b_index;
- int pri;
-
- tipc_bclink_lock(net);
+ struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
+ struct sk_buff_head xmitq;
- if (action)
- tipc_nmap_add(nm_ptr, node);
- else
- tipc_nmap_remove(nm_ptr, node);
+ __skb_queue_head_init(&xmitq);
- /* Group bearers by priority (can assume max of two per priority) */
- memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp));
+ tipc_bcast_lock(net);
+ if (msg_type(hdr) == STATE_MSG) {
+ tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq);
+ tipc_link_bc_sync_rcv(l, hdr, &xmitq);
+ } else {
+ tipc_link_bc_init_rcv(l, hdr);
+ }
+ tipc_bcast_unlock(net);
- rcu_read_lock();
- for (b_index = 0; b_index < MAX_BEARERS; b_index++) {
- b = rcu_dereference_rtnl(tn->bearer_list[b_index]);
- if (!b || !b->nodes.count)
- continue;
+ tipc_bcbase_xmit(net, &xmitq);
- if (!bp_temp[b->priority].primary)
- bp_temp[b->priority].primary = b;
- else
- bp_temp[b->priority].secondary = b;
- }
- rcu_read_unlock();
+ /* Any socket wakeup messages ? */
+ if (!skb_queue_empty(inputq))
+ tipc_sk_rcv(net, inputq);
+}
- /* Create array of bearer pairs for broadcasting */
- bp_curr = bcbearer->bpairs;
- memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
+/* tipc_bcast_add_peer - add a peer node to broadcast link and bearer
+ *
+ * RCU is locked, node lock is set
+ */
+void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
+ struct sk_buff_head *xmitq)
+{
+ struct tipc_link *snd_l = tipc_bc_sndlink(net);
- for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) {
+ tipc_bcast_lock(net);
+ tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
+ tipc_bcbase_select_primary(net);
+ tipc_bcast_unlock(net);
+}
- if (!bp_temp[pri].primary)
- continue;
+/* tipc_bcast_remove_peer - remove a peer node from broadcast link and bearer
+ *
+ * RCU is locked, node lock is set
+ */
+void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
+{
+ struct tipc_link *snd_l = tipc_bc_sndlink(net);
+ struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
+ struct sk_buff_head xmitq;
- bp_curr->primary = bp_temp[pri].primary;
+ __skb_queue_head_init(&xmitq);
- if (bp_temp[pri].secondary) {
- if (tipc_nmap_equal(&bp_temp[pri].primary->nodes,
- &bp_temp[pri].secondary->nodes)) {
- bp_curr->secondary = bp_temp[pri].secondary;
- } else {
- bp_curr++;
- bp_curr->primary = bp_temp[pri].secondary;
- }
- }
+ tipc_bcast_lock(net);
+ tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
+ tipc_bcbase_select_primary(net);
+ tipc_bcast_unlock(net);
- bp_curr++;
- }
+ tipc_bcbase_xmit(net, &xmitq);
- tipc_bclink_unlock(net);
+ /* Any socket wakeup messages ? */
+ if (!skb_queue_empty(inputq))
+ tipc_sk_rcv(net, inputq);
}
static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
@@ -784,12 +395,14 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
if (!bcl)
return 0;
- tipc_bclink_lock(net);
+ tipc_bcast_lock(net);
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
NLM_F_MULTI, TIPC_NL_LINK_GET);
- if (!hdr)
+ if (!hdr) {
+ tipc_bcast_unlock(net);
return -EMSGSIZE;
+ }
attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
if (!attrs)
@@ -803,9 +416,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
goto attr_msg_full;
if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->next_in_no))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->next_out_no))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
goto attr_msg_full;
prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
@@ -819,7 +432,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
if (err)
goto attr_msg_full;
- tipc_bclink_unlock(net);
+ tipc_bcast_unlock(net);
nla_nest_end(msg->skb, attrs);
genlmsg_end(msg->skb, hdr);
@@ -830,7 +443,7 @@ prop_msg_full:
attr_msg_full:
nla_nest_cancel(msg->skb, attrs);
msg_full:
- tipc_bclink_unlock(net);
+ tipc_bcast_unlock(net);
genlmsg_cancel(msg->skb, hdr);
return -EMSGSIZE;
@@ -844,143 +457,91 @@ int tipc_bclink_reset_stats(struct net *net)
if (!bcl)
return -ENOPROTOOPT;
- tipc_bclink_lock(net);
+ tipc_bcast_lock(net);
memset(&bcl->stats, 0, sizeof(bcl->stats));
- tipc_bclink_unlock(net);
+ tipc_bcast_unlock(net);
return 0;
}
-int tipc_bclink_set_queue_limits(struct net *net, u32 limit)
+static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *bcl = tn->bcl;
+ struct tipc_link *l = tipc_bc_sndlink(net);
- if (!bcl)
+ if (!l)
return -ENOPROTOOPT;
- if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN))
+ if (limit < BCLINK_WIN_MIN)
+ limit = BCLINK_WIN_MIN;
+ if (limit > TIPC_MAX_LINK_WIN)
return -EINVAL;
-
- tipc_bclink_lock(net);
- tipc_link_set_queue_limits(bcl, limit);
- tipc_bclink_unlock(net);
+ tipc_bcast_lock(net);
+ tipc_link_set_queue_limits(l, limit);
+ tipc_bcast_unlock(net);
return 0;
}
-int tipc_bclink_init(struct net *net)
+int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[])
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bcbearer *bcbearer;
- struct tipc_bclink *bclink;
- struct tipc_link *bcl;
-
- bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC);
- if (!bcbearer)
- return -ENOMEM;
-
- bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC);
- if (!bclink) {
- kfree(bcbearer);
- return -ENOMEM;
- }
+ int err;
+ u32 win;
+ struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
- bcl = &bclink->link;
- bcbearer->bearer.media = &bcbearer->media;
- bcbearer->media.send_msg = tipc_bcbearer_send;
- sprintf(bcbearer->media.name, "tipc-broadcast");
-
- spin_lock_init(&bclink->lock);
- __skb_queue_head_init(&bcl->transmq);
- __skb_queue_head_init(&bcl->backlogq);
- __skb_queue_head_init(&bcl->deferdq);
- skb_queue_head_init(&bcl->wakeupq);
- bcl->next_out_no = 1;
- spin_lock_init(&bclink->node.lock);
- __skb_queue_head_init(&bclink->arrvq);
- skb_queue_head_init(&bclink->inputq);
- bcl->owner = &bclink->node;
- bcl->owner->net = net;
- bcl->mtu = MAX_PKT_DEFAULT_MCAST;
- tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
- bcl->bearer_id = MAX_BEARERS;
- rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer);
- bcl->state = WORKING_WORKING;
- bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg;
- msg_set_prevnode(bcl->pmsg, tn->own_addr);
- strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
- tn->bcbearer = bcbearer;
- tn->bclink = bclink;
- tn->bcl = bcl;
- return 0;
-}
+ if (!attrs[TIPC_NLA_LINK_PROP])
+ return -EINVAL;
-void tipc_bclink_stop(struct net *net)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props);
+ if (err)
+ return err;
- tipc_bclink_lock(net);
- tipc_link_purge_queues(tn->bcl);
- tipc_bclink_unlock(net);
+ if (!props[TIPC_NLA_PROP_WIN])
+ return -EOPNOTSUPP;
- RCU_INIT_POINTER(tn->bearer_list[BCBEARER], NULL);
- synchronize_net();
- kfree(tn->bcbearer);
- kfree(tn->bclink);
+ win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+
+ return tipc_bc_link_set_queue_limits(net, win);
}
-/**
- * tipc_nmap_add - add a node to a node map
- */
-static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node)
+int tipc_bcast_init(struct net *net)
{
- int n = tipc_node(node);
- int w = n / WSIZE;
- u32 mask = (1 << (n % WSIZE));
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_bc_base *bb = NULL;
+ struct tipc_link *l = NULL;
- if ((nm_ptr->map[w] & mask) == 0) {
- nm_ptr->count++;
- nm_ptr->map[w] |= mask;
- }
+ bb = kzalloc(sizeof(*bb), GFP_ATOMIC);
+ if (!bb)
+ goto enomem;
+ tn->bcbase = bb;
+ spin_lock_init(&tipc_net(net)->bclock);
+
+ if (!tipc_link_bc_create(net, 0, 0,
+ U16_MAX,
+ BCLINK_WIN_DEFAULT,
+ 0,
+ &bb->inputq,
+ NULL,
+ NULL,
+ &l))
+ goto enomem;
+ bb->link = l;
+ tn->bcl = l;
+ return 0;
+enomem:
+ kfree(bb);
+ kfree(l);
+ return -ENOMEM;
}
-/**
- * tipc_nmap_remove - remove a node from a node map
- */
-static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
+void tipc_bcast_reinit(struct net *net)
{
- int n = tipc_node(node);
- int w = n / WSIZE;
- u32 mask = (1 << (n % WSIZE));
+ struct tipc_bc_base *b = tipc_bc_base(net);
- if ((nm_ptr->map[w] & mask) != 0) {
- nm_ptr->map[w] &= ~mask;
- nm_ptr->count--;
- }
+ msg_set_prevnode(b->link->pmsg, tipc_own_addr(net));
}
-/**
- * tipc_nmap_diff - find differences between node maps
- * @nm_a: input node map A
- * @nm_b: input node map B
- * @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
- */
-static void tipc_nmap_diff(struct tipc_node_map *nm_a,
- struct tipc_node_map *nm_b,
- struct tipc_node_map *nm_diff)
+void tipc_bcast_stop(struct net *net)
{
- int stop = ARRAY_SIZE(nm_a->map);
- int w;
- int b;
- u32 map;
-
- memset(nm_diff, 0, sizeof(*nm_diff));
- for (w = 0; w < stop; w++) {
- map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]);
- nm_diff->map[w] = map;
- if (map != 0) {
- for (b = 0 ; b < WSIZE; b++) {
- if (map & (1 << b))
- nm_diff->count++;
- }
- }
- }
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ synchronize_net();
+ kfree(tn->bcbase);
+ kfree(tn->bcl);
}
diff --git a/kernel/net/tipc/bcast.h b/kernel/net/tipc/bcast.h
index 4bdc12277..2855b9356 100644
--- a/kernel/net/tipc/bcast.h
+++ b/kernel/net/tipc/bcast.h
@@ -37,100 +37,44 @@
#ifndef _TIPC_BCAST_H
#define _TIPC_BCAST_H
-#include <linux/tipc_config.h>
-#include "link.h"
-#include "node.h"
+#include "core.h"
-/**
- * struct tipc_bcbearer_pair - a pair of bearers used by broadcast link
- * @primary: pointer to primary bearer
- * @secondary: pointer to secondary bearer
- *
- * Bearers must have same priority and same set of reachable destinations
- * to be paired.
- */
-
-struct tipc_bcbearer_pair {
- struct tipc_bearer *primary;
- struct tipc_bearer *secondary;
-};
-
-#define BCBEARER MAX_BEARERS
-
-/**
- * struct tipc_bcbearer - bearer used by broadcast link
- * @bearer: (non-standard) broadcast bearer structure
- * @media: (non-standard) broadcast media structure
- * @bpairs: array of bearer pairs
- * @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort()
- * @remains: temporary node map used by tipc_bcbearer_send()
- * @remains_new: temporary node map used tipc_bcbearer_send()
- *
- * Note: The fields labelled "temporary" are incorporated into the bearer
- * to avoid consuming potentially limited stack space through the use of
- * large local variables within multicast routines. Concurrent access is
- * prevented through use of the spinlock "bclink_lock".
- */
-struct tipc_bcbearer {
- struct tipc_bearer bearer;
- struct tipc_media media;
- struct tipc_bcbearer_pair bpairs[MAX_BEARERS];
- struct tipc_bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
- struct tipc_node_map remains;
- struct tipc_node_map remains_new;
-};
+struct tipc_node;
+struct tipc_msg;
+struct tipc_nl_msg;
+struct tipc_node_map;
-/**
- * struct tipc_bclink - link used for broadcast messages
- * @lock: spinlock governing access to structure
- * @link: (non-standard) broadcast link structure
- * @node: (non-standard) node structure representing b'cast link's peer node
- * @bcast_nodes: map of broadcast-capable nodes
- * @retransmit_to: node that most recently requested a retransmit
- *
- * Handles sequence numbering, fragmentation, bundling, etc.
- */
-struct tipc_bclink {
- spinlock_t lock;
- struct tipc_link link;
- struct tipc_node node;
- struct sk_buff_head arrvq;
- struct sk_buff_head inputq;
- struct tipc_node_map bcast_nodes;
- struct tipc_node *retransmit_to;
-};
+int tipc_bcast_init(struct net *net);
+void tipc_bcast_reinit(struct net *net);
+void tipc_bcast_stop(struct net *net);
+void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
+ struct sk_buff_head *xmitq);
+void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
+void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
+void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
+int tipc_bcast_get_mtu(struct net *net);
+int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
+int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
+void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked);
+void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+ struct tipc_msg *hdr);
+int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
+int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
+int tipc_bclink_reset_stats(struct net *net);
-struct tipc_node;
-extern const char tipc_bclink_name[];
+static inline void tipc_bcast_lock(struct net *net)
+{
+ spin_lock_bh(&tipc_net(net)->bclock);
+}
-/**
- * tipc_nmap_equal - test for equality of node maps
- */
-static inline int tipc_nmap_equal(struct tipc_node_map *nm_a,
- struct tipc_node_map *nm_b)
+static inline void tipc_bcast_unlock(struct net *net)
{
- return !memcmp(nm_a, nm_b, sizeof(*nm_a));
+ spin_unlock_bh(&tipc_net(net)->bclock);
}
-int tipc_bclink_init(struct net *net);
-void tipc_bclink_stop(struct net *net);
-void tipc_bclink_add_node(struct net *net, u32 addr);
-void tipc_bclink_remove_node(struct net *net, u32 addr);
-struct tipc_node *tipc_bclink_retransmit_to(struct net *tn);
-void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked);
-void tipc_bclink_rcv(struct net *net, struct sk_buff *buf);
-u32 tipc_bclink_get_last_sent(struct net *net);
-u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr);
-void tipc_bclink_update_link_state(struct tipc_node *node,
- u32 last_sent);
-int tipc_bclink_reset_stats(struct net *net);
-int tipc_bclink_set_queue_limits(struct net *net, u32 limit);
-void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr,
- u32 node, bool action);
-uint tipc_bclink_get_mtu(void);
-int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list);
-void tipc_bclink_wakeup_users(struct net *net);
-int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
-void tipc_bclink_input(struct net *net);
+static inline struct tipc_link *tipc_bc_sndlink(struct net *net)
+{
+ return tipc_net(net)->bcl;
+}
#endif
diff --git a/kernel/net/tipc/bearer.c b/kernel/net/tipc/bearer.c
index 70e3dacbf..648f2a67f 100644
--- a/kernel/net/tipc/bearer.c
+++ b/kernel/net/tipc/bearer.c
@@ -71,8 +71,7 @@ static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
[TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED }
};
-static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr,
- bool shutting_down);
+static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr);
/**
* tipc_media_find - locates specified media object by name
@@ -194,10 +193,8 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest)
rcu_read_lock();
b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (b_ptr) {
- tipc_bcbearer_sort(net, &b_ptr->nodes, dest, true);
+ if (b_ptr)
tipc_disc_add_dest(b_ptr->link_req);
- }
rcu_read_unlock();
}
@@ -208,10 +205,8 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
rcu_read_lock();
b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (b_ptr) {
- tipc_bcbearer_sort(net, &b_ptr->nodes, dest, false);
+ if (b_ptr)
tipc_disc_remove_dest(b_ptr->link_req);
- }
rcu_read_unlock();
}
@@ -324,7 +319,7 @@ restart:
res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr);
if (res) {
- bearer_disable(net, b_ptr, false);
+ bearer_disable(net, b_ptr);
pr_warn("Bearer <%s> rejected, discovery object creation failed\n",
name);
return -EINVAL;
@@ -344,7 +339,7 @@ restart:
static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr)
{
pr_info("Resetting bearer <%s>\n", b_ptr->name);
- tipc_link_reset_list(net, b_ptr->identity);
+ tipc_node_delete_links(net, b_ptr->identity);
tipc_disc_reset(net, b_ptr);
return 0;
}
@@ -354,8 +349,7 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr)
*
* Note: This routine assumes caller holds RTNL lock.
*/
-static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr,
- bool shutting_down)
+static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
u32 i;
@@ -363,7 +357,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr,
pr_info("Disabling bearer <%s>\n", b_ptr->name);
b_ptr->media->disable_media(b_ptr);
- tipc_link_delete_list(net, b_ptr->identity, shutting_down);
+ tipc_node_delete_links(net, b_ptr->identity);
+ RCU_INIT_POINTER(b_ptr->media_ptr, NULL);
if (b_ptr->link_req)
tipc_disc_delete(b_ptr->link_req);
@@ -401,16 +396,13 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
/* tipc_disable_l2_media - detach TIPC bearer from an L2 interface
*
- * Mark L2 bearer as inactive so that incoming buffers are thrown away,
- * then get worker thread to complete bearer cleanup. (Can't do cleanup
- * here because cleanup code needs to sleep and caller holds spinlocks.)
+ * Mark L2 bearer as inactive so that incoming buffers are thrown away
*/
void tipc_disable_l2_media(struct tipc_bearer *b)
{
struct net_device *dev;
dev = (struct net_device *)rtnl_dereference(b->media_ptr);
- RCU_INIT_POINTER(b->media_ptr, NULL);
RCU_INIT_POINTER(dev->tipc_ptr, NULL);
synchronize_net();
dev_put(dev);
@@ -422,10 +414,9 @@ void tipc_disable_l2_media(struct tipc_bearer *b)
* @b_ptr: the bearer through which the packet is to be sent
* @dest: peer destination address
*/
-int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
+int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
struct tipc_bearer *b, struct tipc_media_addr *dest)
{
- struct sk_buff *clone;
struct net_device *dev;
int delta;
@@ -433,42 +424,97 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
if (!dev)
return 0;
- clone = skb_clone(buf, GFP_ATOMIC);
- if (!clone)
- return 0;
-
- delta = dev->hard_header_len - skb_headroom(buf);
+ delta = dev->hard_header_len - skb_headroom(skb);
if ((delta > 0) &&
- pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
- kfree_skb(clone);
+ pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ kfree_skb(skb);
return 0;
}
- skb_reset_network_header(clone);
- clone->dev = dev;
- clone->protocol = htons(ETH_P_TIPC);
- dev_hard_header(clone, dev, ETH_P_TIPC, dest->value,
- dev->dev_addr, clone->len);
- dev_queue_xmit(clone);
+ skb_reset_network_header(skb);
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_TIPC);
+ dev_hard_header(skb, dev, ETH_P_TIPC, dest->value,
+ dev->dev_addr, skb->len);
+ dev_queue_xmit(skb);
return 0;
}
-/* tipc_bearer_send- sends buffer to destination over bearer
- *
- * IMPORTANT:
- * The media send routine must not alter the buffer being passed in
- * as it may be needed for later retransmission!
+int tipc_bearer_mtu(struct net *net, u32 bearer_id)
+{
+ int mtu = 0;
+ struct tipc_bearer *b;
+
+ rcu_read_lock();
+ b = rcu_dereference_rtnl(tipc_net(net)->bearer_list[bearer_id]);
+ if (b)
+ mtu = b->mtu;
+ rcu_read_unlock();
+ return mtu;
+}
+
+/* tipc_bearer_xmit_skb - sends buffer to destination over bearer
+ */
+void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
+ struct sk_buff *skb,
+ struct tipc_media_addr *dest)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_bearer *b;
+
+ rcu_read_lock();
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (likely(b))
+ b->media->send_msg(net, skb, b, dest);
+ rcu_read_unlock();
+}
+
+/* tipc_bearer_xmit() -send buffer to destination over bearer
*/
-void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
- struct tipc_media_addr *dest)
+void tipc_bearer_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr *dst)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
+ struct sk_buff *skb, *tmp;
+
+ if (skb_queue_empty(xmitq))
+ return;
rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (likely(b_ptr))
- b_ptr->media->send_msg(net, buf, b_ptr, dest);
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (likely(b)) {
+ skb_queue_walk_safe(xmitq, skb, tmp) {
+ __skb_dequeue(xmitq);
+ b->media->send_msg(net, skb, b, dst);
+ }
+ }
+ rcu_read_unlock();
+}
+
+/* tipc_bearer_bc_xmit() - broadcast buffers to all destinations
+ */
+void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq)
+{
+ struct tipc_net *tn = tipc_net(net);
+ int net_id = tn->net_id;
+ struct tipc_bearer *b;
+ struct sk_buff *skb, *tmp;
+ struct tipc_msg *hdr;
+
+ rcu_read_lock();
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (likely(b)) {
+ skb_queue_walk_safe(xmitq, skb, tmp) {
+ hdr = buf_msg(skb);
+ msg_set_non_seq(hdr, 1);
+ msg_set_mc_netid(hdr, net_id);
+ __skb_dequeue(xmitq);
+ b->media->send_msg(net, skb, b, &b->bcast_addr);
+ }
+ }
rcu_read_unlock();
}
@@ -530,7 +576,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
case NETDEV_CHANGE:
if (netif_carrier_ok(dev))
break;
- case NETDEV_DOWN:
+ case NETDEV_GOING_DOWN:
case NETDEV_CHANGEMTU:
tipc_reset_bearer(net, b_ptr);
break;
@@ -541,7 +587,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
break;
case NETDEV_UNREGISTER:
case NETDEV_CHANGENAME:
- bearer_disable(dev_net(dev), b_ptr, false);
+ bearer_disable(dev_net(dev), b_ptr);
break;
}
return NOTIFY_OK;
@@ -583,7 +629,7 @@ void tipc_bearer_stop(struct net *net)
for (i = 0; i < MAX_BEARERS; i++) {
b_ptr = rtnl_dereference(tn->bearer_list[i]);
if (b_ptr) {
- bearer_disable(net, b_ptr, true);
+ bearer_disable(net, b_ptr);
tn->bearer_list[i] = NULL;
}
}
@@ -747,7 +793,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- bearer_disable(net, bearer, false);
+ bearer_disable(net, bearer);
rtnl_unlock();
return 0;
@@ -812,7 +858,7 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
char *name;
struct tipc_bearer *b;
struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
- struct net *net = genl_info_net(info);
+ struct net *net = sock_net(skb->sk);
if (!info->attrs[TIPC_NLA_BEARER])
return -EINVAL;
diff --git a/kernel/net/tipc/bearer.h b/kernel/net/tipc/bearer.h
index 5cad243ee..552185bc4 100644
--- a/kernel/net/tipc/bearer.h
+++ b/kernel/net/tipc/bearer.h
@@ -38,9 +38,9 @@
#define _TIPC_BEARER_H
#include "netlink.h"
+#include "core.h"
#include <net/genetlink.h>
-#define MAX_BEARERS 2
#define MAX_MEDIA 3
#define MAX_NODES 4096
#define WSIZE 32
@@ -163,6 +163,7 @@ struct tipc_bearer {
u32 identity;
struct tipc_link_req *link_req;
char net_plane;
+ int node_cnt;
struct tipc_node_map nodes;
};
@@ -215,7 +216,14 @@ struct tipc_media *tipc_media_find(const char *name);
int tipc_bearer_setup(void);
void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
-void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
- struct tipc_media_addr *dest);
+int tipc_bearer_mtu(struct net *net, u32 bearer_id);
+void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
+ struct sk_buff *skb,
+ struct tipc_media_addr *dest);
+void tipc_bearer_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr *dst);
+void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq);
#endif /* _TIPC_BEARER_H */
diff --git a/kernel/net/tipc/core.c b/kernel/net/tipc/core.c
index be1c9fa60..03a842870 100644
--- a/kernel/net/tipc/core.c
+++ b/kernel/net/tipc/core.c
@@ -42,6 +42,7 @@
#include "bearer.h"
#include "net.h"
#include "socket.h"
+#include "bcast.h"
#include <linux/module.h>
@@ -68,11 +69,18 @@ static int __net_init tipc_init_net(struct net *net)
if (err)
goto out_nametbl;
- err = tipc_subscr_start(net);
+ err = tipc_topsrv_start(net);
if (err)
goto out_subscr;
+
+ err = tipc_bcast_init(net);
+ if (err)
+ goto out_bclink;
+
return 0;
+out_bclink:
+ tipc_bcast_stop(net);
out_subscr:
tipc_nametbl_stop(net);
out_nametbl:
@@ -83,8 +91,9 @@ out_sk_rht:
static void __net_exit tipc_exit_net(struct net *net)
{
- tipc_subscr_stop(net);
+ tipc_topsrv_stop(net);
tipc_net_stop(net);
+ tipc_bcast_stop(net);
tipc_nametbl_stop(net);
tipc_sk_rht_destroy(net);
}
diff --git a/kernel/net/tipc/core.h b/kernel/net/tipc/core.h
index 3dc68c7a9..18e95a802 100644
--- a/kernel/net/tipc/core.h
+++ b/kernel/net/tipc/core.h
@@ -60,16 +60,18 @@
#include <net/netns/generic.h>
#include <linux/rhashtable.h>
-#include "node.h"
-#include "bearer.h"
-#include "bcast.h"
-#include "netlink.h"
-#include "link.h"
-#include "node.h"
-#include "msg.h"
+struct tipc_node;
+struct tipc_bearer;
+struct tipc_bc_base;
+struct tipc_link;
+struct tipc_name_table;
+struct tipc_server;
#define TIPC_MOD_VER "2.0.0"
+#define NODE_HTABLE_SIZE 512
+#define MAX_BEARERS 3
+
extern int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
extern int sysctl_tipc_named_timeout __read_mostly;
@@ -90,8 +92,8 @@ struct tipc_net {
struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1];
/* Broadcast link */
- struct tipc_bcbearer *bcbearer;
- struct tipc_bclink *bclink;
+ spinlock_t bclock;
+ struct tipc_bc_base *bcbase;
struct tipc_link *bcl;
/* Socket hash table */
@@ -106,6 +108,41 @@ struct tipc_net {
atomic_t subscription_count;
};
+static inline struct tipc_net *tipc_net(struct net *net)
+{
+ return net_generic(net, tipc_net_id);
+}
+
+static inline int tipc_netid(struct net *net)
+{
+ return tipc_net(net)->net_id;
+}
+
+static inline u16 mod(u16 x)
+{
+ return x & 0xffffu;
+}
+
+static inline int less_eq(u16 left, u16 right)
+{
+ return mod(right - left) < 32768u;
+}
+
+static inline int more(u16 left, u16 right)
+{
+ return !less_eq(left, right);
+}
+
+static inline int less(u16 left, u16 right)
+{
+ return less_eq(left, right) && (mod(right) != mod(left));
+}
+
+static inline int in_range(u16 val, u16 min, u16 max)
+{
+ return !less(val, min) && !more(val, max);
+}
+
#ifdef CONFIG_SYSCTL
int tipc_register_sysctl(void);
void tipc_unregister_sysctl(void);
diff --git a/kernel/net/tipc/discover.c b/kernel/net/tipc/discover.c
index 967e292f5..afe8c47c4 100644
--- a/kernel/net/tipc/discover.c
+++ b/kernel/net/tipc/discover.c
@@ -35,7 +35,7 @@
*/
#include "core.h"
-#include "link.h"
+#include "node.h"
#include "discover.h"
/* min delay during bearer start up */
@@ -89,7 +89,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type,
MAX_H_SIZE, dest_domain);
msg_set_non_seq(msg, 1);
msg_set_node_sig(msg, tn->random);
- msg_set_node_capabilities(msg, 0);
+ msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES);
msg_set_dest_domain(msg, dest_domain);
msg_set_bc_netid(msg, tn->net_id);
b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr);
@@ -120,30 +120,24 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
* @buf: buffer containing message
* @bearer: bearer that message arrived on
*/
-void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
+void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
struct tipc_bearer *bearer)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_node *node;
- struct tipc_link *link;
struct tipc_media_addr maddr;
- struct sk_buff *rbuf;
- struct tipc_msg *msg = buf_msg(buf);
- u32 ddom = msg_dest_domain(msg);
- u32 onode = msg_prevnode(msg);
- u32 net_id = msg_bc_netid(msg);
- u32 mtyp = msg_type(msg);
- u32 signature = msg_node_sig(msg);
- u16 caps = msg_node_capabilities(msg);
- bool addr_match = false;
- bool sign_match = false;
- bool link_up = false;
- bool accept_addr = false;
- bool accept_sign = false;
+ struct sk_buff *rskb;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u32 ddom = msg_dest_domain(hdr);
+ u32 onode = msg_prevnode(hdr);
+ u32 net_id = msg_bc_netid(hdr);
+ u32 mtyp = msg_type(hdr);
+ u32 signature = msg_node_sig(hdr);
+ u16 caps = msg_node_capabilities(hdr);
bool respond = false;
+ bool dupl_addr = false;
- bearer->media->msg2addr(bearer, &maddr, msg_media_addr(msg));
- kfree_skb(buf);
+ bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
+ kfree_skb(skb);
/* Ensure message from node is valid and communication is permitted */
if (net_id != tn->net_id)
@@ -165,102 +159,19 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
if (!tipc_in_scope(bearer->domain, onode))
return;
- node = tipc_node_create(net, onode);
- if (!node)
- return;
- tipc_node_lock(node);
- node->capabilities = caps;
- link = node->links[bearer->identity];
-
- /* Prepare to validate requesting node's signature and media address */
- sign_match = (signature == node->signature);
- addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr));
- link_up = link && tipc_link_is_up(link);
-
-
- /* These three flags give us eight permutations: */
-
- if (sign_match && addr_match && link_up) {
- /* All is fine. Do nothing. */
- } else if (sign_match && addr_match && !link_up) {
- /* Respond. The link will come up in due time */
- respond = true;
- } else if (sign_match && !addr_match && link_up) {
- /* Peer has changed i/f address without rebooting.
- * If so, the link will reset soon, and the next
- * discovery will be accepted. So we can ignore it.
- * It may also be an cloned or malicious peer having
- * chosen the same node address and signature as an
- * existing one.
- * Ignore requests until the link goes down, if ever.
- */
- disc_dupl_alert(bearer, onode, &maddr);
- } else if (sign_match && !addr_match && !link_up) {
- /* Peer link has changed i/f address without rebooting.
- * It may also be a cloned or malicious peer; we can't
- * distinguish between the two.
- * The signature is correct, so we must accept.
- */
- accept_addr = true;
- respond = true;
- } else if (!sign_match && addr_match && link_up) {
- /* Peer node rebooted. Two possibilities:
- * - Delayed re-discovery; this link endpoint has already
- * reset and re-established contact with the peer, before
- * receiving a discovery message from that node.
- * (The peer happened to receive one from this node first).
- * - The peer came back so fast that our side has not
- * discovered it yet. Probing from this side will soon
- * reset the link, since there can be no working link
- * endpoint at the peer end, and the link will re-establish.
- * Accept the signature, since it comes from a known peer.
- */
- accept_sign = true;
- } else if (!sign_match && addr_match && !link_up) {
- /* The peer node has rebooted.
- * Accept signature, since it is a known peer.
- */
- accept_sign = true;
- respond = true;
- } else if (!sign_match && !addr_match && link_up) {
- /* Peer rebooted with new address, or a new/duplicate peer.
- * Ignore until the link goes down, if ever.
- */
+ tipc_node_check_dest(net, onode, bearer, caps, signature,
+ &maddr, &respond, &dupl_addr);
+ if (dupl_addr)
disc_dupl_alert(bearer, onode, &maddr);
- } else if (!sign_match && !addr_match && !link_up) {
- /* Peer rebooted with new address, or it is a new peer.
- * Accept signature and address.
- */
- accept_sign = true;
- accept_addr = true;
- respond = true;
- }
-
- if (accept_sign)
- node->signature = signature;
-
- if (accept_addr) {
- if (!link)
- link = tipc_link_create(node, bearer, &maddr);
- if (link) {
- memcpy(&link->media_addr, &maddr, sizeof(maddr));
- tipc_link_reset(link);
- } else {
- respond = false;
- }
- }
/* Send response, if necessary */
if (respond && (mtyp == DSC_REQ_MSG)) {
- rbuf = tipc_buf_acquire(MAX_H_SIZE);
- if (rbuf) {
- tipc_disc_init_msg(net, rbuf, DSC_RESP_MSG, bearer);
- tipc_bearer_send(net, bearer->identity, rbuf, &maddr);
- kfree_skb(rbuf);
- }
+ rskb = tipc_buf_acquire(MAX_H_SIZE);
+ if (!rskb)
+ return;
+ tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer);
+ tipc_bearer_xmit_skb(net, bearer->identity, rskb, &maddr);
}
- tipc_node_unlock(node);
- tipc_node_put(node);
}
/**
@@ -313,6 +224,7 @@ void tipc_disc_remove_dest(struct tipc_link_req *req)
static void disc_timeout(unsigned long data)
{
struct tipc_link_req *req = (struct tipc_link_req *)data;
+ struct sk_buff *skb;
int max_delay;
spin_lock_bh(&req->lock);
@@ -330,9 +242,9 @@ static void disc_timeout(unsigned long data)
* hold at fast polling rate if don't have any associated nodes,
* otherwise hold at slow polling rate
*/
- tipc_bearer_send(req->net, req->bearer_id, req->buf, &req->dest);
-
-
+ skb = skb_clone(req->buf, GFP_ATOMIC);
+ if (skb)
+ tipc_bearer_xmit_skb(req->net, req->bearer_id, skb, &req->dest);
req->timer_intv *= 2;
if (req->num_nodes)
max_delay = TIPC_LINK_REQ_SLOW;
@@ -359,6 +271,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
struct tipc_media_addr *dest)
{
struct tipc_link_req *req;
+ struct sk_buff *skb;
req = kmalloc(sizeof(*req), GFP_ATOMIC);
if (!req)
@@ -380,7 +293,9 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
setup_timer(&req->timer, disc_timeout, (unsigned long)req);
mod_timer(&req->timer, jiffies + req->timer_intv);
b_ptr->link_req = req;
- tipc_bearer_send(net, req->bearer_id, req->buf, &req->dest);
+ skb = skb_clone(req->buf, GFP_ATOMIC);
+ if (skb)
+ tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest);
return 0;
}
@@ -404,6 +319,7 @@ void tipc_disc_delete(struct tipc_link_req *req)
void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr)
{
struct tipc_link_req *req = b_ptr->link_req;
+ struct sk_buff *skb;
spin_lock_bh(&req->lock);
tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr);
@@ -413,6 +329,8 @@ void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr)
req->num_nodes = 0;
req->timer_intv = TIPC_LINK_REQ_INIT;
mod_timer(&req->timer, jiffies + req->timer_intv);
- tipc_bearer_send(net, req->bearer_id, req->buf, &req->dest);
+ skb = skb_clone(req->buf, GFP_ATOMIC);
+ if (skb)
+ tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest);
spin_unlock_bh(&req->lock);
}
diff --git a/kernel/net/tipc/link.c b/kernel/net/tipc/link.c
index 43a515dc9..91aea071a 100644
--- a/kernel/net/tipc/link.c
+++ b/kernel/net/tipc/link.c
@@ -48,9 +48,9 @@
/*
* Error message prefixes
*/
-static const char *link_co_err = "Link changeover error, ";
+static const char *link_co_err = "Link tunneling error, ";
static const char *link_rst_msg = "Resetting link ";
-static const char *link_unk_evt = "Unknown link event ";
+static const char tipc_bclink_name[] = "broadcast-link";
static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
[TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
@@ -76,257 +76,530 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
[TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }
};
+/* Send states for broadcast NACKs
+ */
+enum {
+ BC_NACK_SND_CONDITIONAL,
+ BC_NACK_SND_UNCONDITIONAL,
+ BC_NACK_SND_SUPPRESS,
+};
+
+/*
+ * Interval between NACKs when packets arrive out of order
+ */
+#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
/*
* Out-of-range value for link session numbers
*/
-#define INVALID_SESSION 0x10000
+#define WILDCARD_SESSION 0x10000
-/*
- * Link state events:
+/* Link FSM states:
*/
-#define STARTING_EVT 856384768 /* link processing trigger */
-#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */
-#define TIMEOUT_EVT 560817u /* link timer expired */
+enum {
+ LINK_ESTABLISHED = 0xe,
+ LINK_ESTABLISHING = 0xe << 4,
+ LINK_RESET = 0x1 << 8,
+ LINK_RESETTING = 0x2 << 12,
+ LINK_PEER_RESET = 0xd << 16,
+ LINK_FAILINGOVER = 0xf << 20,
+ LINK_SYNCHING = 0xc << 24
+};
-/*
- * State value stored in 'failover_pkts'
+/* Link FSM state checking routines
*/
-#define FIRST_FAILOVER 0xffffu
-
-static void link_handle_out_of_seq_msg(struct tipc_link *link,
- struct sk_buff *skb);
-static void tipc_link_proto_rcv(struct tipc_link *link,
- struct sk_buff *skb);
-static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol);
-static void link_state_event(struct tipc_link *l_ptr, u32 event);
+static int link_is_up(struct tipc_link *l)
+{
+ return l->state & (LINK_ESTABLISHED | LINK_SYNCHING);
+}
+
+static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
+static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
+ u16 rcvgap, int tolerance, int priority,
+ struct sk_buff_head *xmitq);
static void link_reset_statistics(struct tipc_link *l_ptr);
static void link_print(struct tipc_link *l_ptr, const char *str);
-static void tipc_link_sync_xmit(struct tipc_link *l);
-static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf);
-static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb);
-static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb);
-static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb);
+static void tipc_link_build_nack_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq);
+static void tipc_link_build_bc_init_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq);
+static bool tipc_link_release_pkts(struct tipc_link *l, u16 to);
+
/*
- * Simple link routines
+ * Simple non-static link routines (i.e. referenced outside this file)
*/
-static unsigned int align(unsigned int i)
+bool tipc_link_is_up(struct tipc_link *l)
{
- return (i + 3) & ~3u;
+ return link_is_up(l);
}
-static void tipc_link_release(struct kref *kref)
+bool tipc_link_peer_is_down(struct tipc_link *l)
{
- kfree(container_of(kref, struct tipc_link, ref));
+ return l->state == LINK_PEER_RESET;
}
-static void tipc_link_get(struct tipc_link *l_ptr)
+bool tipc_link_is_reset(struct tipc_link *l)
{
- kref_get(&l_ptr->ref);
+ return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING);
}
-static void tipc_link_put(struct tipc_link *l_ptr)
+bool tipc_link_is_establishing(struct tipc_link *l)
{
- kref_put(&l_ptr->ref, tipc_link_release);
+ return l->state == LINK_ESTABLISHING;
}
-static struct tipc_link *tipc_parallel_link(struct tipc_link *l)
+bool tipc_link_is_synching(struct tipc_link *l)
{
- if (l->owner->active_links[0] != l)
- return l->owner->active_links[0];
- return l->owner->active_links[1];
+ return l->state == LINK_SYNCHING;
}
-/*
- * Simple non-static link routines (i.e. referenced outside this file)
- */
-int tipc_link_is_up(struct tipc_link *l_ptr)
+bool tipc_link_is_failingover(struct tipc_link *l)
{
- if (!l_ptr)
- return 0;
- return link_working_working(l_ptr) || link_working_unknown(l_ptr);
+ return l->state == LINK_FAILINGOVER;
}
-int tipc_link_is_active(struct tipc_link *l_ptr)
+bool tipc_link_is_blocked(struct tipc_link *l)
{
- return (l_ptr->owner->active_links[0] == l_ptr) ||
- (l_ptr->owner->active_links[1] == l_ptr);
+ return l->state & (LINK_RESETTING | LINK_PEER_RESET | LINK_FAILINGOVER);
}
-/**
- * link_timeout - handle expiration of link timer
- * @l_ptr: pointer to link
- */
-static void link_timeout(unsigned long data)
+static bool link_is_bc_sndlink(struct tipc_link *l)
{
- struct tipc_link *l_ptr = (struct tipc_link *)data;
- struct sk_buff *skb;
+ return !l->bc_sndlink;
+}
- tipc_node_lock(l_ptr->owner);
+static bool link_is_bc_rcvlink(struct tipc_link *l)
+{
+ return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l));
+}
- /* update counters used in statistical profiling of send traffic */
- l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq);
- l_ptr->stats.queue_sz_counts++;
+int tipc_link_is_active(struct tipc_link *l)
+{
+ return l->active;
+}
- skb = skb_peek(&l_ptr->transmq);
- if (skb) {
- struct tipc_msg *msg = buf_msg(skb);
- u32 length = msg_size(msg);
+void tipc_link_set_active(struct tipc_link *l, bool active)
+{
+ l->active = active;
+}
- if ((msg_user(msg) == MSG_FRAGMENTER) &&
- (msg_type(msg) == FIRST_FRAGMENT)) {
- length = msg_size(msg_get_wrapped(msg));
- }
- if (length) {
- l_ptr->stats.msg_lengths_total += length;
- l_ptr->stats.msg_length_counts++;
- if (length <= 64)
- l_ptr->stats.msg_length_profile[0]++;
- else if (length <= 256)
- l_ptr->stats.msg_length_profile[1]++;
- else if (length <= 1024)
- l_ptr->stats.msg_length_profile[2]++;
- else if (length <= 4096)
- l_ptr->stats.msg_length_profile[3]++;
- else if (length <= 16384)
- l_ptr->stats.msg_length_profile[4]++;
- else if (length <= 32768)
- l_ptr->stats.msg_length_profile[5]++;
- else
- l_ptr->stats.msg_length_profile[6]++;
- }
+void tipc_link_add_bc_peer(struct tipc_link *snd_l,
+ struct tipc_link *uc_l,
+ struct sk_buff_head *xmitq)
+{
+ struct tipc_link *rcv_l = uc_l->bc_rcvlink;
+
+ snd_l->ackers++;
+ rcv_l->acked = snd_l->snd_nxt - 1;
+ snd_l->state = LINK_ESTABLISHED;
+ tipc_link_build_bc_init_msg(uc_l, xmitq);
+}
+
+void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
+ struct tipc_link *rcv_l,
+ struct sk_buff_head *xmitq)
+{
+ u16 ack = snd_l->snd_nxt - 1;
+
+ snd_l->ackers--;
+ tipc_link_bc_ack_rcv(rcv_l, ack, xmitq);
+ tipc_link_reset(rcv_l);
+ rcv_l->state = LINK_RESET;
+ if (!snd_l->ackers) {
+ tipc_link_reset(snd_l);
+ snd_l->state = LINK_RESET;
+ __skb_queue_purge(xmitq);
}
+}
- /* do all other link processing performed on a periodic basis */
- link_state_event(l_ptr, TIMEOUT_EVT);
+int tipc_link_bc_peers(struct tipc_link *l)
+{
+ return l->ackers;
+}
- if (skb_queue_len(&l_ptr->backlogq))
- tipc_link_push_packets(l_ptr);
+void tipc_link_set_mtu(struct tipc_link *l, int mtu)
+{
+ l->mtu = mtu;
+}
- tipc_node_unlock(l_ptr->owner);
- tipc_link_put(l_ptr);
+int tipc_link_mtu(struct tipc_link *l)
+{
+ return l->mtu;
}
-static void link_set_timer(struct tipc_link *link, unsigned long time)
+static u32 link_own_addr(struct tipc_link *l)
{
- if (!mod_timer(&link->timer, jiffies + time))
- tipc_link_get(link);
+ return msg_prevnode(l->pmsg);
}
/**
* tipc_link_create - create a new link
- * @n_ptr: pointer to associated node
- * @b_ptr: pointer to associated bearer
- * @media_addr: media address to use when sending messages over link
+ * @n: pointer to associated node
+ * @if_name: associated interface name
+ * @bearer_id: id (index) of associated bearer
+ * @tolerance: link tolerance to be used by link
+ * @net_plane: network plane (A,B,c..) this link belongs to
+ * @mtu: mtu to be advertised by link
+ * @priority: priority to be used by link
+ * @window: send window to be used by link
+ * @session: session to be used by link
+ * @ownnode: identity of own node
+ * @peer: node id of peer node
+ * @peer_caps: bitmap describing peer node capabilities
+ * @bc_sndlink: the namespace global link used for broadcast sending
+ * @bc_rcvlink: the peer specific link used for broadcast reception
+ * @inputq: queue to put messages ready for delivery
+ * @namedq: queue to put binding table update messages ready for delivery
+ * @link: return value, pointer to put the created link
*
- * Returns pointer to link.
+ * Returns true if link was created, otherwise false
*/
-struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
- struct tipc_bearer *b_ptr,
- const struct tipc_media_addr *media_addr)
+bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
+ int tolerance, char net_plane, u32 mtu, int priority,
+ int window, u32 session, u32 ownnode, u32 peer,
+ u16 peer_caps,
+ struct tipc_link *bc_sndlink,
+ struct tipc_link *bc_rcvlink,
+ struct sk_buff_head *inputq,
+ struct sk_buff_head *namedq,
+ struct tipc_link **link)
{
- struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
- struct tipc_link *l_ptr;
- struct tipc_msg *msg;
- char *if_name;
- char addr_string[16];
- u32 peer = n_ptr->addr;
-
- if (n_ptr->link_cnt >= MAX_BEARERS) {
- tipc_addr_string_fill(addr_string, n_ptr->addr);
- pr_err("Attempt to establish %uth link to %s. Max %u allowed.\n",
- n_ptr->link_cnt, addr_string, MAX_BEARERS);
- return NULL;
- }
-
- if (n_ptr->links[b_ptr->identity]) {
- tipc_addr_string_fill(addr_string, n_ptr->addr);
- pr_err("Attempt to establish second link on <%s> to %s\n",
- b_ptr->name, addr_string);
- return NULL;
- }
+ struct tipc_link *l;
+ struct tipc_msg *hdr;
- l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC);
- if (!l_ptr) {
- pr_warn("Link creation failed, no memory\n");
- return NULL;
- }
- kref_init(&l_ptr->ref);
- l_ptr->addr = peer;
- if_name = strchr(b_ptr->name, ':') + 1;
- sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
- tipc_zone(tn->own_addr), tipc_cluster(tn->own_addr),
- tipc_node(tn->own_addr),
- if_name,
- tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
- /* note: peer i/f name is updated by reset/activate message */
- memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
- l_ptr->owner = n_ptr;
- l_ptr->checkpoint = 1;
- l_ptr->peer_session = INVALID_SESSION;
- l_ptr->bearer_id = b_ptr->identity;
- link_set_supervision_props(l_ptr, b_ptr->tolerance);
- l_ptr->state = RESET_UNKNOWN;
-
- l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg;
- msg = l_ptr->pmsg;
- tipc_msg_init(tn->own_addr, msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE,
- l_ptr->addr);
- msg_set_size(msg, sizeof(l_ptr->proto_msg));
- msg_set_session(msg, (tn->random & 0xffff));
- msg_set_bearer_id(msg, b_ptr->identity);
- strcpy((char *)msg_data(msg), if_name);
- l_ptr->net_plane = b_ptr->net_plane;
- l_ptr->advertised_mtu = b_ptr->mtu;
- l_ptr->mtu = l_ptr->advertised_mtu;
- l_ptr->priority = b_ptr->priority;
- tipc_link_set_queue_limits(l_ptr, b_ptr->window);
- l_ptr->next_out_no = 1;
- __skb_queue_head_init(&l_ptr->transmq);
- __skb_queue_head_init(&l_ptr->backlogq);
- __skb_queue_head_init(&l_ptr->deferdq);
- skb_queue_head_init(&l_ptr->wakeupq);
- skb_queue_head_init(&l_ptr->inputq);
- skb_queue_head_init(&l_ptr->namedq);
- link_reset_statistics(l_ptr);
- tipc_node_attach_link(n_ptr, l_ptr);
- setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr);
- link_state_event(l_ptr, STARTING_EVT);
-
- return l_ptr;
+ l = kzalloc(sizeof(*l), GFP_ATOMIC);
+ if (!l)
+ return false;
+ *link = l;
+ l->pmsg = (struct tipc_msg *)&l->proto_msg;
+ hdr = l->pmsg;
+ tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer);
+ msg_set_size(hdr, sizeof(l->proto_msg));
+ msg_set_session(hdr, session);
+ msg_set_bearer_id(hdr, l->bearer_id);
+
+ /* Note: peer i/f name is completed by reset/activate message */
+ sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
+ tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode),
+ if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
+ strcpy((char *)msg_data(hdr), if_name);
+
+ l->addr = peer;
+ l->peer_caps = peer_caps;
+ l->net = net;
+ l->peer_session = WILDCARD_SESSION;
+ l->bearer_id = bearer_id;
+ l->tolerance = tolerance;
+ l->net_plane = net_plane;
+ l->advertised_mtu = mtu;
+ l->mtu = mtu;
+ l->priority = priority;
+ tipc_link_set_queue_limits(l, window);
+ l->ackers = 1;
+ l->bc_sndlink = bc_sndlink;
+ l->bc_rcvlink = bc_rcvlink;
+ l->inputq = inputq;
+ l->namedq = namedq;
+ l->state = LINK_RESETTING;
+ __skb_queue_head_init(&l->transmq);
+ __skb_queue_head_init(&l->backlogq);
+ __skb_queue_head_init(&l->deferdq);
+ skb_queue_head_init(&l->wakeupq);
+ skb_queue_head_init(l->inputq);
+ return true;
}
/**
- * tipc_link_delete - Delete a link
- * @l: link to be deleted
+ * tipc_link_bc_create - create new link to be used for broadcast
+ * @n: pointer to associated node
+ * @mtu: mtu to be used
+ * @window: send window to be used
+ * @inputq: queue to put messages ready for delivery
+ * @namedq: queue to put binding table update messages ready for delivery
+ * @link: return value, pointer to put the created link
+ *
+ * Returns true if link was created, otherwise false
*/
-void tipc_link_delete(struct tipc_link *l)
+bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
+ int mtu, int window, u16 peer_caps,
+ struct sk_buff_head *inputq,
+ struct sk_buff_head *namedq,
+ struct tipc_link *bc_sndlink,
+ struct tipc_link **link)
{
+ struct tipc_link *l;
+
+ if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window,
+ 0, ownnode, peer, peer_caps, bc_sndlink,
+ NULL, inputq, namedq, link))
+ return false;
+
+ l = *link;
+ strcpy(l->name, tipc_bclink_name);
tipc_link_reset(l);
- if (del_timer(&l->timer))
- tipc_link_put(l);
- l->flags |= LINK_STOPPED;
- /* Delete link now, or when timer is finished: */
- tipc_link_reset_fragments(l);
- tipc_node_detach_link(l->owner, l);
- tipc_link_put(l);
+ l->state = LINK_RESET;
+ l->ackers = 0;
+ l->bc_rcvlink = l;
+
+ /* Broadcast send link is always up */
+ if (link_is_bc_sndlink(l))
+ l->state = LINK_ESTABLISHED;
+
+ return true;
}
-void tipc_link_delete_list(struct net *net, unsigned int bearer_id,
- bool shutting_down)
+/**
+ * tipc_link_fsm_evt - link finite state machine
+ * @l: pointer to link
+ * @evt: state machine event to be processed
+ */
+int tipc_link_fsm_evt(struct tipc_link *l, int evt)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *link;
- struct tipc_node *node;
+ int rc = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(node, &tn->node_list, list) {
- tipc_node_lock(node);
- link = node->links[bearer_id];
- if (link)
- tipc_link_delete(link);
- tipc_node_unlock(node);
+ switch (l->state) {
+ case LINK_RESETTING:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_PEER_RESET;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_FAILURE_EVT:
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILOVER_END_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_RESET:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_ESTABLISHING;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ l->state = LINK_FAILINGOVER;
+ case LINK_FAILURE_EVT:
+ case LINK_RESET_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILOVER_END_EVT:
+ break;
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_PEER_RESET:
+ switch (evt) {
+ case LINK_RESET_EVT:
+ l->state = LINK_ESTABLISHING;
+ break;
+ case LINK_PEER_RESET_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILURE_EVT:
+ break;
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_FAILINGOVER:
+ switch (evt) {
+ case LINK_FAILOVER_END_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_PEER_RESET_EVT:
+ case LINK_RESET_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILURE_EVT:
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_ESTABLISHING:
+ switch (evt) {
+ case LINK_ESTABLISH_EVT:
+ l->state = LINK_ESTABLISHED;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ l->state = LINK_FAILINGOVER;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_FAILURE_EVT:
+ case LINK_PEER_RESET_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ break;
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_ESTABLISHED:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_PEER_RESET;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_FAILURE_EVT:
+ l->state = LINK_RESETTING;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_ESTABLISH_EVT:
+ case LINK_SYNCH_END_EVT:
+ break;
+ case LINK_SYNCH_BEGIN_EVT:
+ l->state = LINK_SYNCHING;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_SYNCHING:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_PEER_RESET;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_FAILURE_EVT:
+ l->state = LINK_RESETTING;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_ESTABLISH_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ break;
+ case LINK_SYNCH_END_EVT:
+ l->state = LINK_ESTABLISHED;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ default:
+ pr_err("Unknown FSM state %x in %s\n", l->state, l->name);
}
- rcu_read_unlock();
+ return rc;
+illegal_evt:
+ pr_err("Illegal FSM event %x in state %x on link %s\n",
+ evt, l->state, l->name);
+ return rc;
+}
+
+/* link_profile_stats - update statistical profiling of traffic
+ */
+static void link_profile_stats(struct tipc_link *l)
+{
+ struct sk_buff *skb;
+ struct tipc_msg *msg;
+ int length;
+
+ /* Update counters used in statistical profiling of send traffic */
+ l->stats.accu_queue_sz += skb_queue_len(&l->transmq);
+ l->stats.queue_sz_counts++;
+
+ skb = skb_peek(&l->transmq);
+ if (!skb)
+ return;
+ msg = buf_msg(skb);
+ length = msg_size(msg);
+
+ if (msg_user(msg) == MSG_FRAGMENTER) {
+ if (msg_type(msg) != FIRST_FRAGMENT)
+ return;
+ length = msg_size(msg_get_wrapped(msg));
+ }
+ l->stats.msg_lengths_total += length;
+ l->stats.msg_length_counts++;
+ if (length <= 64)
+ l->stats.msg_length_profile[0]++;
+ else if (length <= 256)
+ l->stats.msg_length_profile[1]++;
+ else if (length <= 1024)
+ l->stats.msg_length_profile[2]++;
+ else if (length <= 4096)
+ l->stats.msg_length_profile[3]++;
+ else if (length <= 16384)
+ l->stats.msg_length_profile[4]++;
+ else if (length <= 32768)
+ l->stats.msg_length_profile[5]++;
+ else
+ l->stats.msg_length_profile[6]++;
+}
+
+/* tipc_link_timeout - perform periodic task as instructed from node timeout
+ */
+/* tipc_link_timeout - perform periodic task as instructed from node timeout
+ */
+int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
+{
+ int rc = 0;
+ int mtyp = STATE_MSG;
+ bool xmit = false;
+ bool prb = false;
+ u16 bc_snt = l->bc_sndlink->snd_nxt - 1;
+ u16 bc_acked = l->bc_rcvlink->acked;
+ bool bc_up = link_is_up(l->bc_rcvlink);
+
+ link_profile_stats(l);
+
+ switch (l->state) {
+ case LINK_ESTABLISHED:
+ case LINK_SYNCHING:
+ if (!l->silent_intv_cnt) {
+ if (bc_up && (bc_acked != bc_snt))
+ xmit = true;
+ } else if (l->silent_intv_cnt <= l->abort_limit) {
+ xmit = true;
+ prb = true;
+ } else {
+ rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ }
+ l->silent_intv_cnt++;
+ break;
+ case LINK_RESET:
+ xmit = true;
+ mtyp = RESET_MSG;
+ break;
+ case LINK_ESTABLISHING:
+ xmit = true;
+ mtyp = ACTIVATE_MSG;
+ break;
+ case LINK_PEER_RESET:
+ case LINK_RESETTING:
+ case LINK_FAILINGOVER:
+ break;
+ default:
+ break;
+ }
+
+ if (xmit)
+ tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq);
+
+ return rc;
}
/**
@@ -334,7 +607,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id,
* @link: congested link
* @list: message that was attempted sent
* Create pseudo msg to send back to user when congestion abates
- * Only consumes message if there is an error
+ * Does not consume buffer list
*/
static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
{
@@ -347,8 +620,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
/* This really cannot happen... */
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
- tipc_link_reset(link);
- goto err;
+ return -ENOBUFS;
}
/* Non-blocking sender: */
if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
@@ -358,15 +630,12 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
addr, addr, oport, 0, 0);
if (!skb)
- goto err;
+ return -ENOBUFS;
TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
TIPC_SKB_CB(skb)->chain_imp = imp;
skb_queue_tail(&link->wakeupq, skb);
link->stats.link_congs++;
return -ELINKCONG;
-err:
- __skb_queue_purge(list);
- return -ENOBUFS;
}
/**
@@ -388,782 +657,202 @@ void link_prepare_wakeup(struct tipc_link *l)
if ((pnd[imp] + l->backlog[imp].len) >= lim)
break;
skb_unlink(skb, &l->wakeupq);
- skb_queue_tail(&l->inputq, skb);
- l->owner->inputq = &l->inputq;
- l->owner->action_flags |= TIPC_MSG_EVT;
+ skb_queue_tail(l->inputq, skb);
}
}
-/**
- * tipc_link_reset_fragments - purge link's inbound message fragments queue
- * @l_ptr: pointer to link
- */
-void tipc_link_reset_fragments(struct tipc_link *l_ptr)
+void tipc_link_reset(struct tipc_link *l)
{
- kfree_skb(l_ptr->reasm_buf);
- l_ptr->reasm_buf = NULL;
-}
+ /* Link is down, accept any session */
+ l->peer_session = WILDCARD_SESSION;
-static void tipc_link_purge_backlog(struct tipc_link *l)
-{
+ /* If peer is up, it only accepts an incremented session number */
+ msg_set_session(l->pmsg, msg_session(l->pmsg) + 1);
+
+ /* Prepare for renewed mtu size negotiation */
+ l->mtu = l->advertised_mtu;
+
+ /* Clean up all queues and counters: */
+ __skb_queue_purge(&l->transmq);
+ __skb_queue_purge(&l->deferdq);
+ skb_queue_splice_init(&l->wakeupq, l->inputq);
__skb_queue_purge(&l->backlogq);
l->backlog[TIPC_LOW_IMPORTANCE].len = 0;
l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0;
l->backlog[TIPC_HIGH_IMPORTANCE].len = 0;
l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0;
l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0;
+ kfree_skb(l->reasm_buf);
+ kfree_skb(l->failover_reasm_skb);
+ l->reasm_buf = NULL;
+ l->failover_reasm_skb = NULL;
+ l->rcv_unacked = 0;
+ l->snd_nxt = 1;
+ l->rcv_nxt = 1;
+ l->acked = 0;
+ l->silent_intv_cnt = 0;
+ l->stats.recv_info = 0;
+ l->stale_count = 0;
+ l->bc_peer_is_up = false;
+ link_reset_statistics(l);
}
/**
- * tipc_link_purge_queues - purge all pkt queues associated with link
- * @l_ptr: pointer to link
- */
-void tipc_link_purge_queues(struct tipc_link *l_ptr)
-{
- __skb_queue_purge(&l_ptr->deferdq);
- __skb_queue_purge(&l_ptr->transmq);
- tipc_link_purge_backlog(l_ptr);
- tipc_link_reset_fragments(l_ptr);
-}
-
-void tipc_link_reset(struct tipc_link *l_ptr)
-{
- u32 prev_state = l_ptr->state;
- int was_active_link = tipc_link_is_active(l_ptr);
- struct tipc_node *owner = l_ptr->owner;
- struct tipc_link *pl = tipc_parallel_link(l_ptr);
-
- msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff));
-
- /* Link is down, accept any session */
- l_ptr->peer_session = INVALID_SESSION;
-
- /* Prepare for renewed mtu size negotiation */
- l_ptr->mtu = l_ptr->advertised_mtu;
-
- l_ptr->state = RESET_UNKNOWN;
-
- if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET))
- return;
-
- tipc_node_link_down(l_ptr->owner, l_ptr);
- tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr);
-
- if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) {
- l_ptr->flags |= LINK_FAILINGOVER;
- l_ptr->failover_checkpt = l_ptr->next_in_no;
- pl->failover_pkts = FIRST_FAILOVER;
- pl->failover_checkpt = l_ptr->next_in_no;
- pl->failover_skb = l_ptr->reasm_buf;
- } else {
- kfree_skb(l_ptr->reasm_buf);
- }
- /* Clean up all queues, except inputq: */
- __skb_queue_purge(&l_ptr->transmq);
- __skb_queue_purge(&l_ptr->deferdq);
- if (!owner->inputq)
- owner->inputq = &l_ptr->inputq;
- skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq);
- if (!skb_queue_empty(owner->inputq))
- owner->action_flags |= TIPC_MSG_EVT;
- tipc_link_purge_backlog(l_ptr);
- l_ptr->reasm_buf = NULL;
- l_ptr->rcv_unacked = 0;
- l_ptr->checkpoint = 1;
- l_ptr->next_out_no = 1;
- l_ptr->fsm_msg_cnt = 0;
- l_ptr->stale_count = 0;
- link_reset_statistics(l_ptr);
-}
-
-void tipc_link_reset_list(struct net *net, unsigned int bearer_id)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *l_ptr;
- struct tipc_node *n_ptr;
-
- rcu_read_lock();
- list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
- tipc_node_lock(n_ptr);
- l_ptr = n_ptr->links[bearer_id];
- if (l_ptr)
- tipc_link_reset(l_ptr);
- tipc_node_unlock(n_ptr);
- }
- rcu_read_unlock();
-}
-
-static void link_activate(struct tipc_link *link)
-{
- struct tipc_node *node = link->owner;
-
- link->next_in_no = 1;
- link->stats.recv_info = 1;
- tipc_node_link_up(node, link);
- tipc_bearer_add_dest(node->net, link->bearer_id, link->addr);
-}
-
-/**
- * link_state_event - link finite state machine
- * @l_ptr: pointer to link
- * @event: state machine event to process
- */
-static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
-{
- struct tipc_link *other;
- unsigned long cont_intv = l_ptr->cont_intv;
-
- if (l_ptr->flags & LINK_STOPPED)
- return;
-
- if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT))
- return; /* Not yet. */
-
- if (l_ptr->flags & LINK_FAILINGOVER) {
- if (event == TIMEOUT_EVT)
- link_set_timer(l_ptr, cont_intv);
- return;
- }
-
- switch (l_ptr->state) {
- case WORKING_WORKING:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- break;
- case TIMEOUT_EVT:
- if (l_ptr->next_in_no != l_ptr->checkpoint) {
- l_ptr->checkpoint = l_ptr->next_in_no;
- if (tipc_bclink_acks_missing(l_ptr->owner)) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- }
- link_set_timer(l_ptr, cont_intv);
- break;
- }
- l_ptr->state = WORKING_UNKNOWN;
- l_ptr->fsm_msg_cnt = 0;
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv / 4);
- break;
- case RESET_MSG:
- pr_debug("%s<%s>, requested by peer\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_RESET;
- l_ptr->fsm_msg_cnt = 0;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- break;
- default:
- pr_debug("%s%u in WW state\n", link_unk_evt, event);
- }
- break;
- case WORKING_UNKNOWN:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- l_ptr->state = WORKING_WORKING;
- l_ptr->fsm_msg_cnt = 0;
- link_set_timer(l_ptr, cont_intv);
- break;
- case RESET_MSG:
- pr_debug("%s<%s>, requested by peer while probing\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_RESET;
- l_ptr->fsm_msg_cnt = 0;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- break;
- case TIMEOUT_EVT:
- if (l_ptr->next_in_no != l_ptr->checkpoint) {
- l_ptr->state = WORKING_WORKING;
- l_ptr->fsm_msg_cnt = 0;
- l_ptr->checkpoint = l_ptr->next_in_no;
- if (tipc_bclink_acks_missing(l_ptr->owner)) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- }
- link_set_timer(l_ptr, cont_intv);
- } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 1, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv / 4);
- } else { /* Link has failed */
- pr_debug("%s<%s>, peer not responding\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_UNKNOWN;
- l_ptr->fsm_msg_cnt = 0;
- tipc_link_proto_xmit(l_ptr, RESET_MSG,
- 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- }
- break;
- default:
- pr_err("%s%u in WU state\n", link_unk_evt, event);
- }
- break;
- case RESET_UNKNOWN:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- break;
- case ACTIVATE_MSG:
- other = l_ptr->owner->active_links[0];
- if (other && link_working_unknown(other))
- break;
- l_ptr->state = WORKING_WORKING;
- l_ptr->fsm_msg_cnt = 0;
- link_activate(l_ptr);
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- if (l_ptr->owner->working_links == 1)
- tipc_link_sync_xmit(l_ptr);
- link_set_timer(l_ptr, cont_intv);
- break;
- case RESET_MSG:
- l_ptr->state = RESET_RESET;
- l_ptr->fsm_msg_cnt = 0;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 1, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- break;
- case STARTING_EVT:
- l_ptr->flags |= LINK_STARTED;
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- break;
- case TIMEOUT_EVT:
- tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- break;
- default:
- pr_err("%s%u in RU state\n", link_unk_evt, event);
- }
- break;
- case RESET_RESET:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- other = l_ptr->owner->active_links[0];
- if (other && link_working_unknown(other))
- break;
- l_ptr->state = WORKING_WORKING;
- l_ptr->fsm_msg_cnt = 0;
- link_activate(l_ptr);
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- if (l_ptr->owner->working_links == 1)
- tipc_link_sync_xmit(l_ptr);
- link_set_timer(l_ptr, cont_intv);
- break;
- case RESET_MSG:
- break;
- case TIMEOUT_EVT:
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- l_ptr->fsm_msg_cnt++;
- link_set_timer(l_ptr, cont_intv);
- break;
- default:
- pr_err("%s%u in RR state\n", link_unk_evt, event);
- }
- break;
- default:
- pr_err("Unknown link state %u/%u\n", l_ptr->state, event);
- }
-}
-
-/**
- * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked
+ * tipc_link_xmit(): enqueue buffer list according to queue situation
* @link: link to use
* @list: chain of buffers containing message
+ * @xmitq: returned list of packets to be sent by caller
*
* Consumes the buffer chain, except when returning -ELINKCONG,
* since the caller then may want to make more send attempts.
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
-int __tipc_link_xmit(struct net *net, struct tipc_link *link,
- struct sk_buff_head *list)
+int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
+ struct sk_buff_head *xmitq)
{
- struct tipc_msg *msg = buf_msg(skb_peek(list));
- unsigned int maxwin = link->window;
- unsigned int imp = msg_importance(msg);
- uint mtu = link->mtu;
- uint ack = mod(link->next_in_no - 1);
- uint seqno = link->next_out_no;
- uint bc_last_in = link->owner->bclink.last_in;
- struct tipc_media_addr *addr = &link->media_addr;
- struct sk_buff_head *transmq = &link->transmq;
- struct sk_buff_head *backlogq = &link->backlogq;
- struct sk_buff *skb, *tmp;
-
- /* Match backlog limit against msg importance: */
- if (unlikely(link->backlog[imp].len >= link->backlog[imp].limit))
- return link_schedule_user(link, list);
-
- if (unlikely(msg_size(msg) > mtu)) {
- __skb_queue_purge(list);
- return -EMSGSIZE;
+ struct tipc_msg *hdr = buf_msg(skb_peek(list));
+ unsigned int maxwin = l->window;
+ unsigned int i, imp = msg_importance(hdr);
+ unsigned int mtu = l->mtu;
+ u16 ack = l->rcv_nxt - 1;
+ u16 seqno = l->snd_nxt;
+ u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+ struct sk_buff_head *transmq = &l->transmq;
+ struct sk_buff_head *backlogq = &l->backlogq;
+ struct sk_buff *skb, *_skb, *bskb;
+
+ /* Match msg importance against this and all higher backlog limits: */
+ for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
+ if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
+ return link_schedule_user(l, list);
}
+ if (unlikely(msg_size(hdr) > mtu))
+ return -EMSGSIZE;
+
/* Prepare each packet for sending, and add to relevant queue: */
- skb_queue_walk_safe(list, skb, tmp) {
- __skb_unlink(skb, list);
- msg = buf_msg(skb);
- msg_set_seqno(msg, seqno);
- msg_set_ack(msg, ack);
- msg_set_bcast_ack(msg, bc_last_in);
+ while (skb_queue_len(list)) {
+ skb = skb_peek(list);
+ hdr = buf_msg(skb);
+ msg_set_seqno(hdr, seqno);
+ msg_set_ack(hdr, ack);
+ msg_set_bcast_ack(hdr, bc_ack);
if (likely(skb_queue_len(transmq) < maxwin)) {
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ return -ENOBUFS;
+ __skb_dequeue(list);
__skb_queue_tail(transmq, skb);
- tipc_bearer_send(net, link->bearer_id, skb, addr);
- link->rcv_unacked = 0;
+ __skb_queue_tail(xmitq, _skb);
+ TIPC_SKB_CB(skb)->ackers = l->ackers;
+ l->rcv_unacked = 0;
seqno++;
continue;
}
- if (tipc_msg_bundle(skb_peek_tail(backlogq), skb, mtu)) {
- link->stats.sent_bundled++;
+ if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) {
+ kfree_skb(__skb_dequeue(list));
+ l->stats.sent_bundled++;
continue;
}
- if (tipc_msg_make_bundle(&skb, mtu, link->addr)) {
- link->stats.sent_bundled++;
- link->stats.sent_bundles++;
- imp = msg_importance(buf_msg(skb));
+ if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) {
+ kfree_skb(__skb_dequeue(list));
+ __skb_queue_tail(backlogq, bskb);
+ l->backlog[msg_importance(buf_msg(bskb))].len++;
+ l->stats.sent_bundled++;
+ l->stats.sent_bundles++;
+ continue;
}
- __skb_queue_tail(backlogq, skb);
- link->backlog[imp].len++;
- seqno++;
+ l->backlog[imp].len += skb_queue_len(list);
+ skb_queue_splice_tail_init(list, backlogq);
}
- link->next_out_no = seqno;
+ l->snd_nxt = seqno;
return 0;
}
-static void skb2list(struct sk_buff *skb, struct sk_buff_head *list)
-{
- skb_queue_head_init(list);
- __skb_queue_tail(list, skb);
-}
-
-static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb)
+void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
{
- struct sk_buff_head head;
-
- skb2list(skb, &head);
- return __tipc_link_xmit(link->owner->net, link, &head);
-}
-
-/* tipc_link_xmit_skb(): send single buffer to destination
- * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
- * messages, which will not be rejected
- * The only exception is datagram messages rerouted after secondary
- * lookup, which are rare and safe to dispose of anyway.
- * TODO: Return real return value, and let callers use
- * tipc_wait_for_sendpkt() where applicable
- */
-int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
- u32 selector)
-{
- struct sk_buff_head head;
- int rc;
-
- skb2list(skb, &head);
- rc = tipc_link_xmit(net, &head, dnode, selector);
- if (rc == -ELINKCONG)
- kfree_skb(skb);
- return 0;
-}
-
-/**
- * tipc_link_xmit() is the general link level function for message sending
- * @net: the applicable net namespace
- * @list: chain of buffers containing message
- * @dsz: amount of user data to be sent
- * @dnode: address of destination node
- * @selector: a number used for deterministic link selection
- * Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
- */
-int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
- u32 selector)
-{
- struct tipc_link *link = NULL;
- struct tipc_node *node;
- int rc = -EHOSTUNREACH;
-
- node = tipc_node_find(net, dnode);
- if (node) {
- tipc_node_lock(node);
- link = node->active_links[selector & 1];
- if (link)
- rc = __tipc_link_xmit(net, link, list);
- tipc_node_unlock(node);
- tipc_node_put(node);
- }
- if (link)
- return rc;
-
- if (likely(in_own_node(net, dnode))) {
- tipc_sk_rcv(net, list);
- return 0;
- }
-
- __skb_queue_purge(list);
- return rc;
-}
-
-/*
- * tipc_link_sync_xmit - synchronize broadcast link endpoints.
- *
- * Give a newly added peer node the sequence number where it should
- * start receiving and acking broadcast packets.
- *
- * Called with node locked
- */
-static void tipc_link_sync_xmit(struct tipc_link *link)
-{
- struct sk_buff *skb;
- struct tipc_msg *msg;
-
- skb = tipc_buf_acquire(INT_H_SIZE);
- if (!skb)
- return;
-
- msg = buf_msg(skb);
- tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG,
- INT_H_SIZE, link->addr);
- msg_set_last_bcast(msg, link->owner->bclink.acked);
- __tipc_link_xmit_skb(link, skb);
-}
-
-/*
- * tipc_link_sync_rcv - synchronize broadcast link endpoints.
- * Receive the sequence number where we should start receiving and
- * acking broadcast packets from a newly added peer node, and open
- * up for reception of such packets.
- *
- * Called with node locked
- */
-static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf)
-{
- struct tipc_msg *msg = buf_msg(buf);
-
- n->bclink.last_sent = n->bclink.last_in = msg_last_bcast(msg);
- n->bclink.recv_permitted = true;
- kfree_skb(buf);
-}
-
-/*
- * tipc_link_push_packets - push unsent packets to bearer
- *
- * Push out the unsent messages of a link where congestion
- * has abated. Node is locked.
- *
- * Called with node locked
- */
-void tipc_link_push_packets(struct tipc_link *link)
-{
- struct sk_buff *skb;
- struct tipc_msg *msg;
- unsigned int ack = mod(link->next_in_no - 1);
-
- while (skb_queue_len(&link->transmq) < link->window) {
- skb = __skb_dequeue(&link->backlogq);
+ struct sk_buff *skb, *_skb;
+ struct tipc_msg *hdr;
+ u16 seqno = l->snd_nxt;
+ u16 ack = l->rcv_nxt - 1;
+ u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+
+ while (skb_queue_len(&l->transmq) < l->window) {
+ skb = skb_peek(&l->backlogq);
if (!skb)
break;
- msg = buf_msg(skb);
- link->backlog[msg_importance(msg)].len--;
- msg_set_ack(msg, ack);
- msg_set_bcast_ack(msg, link->owner->bclink.last_in);
- link->rcv_unacked = 0;
- __skb_queue_tail(&link->transmq, skb);
- tipc_bearer_send(link->owner->net, link->bearer_id,
- skb, &link->media_addr);
- }
-}
-
-void tipc_link_reset_all(struct tipc_node *node)
-{
- char addr_string[16];
- u32 i;
-
- tipc_node_lock(node);
-
- pr_warn("Resetting all links to %s\n",
- tipc_addr_string_fill(addr_string, node->addr));
-
- for (i = 0; i < MAX_BEARERS; i++) {
- if (node->links[i]) {
- link_print(node->links[i], "Resetting link\n");
- tipc_link_reset(node->links[i]);
- }
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ break;
+ __skb_dequeue(&l->backlogq);
+ hdr = buf_msg(skb);
+ l->backlog[msg_importance(hdr)].len--;
+ __skb_queue_tail(&l->transmq, skb);
+ __skb_queue_tail(xmitq, _skb);
+ TIPC_SKB_CB(skb)->ackers = l->ackers;
+ msg_set_seqno(hdr, seqno);
+ msg_set_ack(hdr, ack);
+ msg_set_bcast_ack(hdr, bc_ack);
+ l->rcv_unacked = 0;
+ seqno++;
}
-
- tipc_node_unlock(node);
+ l->snd_nxt = seqno;
}
-static void link_retransmit_failure(struct tipc_link *l_ptr,
- struct sk_buff *buf)
+static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
{
- struct tipc_msg *msg = buf_msg(buf);
- struct net *net = l_ptr->owner->net;
-
- pr_warn("Retransmission failure on link <%s>\n", l_ptr->name);
-
- if (l_ptr->addr) {
- /* Handle failure on standard link */
- link_print(l_ptr, "Resetting link\n");
- tipc_link_reset(l_ptr);
-
- } else {
- /* Handle failure on broadcast link */
- struct tipc_node *n_ptr;
- char addr_string[16];
-
- pr_info("Msg seq number: %u, ", msg_seqno(msg));
- pr_cont("Outstanding acks: %lu\n",
- (unsigned long) TIPC_SKB_CB(buf)->handle);
-
- n_ptr = tipc_bclink_retransmit_to(net);
-
- tipc_addr_string_fill(addr_string, n_ptr->addr);
- pr_info("Broadcast link info for %s\n", addr_string);
- pr_info("Reception permitted: %d, Acked: %u\n",
- n_ptr->bclink.recv_permitted,
- n_ptr->bclink.acked);
- pr_info("Last in: %u, Oos state: %u, Last sent: %u\n",
- n_ptr->bclink.last_in,
- n_ptr->bclink.oos_state,
- n_ptr->bclink.last_sent);
-
- n_ptr->action_flags |= TIPC_BCAST_RESET;
- l_ptr->stale_count = 0;
- }
+ struct tipc_msg *hdr = buf_msg(skb);
+
+ pr_warn("Retransmission failure on link <%s>\n", l->name);
+ link_print(l, "Resetting link ");
+ pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
+ msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr));
+ pr_info("sqno %u, prev: %x, src: %x\n",
+ msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
}
-void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb,
- u32 retransmits)
+int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to,
+ struct sk_buff_head *xmitq)
{
- struct tipc_msg *msg;
+ struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
+ struct tipc_msg *hdr;
+ u16 ack = l->rcv_nxt - 1;
+ u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
if (!skb)
- return;
-
- msg = buf_msg(skb);
+ return 0;
- /* Detect repeated retransmit failures */
- if (l_ptr->last_retransmitted == msg_seqno(msg)) {
- if (++l_ptr->stale_count > 100) {
- link_retransmit_failure(l_ptr, skb);
- return;
- }
- } else {
- l_ptr->last_retransmitted = msg_seqno(msg);
- l_ptr->stale_count = 1;
+ /* Detect repeated retransmit failures on same packet */
+ if (likely(l->last_retransm != buf_seqno(skb))) {
+ l->last_retransm = buf_seqno(skb);
+ l->stale_count = 1;
+ } else if (++l->stale_count > 100) {
+ link_retransmit_failure(l, skb);
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
- skb_queue_walk_from(&l_ptr->transmq, skb) {
- if (!retransmits)
+ /* Move forward to where retransmission should start */
+ skb_queue_walk(&l->transmq, skb) {
+ if (!less(buf_seqno(skb), from))
break;
- msg = buf_msg(skb);
- msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
- msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
- tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb,
- &l_ptr->media_addr);
- retransmits--;
- l_ptr->stats.retransmitted++;
}
-}
-
-/* link_synch(): check if all packets arrived before the synch
- * point have been consumed
- * Returns true if the parallel links are synched, otherwise false
- */
-static bool link_synch(struct tipc_link *l)
-{
- unsigned int post_synch;
- struct tipc_link *pl;
-
- pl = tipc_parallel_link(l);
- if (pl == l)
- goto synched;
-
- /* Was last pre-synch packet added to input queue ? */
- if (less_eq(pl->next_in_no, l->synch_point))
- return false;
-
- /* Is it still in the input queue ? */
- post_synch = mod(pl->next_in_no - l->synch_point) - 1;
- if (skb_queue_len(&pl->inputq) > post_synch)
- return false;
-synched:
- l->flags &= ~LINK_SYNCHING;
- return true;
-}
-
-static void link_retrieve_defq(struct tipc_link *link,
- struct sk_buff_head *list)
-{
- u32 seq_no;
- if (skb_queue_empty(&link->deferdq))
- return;
-
- seq_no = buf_seqno(skb_peek(&link->deferdq));
- if (seq_no == mod(link->next_in_no))
- skb_queue_splice_tail_init(&link->deferdq, list);
-}
-
-/**
- * tipc_rcv - process TIPC packets/messages arriving from off-node
- * @net: the applicable net namespace
- * @skb: TIPC packet
- * @b_ptr: pointer to bearer message arrived on
- *
- * Invoked with no locks held. Bearer pointer must point to a valid bearer
- * structure (i.e. cannot be NULL), but bearer can be inactive.
- */
-void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct sk_buff_head head;
- struct tipc_node *n_ptr;
- struct tipc_link *l_ptr;
- struct sk_buff *skb1, *tmp;
- struct tipc_msg *msg;
- u32 seq_no;
- u32 ackd;
- u32 released;
-
- skb2list(skb, &head);
-
- while ((skb = __skb_dequeue(&head))) {
- /* Ensure message is well-formed */
- if (unlikely(!tipc_msg_validate(skb)))
- goto discard;
-
- /* Handle arrival of a non-unicast link message */
- msg = buf_msg(skb);
- if (unlikely(msg_non_seq(msg))) {
- if (msg_user(msg) == LINK_CONFIG)
- tipc_disc_rcv(net, skb, b_ptr);
- else
- tipc_bclink_rcv(net, skb);
- continue;
- }
-
- /* Discard unicast link messages destined for another node */
- if (unlikely(!msg_short(msg) &&
- (msg_destnode(msg) != tn->own_addr)))
- goto discard;
-
- /* Locate neighboring node that sent message */
- n_ptr = tipc_node_find(net, msg_prevnode(msg));
- if (unlikely(!n_ptr))
- goto discard;
-
- tipc_node_lock(n_ptr);
- /* Locate unicast link endpoint that should handle message */
- l_ptr = n_ptr->links[b_ptr->identity];
- if (unlikely(!l_ptr))
- goto unlock;
-
- /* Verify that communication with node is currently allowed */
- if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) &&
- msg_user(msg) == LINK_PROTOCOL &&
- (msg_type(msg) == RESET_MSG ||
- msg_type(msg) == ACTIVATE_MSG) &&
- !msg_redundant_link(msg))
- n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN;
-
- if (tipc_node_blocked(n_ptr))
- goto unlock;
-
- /* Validate message sequence number info */
- seq_no = msg_seqno(msg);
- ackd = msg_ack(msg);
-
- /* Release acked messages */
- if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg)))
- tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg));
-
- released = 0;
- skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) {
- if (more(buf_seqno(skb1), ackd))
- break;
- __skb_unlink(skb1, &l_ptr->transmq);
- kfree_skb(skb1);
- released = 1;
- }
-
- /* Try sending any messages link endpoint has pending */
- if (unlikely(skb_queue_len(&l_ptr->backlogq)))
- tipc_link_push_packets(l_ptr);
-
- if (released && !skb_queue_empty(&l_ptr->wakeupq))
- link_prepare_wakeup(l_ptr);
-
- /* Process the incoming packet */
- if (unlikely(!link_working_working(l_ptr))) {
- if (msg_user(msg) == LINK_PROTOCOL) {
- tipc_link_proto_rcv(l_ptr, skb);
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
- }
-
- /* Traffic message. Conditionally activate link */
- link_state_event(l_ptr, TRAFFIC_MSG_EVT);
-
- if (link_working_working(l_ptr)) {
- /* Re-insert buffer in front of queue */
- __skb_queue_head(&head, skb);
- skb = NULL;
- goto unlock;
- }
- goto unlock;
- }
-
- /* Link is now in state WORKING_WORKING */
- if (unlikely(seq_no != mod(l_ptr->next_in_no))) {
- link_handle_out_of_seq_msg(l_ptr, skb);
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
- }
- /* Synchronize with parallel link if applicable */
- if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
- if (!link_synch(l_ptr))
- goto unlock;
- }
- l_ptr->next_in_no++;
- if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
- link_retrieve_defq(l_ptr, &head);
- if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) {
- l_ptr->stats.sent_acks++;
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
- }
- tipc_link_input(l_ptr, skb);
- skb = NULL;
-unlock:
- tipc_node_unlock(n_ptr);
- tipc_node_put(n_ptr);
-discard:
- if (unlikely(skb))
- kfree_skb(skb);
+ skb_queue_walk_from(&l->transmq, skb) {
+ if (more(buf_seqno(skb), to))
+ break;
+ hdr = buf_msg(skb);
+ _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
+ if (!_skb)
+ return 0;
+ hdr = buf_msg(_skb);
+ msg_set_ack(hdr, ack);
+ msg_set_bcast_ack(hdr, bc_ack);
+ _skb->priority = TC_PRIO_CONTROL;
+ __skb_queue_tail(xmitq, _skb);
+ l->stats.retransmitted++;
}
+ return 0;
}
/* tipc_data_input - deliver data and name distr msgs to upper layer
@@ -1171,29 +860,20 @@ discard:
* Consumes buffer if message is of right type
* Node lock must be held
*/
-static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb)
+static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *inputq)
{
- struct tipc_node *node = link->owner;
- struct tipc_msg *msg = buf_msg(skb);
- u32 dport = msg_destport(msg);
-
- switch (msg_user(msg)) {
+ switch (msg_user(buf_msg(skb))) {
case TIPC_LOW_IMPORTANCE:
case TIPC_MEDIUM_IMPORTANCE:
case TIPC_HIGH_IMPORTANCE:
case TIPC_CRITICAL_IMPORTANCE:
case CONN_MANAGER:
- if (tipc_skb_queue_tail(&link->inputq, skb, dport)) {
- node->inputq = &link->inputq;
- node->action_flags |= TIPC_MSG_EVT;
- }
+ skb_queue_tail(inputq, skb);
return true;
case NAME_DISTRIBUTOR:
- node->bclink.recv_permitted = true;
- node->namedq = &link->namedq;
- skb_queue_tail(&link->namedq, skb);
- if (skb_queue_len(&link->namedq) == 1)
- node->action_flags |= TIPC_NAMED_MSG_EVT;
+ l->bc_rcvlink->state = LINK_ESTABLISHED;
+ skb_queue_tail(l->namedq, skb);
return true;
case MSG_BUNDLER:
case TUNNEL_PROTOCOL:
@@ -1210,540 +890,629 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb)
/* tipc_link_input - process packet that has passed link protocol check
*
* Consumes buffer
- * Node lock must be held
*/
-static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb)
+static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *inputq)
{
- struct tipc_node *node = link->owner;
- struct tipc_msg *msg = buf_msg(skb);
+ struct tipc_msg *hdr = buf_msg(skb);
+ struct sk_buff **reasm_skb = &l->reasm_buf;
struct sk_buff *iskb;
+ struct sk_buff_head tmpq;
+ int usr = msg_user(hdr);
+ int rc = 0;
int pos = 0;
+ int ipos = 0;
- if (likely(tipc_data_input(link, skb)))
- return;
-
- switch (msg_user(msg)) {
- case TUNNEL_PROTOCOL:
- if (msg_dup(msg)) {
- link->flags |= LINK_SYNCHING;
- link->synch_point = msg_seqno(msg_get_wrapped(msg));
- kfree_skb(skb);
- break;
- }
- if (!tipc_link_failover_rcv(link, &skb))
- break;
- if (msg_user(buf_msg(skb)) != MSG_BUNDLER) {
- tipc_data_input(link, skb);
- break;
+ if (unlikely(usr == TUNNEL_PROTOCOL)) {
+ if (msg_type(hdr) == SYNCH_MSG) {
+ __skb_queue_purge(&l->deferdq);
+ goto drop;
}
- case MSG_BUNDLER:
- link->stats.recv_bundles++;
- link->stats.recv_bundled += msg_msgcnt(msg);
+ if (!tipc_msg_extract(skb, &iskb, &ipos))
+ return rc;
+ kfree_skb(skb);
+ skb = iskb;
+ hdr = buf_msg(skb);
+ if (less(msg_seqno(hdr), l->drop_point))
+ goto drop;
+ if (tipc_data_input(l, skb, inputq))
+ return rc;
+ usr = msg_user(hdr);
+ reasm_skb = &l->failover_reasm_skb;
+ }
+ if (usr == MSG_BUNDLER) {
+ skb_queue_head_init(&tmpq);
+ l->stats.recv_bundles++;
+ l->stats.recv_bundled += msg_msgcnt(hdr);
while (tipc_msg_extract(skb, &iskb, &pos))
- tipc_data_input(link, iskb);
- break;
- case MSG_FRAGMENTER:
- link->stats.recv_fragments++;
- if (tipc_buf_append(&link->reasm_buf, &skb)) {
- link->stats.recv_fragmented++;
- tipc_data_input(link, skb);
- } else if (!link->reasm_buf) {
- tipc_link_reset(link);
+ tipc_data_input(l, iskb, &tmpq);
+ tipc_skb_queue_splice_tail(&tmpq, inputq);
+ return 0;
+ } else if (usr == MSG_FRAGMENTER) {
+ l->stats.recv_fragments++;
+ if (tipc_buf_append(reasm_skb, &skb)) {
+ l->stats.recv_fragmented++;
+ tipc_data_input(l, skb, inputq);
+ } else if (!*reasm_skb && !link_is_bc_rcvlink(l)) {
+ pr_warn_ratelimited("Unable to build fragment list\n");
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
- break;
- case BCAST_PROTOCOL:
- tipc_link_sync_rcv(node, skb);
- break;
- default:
- break;
- };
+ return 0;
+ } else if (usr == BCAST_PROTOCOL) {
+ tipc_bcast_lock(l->net);
+ tipc_link_bc_init_rcv(l->bc_rcvlink, hdr);
+ tipc_bcast_unlock(l->net);
+ }
+drop:
+ kfree_skb(skb);
+ return 0;
}
-/**
- * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue
- *
- * Returns increase in queue length (i.e. 0 or 1)
- */
-u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb)
+static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
{
- struct sk_buff *skb1;
- u32 seq_no = buf_seqno(skb);
+ bool released = false;
+ struct sk_buff *skb, *tmp;
- /* Empty queue ? */
- if (skb_queue_empty(list)) {
- __skb_queue_tail(list, skb);
- return 1;
+ skb_queue_walk_safe(&l->transmq, skb, tmp) {
+ if (more(buf_seqno(skb), acked))
+ break;
+ __skb_unlink(skb, &l->transmq);
+ kfree_skb(skb);
+ released = true;
}
+ return released;
+}
- /* Last ? */
- if (less(buf_seqno(skb_peek_tail(list)), seq_no)) {
- __skb_queue_tail(list, skb);
- return 1;
+/* tipc_link_build_ack_msg: prepare link acknowledge message for transmission
+ *
+ * Note that sending of broadcast ack is coordinated among nodes, to reduce
+ * risk of ack storms towards the sender
+ */
+int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
+{
+ if (!l)
+ return 0;
+
+ /* Broadcast ACK must be sent via a unicast link => defer to caller */
+ if (link_is_bc_rcvlink(l)) {
+ if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf)
+ return 0;
+ l->rcv_unacked = 0;
+ return TIPC_LINK_SND_BC_ACK;
}
- /* Locate insertion point in queue, then insert; discard if duplicate */
- skb_queue_walk(list, skb1) {
- u32 curr_seqno = buf_seqno(skb1);
+ /* Unicast ACK */
+ l->rcv_unacked = 0;
+ l->stats.sent_acks++;
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
+ return 0;
+}
- if (seq_no == curr_seqno) {
- kfree_skb(skb);
- return 0;
- }
+/* tipc_link_build_reset_msg: prepare link RESET or ACTIVATE message
+ */
+void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
+{
+ int mtyp = RESET_MSG;
- if (less(seq_no, curr_seqno))
- break;
- }
+ if (l->state == LINK_ESTABLISHING)
+ mtyp = ACTIVATE_MSG;
- __skb_queue_before(list, skb1, skb);
- return 1;
+ tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq);
}
-/*
- * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet
+/* tipc_link_build_nack_msg: prepare link nack message for transmission
*/
-static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr,
- struct sk_buff *buf)
+static void tipc_link_build_nack_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
{
- u32 seq_no = buf_seqno(buf);
+ u32 def_cnt = ++l->stats.deferred_recv;
- if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) {
- tipc_link_proto_rcv(l_ptr, buf);
+ if (link_is_bc_rcvlink(l))
return;
- }
- /* Record OOS packet arrival (force mismatch on next timeout) */
- l_ptr->checkpoint--;
+ if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
+}
- /*
- * Discard packet if a duplicate; otherwise add it to deferred queue
- * and notify peer of gap as per protocol specification
- */
- if (less(seq_no, mod(l_ptr->next_in_no))) {
- l_ptr->stats.duplicates++;
- kfree_skb(buf);
- return;
- }
+/* tipc_link_rcv - process TIPC packets/messages arriving from off-node
+ * @l: the link that should handle the message
+ * @skb: TIPC packet
+ * @xmitq: queue to place packets to be sent after this call
+ */
+int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff_head *defq = &l->deferdq;
+ struct tipc_msg *hdr;
+ u16 seqno, rcv_nxt, win_lim;
+ int rc = 0;
+
+ do {
+ hdr = buf_msg(skb);
+ seqno = msg_seqno(hdr);
+ rcv_nxt = l->rcv_nxt;
+ win_lim = rcv_nxt + TIPC_MAX_LINK_WIN;
+
+ /* Verify and update link state */
+ if (unlikely(msg_user(hdr) == LINK_PROTOCOL))
+ return tipc_link_proto_rcv(l, skb, xmitq);
+
+ if (unlikely(!link_is_up(l))) {
+ if (l->state == LINK_ESTABLISHING)
+ rc = TIPC_LINK_UP_EVT;
+ goto drop;
+ }
- if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) {
- l_ptr->stats.deferred_recv++;
- if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1)
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
- } else {
- l_ptr->stats.duplicates++;
- }
+ /* Don't send probe at next timeout expiration */
+ l->silent_intv_cnt = 0;
+
+ /* Drop if outside receive window */
+ if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) {
+ l->stats.duplicates++;
+ goto drop;
+ }
+
+ /* Forward queues and wake up waiting users */
+ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
+ }
+
+ /* Defer delivery if sequence gap */
+ if (unlikely(seqno != rcv_nxt)) {
+ __tipc_skb_queue_sorted(defq, seqno, skb);
+ tipc_link_build_nack_msg(l, xmitq);
+ break;
+ }
+
+ /* Deliver packet */
+ l->rcv_nxt++;
+ l->stats.recv_info++;
+ if (!tipc_data_input(l, skb, l->inputq))
+ rc |= tipc_link_input(l, skb, l->inputq);
+ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
+ rc |= tipc_link_build_ack_msg(l, xmitq);
+ if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK))
+ break;
+ } while ((skb = __skb_dequeue(defq)));
+
+ return rc;
+drop:
+ kfree_skb(skb);
+ return rc;
}
/*
* Send protocol message to the other endpoint.
*/
-void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg,
+void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg,
u32 gap, u32 tolerance, u32 priority)
{
- struct sk_buff *buf = NULL;
- struct tipc_msg *msg = l_ptr->pmsg;
- u32 msg_size = sizeof(l_ptr->proto_msg);
- int r_flag;
-
- /* Don't send protocol message during link failover */
- if (l_ptr->flags & LINK_FAILINGOVER)
- return;
+ struct sk_buff *skb = NULL;
+ struct sk_buff_head xmitq;
- /* Abort non-RESET send if communication with node is prohibited */
- if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG))
+ __skb_queue_head_init(&xmitq);
+ tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap,
+ tolerance, priority, &xmitq);
+ skb = __skb_dequeue(&xmitq);
+ if (!skb)
return;
+ tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr);
+ l->rcv_unacked = 0;
+}
- /* Create protocol message with "out-of-sequence" sequence number */
- msg_set_type(msg, msg_typ);
- msg_set_net_plane(msg, l_ptr->net_plane);
- msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
- msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net));
+static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
+ u16 rcvgap, int tolerance, int priority,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb = NULL;
+ struct tipc_msg *hdr = l->pmsg;
+ bool node_up = link_is_up(l->bc_rcvlink);
- if (msg_typ == STATE_MSG) {
- u32 next_sent = mod(l_ptr->next_out_no);
+ /* Don't send protocol message during reset or link failover */
+ if (tipc_link_is_blocked(l))
+ return;
- if (!tipc_link_is_up(l_ptr))
+ msg_set_type(hdr, mtyp);
+ msg_set_net_plane(hdr, l->net_plane);
+ msg_set_next_sent(hdr, l->snd_nxt);
+ msg_set_ack(hdr, l->rcv_nxt - 1);
+ msg_set_bcast_ack(hdr, l->bc_rcvlink->rcv_nxt - 1);
+ msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1);
+ msg_set_link_tolerance(hdr, tolerance);
+ msg_set_linkprio(hdr, priority);
+ msg_set_redundant_link(hdr, node_up);
+ msg_set_seq_gap(hdr, 0);
+
+ /* Compatibility: created msg must not be in sequence with pkt flow */
+ msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
+
+ if (mtyp == STATE_MSG) {
+ if (!tipc_link_is_up(l))
return;
- if (skb_queue_len(&l_ptr->backlogq))
- next_sent = buf_seqno(skb_peek(&l_ptr->backlogq));
- msg_set_next_sent(msg, next_sent);
- if (!skb_queue_empty(&l_ptr->deferdq)) {
- u32 rec = buf_seqno(skb_peek(&l_ptr->deferdq));
- gap = mod(rec - mod(l_ptr->next_in_no));
+
+ /* Override rcvgap if there are packets in deferred queue */
+ if (!skb_queue_empty(&l->deferdq))
+ rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt;
+ if (rcvgap) {
+ msg_set_seq_gap(hdr, rcvgap);
+ l->stats.sent_nacks++;
}
- msg_set_seq_gap(msg, gap);
- if (gap)
- l_ptr->stats.sent_nacks++;
- msg_set_link_tolerance(msg, tolerance);
- msg_set_linkprio(msg, priority);
- msg_set_max_pkt(msg, l_ptr->mtu);
- msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
- msg_set_probe(msg, probe_msg != 0);
- if (probe_msg)
- l_ptr->stats.sent_probes++;
- l_ptr->stats.sent_states++;
- } else { /* RESET_MSG or ACTIVATE_MSG */
- msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1));
- msg_set_seq_gap(msg, 0);
- msg_set_next_sent(msg, 1);
- msg_set_probe(msg, 0);
- msg_set_link_tolerance(msg, l_ptr->tolerance);
- msg_set_linkprio(msg, l_ptr->priority);
- msg_set_max_pkt(msg, l_ptr->advertised_mtu);
+ msg_set_probe(hdr, probe);
+ if (probe)
+ l->stats.sent_probes++;
+ l->stats.sent_states++;
+ l->rcv_unacked = 0;
+ } else {
+ /* RESET_MSG or ACTIVATE_MSG */
+ msg_set_max_pkt(hdr, l->advertised_mtu);
+ msg_set_ack(hdr, l->rcv_nxt - 1);
+ msg_set_next_sent(hdr, 1);
}
+ skb = tipc_buf_acquire(msg_size(hdr));
+ if (!skb)
+ return;
+ skb_copy_to_linear_data(skb, hdr, msg_size(hdr));
+ skb->priority = TC_PRIO_CONTROL;
+ __skb_queue_tail(xmitq, skb);
+}
- r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr));
- msg_set_redundant_link(msg, r_flag);
- msg_set_linkprio(msg, l_ptr->priority);
- msg_set_size(msg, msg_size);
+/* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets
+ * with contents of the link's transmit and backlog queues.
+ */
+void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
+ int mtyp, struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb, *tnlskb;
+ struct tipc_msg *hdr, tnlhdr;
+ struct sk_buff_head *queue = &l->transmq;
+ struct sk_buff_head tmpxq, tnlq;
+ u16 pktlen, pktcnt, seqno = l->snd_nxt;
- msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2)));
+ if (!tnl)
+ return;
- buf = tipc_buf_acquire(msg_size);
- if (!buf)
+ skb_queue_head_init(&tnlq);
+ skb_queue_head_init(&tmpxq);
+
+ /* At least one packet required for safe algorithm => add dummy */
+ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
+ BASIC_H_SIZE, 0, l->addr, link_own_addr(l),
+ 0, 0, TIPC_ERR_NO_PORT);
+ if (!skb) {
+ pr_warn("%sunable to create tunnel packet\n", link_co_err);
return;
+ }
+ skb_queue_tail(&tnlq, skb);
+ tipc_link_xmit(l, &tnlq, &tmpxq);
+ __skb_queue_purge(&tmpxq);
+
+ /* Initialize reusable tunnel packet header */
+ tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL,
+ mtyp, INT_H_SIZE, l->addr);
+ pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq);
+ msg_set_msgcnt(&tnlhdr, pktcnt);
+ msg_set_bearer_id(&tnlhdr, l->peer_bearer_id);
+tnl:
+ /* Wrap each packet into a tunnel packet */
+ skb_queue_walk(queue, skb) {
+ hdr = buf_msg(skb);
+ if (queue == &l->backlogq)
+ msg_set_seqno(hdr, seqno++);
+ pktlen = msg_size(hdr);
+ msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
+ tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE);
+ if (!tnlskb) {
+ pr_warn("%sunable to send packet\n", link_co_err);
+ return;
+ }
+ skb_copy_to_linear_data(tnlskb, &tnlhdr, INT_H_SIZE);
+ skb_copy_to_linear_data_offset(tnlskb, INT_H_SIZE, hdr, pktlen);
+ __skb_queue_tail(&tnlq, tnlskb);
+ }
+ if (queue != &l->backlogq) {
+ queue = &l->backlogq;
+ goto tnl;
+ }
+
+ tipc_link_xmit(tnl, &tnlq, xmitq);
- skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
- buf->priority = TC_PRIO_CONTROL;
- tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf,
- &l_ptr->media_addr);
- l_ptr->rcv_unacked = 0;
- kfree_skb(buf);
+ if (mtyp == FAILOVER_MSG) {
+ tnl->drop_point = l->rcv_nxt;
+ tnl->failover_reasm_skb = l->reasm_buf;
+ l->reasm_buf = NULL;
+ }
}
-/*
- * Receive protocol message :
+/* tipc_link_proto_rcv(): receive link level protocol message :
* Note that network plane id propagates through the network, and may
- * change at any time. The node with lowest address rules
+ * change at any time. The node with lowest numerical id determines
+ * network plane
*/
-static void tipc_link_proto_rcv(struct tipc_link *l_ptr,
- struct sk_buff *buf)
+static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
- u32 rec_gap = 0;
- u32 msg_tol;
- struct tipc_msg *msg = buf_msg(buf);
+ struct tipc_msg *hdr = buf_msg(skb);
+ u16 rcvgap = 0;
+ u16 ack = msg_ack(hdr);
+ u16 gap = msg_seq_gap(hdr);
+ u16 peers_snd_nxt = msg_next_sent(hdr);
+ u16 peers_tol = msg_link_tolerance(hdr);
+ u16 peers_prio = msg_linkprio(hdr);
+ u16 rcv_nxt = l->rcv_nxt;
+ int mtyp = msg_type(hdr);
+ char *if_name;
+ int rc = 0;
- if (l_ptr->flags & LINK_FAILINGOVER)
+ if (tipc_link_is_blocked(l) || !xmitq)
goto exit;
- if (l_ptr->net_plane != msg_net_plane(msg))
- if (link_own_addr(l_ptr) > msg_prevnode(msg))
- l_ptr->net_plane = msg_net_plane(msg);
-
- switch (msg_type(msg)) {
+ if (link_own_addr(l) > msg_prevnode(hdr))
+ l->net_plane = msg_net_plane(hdr);
+ switch (mtyp) {
case RESET_MSG:
- if (!link_working_unknown(l_ptr) &&
- (l_ptr->peer_session != INVALID_SESSION)) {
- if (less_eq(msg_session(msg), l_ptr->peer_session))
- break; /* duplicate or old reset: ignore */
- }
-
- if (!msg_redundant_link(msg) && (link_working_working(l_ptr) ||
- link_working_unknown(l_ptr))) {
- /*
- * peer has lost contact -- don't allow peer's links
- * to reactivate before we recognize loss & clean up
- */
- l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN;
- }
-
- link_state_event(l_ptr, RESET_MSG);
+ /* Ignore duplicate RESET with old session number */
+ if ((less_eq(msg_session(hdr), l->peer_session)) &&
+ (l->peer_session != WILDCARD_SESSION))
+ break;
/* fall thru' */
+
case ACTIVATE_MSG:
- /* Update link settings according other endpoint's values */
- strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg));
- msg_tol = msg_link_tolerance(msg);
- if (msg_tol > l_ptr->tolerance)
- link_set_supervision_props(l_ptr, msg_tol);
+ /* Complete own link name with peer's interface name */
+ if_name = strrchr(l->name, ':') + 1;
+ if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
+ break;
+ if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
+ break;
+ strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME);
- if (msg_linkprio(msg) > l_ptr->priority)
- l_ptr->priority = msg_linkprio(msg);
+ /* Update own tolerance if peer indicates a non-zero value */
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ l->tolerance = peers_tol;
- if (l_ptr->mtu > msg_max_pkt(msg))
- l_ptr->mtu = msg_max_pkt(msg);
+ /* Update own priority if peer's priority is higher */
+ if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
+ l->priority = peers_prio;
- /* Synchronize broadcast link info, if not done previously */
- if (!tipc_node_is_up(l_ptr->owner)) {
- l_ptr->owner->bclink.last_sent =
- l_ptr->owner->bclink.last_in =
- msg_last_bcast(msg);
- l_ptr->owner->bclink.oos_state = 0;
- }
+ /* ACTIVATE_MSG serves as PEER_RESET if link is already down */
+ if ((mtyp == RESET_MSG) || !link_is_up(l))
+ rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT);
- l_ptr->peer_session = msg_session(msg);
- l_ptr->peer_bearer_id = msg_bearer_id(msg);
+ /* ACTIVATE_MSG takes up link if it was already locally reset */
+ if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING))
+ rc = TIPC_LINK_UP_EVT;
- if (msg_type(msg) == ACTIVATE_MSG)
- link_state_event(l_ptr, ACTIVATE_MSG);
+ l->peer_session = msg_session(hdr);
+ l->peer_bearer_id = msg_bearer_id(hdr);
+ if (l->mtu > msg_max_pkt(hdr))
+ l->mtu = msg_max_pkt(hdr);
break;
+
case STATE_MSG:
- msg_tol = msg_link_tolerance(msg);
- if (msg_tol)
- link_set_supervision_props(l_ptr, msg_tol);
-
- if (msg_linkprio(msg) &&
- (msg_linkprio(msg) != l_ptr->priority)) {
- pr_debug("%s<%s>, priority change %u->%u\n",
- link_rst_msg, l_ptr->name,
- l_ptr->priority, msg_linkprio(msg));
- l_ptr->priority = msg_linkprio(msg);
- tipc_link_reset(l_ptr); /* Enforce change to take effect */
- break;
- }
+ /* Update own tolerance if peer indicates a non-zero value */
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ l->tolerance = peers_tol;
- /* Record reception; force mismatch at next timeout: */
- l_ptr->checkpoint--;
+ l->silent_intv_cnt = 0;
+ l->stats.recv_states++;
+ if (msg_probe(hdr))
+ l->stats.recv_probes++;
- link_state_event(l_ptr, TRAFFIC_MSG_EVT);
- l_ptr->stats.recv_states++;
- if (link_reset_unknown(l_ptr))
+ if (!link_is_up(l)) {
+ if (l->state == LINK_ESTABLISHING)
+ rc = TIPC_LINK_UP_EVT;
break;
-
- if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) {
- rec_gap = mod(msg_next_sent(msg) -
- mod(l_ptr->next_in_no));
}
- if (msg_probe(msg))
- l_ptr->stats.recv_probes++;
-
- /* Protocol message before retransmits, reduce loss risk */
- if (l_ptr->owner->bclink.recv_permitted)
- tipc_bclink_update_link_state(l_ptr->owner,
- msg_last_bcast(msg));
-
- if (rec_gap || (msg_probe(msg))) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0,
- rec_gap, 0, 0);
+ /* Send NACK if peer has sent pkts we haven't received yet */
+ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l))
+ rcvgap = peers_snd_nxt - l->rcv_nxt;
+ if (rcvgap || (msg_probe(hdr)))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap,
+ 0, 0, xmitq);
+ tipc_link_release_pkts(l, ack);
+
+ /* If NACK, retransmit will now start at right position */
+ if (gap) {
+ rc = tipc_link_retrans(l, ack + 1, ack + gap, xmitq);
+ l->stats.recv_nacks++;
}
- if (msg_seq_gap(msg)) {
- l_ptr->stats.recv_nacks++;
- tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq),
- msg_seq_gap(msg));
- }
- break;
+
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
}
exit:
- kfree_skb(buf);
+ kfree_skb(skb);
+ return rc;
}
-
-/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to
- * a different bearer. Owner node is locked.
+/* tipc_link_build_bc_proto_msg() - create broadcast protocol message
*/
-static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr,
- struct tipc_msg *tunnel_hdr,
- struct tipc_msg *msg,
- u32 selector)
+static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast,
+ u16 peers_snd_nxt,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link *tunnel;
struct sk_buff *skb;
- u32 length = msg_size(msg);
+ struct tipc_msg *hdr;
+ struct sk_buff *dfrd_skb = skb_peek(&l->deferdq);
+ u16 ack = l->rcv_nxt - 1;
+ u16 gap_to = peers_snd_nxt - 1;
- tunnel = l_ptr->owner->active_links[selector & 1];
- if (!tipc_link_is_up(tunnel)) {
- pr_warn("%stunnel link no longer available\n", link_co_err);
- return;
- }
- msg_set_size(tunnel_hdr, length + INT_H_SIZE);
- skb = tipc_buf_acquire(length + INT_H_SIZE);
- if (!skb) {
- pr_warn("%sunable to send tunnel msg\n", link_co_err);
- return;
- }
- skb_copy_to_linear_data(skb, tunnel_hdr, INT_H_SIZE);
- skb_copy_to_linear_data_offset(skb, INT_H_SIZE, msg, length);
- __tipc_link_xmit_skb(tunnel, skb);
+ skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
+ 0, l->addr, link_own_addr(l), 0, 0, 0);
+ if (!skb)
+ return false;
+ hdr = buf_msg(skb);
+ msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1);
+ msg_set_bcast_ack(hdr, ack);
+ msg_set_bcgap_after(hdr, ack);
+ if (dfrd_skb)
+ gap_to = buf_seqno(dfrd_skb) - 1;
+ msg_set_bcgap_to(hdr, gap_to);
+ msg_set_non_seq(hdr, bcast);
+ __skb_queue_tail(xmitq, skb);
+ return true;
}
+/* tipc_link_build_bc_init_msg() - synchronize broadcast link endpoints.
+ *
+ * Give a newly added peer node the sequence number where it should
+ * start receiving and acking broadcast packets.
+ */
+static void tipc_link_build_bc_init_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff_head list;
+
+ __skb_queue_head_init(&list);
+ if (!tipc_link_build_bc_proto_msg(l->bc_rcvlink, false, 0, &list))
+ return;
+ tipc_link_xmit(l, &list, xmitq);
+}
-/* tipc_link_failover_send_queue(): A link has gone down, but a second
- * link is still active. We can do failover. Tunnel the failing link's
- * whole send queue via the remaining link. This way, we don't lose
- * any packets, and sequence order is preserved for subsequent traffic
- * sent over the remaining link. Owner node is locked.
+/* tipc_link_bc_init_rcv - receive initial broadcast synch data from peer
*/
-void tipc_link_failover_send_queue(struct tipc_link *l_ptr)
+void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
{
- int msgcount;
- struct tipc_link *tunnel = l_ptr->owner->active_links[0];
- struct tipc_msg tunnel_hdr;
- struct sk_buff *skb;
- int split_bundles;
+ int mtyp = msg_type(hdr);
+ u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
- if (!tunnel)
+ if (link_is_up(l))
return;
- tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL,
- FAILOVER_MSG, INT_H_SIZE, l_ptr->addr);
- skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq);
- tipc_link_purge_backlog(l_ptr);
- msgcount = skb_queue_len(&l_ptr->transmq);
- msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
- msg_set_msgcnt(&tunnel_hdr, msgcount);
-
- if (skb_queue_empty(&l_ptr->transmq)) {
- skb = tipc_buf_acquire(INT_H_SIZE);
- if (skb) {
- skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE);
- msg_set_size(&tunnel_hdr, INT_H_SIZE);
- __tipc_link_xmit_skb(tunnel, skb);
- } else {
- pr_warn("%sunable to send changeover msg\n",
- link_co_err);
- }
+ if (msg_user(hdr) == BCAST_PROTOCOL) {
+ l->rcv_nxt = peers_snd_nxt;
+ l->state = LINK_ESTABLISHED;
return;
}
- split_bundles = (l_ptr->owner->active_links[0] !=
- l_ptr->owner->active_links[1]);
-
- skb_queue_walk(&l_ptr->transmq, skb) {
- struct tipc_msg *msg = buf_msg(skb);
+ if (l->peer_caps & TIPC_BCAST_SYNCH)
+ return;
- if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) {
- struct tipc_msg *m = msg_get_wrapped(msg);
- unchar *pos = (unchar *)m;
+ if (msg_peer_node_is_up(hdr))
+ return;
- msgcount = msg_msgcnt(msg);
- while (msgcount--) {
- msg_set_seqno(m, msg_seqno(msg));
- tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, m,
- msg_link_selector(m));
- pos += align(msg_size(m));
- m = (struct tipc_msg *)pos;
- }
- } else {
- tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, msg,
- msg_link_selector(msg));
- }
- }
+ /* Compatibility: accept older, less safe initial synch data */
+ if ((mtyp == RESET_MSG) || (mtyp == ACTIVATE_MSG))
+ l->rcv_nxt = peers_snd_nxt;
}
-/* tipc_link_dup_queue_xmit(): A second link has become active. Tunnel a
- * duplicate of the first link's send queue via the new link. This way, we
- * are guaranteed that currently queued packets from a socket are delivered
- * before future traffic from the same socket, even if this is using the
- * new link. The last arriving copy of each duplicate packet is dropped at
- * the receiving end by the regular protocol check, so packet cardinality
- * and sequence order is preserved per sender/receiver socket pair.
- * Owner node is locked.
+/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
*/
-void tipc_link_dup_queue_xmit(struct tipc_link *link,
- struct tipc_link *tnl)
+void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
+ struct sk_buff_head *xmitq)
{
- struct sk_buff *skb;
- struct tipc_msg tnl_hdr;
- struct sk_buff_head *queue = &link->transmq;
- int mcnt;
+ u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
- tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL,
- SYNCH_MSG, INT_H_SIZE, link->addr);
- mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq);
- msg_set_msgcnt(&tnl_hdr, mcnt);
- msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id);
+ if (!link_is_up(l))
+ return;
-tunnel_queue:
- skb_queue_walk(queue, skb) {
- struct sk_buff *outskb;
- struct tipc_msg *msg = buf_msg(skb);
- u32 len = msg_size(msg);
-
- msg_set_ack(msg, mod(link->next_in_no - 1));
- msg_set_bcast_ack(msg, link->owner->bclink.last_in);
- msg_set_size(&tnl_hdr, len + INT_H_SIZE);
- outskb = tipc_buf_acquire(len + INT_H_SIZE);
- if (outskb == NULL) {
- pr_warn("%sunable to send duplicate msg\n",
- link_co_err);
- return;
- }
- skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE);
- skb_copy_to_linear_data_offset(outskb, INT_H_SIZE,
- skb->data, len);
- __tipc_link_xmit_skb(tnl, outskb);
- if (!tipc_link_is_up(link))
- return;
- }
- if (queue == &link->backlogq)
+ if (!msg_peer_node_is_up(hdr))
return;
- queue = &link->backlogq;
- goto tunnel_queue;
-}
-/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet
- * Owner node is locked.
- */
-static bool tipc_link_failover_rcv(struct tipc_link *link,
- struct sk_buff **skb)
-{
- struct tipc_msg *msg = buf_msg(*skb);
- struct sk_buff *iskb = NULL;
- struct tipc_link *pl = NULL;
- int bearer_id = msg_bearer_id(msg);
- int pos = 0;
+ l->bc_peer_is_up = true;
- if (msg_type(msg) != FAILOVER_MSG) {
- pr_warn("%sunknown tunnel pkt received\n", link_co_err);
- goto exit;
+ /* Ignore if peers_snd_nxt goes beyond receive window */
+ if (more(peers_snd_nxt, l->rcv_nxt + l->window))
+ return;
+
+ if (!more(peers_snd_nxt, l->rcv_nxt)) {
+ l->nack_state = BC_NACK_SND_CONDITIONAL;
+ return;
}
- if (bearer_id >= MAX_BEARERS)
- goto exit;
- if (bearer_id == link->bearer_id)
- goto exit;
+ /* Don't NACK if one was recently sent or peeked */
+ if (l->nack_state == BC_NACK_SND_SUPPRESS) {
+ l->nack_state = BC_NACK_SND_UNCONDITIONAL;
+ return;
+ }
- pl = link->owner->links[bearer_id];
- if (pl && tipc_link_is_up(pl))
- tipc_link_reset(pl);
+ /* Conditionally delay NACK sending until next synch rcv */
+ if (l->nack_state == BC_NACK_SND_CONDITIONAL) {
+ l->nack_state = BC_NACK_SND_UNCONDITIONAL;
+ if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN)
+ return;
+ }
- if (link->failover_pkts == FIRST_FAILOVER)
- link->failover_pkts = msg_msgcnt(msg);
+ /* Send NACK now but suppress next one */
+ tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq);
+ l->nack_state = BC_NACK_SND_SUPPRESS;
+}
- /* Should we expect an inner packet? */
- if (!link->failover_pkts)
- goto exit;
+void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb, *tmp;
+ struct tipc_link *snd_l = l->bc_sndlink;
- if (!tipc_msg_extract(*skb, &iskb, &pos)) {
- pr_warn("%sno inner failover pkt\n", link_co_err);
- *skb = NULL;
- goto exit;
- }
- link->failover_pkts--;
- *skb = NULL;
+ if (!link_is_up(l) || !l->bc_peer_is_up)
+ return;
- /* Was this packet already delivered? */
- if (less(buf_seqno(iskb), link->failover_checkpt)) {
- kfree_skb(iskb);
- iskb = NULL;
- goto exit;
+ if (!more(acked, l->acked))
+ return;
+
+ /* Skip over packets peer has already acked */
+ skb_queue_walk(&snd_l->transmq, skb) {
+ if (more(buf_seqno(skb), l->acked))
+ break;
}
- if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) {
- link->stats.recv_fragments++;
- tipc_buf_append(&link->failover_skb, &iskb);
+
+ /* Update/release the packets peer is acking now */
+ skb_queue_walk_from_safe(&snd_l->transmq, skb, tmp) {
+ if (more(buf_seqno(skb), acked))
+ break;
+ if (!--TIPC_SKB_CB(skb)->ackers) {
+ __skb_unlink(skb, &snd_l->transmq);
+ kfree_skb(skb);
+ }
}
-exit:
- if (!link->failover_pkts && pl)
- pl->flags &= ~LINK_FAILINGOVER;
- kfree_skb(*skb);
- *skb = iskb;
- return *skb;
+ l->acked = acked;
+ tipc_link_advance_backlog(snd_l, xmitq);
+ if (unlikely(!skb_queue_empty(&snd_l->wakeupq)))
+ link_prepare_wakeup(snd_l);
}
-static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol)
+/* tipc_link_bc_nack_rcv(): receive broadcast nack message
+ */
+int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
- unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u32 dnode = msg_destnode(hdr);
+ int mtyp = msg_type(hdr);
+ u16 acked = msg_bcast_ack(hdr);
+ u16 from = acked + 1;
+ u16 to = msg_bcgap_to(hdr);
+ u16 peers_snd_nxt = to + 1;
+ int rc = 0;
+
+ kfree_skb(skb);
+
+ if (!tipc_link_is_up(l) || !l->bc_peer_is_up)
+ return 0;
- if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL))
- return;
+ if (mtyp != STATE_MSG)
+ return 0;
+
+ if (dnode == link_own_addr(l)) {
+ tipc_link_bc_ack_rcv(l, acked, xmitq);
+ rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
+ l->stats.recv_nacks++;
+ return rc;
+ }
+
+ /* Msg for other node => suppress own NACK at next sync if applicable */
+ if (more(peers_snd_nxt, l->rcv_nxt) && !less(l->rcv_nxt, from))
+ l->nack_state = BC_NACK_SND_SUPPRESS;
- l_ptr->tolerance = tol;
- l_ptr->cont_intv = msecs_to_jiffies(intv);
- l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv) / 4);
+ return 0;
}
void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
@@ -1780,7 +1549,7 @@ static struct tipc_node *tipc_link_find_owner(struct net *net,
list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
tipc_node_lock(n_ptr);
for (i = 0; i < MAX_BEARERS; i++) {
- l_ptr = n_ptr->links[i];
+ l_ptr = n_ptr->links[i].link;
if (l_ptr && !strcmp(l_ptr->name, link_name)) {
*bearer_id = i;
found_node = n_ptr;
@@ -1803,31 +1572,20 @@ static struct tipc_node *tipc_link_find_owner(struct net *net,
static void link_reset_statistics(struct tipc_link *l_ptr)
{
memset(&l_ptr->stats, 0, sizeof(l_ptr->stats));
- l_ptr->stats.sent_info = l_ptr->next_out_no;
- l_ptr->stats.recv_info = l_ptr->next_in_no;
+ l_ptr->stats.sent_info = l_ptr->snd_nxt;
+ l_ptr->stats.recv_info = l_ptr->rcv_nxt;
}
-static void link_print(struct tipc_link *l_ptr, const char *str)
+static void link_print(struct tipc_link *l, const char *str)
{
- struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id);
- struct tipc_bearer *b_ptr;
-
- rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]);
- if (b_ptr)
- pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name);
- rcu_read_unlock();
-
- if (link_working_unknown(l_ptr))
- pr_cont(":WU\n");
- else if (link_reset_reset(l_ptr))
- pr_cont(":RR\n");
- else if (link_reset_unknown(l_ptr))
- pr_cont(":RU\n");
- else if (link_working_working(l_ptr))
- pr_cont(":WW\n");
- else
- pr_cont("\n");
+ struct sk_buff *hskb = skb_peek(&l->transmq);
+ u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt - 1;
+ u16 tail = l->snd_nxt - 1;
+
+ pr_info("%s Link <%s> state %x\n", str, l->name, l->state);
+ pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n",
+ skb_queue_len(&l->transmq), head, tail,
+ skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt);
}
/* Parse and validate nested (link) properties valid for media, bearer and link
@@ -1893,13 +1651,16 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
+ if (strcmp(name, tipc_bclink_name) == 0)
+ return tipc_nl_bc_link_set(net, attrs);
+
node = tipc_link_find_owner(net, name, &bearer_id);
if (!node)
return -EINVAL;
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
res = -EINVAL;
goto out;
@@ -1919,7 +1680,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
u32 tol;
tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
- link_set_supervision_props(link, tol);
+ link->tolerance = tol;
tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0);
}
if (props[TIPC_NLA_PROP_PRIO]) {
@@ -2034,15 +1795,15 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
goto attr_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->next_out_no))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt))
goto attr_msg_full;
if (tipc_link_is_up(link))
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
goto attr_msg_full;
- if (tipc_link_is_active(link))
+ if (link->active)
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE))
goto attr_msg_full;
@@ -2089,10 +1850,11 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
for (i = *prev_link; i < MAX_BEARERS; i++) {
*prev_link = i;
- if (!node->links[i])
+ if (!node->links[i].link)
continue;
- err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
+ err = __tipc_nl_add_link(net, msg,
+ node->links[i].link, NLM_F_MULTI);
if (err)
return err;
}
@@ -2175,50 +1937,53 @@ out:
int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
- struct sk_buff *ans_skb;
struct tipc_nl_msg msg;
- struct tipc_link *link;
- struct tipc_node *node;
char *name;
- int bearer_id;
int err;
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
if (!info->attrs[TIPC_NLA_LINK_NAME])
return -EINVAL;
-
name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]);
- node = tipc_link_find_owner(net, name, &bearer_id);
- if (!node)
- return -EINVAL;
- ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!ans_skb)
+ msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!msg.skb)
return -ENOMEM;
- msg.skb = ans_skb;
- msg.portid = info->snd_portid;
- msg.seq = info->snd_seq;
-
- tipc_node_lock(node);
- link = node->links[bearer_id];
- if (!link) {
- err = -EINVAL;
- goto err_out;
- }
-
- err = __tipc_nl_add_link(net, &msg, link, 0);
- if (err)
- goto err_out;
+ if (strcmp(name, tipc_bclink_name) == 0) {
+ err = tipc_nl_add_bc_link(net, &msg);
+ if (err) {
+ nlmsg_free(msg.skb);
+ return err;
+ }
+ } else {
+ int bearer_id;
+ struct tipc_node *node;
+ struct tipc_link *link;
- tipc_node_unlock(node);
+ node = tipc_link_find_owner(net, name, &bearer_id);
+ if (!node)
+ return -EINVAL;
- return genlmsg_reply(ans_skb, info);
+ tipc_node_lock(node);
+ link = node->links[bearer_id].link;
+ if (!link) {
+ tipc_node_unlock(node);
+ nlmsg_free(msg.skb);
+ return -EINVAL;
+ }
-err_out:
- tipc_node_unlock(node);
- nlmsg_free(ans_skb);
+ err = __tipc_nl_add_link(net, &msg, link, 0);
+ tipc_node_unlock(node);
+ if (err) {
+ nlmsg_free(msg.skb);
+ return err;
+ }
+ }
- return err;
+ return genlmsg_reply(msg.skb, info);
}
int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info)
@@ -2258,7 +2023,7 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info)
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
tipc_node_unlock(node);
return -EINVAL;
diff --git a/kernel/net/tipc/link.h b/kernel/net/tipc/link.h
index b5b4e3554..66d859b66 100644
--- a/kernel/net/tipc/link.h
+++ b/kernel/net/tipc/link.h
@@ -49,19 +49,26 @@
*/
#define INVALID_LINK_SEQ 0x10000
-/* Link working states
+/* Link FSM events:
*/
-#define WORKING_WORKING 560810u
-#define WORKING_UNKNOWN 560811u
-#define RESET_UNKNOWN 560812u
-#define RESET_RESET 560813u
+enum {
+ LINK_ESTABLISH_EVT = 0xec1ab1e,
+ LINK_PEER_RESET_EVT = 0x9eed0e,
+ LINK_FAILURE_EVT = 0xfa110e,
+ LINK_RESET_EVT = 0x10ca1d0e,
+ LINK_FAILOVER_BEGIN_EVT = 0xfa110bee,
+ LINK_FAILOVER_END_EVT = 0xfa110ede,
+ LINK_SYNCH_BEGIN_EVT = 0xc1ccbee,
+ LINK_SYNCH_END_EVT = 0xc1ccede
+};
-/* Link endpoint execution states
+/* Events returned from link at packet reception or at timeout
*/
-#define LINK_STARTED 0x0001
-#define LINK_STOPPED 0x0002
-#define LINK_SYNCHING 0x0004
-#define LINK_FAILINGOVER 0x0008
+enum {
+ TIPC_LINK_UP_EVT = 1,
+ TIPC_LINK_DOWN_EVT = (1 << 1),
+ TIPC_LINK_SND_BC_ACK = (1 << 2)
+};
/* Starting value for maximum packet size negotiation on unicast links
* (unless bearer MTU is less)
@@ -104,33 +111,34 @@ struct tipc_stats {
* @name: link name character string
* @media_addr: media address to use when sending messages over link
* @timer: link timer
- * @owner: pointer to peer node
+ * @net: pointer to namespace struct
* @refcnt: reference counter for permanent references (owner node & timer)
- * @flags: execution state flags for link endpoint instance
- * @checkpoint: reference point for triggering link continuity checking
* @peer_session: link session # being used by peer end of link
* @peer_bearer_id: bearer id used by link's peer endpoint
* @bearer_id: local bearer id used by link
* @tolerance: minimum link continuity loss needed to reset link [in ms]
- * @cont_intv: link continuity testing interval
+ * @keepalive_intv: link keepalive timer interval
* @abort_limit: # of unacknowledged continuity probes needed to reset link
* @state: current state of link FSM
- * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state
+ * @peer_caps: bitmap describing capabilities of peer node
+ * @silent_intv_cnt: # of timer intervals without any reception from peer
* @proto_msg: template for control messages generated by link
* @pmsg: convenience pointer to "proto_msg" field
* @priority: current link priority
* @net_plane: current link network plane ('A' through 'H')
* @backlog_limit: backlog queue congestion thresholds (indexed by importance)
* @exp_msg_count: # of tunnelled messages expected during link changeover
- * @reset_checkpoint: seq # of last acknowledged message at time of link reset
+ * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
* @mtu: current maximum packet size for this link
* @advertised_mtu: advertised own mtu when link is being established
* @transmitq: queue for sent, non-acked messages
* @backlogq: queue for messages waiting to be sent
- * @next_out_no: next sequence number to use for outbound messages
+ * @snt_nxt: next sequence number to use for outbound messages
* @last_retransmitted: sequence number of most recently retransmitted message
* @stale_count: # of identical retransmit requests made by peer
- * @next_in_no: next sequence number to expect for inbound messages
+ * @ackers: # of peers that needs to ack each packet before it can be released
+ * @acked: # last packet acked by a certain peer. Used for broadcast.
+ * @rcv_nxt: next sequence number to expect for inbound messages
* @deferred_queue: deferred queue saved OOS b'cast message received from node
* @unacked_window: # of inbound messages rx'd without ack'ing back to peer
* @inputq: buffer queue for messages to be delivered upwards
@@ -139,27 +147,26 @@ struct tipc_stats {
* @wakeupq: linked list of wakeup msgs waiting for link congestion to abate
* @long_msg_seq_no: next identifier to use for outbound fragmented messages
* @reasm_buf: head of partially reassembled inbound message fragments
+ * @bc_rcvr: marks that this is a broadcast receiver link
* @stats: collects statistics regarding link activity
*/
struct tipc_link {
u32 addr;
char name[TIPC_MAX_LINK_NAME];
- struct tipc_media_addr media_addr;
- struct timer_list timer;
- struct tipc_node *owner;
- struct kref ref;
+ struct tipc_media_addr *media_addr;
+ struct net *net;
/* Management and link supervision data */
- unsigned int flags;
- u32 checkpoint;
u32 peer_session;
u32 peer_bearer_id;
u32 bearer_id;
u32 tolerance;
- unsigned long cont_intv;
+ unsigned long keepalive_intv;
u32 abort_limit;
- int state;
- u32 fsm_msg_cnt;
+ u32 state;
+ u16 peer_caps;
+ bool active;
+ u32 silent_intv_cnt;
struct {
unchar hdr[INT_H_SIZE];
unchar body[TIPC_MAX_IF_NAME];
@@ -167,12 +174,10 @@ struct tipc_link {
struct tipc_msg *pmsg;
u32 priority;
char net_plane;
- u16 synch_point;
- /* Failover */
- u16 failover_pkts;
- u16 failover_checkpt;
- struct sk_buff *failover_skb;
+ /* Failover/synch */
+ u16 drop_point;
+ struct sk_buff *failover_reasm_skb;
/* Max packet negotiation */
u16 mtu;
@@ -185,17 +190,17 @@ struct tipc_link {
u16 len;
u16 limit;
} backlog[5];
- u32 next_out_no;
- u32 window;
- u32 last_retransmitted;
+ u16 snd_nxt;
+ u16 last_retransm;
+ u16 window;
u32 stale_count;
/* Reception */
- u32 next_in_no;
+ u16 rcv_nxt;
u32 rcv_unacked;
struct sk_buff_head deferdq;
- struct sk_buff_head inputq;
- struct sk_buff_head namedq;
+ struct sk_buff_head *inputq;
+ struct sk_buff_head *namedq;
/* Congestion handling */
struct sk_buff_head wakeupq;
@@ -203,109 +208,76 @@ struct tipc_link {
/* Fragmentation/reassembly */
struct sk_buff *reasm_buf;
+ /* Broadcast */
+ u16 ackers;
+ u16 acked;
+ struct tipc_link *bc_rcvlink;
+ struct tipc_link *bc_sndlink;
+ int nack_state;
+ bool bc_peer_is_up;
+
/* Statistics */
struct tipc_stats stats;
};
-struct tipc_port;
-
-struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
- struct tipc_bearer *b_ptr,
- const struct tipc_media_addr *media_addr);
-void tipc_link_delete(struct tipc_link *link);
-void tipc_link_delete_list(struct net *net, unsigned int bearer_id,
- bool shutting_down);
-void tipc_link_failover_send_queue(struct tipc_link *l_ptr);
-void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest);
+bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
+ int tolerance, char net_plane, u32 mtu, int priority,
+ int window, u32 session, u32 ownnode, u32 peer,
+ u16 peer_caps,
+ struct tipc_link *bc_sndlink,
+ struct tipc_link *bc_rcvlink,
+ struct sk_buff_head *inputq,
+ struct sk_buff_head *namedq,
+ struct tipc_link **link);
+bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
+ int mtu, int window, u16 peer_caps,
+ struct sk_buff_head *inputq,
+ struct sk_buff_head *namedq,
+ struct tipc_link *bc_sndlink,
+ struct tipc_link **link);
+void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
+ int mtyp, struct sk_buff_head *xmitq);
+void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
+int tipc_link_fsm_evt(struct tipc_link *l, int evt);
void tipc_link_reset_fragments(struct tipc_link *l_ptr);
-int tipc_link_is_up(struct tipc_link *l_ptr);
-int tipc_link_is_active(struct tipc_link *l_ptr);
-void tipc_link_purge_queues(struct tipc_link *l_ptr);
-void tipc_link_reset_all(struct tipc_node *node);
+bool tipc_link_is_up(struct tipc_link *l);
+bool tipc_link_peer_is_down(struct tipc_link *l);
+bool tipc_link_is_reset(struct tipc_link *l);
+bool tipc_link_is_establishing(struct tipc_link *l);
+bool tipc_link_is_synching(struct tipc_link *l);
+bool tipc_link_is_failingover(struct tipc_link *l);
+bool tipc_link_is_blocked(struct tipc_link *l);
+void tipc_link_set_active(struct tipc_link *l, bool active);
void tipc_link_reset(struct tipc_link *l_ptr);
-void tipc_link_reset_list(struct net *net, unsigned int bearer_id);
-int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
- u32 selector);
-int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest,
- u32 selector);
-int __tipc_link_xmit(struct net *net, struct tipc_link *link,
- struct sk_buff_head *list);
-void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob,
- u32 gap, u32 tolerance, u32 priority);
-void tipc_link_push_packets(struct tipc_link *l_ptr);
-u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *buf);
-void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window);
-void tipc_link_retransmit(struct tipc_link *l_ptr,
- struct sk_buff *start, u32 retransmits);
-struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list,
- const struct sk_buff *skb);
+int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list,
+ struct sk_buff_head *xmitq);
+void tipc_link_set_queue_limits(struct tipc_link *l, u32 window);
int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
-void link_prepare_wakeup(struct tipc_link *l);
-
-/*
- * Link sequence number manipulation routines (uses modulo 2**16 arithmetic)
- */
-static inline u32 buf_seqno(struct sk_buff *buf)
-{
- return msg_seqno(buf_msg(buf));
-}
-
-static inline u32 mod(u32 x)
-{
- return x & 0xffffu;
-}
-
-static inline int less_eq(u32 left, u32 right)
-{
- return mod(right - left) < 32768u;
-}
-
-static inline int more(u32 left, u32 right)
-{
- return !less_eq(left, right);
-}
-
-static inline int less(u32 left, u32 right)
-{
- return less_eq(left, right) && (mod(right) != mod(left));
-}
-
-static inline u32 lesser(u32 left, u32 right)
-{
- return less_eq(left, right) ? left : right;
-}
-
-static inline u32 link_own_addr(struct tipc_link *l)
-{
- return msg_prevnode(l->pmsg);
-}
-
-/*
- * Link status checking routines
- */
-static inline int link_working_working(struct tipc_link *l_ptr)
-{
- return l_ptr->state == WORKING_WORKING;
-}
-
-static inline int link_working_unknown(struct tipc_link *l_ptr)
-{
- return l_ptr->state == WORKING_UNKNOWN;
-}
-
-static inline int link_reset_unknown(struct tipc_link *l_ptr)
-{
- return l_ptr->state == RESET_UNKNOWN;
-}
-
-static inline int link_reset_reset(struct tipc_link *l_ptr)
-{
- return l_ptr->state == RESET_RESET;
-}
-
+int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq);
+int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
+int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
+void tipc_link_add_bc_peer(struct tipc_link *snd_l,
+ struct tipc_link *uc_l,
+ struct sk_buff_head *xmitq);
+void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
+ struct tipc_link *rcv_l,
+ struct sk_buff_head *xmitq);
+int tipc_link_bc_peers(struct tipc_link *l);
+void tipc_link_set_mtu(struct tipc_link *l, int mtu);
+int tipc_link_mtu(struct tipc_link *l);
+void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
+ struct sk_buff_head *xmitq);
+void tipc_link_build_bc_sync_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq);
+void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr);
+void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
+ struct sk_buff_head *xmitq);
+int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
#endif
diff --git a/kernel/net/tipc/msg.c b/kernel/net/tipc/msg.c
index c3e96e815..8740930f0 100644
--- a/kernel/net/tipc/msg.c
+++ b/kernel/net/tipc/msg.c
@@ -121,7 +121,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
{
struct sk_buff *head = *headbuf;
struct sk_buff *frag = *buf;
- struct sk_buff *tail;
+ struct sk_buff *tail = NULL;
struct tipc_msg *msg;
u32 fragid;
int delta;
@@ -141,9 +141,15 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
if (unlikely(skb_unclone(frag, GFP_ATOMIC)))
goto err;
head = *headbuf = frag;
- skb_frag_list_init(head);
- TIPC_SKB_CB(head)->tail = NULL;
*buf = NULL;
+ TIPC_SKB_CB(head)->tail = NULL;
+ if (skb_is_nonlinear(head)) {
+ skb_walk_frags(head, tail) {
+ TIPC_SKB_CB(head)->tail = tail;
+ }
+ } else {
+ skb_frag_list_init(head);
+ }
return 0;
}
@@ -176,7 +182,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
*buf = NULL;
return 0;
err:
- pr_warn_ratelimited("Unable to build fragment list\n");
kfree_skb(*buf);
kfree_skb(*headbuf);
*buf = *headbuf = NULL;
@@ -331,16 +336,15 @@ error:
/**
* tipc_msg_bundle(): Append contents of a buffer to tail of an existing one
- * @bskb: the buffer to append to ("bundle")
- * @skb: buffer to be appended
+ * @skb: the buffer to append to ("bundle")
+ * @msg: message to be appended
* @mtu: max allowable size for the bundle buffer
* Consumes buffer if successful
* Returns true if bundling could be performed, otherwise false
*/
-bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu)
+bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu)
{
struct tipc_msg *bmsg;
- struct tipc_msg *msg = buf_msg(skb);
unsigned int bsz;
unsigned int msz = msg_size(msg);
u32 start, pad;
@@ -348,9 +352,9 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu)
if (likely(msg_user(msg) == MSG_FRAGMENTER))
return false;
- if (!bskb)
+ if (!skb)
return false;
- bmsg = buf_msg(bskb);
+ bmsg = buf_msg(skb);
bsz = msg_size(bmsg);
start = align(bsz);
pad = start - bsz;
@@ -359,18 +363,20 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu)
return false;
if (unlikely(msg_user(msg) == BCAST_PROTOCOL))
return false;
- if (likely(msg_user(bmsg) != MSG_BUNDLER))
+ if (unlikely(msg_user(bmsg) != MSG_BUNDLER))
return false;
- if (unlikely(skb_tailroom(bskb) < (pad + msz)))
+ if (unlikely(skb_tailroom(skb) < (pad + msz)))
return false;
if (unlikely(max < (start + msz)))
return false;
+ if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) &&
+ (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE))
+ return false;
- skb_put(bskb, pad + msz);
- skb_copy_to_linear_data_offset(bskb, start, skb->data, msz);
+ skb_put(skb, pad + msz);
+ skb_copy_to_linear_data_offset(skb, start, msg, msz);
msg_set_size(bmsg, start + msz);
msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1);
- kfree_skb(skb);
return true;
}
@@ -416,18 +422,18 @@ none:
/**
* tipc_msg_make_bundle(): Create bundle buf and append message to its tail
- * @list: the buffer chain
- * @skb: buffer to be appended and replaced
+ * @list: the buffer chain, where head is the buffer to replace/append
+ * @skb: buffer to be created, appended to and returned in case of success
+ * @msg: message to be appended
* @mtu: max allowable size for the bundle buffer, inclusive header
* @dnode: destination node for message. (Not always present in header)
- * Replaces buffer if successful
* Returns true if success, otherwise false
*/
-bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode)
+bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
+ u32 mtu, u32 dnode)
{
- struct sk_buff *bskb;
+ struct sk_buff *_skb;
struct tipc_msg *bmsg;
- struct tipc_msg *msg = buf_msg(*skb);
u32 msz = msg_size(msg);
u32 max = mtu - INT_H_SIZE;
@@ -440,78 +446,94 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode)
if (msz > (max / 2))
return false;
- bskb = tipc_buf_acquire(max);
- if (!bskb)
+ _skb = tipc_buf_acquire(max);
+ if (!_skb)
return false;
- skb_trim(bskb, INT_H_SIZE);
- bmsg = buf_msg(bskb);
+ skb_trim(_skb, INT_H_SIZE);
+ bmsg = buf_msg(_skb);
tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0,
INT_H_SIZE, dnode);
+ if (msg_isdata(msg))
+ msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE);
+ else
+ msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE);
msg_set_seqno(bmsg, msg_seqno(msg));
msg_set_ack(bmsg, msg_ack(msg));
msg_set_bcast_ack(bmsg, msg_bcast_ack(msg));
- tipc_msg_bundle(bskb, *skb, mtu);
- *skb = bskb;
+ tipc_msg_bundle(_skb, msg, mtu);
+ *skb = _skb;
return true;
}
/**
* tipc_msg_reverse(): swap source and destination addresses and add error code
- * @buf: buffer containing message to be reversed
- * @dnode: return value: node where to send message after reversal
- * @err: error code to be set in message
- * Consumes buffer if failure
+ * @own_node: originating node id for reversed message
+ * @skb: buffer containing message to be reversed; may be replaced.
+ * @err: error code to be set in message, if any
+ * Consumes buffer at failure
* Returns true if success, otherwise false
*/
-bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
- int err)
+bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
{
- struct tipc_msg *msg = buf_msg(buf);
+ struct sk_buff *_skb = *skb;
+ struct tipc_msg *hdr = buf_msg(_skb);
struct tipc_msg ohdr;
- uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE);
+ int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE);
- if (skb_linearize(buf))
+ if (skb_linearize(_skb))
goto exit;
- msg = buf_msg(buf);
- if (msg_dest_droppable(msg))
+ hdr = buf_msg(_skb);
+ if (msg_dest_droppable(hdr))
goto exit;
- if (msg_errcode(msg))
+ if (msg_errcode(hdr))
goto exit;
- memcpy(&ohdr, msg, msg_hdr_sz(msg));
- msg_set_errcode(msg, err);
- msg_set_origport(msg, msg_destport(&ohdr));
- msg_set_destport(msg, msg_origport(&ohdr));
- msg_set_prevnode(msg, own_addr);
- if (!msg_short(msg)) {
- msg_set_orignode(msg, msg_destnode(&ohdr));
- msg_set_destnode(msg, msg_orignode(&ohdr));
+
+ /* Take a copy of original header before altering message */
+ memcpy(&ohdr, hdr, msg_hdr_sz(hdr));
+
+ /* Never return SHORT header; expand by replacing buffer if necessary */
+ if (msg_short(hdr)) {
+ *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen);
+ if (!*skb)
+ goto exit;
+ memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
+ kfree_skb(_skb);
+ _skb = *skb;
+ hdr = buf_msg(_skb);
+ memcpy(hdr, &ohdr, BASIC_H_SIZE);
+ msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
- msg_set_size(msg, msg_hdr_sz(msg) + rdsz);
- skb_trim(buf, msg_size(msg));
- skb_orphan(buf);
- *dnode = msg_orignode(&ohdr);
+
+ /* Now reverse the concerned fields */
+ msg_set_errcode(hdr, err);
+ msg_set_origport(hdr, msg_destport(&ohdr));
+ msg_set_destport(hdr, msg_origport(&ohdr));
+ msg_set_destnode(hdr, msg_prevnode(&ohdr));
+ msg_set_prevnode(hdr, own_node);
+ msg_set_orignode(hdr, own_node);
+ msg_set_size(hdr, msg_hdr_sz(hdr) + dlen);
+ skb_trim(_skb, msg_size(hdr));
+ skb_orphan(_skb);
return true;
exit:
- kfree_skb(buf);
- *dnode = 0;
+ kfree_skb(_skb);
+ *skb = NULL;
return false;
}
/**
* tipc_msg_lookup_dest(): try to find new destination for named message
* @skb: the buffer containing the message.
- * @dnode: return value: next-hop node, if destination found
- * @err: return value: error code to use, if message to be rejected
+ * @err: error code to be used by caller if lookup fails
* Does not consume buffer
* Returns true if a destination is found, false otherwise
*/
-bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb,
- u32 *dnode, int *err)
+bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
{
struct tipc_msg *msg = buf_msg(skb);
- u32 dport;
- u32 own_addr = tipc_own_addr(net);
+ u32 dport, dnode;
+ u32 onode = tipc_own_addr(net);
if (!msg_isdata(msg))
return false;
@@ -522,17 +544,18 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb,
*err = -TIPC_ERR_NO_NAME;
if (skb_linearize(skb))
return false;
+ msg = buf_msg(skb);
if (msg_reroute_cnt(msg))
return false;
- *dnode = addr_domain(net, msg_lookup_scope(msg));
+ dnode = addr_domain(net, msg_lookup_scope(msg));
dport = tipc_nametbl_translate(net, msg_nametype(msg),
- msg_nameinst(msg), dnode);
+ msg_nameinst(msg), &dnode);
if (!dport)
return false;
msg_incr_reroute_cnt(msg);
- if (*dnode != own_addr)
- msg_set_prevnode(msg, own_addr);
- msg_set_destnode(msg, *dnode);
+ if (dnode != onode)
+ msg_set_prevnode(msg, onode);
+ msg_set_destnode(msg, dnode);
msg_set_destport(msg, dport);
*err = TIPC_OK;
return true;
@@ -541,18 +564,22 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb,
/* tipc_msg_reassemble() - clone a buffer chain of fragments and
* reassemble the clones into one message
*/
-struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list)
+bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *_skb;
struct sk_buff *frag = NULL;
struct sk_buff *head = NULL;
- int hdr_sz;
+ int hdr_len;
/* Copy header if single buffer */
if (skb_queue_len(list) == 1) {
skb = skb_peek(list);
- hdr_sz = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb));
- return __pskb_copy(skb, hdr_sz, GFP_ATOMIC);
+ hdr_len = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb));
+ _skb = __pskb_copy(skb, hdr_len, GFP_ATOMIC);
+ if (!_skb)
+ return false;
+ __skb_queue_tail(rcvq, _skb);
+ return true;
}
/* Clone all fragments and reassemble */
@@ -566,9 +593,41 @@ struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list)
if (!head)
goto error;
}
- return frag;
+ __skb_queue_tail(rcvq, frag);
+ return true;
error:
pr_warn("Failed do clone local mcast rcv buffer\n");
kfree_skb(head);
- return NULL;
+ return false;
+}
+
+/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
+ * @list: list to be appended to
+ * @seqno: sequence number of buffer to add
+ * @skb: buffer to add
+ */
+void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
+ struct sk_buff *skb)
+{
+ struct sk_buff *_skb, *tmp;
+
+ if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) {
+ __skb_queue_head(list, skb);
+ return;
+ }
+
+ if (more(seqno, buf_seqno(skb_peek_tail(list)))) {
+ __skb_queue_tail(list, skb);
+ return;
+ }
+
+ skb_queue_walk_safe(list, _skb, tmp) {
+ if (more(seqno, buf_seqno(_skb)))
+ continue;
+ if (seqno == buf_seqno(_skb))
+ break;
+ __skb_queue_before(list, _skb, skb);
+ return;
+ }
+ kfree_skb(skb);
}
diff --git a/kernel/net/tipc/msg.h b/kernel/net/tipc/msg.h
index e1d3595e2..55778a0ae 100644
--- a/kernel/net/tipc/msg.h
+++ b/kernel/net/tipc/msg.h
@@ -38,6 +38,7 @@
#define _TIPC_MSG_H
#include <linux/tipc.h>
+#include "core.h"
/*
* Constants and routines used to read and write TIPC payload message headers
@@ -109,9 +110,9 @@ struct tipc_skb_cb {
struct sk_buff *tail;
bool validated;
bool wakeup_pending;
- bool bundling;
u16 chain_sz;
u16 chain_imp;
+ u16 ackers;
};
#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
@@ -313,12 +314,12 @@ static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n)
msg_set_bits(m, 1, 19, 0x3, n);
}
-static inline u32 msg_bcast_ack(struct tipc_msg *m)
+static inline u16 msg_bcast_ack(struct tipc_msg *m)
{
return msg_bits(m, 1, 0, 0xffff);
}
-static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n)
+static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 1, 0, 0xffff, n);
}
@@ -327,22 +328,22 @@ static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n)
/*
* Word 2
*/
-static inline u32 msg_ack(struct tipc_msg *m)
+static inline u16 msg_ack(struct tipc_msg *m)
{
return msg_bits(m, 2, 16, 0xffff);
}
-static inline void msg_set_ack(struct tipc_msg *m, u32 n)
+static inline void msg_set_ack(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 2, 16, 0xffff, n);
}
-static inline u32 msg_seqno(struct tipc_msg *m)
+static inline u16 msg_seqno(struct tipc_msg *m)
{
return msg_bits(m, 2, 0, 0xffff);
}
-static inline void msg_set_seqno(struct tipc_msg *m, u32 n)
+static inline void msg_set_seqno(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 2, 0, 0xffff, n);
}
@@ -352,18 +353,22 @@ static inline void msg_set_seqno(struct tipc_msg *m, u32 n)
*/
static inline u32 msg_importance(struct tipc_msg *m)
{
- if (unlikely(msg_user(m) == MSG_FRAGMENTER))
- return msg_bits(m, 5, 13, 0x7);
- if (likely(msg_isdata(m) && !msg_errcode(m)))
- return msg_user(m);
+ int usr = msg_user(m);
+
+ if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m)))
+ return usr;
+ if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))
+ return msg_bits(m, 9, 0, 0x7);
return TIPC_SYSTEM_IMPORTANCE;
}
static inline void msg_set_importance(struct tipc_msg *m, u32 i)
{
- if (unlikely(msg_user(m) == MSG_FRAGMENTER))
- msg_set_bits(m, 5, 13, 0x7, i);
- else if (likely(i < TIPC_SYSTEM_IMPORTANCE))
+ int usr = msg_user(m);
+
+ if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)))
+ msg_set_bits(m, 9, 0, 0x7, i);
+ else if (i < TIPC_SYSTEM_IMPORTANCE)
msg_set_user(m, i);
else
pr_warn("Trying to set illegal importance in message\n");
@@ -554,15 +559,6 @@ static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n)
msg_set_bits(m, 1, 15, 0x1fff, n);
}
-static inline bool msg_dup(struct tipc_msg *m)
-{
- if (likely(msg_user(m) != TUNNEL_PROTOCOL))
- return false;
- if (msg_type(m) != SYNCH_MSG)
- return false;
- return true;
-}
-
/*
* Word 2
*/
@@ -605,6 +601,11 @@ static inline u32 msg_last_bcast(struct tipc_msg *m)
return msg_bits(m, 4, 16, 0xffff);
}
+static inline u32 msg_bc_snd_nxt(struct tipc_msg *m)
+{
+ return msg_last_bcast(m) + 1;
+}
+
static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
{
msg_set_bits(m, 4, 16, 0xffff, n);
@@ -616,12 +617,12 @@ static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
}
-static inline u32 msg_next_sent(struct tipc_msg *m)
+static inline u16 msg_next_sent(struct tipc_msg *m)
{
return msg_bits(m, 4, 0, 0xffff);
}
-static inline void msg_set_next_sent(struct tipc_msg *m, u32 n)
+static inline void msg_set_next_sent(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 4, 0, 0xffff, n);
}
@@ -654,12 +655,12 @@ static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
/*
* Word 5
*/
-static inline u32 msg_session(struct tipc_msg *m)
+static inline u16 msg_session(struct tipc_msg *m)
{
return msg_bits(m, 5, 16, 0xffff);
}
-static inline void msg_set_session(struct tipc_msg *m, u32 n)
+static inline void msg_set_session(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 5, 16, 0xffff, n);
}
@@ -722,12 +723,12 @@ static inline char *msg_media_addr(struct tipc_msg *m)
/*
* Word 9
*/
-static inline u32 msg_msgcnt(struct tipc_msg *m)
+static inline u16 msg_msgcnt(struct tipc_msg *m)
{
return msg_bits(m, 9, 16, 0xffff);
}
-static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n)
+static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 9, 16, 0xffff, n);
}
@@ -762,25 +763,46 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
msg_set_bits(m, 9, 0, 0xffff, n);
}
+static inline bool msg_peer_link_is_up(struct tipc_msg *m)
+{
+ if (likely(msg_user(m) != LINK_PROTOCOL))
+ return true;
+ if (msg_type(m) == STATE_MSG)
+ return true;
+ return false;
+}
+
+static inline bool msg_peer_node_is_up(struct tipc_msg *m)
+{
+ if (msg_peer_link_is_up(m))
+ return true;
+ return msg_redundant_link(m);
+}
+
struct sk_buff *tipc_buf_acquire(u32 size);
bool tipc_msg_validate(struct sk_buff *skb);
-bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
- int err);
+bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
u32 hsize, u32 destnode);
struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz,
uint data_sz, u32 dnode, u32 onode,
u32 dport, u32 oport, int errcode);
int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf);
-bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu);
-
-bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode);
+bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu);
+bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
+ u32 mtu, u32 dnode);
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
int offset, int dsz, int mtu, struct sk_buff_head *list);
-bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode,
- int *err);
-struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list);
+bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
+bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
+void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
+ struct sk_buff *skb);
+
+static inline u16 buf_seqno(struct sk_buff *skb)
+{
+ return msg_seqno(buf_msg(skb));
+}
/* tipc_skb_peek(): peek and reserve first buffer in list
* @list: list to be peeked in
@@ -848,26 +870,33 @@ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list,
return skb;
}
-/* tipc_skb_queue_tail(): add buffer to tail of list;
- * @list: list to be appended to
- * @skb: buffer to append. Always appended
- * @dport: the destination port of the buffer
- * returns true if dport differs from previous destination
+/* tipc_skb_queue_splice_tail - append an skb list to lock protected list
+ * @list: the new list to append. Not lock protected
+ * @head: target list. Lock protected.
*/
-static inline bool tipc_skb_queue_tail(struct sk_buff_head *list,
- struct sk_buff *skb, u32 dport)
+static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list,
+ struct sk_buff_head *head)
{
- struct sk_buff *_skb = NULL;
- bool rv = false;
+ spin_lock_bh(&head->lock);
+ skb_queue_splice_tail(list, head);
+ spin_unlock_bh(&head->lock);
+}
+
+/* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists
+ * @list: the new list to add. Lock protected. Will be reinitialized
+ * @head: target list. Lock protected.
+ */
+static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list,
+ struct sk_buff_head *head)
+{
+ struct sk_buff_head tmp;
+
+ __skb_queue_head_init(&tmp);
spin_lock_bh(&list->lock);
- _skb = skb_peek_tail(list);
- if (!_skb || (msg_destport(buf_msg(_skb)) != dport) ||
- (skb_queue_len(list) > 32))
- rv = true;
- __skb_queue_tail(list, skb);
+ skb_queue_splice_tail_init(list, &tmp);
spin_unlock_bh(&list->lock);
- return rv;
+ tipc_skb_queue_splice_tail(&tmp, head);
}
#endif
diff --git a/kernel/net/tipc/name_distr.c b/kernel/net/tipc/name_distr.c
index 41e7b7e4d..c07612bab 100644
--- a/kernel/net/tipc/name_distr.c
+++ b/kernel/net/tipc/name_distr.c
@@ -96,13 +96,13 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb)
dnode = node->addr;
if (in_own_node(net, dnode))
continue;
- if (!tipc_node_active_links(node))
+ if (!tipc_node_is_up(node))
continue;
oskb = pskb_copy(skb, GFP_ATOMIC);
if (!oskb)
break;
msg_set_destnode(buf_msg(oskb), dnode);
- tipc_link_xmit_skb(net, oskb, dnode, dnode);
+ tipc_node_xmit_skb(net, oskb, dnode, 0);
}
rcu_read_unlock();
@@ -223,7 +223,7 @@ void tipc_named_node_up(struct net *net, u32 dnode)
&tn->nametbl->publ_list[TIPC_ZONE_SCOPE]);
rcu_read_unlock();
- tipc_link_xmit(net, &head, dnode, dnode);
+ tipc_node_xmit(net, &head, dnode, 0);
}
static void tipc_publ_subscribe(struct net *net, struct publication *publ,
diff --git a/kernel/net/tipc/name_table.c b/kernel/net/tipc/name_table.c
index ab0ac62a1..0f47f08bf 100644
--- a/kernel/net/tipc/name_table.c
+++ b/kernel/net/tipc/name_table.c
@@ -330,13 +330,9 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
/* Any subscriptions waiting for notification? */
list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
- tipc_subscr_report_overlap(s,
- publ->lower,
- publ->upper,
- TIPC_PUBLISHED,
- publ->ref,
- publ->node,
- created_subseq);
+ tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
+ TIPC_PUBLISHED, publ->ref,
+ publ->node, created_subseq);
}
return publ;
}
@@ -404,13 +400,9 @@ found:
/* Notify any waiting subscriptions */
list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
- tipc_subscr_report_overlap(s,
- publ->lower,
- publ->upper,
- TIPC_WITHDRAWN,
- publ->ref,
- publ->node,
- removed_subseq);
+ tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
+ TIPC_WITHDRAWN, publ->ref,
+ publ->node, removed_subseq);
}
return publ;
@@ -432,19 +424,17 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
return;
while (sseq != &nseq->sseqs[nseq->first_free]) {
- if (tipc_subscr_overlap(s, sseq->lower, sseq->upper)) {
+ if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) {
struct publication *crs;
struct name_info *info = sseq->info;
int must_report = 1;
list_for_each_entry(crs, &info->zone_list, zone_list) {
- tipc_subscr_report_overlap(s,
- sseq->lower,
- sseq->upper,
- TIPC_PUBLISHED,
- crs->ref,
- crs->node,
- must_report);
+ tipc_subscrp_report_overlap(s, sseq->lower,
+ sseq->upper,
+ TIPC_PUBLISHED,
+ crs->ref, crs->node,
+ must_report);
must_report = 0;
}
}
diff --git a/kernel/net/tipc/net.c b/kernel/net/tipc/net.c
index a54f3cbe2..77bf9113c 100644
--- a/kernel/net/tipc/net.c
+++ b/kernel/net/tipc/net.c
@@ -40,6 +40,7 @@
#include "subscr.h"
#include "socket.h"
#include "node.h"
+#include "bcast.h"
static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
[TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
@@ -111,14 +112,11 @@ int tipc_net_start(struct net *net, u32 addr)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
char addr_string[16];
- int res;
tn->own_addr = addr;
tipc_named_reinit(net);
tipc_sk_reinit(net);
- res = tipc_bclink_init(net);
- if (res)
- return res;
+ tipc_bcast_reinit(net);
tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
TIPC_ZONE_SCOPE, 0, tn->own_addr);
@@ -141,7 +139,6 @@ void tipc_net_stop(struct net *net)
tn->own_addr);
rtnl_lock();
tipc_bearer_stop(net);
- tipc_bclink_stop(net);
tipc_node_stop(net);
rtnl_unlock();
diff --git a/kernel/net/tipc/netlink_compat.c b/kernel/net/tipc/netlink_compat.c
index ce9121e8e..1eadc95e1 100644
--- a/kernel/net/tipc/netlink_compat.c
+++ b/kernel/net/tipc/netlink_compat.c
@@ -55,6 +55,7 @@ struct tipc_nl_compat_msg {
int rep_type;
int rep_size;
int req_type;
+ struct net *net;
struct sk_buff *rep;
struct tlv_desc *req;
struct sock *dst_sk;
@@ -68,7 +69,8 @@ struct tipc_nl_compat_cmd_dump {
struct tipc_nl_compat_cmd_doit {
int (*doit)(struct sk_buff *skb, struct genl_info *info);
- int (*transcode)(struct sk_buff *skb, struct tipc_nl_compat_msg *msg);
+ int (*transcode)(struct tipc_nl_compat_cmd_doit *cmd,
+ struct sk_buff *skb, struct tipc_nl_compat_msg *msg);
};
static int tipc_skb_tailroom(struct sk_buff *skb)
@@ -281,7 +283,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd,
if (!trans_buf)
return -ENOMEM;
- err = (*cmd->transcode)(trans_buf, msg);
+ err = (*cmd->transcode)(cmd, trans_buf, msg);
if (err)
goto trans_out;
@@ -353,7 +355,8 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg,
nla_len(bearer[TIPC_NLA_BEARER_NAME]));
}
-static int tipc_nl_compat_bearer_enable(struct sk_buff *skb,
+static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd,
+ struct sk_buff *skb,
struct tipc_nl_compat_msg *msg)
{
struct nlattr *prop;
@@ -385,7 +388,8 @@ static int tipc_nl_compat_bearer_enable(struct sk_buff *skb,
return 0;
}
-static int tipc_nl_compat_bearer_disable(struct sk_buff *skb,
+static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd,
+ struct sk_buff *skb,
struct tipc_nl_compat_msg *msg)
{
char *name;
@@ -576,11 +580,81 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
&link_info, sizeof(link_info));
}
-static int tipc_nl_compat_link_set(struct sk_buff *skb,
- struct tipc_nl_compat_msg *msg)
+static int __tipc_add_link_prop(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg,
+ struct tipc_link_config *lc)
+{
+ switch (msg->cmd) {
+ case TIPC_CMD_SET_LINK_PRI:
+ return nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value));
+ case TIPC_CMD_SET_LINK_TOL:
+ return nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value));
+ case TIPC_CMD_SET_LINK_WINDOW:
+ return nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value));
+ }
+
+ return -EINVAL;
+}
+
+static int tipc_nl_compat_media_set(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
{
- struct nlattr *link;
struct nlattr *prop;
+ struct nlattr *media;
+ struct tipc_link_config *lc;
+
+ lc = (struct tipc_link_config *)TLV_DATA(msg->req);
+
+ media = nla_nest_start(skb, TIPC_NLA_MEDIA);
+ if (!media)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name))
+ return -EMSGSIZE;
+
+ prop = nla_nest_start(skb, TIPC_NLA_MEDIA_PROP);
+ if (!prop)
+ return -EMSGSIZE;
+
+ __tipc_add_link_prop(skb, msg, lc);
+ nla_nest_end(skb, prop);
+ nla_nest_end(skb, media);
+
+ return 0;
+}
+
+static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ struct nlattr *prop;
+ struct nlattr *bearer;
+ struct tipc_link_config *lc;
+
+ lc = (struct tipc_link_config *)TLV_DATA(msg->req);
+
+ bearer = nla_nest_start(skb, TIPC_NLA_BEARER);
+ if (!bearer)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name))
+ return -EMSGSIZE;
+
+ prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP);
+ if (!prop)
+ return -EMSGSIZE;
+
+ __tipc_add_link_prop(skb, msg, lc);
+ nla_nest_end(skb, prop);
+ nla_nest_end(skb, bearer);
+
+ return 0;
+}
+
+static int __tipc_nl_compat_link_set(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ struct nlattr *prop;
+ struct nlattr *link;
struct tipc_link_config *lc;
lc = (struct tipc_link_config *)TLV_DATA(msg->req);
@@ -596,24 +670,40 @@ static int tipc_nl_compat_link_set(struct sk_buff *skb,
if (!prop)
return -EMSGSIZE;
- if (msg->cmd == TIPC_CMD_SET_LINK_PRI) {
- if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value)))
- return -EMSGSIZE;
- } else if (msg->cmd == TIPC_CMD_SET_LINK_TOL) {
- if (nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value)))
- return -EMSGSIZE;
- } else if (msg->cmd == TIPC_CMD_SET_LINK_WINDOW) {
- if (nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value)))
- return -EMSGSIZE;
- }
-
+ __tipc_add_link_prop(skb, msg, lc);
nla_nest_end(skb, prop);
nla_nest_end(skb, link);
return 0;
}
-static int tipc_nl_compat_link_reset_stats(struct sk_buff *skb,
+static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd,
+ struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ struct tipc_link_config *lc;
+ struct tipc_bearer *bearer;
+ struct tipc_media *media;
+
+ lc = (struct tipc_link_config *)TLV_DATA(msg->req);
+
+ media = tipc_media_find(lc->name);
+ if (media) {
+ cmd->doit = &tipc_nl_media_set;
+ return tipc_nl_compat_media_set(skb, msg);
+ }
+
+ bearer = tipc_bearer_find(msg->net, lc->name);
+ if (bearer) {
+ cmd->doit = &tipc_nl_bearer_set;
+ return tipc_nl_compat_bearer_set(skb, msg);
+ }
+
+ return __tipc_nl_compat_link_set(skb, msg);
+}
+
+static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
+ struct sk_buff *skb,
struct tipc_nl_compat_msg *msg)
{
char *name;
@@ -851,7 +941,8 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg,
sizeof(node_info));
}
-static int tipc_nl_compat_net_set(struct sk_buff *skb,
+static int tipc_nl_compat_net_set(struct tipc_nl_compat_cmd_doit *cmd,
+ struct sk_buff *skb,
struct tipc_nl_compat_msg *msg)
{
u32 val;
@@ -1007,7 +1098,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
struct nlmsghdr *req_nlh;
struct nlmsghdr *rep_nlh;
struct tipc_genlmsghdr *req_userhdr = info->userhdr;
- struct net *net = genl_info_net(info);
memset(&msg, 0, sizeof(msg));
@@ -1015,6 +1105,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN;
msg.cmd = req_userhdr->cmd;
msg.dst_sk = info->dst_sk;
+ msg.net = genl_info_net(info);
if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) {
msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN);
@@ -1023,14 +1114,14 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
}
len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
- if (TLV_GET_LEN(msg.req) && !TLV_OK(msg.req, len)) {
+ if (len && !TLV_OK(msg.req, len)) {
msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
err = -EOPNOTSUPP;
goto send;
}
err = tipc_nl_compat_handle(&msg);
- if (err == -EOPNOTSUPP)
+ if ((err == -EOPNOTSUPP) || (err == -EPERM))
msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
else if (err == -EINVAL)
msg.rep = tipc_get_err_tlv(TIPC_CFG_TLV_ERROR);
@@ -1043,7 +1134,7 @@ send:
rep_nlh = nlmsg_hdr(msg.rep);
memcpy(rep_nlh, info->nlhdr, len);
rep_nlh->nlmsg_len = msg.rep->len;
- genlmsg_unicast(net, msg.rep, NETLINK_CB(skb).portid);
+ genlmsg_unicast(msg.net, msg.rep, NETLINK_CB(skb).portid);
return err;
}
diff --git a/kernel/net/tipc/node.c b/kernel/net/tipc/node.c
index 22c059ad2..3926b561f 100644
--- a/kernel/net/tipc/node.c
+++ b/kernel/net/tipc/node.c
@@ -1,7 +1,7 @@
/*
* net/tipc/node.c: TIPC node management routines
*
- * Copyright (c) 2000-2006, 2012-2014, Ericsson AB
+ * Copyright (c) 2000-2006, 2012-2015, Ericsson AB
* Copyright (c) 2005-2006, 2010-2014, Wind River Systems
* All rights reserved.
*
@@ -39,10 +39,42 @@
#include "node.h"
#include "name_distr.h"
#include "socket.h"
+#include "bcast.h"
+#include "discover.h"
-static void node_lost_contact(struct tipc_node *n_ptr);
-static void node_established_contact(struct tipc_node *n_ptr);
+/* Node FSM states and events:
+ */
+enum {
+ SELF_DOWN_PEER_DOWN = 0xdd,
+ SELF_UP_PEER_UP = 0xaa,
+ SELF_DOWN_PEER_LEAVING = 0xd1,
+ SELF_UP_PEER_COMING = 0xac,
+ SELF_COMING_PEER_UP = 0xca,
+ SELF_LEAVING_PEER_DOWN = 0x1d,
+ NODE_FAILINGOVER = 0xf0,
+ NODE_SYNCHING = 0xcc
+};
+
+enum {
+ SELF_ESTABL_CONTACT_EVT = 0xece,
+ SELF_LOST_CONTACT_EVT = 0x1ce,
+ PEER_ESTABL_CONTACT_EVT = 0x9ece,
+ PEER_LOST_CONTACT_EVT = 0x91ce,
+ NODE_FAILOVER_BEGIN_EVT = 0xfbe,
+ NODE_FAILOVER_END_EVT = 0xfee,
+ NODE_SYNCH_BEGIN_EVT = 0xcbe,
+ NODE_SYNCH_END_EVT = 0xcee
+};
+
+static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr **maddr);
+static void tipc_node_link_down(struct tipc_node *n, int bearer_id,
+ bool delete);
+static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq);
static void tipc_node_delete(struct tipc_node *node);
+static void tipc_node_timeout(unsigned long data);
+static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
struct tipc_sock_conn {
u32 port;
@@ -109,7 +141,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr)
return NULL;
}
-struct tipc_node *tipc_node_create(struct net *net, u32 addr)
+struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *n_ptr, *temp_node;
@@ -125,31 +157,66 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr)
}
n_ptr->addr = addr;
n_ptr->net = net;
+ n_ptr->capabilities = capabilities;
kref_init(&n_ptr->kref);
spin_lock_init(&n_ptr->lock);
INIT_HLIST_NODE(&n_ptr->hash);
INIT_LIST_HEAD(&n_ptr->list);
INIT_LIST_HEAD(&n_ptr->publ_list);
INIT_LIST_HEAD(&n_ptr->conn_sks);
- __skb_queue_head_init(&n_ptr->bclink.deferdq);
+ skb_queue_head_init(&n_ptr->bc_entry.namedq);
+ skb_queue_head_init(&n_ptr->bc_entry.inputq1);
+ __skb_queue_head_init(&n_ptr->bc_entry.arrvq);
+ skb_queue_head_init(&n_ptr->bc_entry.inputq2);
+ n_ptr->state = SELF_DOWN_PEER_LEAVING;
+ n_ptr->signature = INVALID_NODE_SIG;
+ n_ptr->active_links[0] = INVALID_BEARER_ID;
+ n_ptr->active_links[1] = INVALID_BEARER_ID;
+ if (!tipc_link_bc_create(net, tipc_own_addr(net), n_ptr->addr,
+ U16_MAX, tipc_bc_sndlink(net)->window,
+ n_ptr->capabilities,
+ &n_ptr->bc_entry.inputq1,
+ &n_ptr->bc_entry.namedq,
+ tipc_bc_sndlink(net),
+ &n_ptr->bc_entry.link)) {
+ pr_warn("Broadcast rcv link creation failed, no memory\n");
+ kfree(n_ptr);
+ n_ptr = NULL;
+ goto exit;
+ }
+ tipc_node_get(n_ptr);
+ setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr);
+ n_ptr->keepalive_intv = U32_MAX;
hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]);
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
if (n_ptr->addr < temp_node->addr)
break;
}
list_add_tail_rcu(&n_ptr->list, &temp_node->list);
- n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN;
- n_ptr->signature = INVALID_NODE_SIG;
- tipc_node_get(n_ptr);
exit:
spin_unlock_bh(&tn->node_list_lock);
return n_ptr;
}
+static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
+{
+ unsigned long tol = l->tolerance;
+ unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
+ unsigned long keepalive_intv = msecs_to_jiffies(intv);
+
+ /* Link with lowest tolerance determines timer interval */
+ if (keepalive_intv < n->keepalive_intv)
+ n->keepalive_intv = keepalive_intv;
+
+ /* Ensure link's abort limit corresponds to current interval */
+ l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv);
+}
+
static void tipc_node_delete(struct tipc_node *node)
{
list_del_rcu(&node->list);
hlist_del_rcu(&node->hash);
+ kfree(node->bc_entry.link);
kfree_rcu(node, rcu);
}
@@ -159,8 +226,11 @@ void tipc_node_stop(struct net *net)
struct tipc_node *node, *t_node;
spin_lock_bh(&tn->node_list_lock);
- list_for_each_entry_safe(node, t_node, &tn->node_list, list)
+ list_for_each_entry_safe(node, t_node, &tn->node_list, list) {
+ if (del_timer(&node->timer))
+ tipc_node_put(node);
tipc_node_put(node);
+ }
spin_unlock_bh(&tn->node_list_lock);
}
@@ -218,212 +288,604 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
tipc_node_put(node);
}
+/* tipc_node_timeout - handle expiration of node timer
+ */
+static void tipc_node_timeout(unsigned long data)
+{
+ struct tipc_node *n = (struct tipc_node *)data;
+ struct tipc_link_entry *le;
+ struct sk_buff_head xmitq;
+ int bearer_id;
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ tipc_node_lock(n);
+ le = &n->links[bearer_id];
+ if (le->link) {
+ /* Link tolerance may change asynchronously: */
+ tipc_node_calculate_timer(n, le->link);
+ rc = tipc_link_timeout(le->link, &xmitq);
+ }
+ tipc_node_unlock(n);
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
+ if (rc & TIPC_LINK_DOWN_EVT)
+ tipc_node_link_down(n, bearer_id, false);
+ }
+ if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ tipc_node_get(n);
+ tipc_node_put(n);
+}
+
/**
- * tipc_node_link_up - handle addition of link
- *
+ * __tipc_node_link_up - handle addition of link
+ * Node lock must be held by caller
* Link becomes active (alone or shared) or standby, depending on its priority.
*/
-void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link **active = &n_ptr->active_links[0];
+ int *slot0 = &n->active_links[0];
+ int *slot1 = &n->active_links[1];
+ struct tipc_link *ol = node_active_link(n, 0);
+ struct tipc_link *nl = n->links[bearer_id].link;
- n_ptr->working_links++;
- n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP;
- n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+ if (!nl)
+ return;
- pr_debug("Established link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
+ tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT);
+ if (!tipc_link_is_up(nl))
+ return;
- if (!active[0]) {
- active[0] = active[1] = l_ptr;
- node_established_contact(n_ptr);
- goto exit;
- }
- if (l_ptr->priority < active[0]->priority) {
- pr_debug("New link <%s> becomes standby\n", l_ptr->name);
- goto exit;
+ n->working_links++;
+ n->action_flags |= TIPC_NOTIFY_LINK_UP;
+ n->link_id = nl->peer_bearer_id << 16 | bearer_id;
+
+ /* Leave room for tunnel header when returning 'mtu' to users: */
+ n->links[bearer_id].mtu = nl->mtu - INT_H_SIZE;
+
+ tipc_bearer_add_dest(n->net, bearer_id, n->addr);
+ tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id);
+
+ pr_debug("Established link <%s> on network plane %c\n",
+ nl->name, nl->net_plane);
+
+ /* First link? => give it both slots */
+ if (!ol) {
+ *slot0 = bearer_id;
+ *slot1 = bearer_id;
+ tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT);
+ n->action_flags |= TIPC_NOTIFY_NODE_UP;
+ tipc_bcast_add_peer(n->net, nl, xmitq);
+ return;
}
- tipc_link_dup_queue_xmit(active[0], l_ptr);
- if (l_ptr->priority == active[0]->priority) {
- active[0] = l_ptr;
- goto exit;
+
+ /* Second link => redistribute slots */
+ if (nl->priority > ol->priority) {
+ pr_debug("Old link <%s> becomes standby\n", ol->name);
+ *slot0 = bearer_id;
+ *slot1 = bearer_id;
+ tipc_link_set_active(nl, true);
+ tipc_link_set_active(ol, false);
+ } else if (nl->priority == ol->priority) {
+ tipc_link_set_active(nl, true);
+ *slot1 = bearer_id;
+ } else {
+ pr_debug("New link <%s> is standby\n", nl->name);
}
- pr_debug("Old link <%s> becomes standby\n", active[0]->name);
- if (active[1] != active[0])
- pr_debug("Old link <%s> becomes standby\n", active[1]->name);
- active[0] = active[1] = l_ptr;
-exit:
- /* Leave room for changeover header when returning 'mtu' to users: */
- n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
- n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
+
+ /* Prepare synchronization with first link */
+ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq);
}
/**
- * node_select_active_links - select active link
+ * tipc_node_link_up - handle addition of link
+ *
+ * Link becomes active (alone or shared) or standby, depending on its priority.
*/
-static void node_select_active_links(struct tipc_node *n_ptr)
+static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link **active = &n_ptr->active_links[0];
- u32 i;
- u32 highest_prio = 0;
+ tipc_node_lock(n);
+ __tipc_node_link_up(n, bearer_id, xmitq);
+ tipc_node_unlock(n);
+}
- active[0] = active[1] = NULL;
+/**
+ * __tipc_node_link_down - handle loss of link
+ */
+static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr **maddr)
+{
+ struct tipc_link_entry *le = &n->links[*bearer_id];
+ int *slot0 = &n->active_links[0];
+ int *slot1 = &n->active_links[1];
+ int i, highest = 0;
+ struct tipc_link *l, *_l, *tnl;
+
+ l = n->links[*bearer_id].link;
+ if (!l || tipc_link_is_reset(l))
+ return;
- for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link *l_ptr = n_ptr->links[i];
+ n->working_links--;
+ n->action_flags |= TIPC_NOTIFY_LINK_DOWN;
+ n->link_id = l->peer_bearer_id << 16 | *bearer_id;
- if (!l_ptr || !tipc_link_is_up(l_ptr) ||
- (l_ptr->priority < highest_prio))
- continue;
+ tipc_bearer_remove_dest(n->net, *bearer_id, n->addr);
- if (l_ptr->priority > highest_prio) {
- highest_prio = l_ptr->priority;
- active[0] = active[1] = l_ptr;
- } else {
- active[1] = l_ptr;
+ pr_debug("Lost link <%s> on network plane %c\n",
+ l->name, l->net_plane);
+
+ /* Select new active link if any available */
+ *slot0 = INVALID_BEARER_ID;
+ *slot1 = INVALID_BEARER_ID;
+ for (i = 0; i < MAX_BEARERS; i++) {
+ _l = n->links[i].link;
+ if (!_l || !tipc_link_is_up(_l))
+ continue;
+ if (_l == l)
+ continue;
+ if (_l->priority < highest)
+ continue;
+ if (_l->priority > highest) {
+ highest = _l->priority;
+ *slot0 = i;
+ *slot1 = i;
+ continue;
}
+ *slot1 = i;
}
+
+ if (!tipc_node_is_up(n)) {
+ if (tipc_link_peer_is_down(l))
+ tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
+ tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT);
+ tipc_link_fsm_evt(l, LINK_RESET_EVT);
+ tipc_link_reset(l);
+ tipc_link_build_reset_msg(l, xmitq);
+ *maddr = &n->links[*bearer_id].maddr;
+ node_lost_contact(n, &le->inputq);
+ tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id);
+ return;
+ }
+ tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id);
+
+ /* There is still a working link => initiate failover */
+ tnl = node_active_link(n, 0);
+ tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
+ tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
+ n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1);
+ tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq);
+ tipc_link_reset(l);
+ tipc_link_fsm_evt(l, LINK_RESET_EVT);
+ tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
+ tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT);
+ *maddr = &n->links[tnl->bearer_id].maddr;
+ *bearer_id = tnl->bearer_id;
}
-/**
- * tipc_node_link_down - handle loss of link
- */
-void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
{
- struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
- struct tipc_link **active;
-
- n_ptr->working_links--;
- n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN;
- n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+ struct tipc_link_entry *le = &n->links[bearer_id];
+ struct tipc_link *l = le->link;
+ struct tipc_media_addr *maddr;
+ struct sk_buff_head xmitq;
- if (!tipc_link_is_active(l_ptr)) {
- pr_debug("Lost standby link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
+ if (!l)
return;
- }
- pr_debug("Lost link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
-
- active = &n_ptr->active_links[0];
- if (active[0] == l_ptr)
- active[0] = active[1];
- if (active[1] == l_ptr)
- active[1] = active[0];
- if (active[0] == l_ptr)
- node_select_active_links(n_ptr);
- if (tipc_node_is_up(n_ptr))
- tipc_link_failover_send_queue(l_ptr);
- else
- node_lost_contact(n_ptr);
- /* Leave room for changeover header when returning 'mtu' to users: */
- if (active[0]) {
- n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
- n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
- return;
- }
- /* Loopback link went down? No fragmentation needed from now on. */
- if (n_ptr->addr == tn->own_addr) {
- n_ptr->act_mtus[0] = MAX_MSG_SIZE;
- n_ptr->act_mtus[1] = MAX_MSG_SIZE;
+ __skb_queue_head_init(&xmitq);
+
+ tipc_node_lock(n);
+ if (!tipc_link_is_establishing(l)) {
+ __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr);
+ if (delete) {
+ kfree(l);
+ le->link = NULL;
+ n->link_cnt--;
+ }
+ } else {
+ /* Defuse pending tipc_node_link_up() */
+ tipc_link_fsm_evt(l, LINK_RESET_EVT);
}
+ tipc_node_unlock(n);
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
+ tipc_sk_rcv(n->net, &le->inputq);
}
-int tipc_node_active_links(struct tipc_node *n_ptr)
+bool tipc_node_is_up(struct tipc_node *n)
{
- return n_ptr->active_links[0] != NULL;
+ return n->active_links[0] != INVALID_BEARER_ID;
}
-int tipc_node_is_up(struct tipc_node *n_ptr)
+void tipc_node_check_dest(struct net *net, u32 onode,
+ struct tipc_bearer *b,
+ u16 capabilities, u32 signature,
+ struct tipc_media_addr *maddr,
+ bool *respond, bool *dupl_addr)
{
- return tipc_node_active_links(n_ptr);
+ struct tipc_node *n;
+ struct tipc_link *l;
+ struct tipc_link_entry *le;
+ bool addr_match = false;
+ bool sign_match = false;
+ bool link_up = false;
+ bool accept_addr = false;
+ bool reset = true;
+ char *if_name;
+
+ *dupl_addr = false;
+ *respond = false;
+
+ n = tipc_node_create(net, onode, capabilities);
+ if (!n)
+ return;
+
+ tipc_node_lock(n);
+
+ le = &n->links[b->identity];
+
+ /* Prepare to validate requesting node's signature and media address */
+ l = le->link;
+ link_up = l && tipc_link_is_up(l);
+ addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr));
+ sign_match = (signature == n->signature);
+
+ /* These three flags give us eight permutations: */
+
+ if (sign_match && addr_match && link_up) {
+ /* All is fine. Do nothing. */
+ reset = false;
+ } else if (sign_match && addr_match && !link_up) {
+ /* Respond. The link will come up in due time */
+ *respond = true;
+ } else if (sign_match && !addr_match && link_up) {
+ /* Peer has changed i/f address without rebooting.
+ * If so, the link will reset soon, and the next
+ * discovery will be accepted. So we can ignore it.
+ * It may also be an cloned or malicious peer having
+ * chosen the same node address and signature as an
+ * existing one.
+ * Ignore requests until the link goes down, if ever.
+ */
+ *dupl_addr = true;
+ } else if (sign_match && !addr_match && !link_up) {
+ /* Peer link has changed i/f address without rebooting.
+ * It may also be a cloned or malicious peer; we can't
+ * distinguish between the two.
+ * The signature is correct, so we must accept.
+ */
+ accept_addr = true;
+ *respond = true;
+ } else if (!sign_match && addr_match && link_up) {
+ /* Peer node rebooted. Two possibilities:
+ * - Delayed re-discovery; this link endpoint has already
+ * reset and re-established contact with the peer, before
+ * receiving a discovery message from that node.
+ * (The peer happened to receive one from this node first).
+ * - The peer came back so fast that our side has not
+ * discovered it yet. Probing from this side will soon
+ * reset the link, since there can be no working link
+ * endpoint at the peer end, and the link will re-establish.
+ * Accept the signature, since it comes from a known peer.
+ */
+ n->signature = signature;
+ } else if (!sign_match && addr_match && !link_up) {
+ /* The peer node has rebooted.
+ * Accept signature, since it is a known peer.
+ */
+ n->signature = signature;
+ *respond = true;
+ } else if (!sign_match && !addr_match && link_up) {
+ /* Peer rebooted with new address, or a new/duplicate peer.
+ * Ignore until the link goes down, if ever.
+ */
+ *dupl_addr = true;
+ } else if (!sign_match && !addr_match && !link_up) {
+ /* Peer rebooted with new address, or it is a new peer.
+ * Accept signature and address.
+ */
+ n->signature = signature;
+ accept_addr = true;
+ *respond = true;
+ }
+
+ if (!accept_addr)
+ goto exit;
+
+ /* Now create new link if not already existing */
+ if (!l) {
+ if (n->link_cnt == 2) {
+ pr_warn("Cannot establish 3rd link to %x\n", n->addr);
+ goto exit;
+ }
+ if_name = strchr(b->name, ':') + 1;
+ if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
+ b->net_plane, b->mtu, b->priority,
+ b->window, mod(tipc_net(net)->random),
+ tipc_own_addr(net), onode,
+ n->capabilities,
+ tipc_bc_sndlink(n->net), n->bc_entry.link,
+ &le->inputq,
+ &n->bc_entry.namedq, &l)) {
+ *respond = false;
+ goto exit;
+ }
+ tipc_link_reset(l);
+ tipc_link_fsm_evt(l, LINK_RESET_EVT);
+ if (n->state == NODE_FAILINGOVER)
+ tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
+ le->link = l;
+ n->link_cnt++;
+ tipc_node_calculate_timer(n, l);
+ if (n->link_cnt == 1)
+ if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ tipc_node_get(n);
+ }
+ memcpy(&le->maddr, maddr, sizeof(*maddr));
+exit:
+ tipc_node_unlock(n);
+ if (reset && !tipc_link_is_reset(l))
+ tipc_node_link_down(n, b->identity, false);
+ tipc_node_put(n);
}
-void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+void tipc_node_delete_links(struct net *net, int bearer_id)
{
- n_ptr->links[l_ptr->bearer_id] = l_ptr;
- n_ptr->link_cnt++;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *n;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, &tn->node_list, list) {
+ tipc_node_link_down(n, bearer_id, true);
+ }
+ rcu_read_unlock();
}
-void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+static void tipc_node_reset_links(struct tipc_node *n)
{
+ char addr_string[16];
int i;
+ pr_warn("Resetting all links to %s\n",
+ tipc_addr_string_fill(addr_string, n->addr));
+
for (i = 0; i < MAX_BEARERS; i++) {
- if (l_ptr != n_ptr->links[i])
- continue;
- n_ptr->links[i] = NULL;
- n_ptr->link_cnt--;
+ tipc_node_link_down(n, i, false);
+ }
+}
+
+/* tipc_node_fsm_evt - node finite state machine
+ * Determines when contact is allowed with peer node
+ */
+static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
+{
+ int state = n->state;
+
+ switch (state) {
+ case SELF_DOWN_PEER_DOWN:
+ switch (evt) {
+ case SELF_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_COMING;
+ break;
+ case PEER_ESTABL_CONTACT_EVT:
+ state = SELF_COMING_PEER_UP;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_UP_PEER_UP:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ state = NODE_SYNCHING;
+ break;
+ case NODE_FAILOVER_BEGIN_EVT:
+ state = NODE_FAILINGOVER;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case NODE_SYNCH_END_EVT:
+ case NODE_FAILOVER_END_EVT:
+ break;
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_DOWN_PEER_LEAVING:
+ switch (evt) {
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case SELF_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_UP_PEER_COMING:
+ switch (evt) {
+ case PEER_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ case NODE_SYNCH_END_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_COMING_PEER_UP:
+ switch (evt) {
+ case SELF_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_LEAVING_PEER_DOWN:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case NODE_FAILINGOVER:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case NODE_FAILOVER_END_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case NODE_FAILOVER_BEGIN_EVT:
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case NODE_SYNCHING:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case NODE_SYNCH_END_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case NODE_FAILOVER_BEGIN_EVT:
+ state = NODE_FAILINGOVER;
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ default:
+ pr_err("Unknown node fsm state %x\n", state);
+ break;
}
+ n->state = state;
+ return;
+
+illegal_evt:
+ pr_err("Illegal node fsm evt %x in state %x\n", evt, state);
}
-static void node_established_contact(struct tipc_node *n_ptr)
+bool tipc_node_filter_pkt(struct tipc_node *n, struct tipc_msg *hdr)
{
- n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP;
- n_ptr->bclink.oos_state = 0;
- n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net);
- tipc_bclink_add_node(n_ptr->net, n_ptr->addr);
+ int state = n->state;
+
+ if (likely(state == SELF_UP_PEER_UP))
+ return true;
+
+ if (state == SELF_LEAVING_PEER_DOWN)
+ return false;
+
+ if (state == SELF_DOWN_PEER_LEAVING) {
+ if (msg_peer_node_is_up(hdr))
+ return false;
+ }
+
+ return true;
}
-static void node_lost_contact(struct tipc_node *n_ptr)
+static void node_lost_contact(struct tipc_node *n,
+ struct sk_buff_head *inputq)
{
char addr_string[16];
struct tipc_sock_conn *conn, *safe;
- struct list_head *conns = &n_ptr->conn_sks;
+ struct tipc_link *l;
+ struct list_head *conns = &n->conn_sks;
struct sk_buff *skb;
- struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
uint i;
pr_debug("Lost contact with %s\n",
- tipc_addr_string_fill(addr_string, n_ptr->addr));
-
- /* Flush broadcast link info associated with lost node */
- if (n_ptr->bclink.recv_permitted) {
- __skb_queue_purge(&n_ptr->bclink.deferdq);
-
- if (n_ptr->bclink.reasm_buf) {
- kfree_skb(n_ptr->bclink.reasm_buf);
- n_ptr->bclink.reasm_buf = NULL;
- }
-
- tipc_bclink_remove_node(n_ptr->net, n_ptr->addr);
- tipc_bclink_acknowledge(n_ptr, INVALID_LINK_SEQ);
+ tipc_addr_string_fill(addr_string, n->addr));
- n_ptr->bclink.recv_permitted = false;
- }
+ /* Clean up broadcast state */
+ tipc_bcast_remove_peer(n->net, n->bc_entry.link);
/* Abort any ongoing link failover */
for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link *l_ptr = n_ptr->links[i];
- if (!l_ptr)
- continue;
- l_ptr->flags &= ~LINK_FAILINGOVER;
- l_ptr->failover_checkpt = 0;
- l_ptr->failover_pkts = 0;
- kfree_skb(l_ptr->failover_skb);
- l_ptr->failover_skb = NULL;
- tipc_link_reset_fragments(l_ptr);
+ l = n->links[i].link;
+ if (l)
+ tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT);
}
- n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN;
-
- /* Prevent re-contact with node until cleanup is done */
- n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN;
-
/* Notify publications from this node */
- n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN;
+ n->action_flags |= TIPC_NOTIFY_NODE_DOWN;
/* Notify sockets connected to node */
list_for_each_entry_safe(conn, safe, conns, list) {
skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG,
- SHORT_H_SIZE, 0, tn->own_addr,
+ SHORT_H_SIZE, 0, tipc_own_addr(n->net),
conn->peer_node, conn->port,
conn->peer_port, TIPC_ERR_NO_NODE);
- if (likely(skb)) {
- skb_queue_tail(n_ptr->inputq, skb);
- n_ptr->action_flags |= TIPC_MSG_EVT;
- }
+ if (likely(skb))
+ skb_queue_tail(inputq, skb);
list_del(&conn->list);
kfree(conn);
}
@@ -452,7 +914,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
goto exit;
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (link) {
strncpy(linkname, link->name, len);
err = 0;
@@ -470,36 +932,24 @@ void tipc_node_unlock(struct tipc_node *node)
u32 flags = node->action_flags;
u32 link_id = 0;
struct list_head *publ_list;
- struct sk_buff_head *inputq = node->inputq;
- struct sk_buff_head *namedq;
- if (likely(!flags || (flags == TIPC_MSG_EVT))) {
- node->action_flags = 0;
+ if (likely(!flags)) {
spin_unlock_bh(&node->lock);
- if (flags == TIPC_MSG_EVT)
- tipc_sk_rcv(net, inputq);
return;
}
addr = node->addr;
link_id = node->link_id;
- namedq = node->namedq;
publ_list = &node->publ_list;
- node->action_flags &= ~(TIPC_MSG_EVT |
- TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
- TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP |
- TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT |
- TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET);
+ node->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
+ TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP);
spin_unlock_bh(&node->lock);
if (flags & TIPC_NOTIFY_NODE_DOWN)
tipc_publ_notify(net, publ_list, addr);
- if (flags & TIPC_WAKEUP_BCAST_USERS)
- tipc_bclink_wakeup_users(net);
-
if (flags & TIPC_NOTIFY_NODE_UP)
tipc_named_node_up(net, addr);
@@ -511,17 +961,6 @@ void tipc_node_unlock(struct tipc_node *node)
tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
link_id, addr);
- if (flags & TIPC_MSG_EVT)
- tipc_sk_rcv(net, inputq);
-
- if (flags & TIPC_NAMED_MSG_EVT)
- tipc_named_rcv(net, namedq);
-
- if (flags & TIPC_BCAST_MSG_EVT)
- tipc_bclink_input(net);
-
- if (flags & TIPC_BCAST_RESET)
- tipc_link_reset_all(node);
}
/* Caller should hold node lock for the passed node */
@@ -558,6 +997,350 @@ msg_full:
return -EMSGSIZE;
}
+static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel,
+ int *bearer_id,
+ struct tipc_media_addr **maddr)
+{
+ int id = n->active_links[sel & 1];
+
+ if (unlikely(id < 0))
+ return NULL;
+
+ *bearer_id = id;
+ *maddr = &n->links[id].maddr;
+ return n->links[id].link;
+}
+
+/**
+ * tipc_node_xmit() is the general link level function for message sending
+ * @net: the applicable net namespace
+ * @list: chain of buffers containing message
+ * @dnode: address of destination node
+ * @selector: a number used for deterministic link selection
+ * Consumes the buffer chain, except when returning -ELINKCONG
+ * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ */
+int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
+ u32 dnode, int selector)
+{
+ struct tipc_link *l = NULL;
+ struct tipc_node *n;
+ struct sk_buff_head xmitq;
+ struct tipc_media_addr *maddr;
+ int bearer_id;
+ int rc = -EHOSTUNREACH;
+
+ __skb_queue_head_init(&xmitq);
+ n = tipc_node_find(net, dnode);
+ if (likely(n)) {
+ tipc_node_lock(n);
+ l = tipc_node_select_link(n, selector, &bearer_id, &maddr);
+ if (likely(l))
+ rc = tipc_link_xmit(l, list, &xmitq);
+ tipc_node_unlock(n);
+ if (unlikely(rc == -ENOBUFS))
+ tipc_node_link_down(n, bearer_id, false);
+ tipc_node_put(n);
+ }
+ if (likely(!rc)) {
+ tipc_bearer_xmit(net, bearer_id, &xmitq, maddr);
+ return 0;
+ }
+ if (likely(in_own_node(net, dnode))) {
+ tipc_sk_rcv(net, list);
+ return 0;
+ }
+ return rc;
+}
+
+/* tipc_node_xmit_skb(): send single buffer to destination
+ * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
+ * messages, which will not be rejected
+ * The only exception is datagram messages rerouted after secondary
+ * lookup, which are rare and safe to dispose of anyway.
+ * TODO: Return real return value, and let callers use
+ * tipc_wait_for_sendpkt() where applicable
+ */
+int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
+ u32 selector)
+{
+ struct sk_buff_head head;
+ int rc;
+
+ skb_queue_head_init(&head);
+ __skb_queue_tail(&head, skb);
+ rc = tipc_node_xmit(net, &head, dnode, selector);
+ if (rc == -ELINKCONG)
+ kfree_skb(skb);
+ return 0;
+}
+
+/**
+ * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node
+ * @net: the applicable net namespace
+ * @skb: TIPC packet
+ * @bearer_id: id of bearer message arrived on
+ *
+ * Invoked with no locks held.
+ */
+static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id)
+{
+ int rc;
+ struct sk_buff_head xmitq;
+ struct tipc_bclink_entry *be;
+ struct tipc_link_entry *le;
+ struct tipc_msg *hdr = buf_msg(skb);
+ int usr = msg_user(hdr);
+ u32 dnode = msg_destnode(hdr);
+ struct tipc_node *n;
+
+ __skb_queue_head_init(&xmitq);
+
+ /* If NACK for other node, let rcv link for that node peek into it */
+ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net)))
+ n = tipc_node_find(net, dnode);
+ else
+ n = tipc_node_find(net, msg_prevnode(hdr));
+ if (!n) {
+ kfree_skb(skb);
+ return;
+ }
+ be = &n->bc_entry;
+ le = &n->links[bearer_id];
+
+ rc = tipc_bcast_rcv(net, be->link, skb);
+
+ /* Broadcast link reset may happen at reassembly failure */
+ if (rc & TIPC_LINK_DOWN_EVT)
+ tipc_node_reset_links(n);
+
+ /* Broadcast ACKs are sent on a unicast link */
+ if (rc & TIPC_LINK_SND_BC_ACK) {
+ tipc_node_lock(n);
+ tipc_link_build_ack_msg(le->link, &xmitq);
+ tipc_node_unlock(n);
+ }
+
+ if (!skb_queue_empty(&xmitq))
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+
+ /* Deliver. 'arrvq' is under inputq2's lock protection */
+ if (!skb_queue_empty(&be->inputq1)) {
+ spin_lock_bh(&be->inputq2.lock);
+ spin_lock_bh(&be->inputq1.lock);
+ skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
+ spin_unlock_bh(&be->inputq1.lock);
+ spin_unlock_bh(&be->inputq2.lock);
+ tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
+ }
+ tipc_node_put(n);
+}
+
+/**
+ * tipc_node_check_state - check and if necessary update node state
+ * @skb: TIPC packet
+ * @bearer_id: identity of bearer delivering the packet
+ * Returns true if state is ok, otherwise consumes buffer and returns false
+ */
+static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
+ int bearer_id, struct sk_buff_head *xmitq)
+{
+ struct tipc_msg *hdr = buf_msg(skb);
+ int usr = msg_user(hdr);
+ int mtyp = msg_type(hdr);
+ u16 oseqno = msg_seqno(hdr);
+ u16 iseqno = msg_seqno(msg_get_wrapped(hdr));
+ u16 exp_pkts = msg_msgcnt(hdr);
+ u16 rcv_nxt, syncpt, dlv_nxt;
+ int state = n->state;
+ struct tipc_link *l, *tnl, *pl = NULL;
+ struct tipc_media_addr *maddr;
+ int i, pb_id;
+
+ l = n->links[bearer_id].link;
+ if (!l)
+ return false;
+ rcv_nxt = l->rcv_nxt;
+
+
+ if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL)))
+ return true;
+
+ /* Find parallel link, if any */
+ for (i = 0; i < MAX_BEARERS; i++) {
+ if ((i != bearer_id) && n->links[i].link) {
+ pl = n->links[i].link;
+ break;
+ }
+ }
+
+ /* Update node accesibility if applicable */
+ if (state == SELF_UP_PEER_COMING) {
+ if (!tipc_link_is_up(l))
+ return true;
+ if (!msg_peer_link_is_up(hdr))
+ return true;
+ tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT);
+ }
+
+ if (state == SELF_DOWN_PEER_LEAVING) {
+ if (msg_peer_node_is_up(hdr))
+ return false;
+ tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
+ }
+
+ /* Ignore duplicate packets */
+ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt))
+ return true;
+
+ /* Initiate or update failover mode if applicable */
+ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) {
+ syncpt = oseqno + exp_pkts - 1;
+ if (pl && tipc_link_is_up(pl)) {
+ pb_id = pl->bearer_id;
+ __tipc_node_link_down(n, &pb_id, xmitq, &maddr);
+ tipc_skb_queue_splice_tail_init(pl->inputq, l->inputq);
+ }
+ /* If pkts arrive out of order, use lowest calculated syncpt */
+ if (less(syncpt, n->sync_point))
+ n->sync_point = syncpt;
+ }
+
+ /* Open parallel link when tunnel link reaches synch point */
+ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) {
+ if (!more(rcv_nxt, n->sync_point))
+ return true;
+ tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT);
+ if (pl)
+ tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT);
+ return true;
+ }
+
+ /* No synching needed if only one link */
+ if (!pl || !tipc_link_is_up(pl))
+ return true;
+
+ /* Initiate synch mode if applicable */
+ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) {
+ syncpt = iseqno + exp_pkts - 1;
+ if (!tipc_link_is_up(l)) {
+ tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT);
+ __tipc_node_link_up(n, bearer_id, xmitq);
+ }
+ if (n->state == SELF_UP_PEER_UP) {
+ n->sync_point = syncpt;
+ tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT);
+ tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT);
+ }
+ if (less(syncpt, n->sync_point))
+ n->sync_point = syncpt;
+ }
+
+ /* Open tunnel link when parallel link reaches synch point */
+ if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) {
+ if (tipc_link_is_synching(l)) {
+ tnl = l;
+ } else {
+ tnl = pl;
+ pl = l;
+ }
+ dlv_nxt = pl->rcv_nxt - mod(skb_queue_len(pl->inputq));
+ if (more(dlv_nxt, n->sync_point)) {
+ tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
+ tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
+ return true;
+ }
+ if (l == pl)
+ return true;
+ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG))
+ return true;
+ if (usr == LINK_PROTOCOL)
+ return true;
+ return false;
+ }
+ return true;
+}
+
+/**
+ * tipc_rcv - process TIPC packets/messages arriving from off-node
+ * @net: the applicable net namespace
+ * @skb: TIPC packet
+ * @bearer: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held. Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
+{
+ struct sk_buff_head xmitq;
+ struct tipc_node *n;
+ struct tipc_msg *hdr = buf_msg(skb);
+ int usr = msg_user(hdr);
+ int bearer_id = b->identity;
+ struct tipc_link_entry *le;
+ u16 bc_ack = msg_bcast_ack(hdr);
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+
+ /* Ensure message is well-formed */
+ if (unlikely(!tipc_msg_validate(skb)))
+ goto discard;
+
+ /* Handle arrival of discovery or broadcast packet */
+ if (unlikely(msg_non_seq(hdr))) {
+ if (unlikely(usr == LINK_CONFIG))
+ return tipc_disc_rcv(net, skb, b);
+ else
+ return tipc_node_bc_rcv(net, skb, bearer_id);
+ }
+
+ /* Locate neighboring node that sent packet */
+ n = tipc_node_find(net, msg_prevnode(hdr));
+ if (unlikely(!n))
+ goto discard;
+ le = &n->links[bearer_id];
+
+ /* Ensure broadcast reception is in synch with peer's send state */
+ if (unlikely(usr == LINK_PROTOCOL))
+ tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr);
+ else if (unlikely(n->bc_entry.link->acked != bc_ack))
+ tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack);
+
+ tipc_node_lock(n);
+
+ /* Is reception permitted at the moment ? */
+ if (!tipc_node_filter_pkt(n, hdr))
+ goto unlock;
+
+ /* Check and if necessary update node state */
+ if (likely(tipc_node_check_state(n, skb, bearer_id, &xmitq))) {
+ rc = tipc_link_rcv(le->link, skb, &xmitq);
+ skb = NULL;
+ }
+unlock:
+ tipc_node_unlock(n);
+
+ if (unlikely(rc & TIPC_LINK_UP_EVT))
+ tipc_node_link_up(n, bearer_id, &xmitq);
+
+ if (unlikely(rc & TIPC_LINK_DOWN_EVT))
+ tipc_node_link_down(n, bearer_id, false);
+
+ if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
+ tipc_named_rcv(net, &n->bc_entry.namedq);
+
+ if (!skb_queue_empty(&le->inputq))
+ tipc_sk_rcv(net, &le->inputq);
+
+ if (!skb_queue_empty(&xmitq))
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+
+ tipc_node_put(n);
+discard:
+ kfree_skb(skb);
+}
+
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int err;
diff --git a/kernel/net/tipc/node.h b/kernel/net/tipc/node.h
index 02d5c20dc..6734562d3 100644
--- a/kernel/net/tipc/node.h
+++ b/kernel/net/tipc/node.h
@@ -45,50 +45,41 @@
/* Out-of-range value for node signature */
#define INVALID_NODE_SIG 0x10000
-#define NODE_HTABLE_SIZE 512
+#define INVALID_BEARER_ID -1
/* Flags used to take different actions according to flag type
- * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down
- * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down
* TIPC_NOTIFY_NODE_DOWN: notify node is down
* TIPC_NOTIFY_NODE_UP: notify node is up
* TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type
*/
enum {
- TIPC_MSG_EVT = 1,
- TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1),
- TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2),
TIPC_NOTIFY_NODE_DOWN = (1 << 3),
TIPC_NOTIFY_NODE_UP = (1 << 4),
- TIPC_WAKEUP_BCAST_USERS = (1 << 5),
TIPC_NOTIFY_LINK_UP = (1 << 6),
- TIPC_NOTIFY_LINK_DOWN = (1 << 7),
- TIPC_NAMED_MSG_EVT = (1 << 8),
- TIPC_BCAST_MSG_EVT = (1 << 9),
- TIPC_BCAST_RESET = (1 << 10)
+ TIPC_NOTIFY_LINK_DOWN = (1 << 7)
};
-/**
- * struct tipc_node_bclink - TIPC node bclink structure
- * @acked: sequence # of last outbound b'cast message acknowledged by node
- * @last_in: sequence # of last in-sequence b'cast message received from node
- * @last_sent: sequence # of last b'cast message sent by node
- * @oos_state: state tracker for handling OOS b'cast messages
- * @deferred_queue: deferred queue saved OOS b'cast message received from node
- * @reasm_buf: broadcast reassembly queue head from node
- * @inputq_map: bitmap indicating which inqueues should be kicked
- * @recv_permitted: true if node is allowed to receive b'cast messages
+/* Optional capabilities supported by this code version
*/
-struct tipc_node_bclink {
- u32 acked;
- u32 last_in;
- u32 last_sent;
- u32 oos_state;
- u32 deferred_size;
- struct sk_buff_head deferdq;
- struct sk_buff *reasm_buf;
- int inputq_map;
- bool recv_permitted;
+enum {
+ TIPC_BCAST_SYNCH = (1 << 1)
+};
+
+#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH
+
+struct tipc_link_entry {
+ struct tipc_link *link;
+ u32 mtu;
+ struct sk_buff_head inputq;
+ struct tipc_media_addr maddr;
+};
+
+struct tipc_bclink_entry {
+ struct tipc_link *link;
+ struct sk_buff_head inputq1;
+ struct sk_buff_head arrvq;
+ struct sk_buff_head inputq2;
+ struct sk_buff_head namedq;
};
/**
@@ -100,11 +91,11 @@ struct tipc_node_bclink {
* @hash: links to adjacent nodes in unsorted hash chain
* @inputq: pointer to input queue containing messages for msg event
* @namedq: pointer to name table input queue with name table messages
- * @curr_link: the link holding the node lock, if any
- * @active_links: pointers to active links to node
- * @links: pointers to all links to node
+ * @active_links: bearer ids of active links, used as index into links[] array
+ * @links: array containing references to all links to node
* @action_flags: bit mask of different types of node actions
- * @bclink: broadcast-related info
+ * @state: connectivity state vs peer node
+ * @sync_point: sequence number where synch/failover is finished
* @list: links to adjacent nodes in sorted list of cluster's nodes
* @working_links: number of working links to node (both active and standby)
* @link_cnt: number of links to node
@@ -120,14 +111,13 @@ struct tipc_node {
spinlock_t lock;
struct net *net;
struct hlist_node hash;
- struct sk_buff_head *inputq;
- struct sk_buff_head *namedq;
- struct tipc_link *active_links[2];
- u32 act_mtus[2];
- struct tipc_link *links[MAX_BEARERS];
+ int active_links[2];
+ struct tipc_link_entry links[MAX_BEARERS];
+ struct tipc_bclink_entry bc_entry;
int action_flags;
- struct tipc_node_bclink bclink;
struct list_head list;
+ int state;
+ u16 sync_point;
int link_cnt;
u16 working_links;
u16 capabilities;
@@ -135,25 +125,32 @@ struct tipc_node {
u32 link_id;
struct list_head publ_list;
struct list_head conn_sks;
+ unsigned long keepalive_intv;
+ struct timer_list timer;
struct rcu_head rcu;
};
struct tipc_node *tipc_node_find(struct net *net, u32 addr);
void tipc_node_put(struct tipc_node *node);
-struct tipc_node *tipc_node_create(struct net *net, u32 addr);
void tipc_node_stop(struct net *net);
+void tipc_node_check_dest(struct net *net, u32 onode,
+ struct tipc_bearer *bearer,
+ u16 capabilities, u32 signature,
+ struct tipc_media_addr *maddr,
+ bool *respond, bool *dupl_addr);
+void tipc_node_delete_links(struct net *net, int bearer_id);
void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-int tipc_node_active_links(struct tipc_node *n_ptr);
-int tipc_node_is_up(struct tipc_node *n_ptr);
+bool tipc_node_is_up(struct tipc_node *n);
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
char *linkname, size_t len);
void tipc_node_unlock(struct tipc_node *node);
+int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
+ int selector);
+int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
+ u32 selector);
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
-
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
static inline void tipc_node_lock(struct tipc_node *node)
@@ -161,26 +158,30 @@ static inline void tipc_node_lock(struct tipc_node *node)
spin_lock_bh(&node->lock);
}
-static inline bool tipc_node_blocked(struct tipc_node *node)
+static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel)
{
- return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN |
- TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN));
-}
+ int bearer_id = n->active_links[sel & 1];
-static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector)
-{
- struct tipc_node *node;
- u32 mtu;
+ if (unlikely(bearer_id == INVALID_BEARER_ID))
+ return NULL;
- node = tipc_node_find(net, addr);
-
- if (likely(node)) {
- mtu = node->act_mtus[selector & 1];
- tipc_node_put(node);
- } else {
- mtu = MAX_MSG_SIZE;
- }
+ return n->links[bearer_id].link;
+}
+static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
+{
+ struct tipc_node *n;
+ int bearer_id;
+ unsigned int mtu = MAX_MSG_SIZE;
+
+ n = tipc_node_find(net, addr);
+ if (unlikely(!n))
+ return mtu;
+
+ bearer_id = n->active_links[sel & 1];
+ if (likely(bearer_id != INVALID_BEARER_ID))
+ mtu = n->links[bearer_id].mtu;
+ tipc_node_put(n);
return mtu;
}
diff --git a/kernel/net/tipc/server.c b/kernel/net/tipc/server.c
index 77ff03ed1..922e04a43 100644
--- a/kernel/net/tipc/server.c
+++ b/kernel/net/tipc/server.c
@@ -309,6 +309,10 @@ static int tipc_accept_from_sock(struct tipc_conn *con)
/* Notify that new connection is incoming */
newcon->usr_data = s->tipc_conn_new(newcon->conid);
+ if (!newcon->usr_data) {
+ sock_release(newsock);
+ return -ENOMEM;
+ }
/* Wake up receive process in case of 'SYN+' message */
newsock->sk->sk_data_ready(newsock->sk);
@@ -321,7 +325,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
struct socket *sock = NULL;
int ret;
- ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1);
+ ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock);
if (ret < 0)
return NULL;
ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
diff --git a/kernel/net/tipc/socket.c b/kernel/net/tipc/socket.c
index 20cc6df07..b53246fb0 100644
--- a/kernel/net/tipc/socket.c
+++ b/kernel/net/tipc/socket.c
@@ -41,6 +41,7 @@
#include "link.h"
#include "name_distr.h"
#include "socket.h"
+#include "bcast.h"
#define SS_LISTENING -1 /* socket is listening */
#define SS_READY -2 /* socket is connectionless */
@@ -104,6 +105,7 @@ struct tipc_sock {
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb);
static void tipc_data_ready(struct sock *sk);
static void tipc_write_space(struct sock *sk);
+static void tipc_sock_destruct(struct sock *sk);
static int tipc_release(struct socket *sock);
static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
@@ -247,6 +249,22 @@ static void tsk_advance_rx_queue(struct sock *sk)
kfree_skb(__skb_dequeue(&sk->sk_receive_queue));
}
+/* tipc_sk_respond() : send response message back to sender
+ */
+static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err)
+{
+ u32 selector;
+ u32 dnode;
+ u32 onode = tipc_own_addr(sock_net(sk));
+
+ if (!tipc_msg_reverse(onode, &skb, err))
+ return;
+
+ dnode = msg_destnode(buf_msg(skb));
+ selector = msg_origport(buf_msg(skb));
+ tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
+}
+
/**
* tsk_rej_rx_queue - reject all buffers in socket receive queue
*
@@ -255,13 +273,9 @@ static void tsk_advance_rx_queue(struct sock *sk)
static void tsk_rej_rx_queue(struct sock *sk)
{
struct sk_buff *skb;
- u32 dnode;
- u32 own_node = tsk_own_node(tipc_sk(sk));
- while ((skb = __skb_dequeue(&sk->sk_receive_queue))) {
- if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT))
- tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0);
- }
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)))
+ tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
}
/* tsk_peer_msg - verify if message was sent by connected port's peer
@@ -342,7 +356,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
}
/* Allocate socket's protocol area */
- sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto);
+ sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);
if (sk == NULL)
return -ENOMEM;
@@ -368,6 +382,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
sk->sk_rcvbuf = sysctl_tipc_rmem[1];
sk->sk_data_ready = tipc_data_ready;
sk->sk_write_space = tipc_write_space;
+ sk->sk_destruct = tipc_sock_destruct;
tsk->conn_timeout = CONN_TIMEOUT_DEFAULT;
tsk->sent_unacked = 0;
atomic_set(&tsk->dupl_rcvcnt, 0);
@@ -409,7 +424,7 @@ static int tipc_release(struct socket *sock)
struct net *net;
struct tipc_sock *tsk;
struct sk_buff *skb;
- u32 dnode, probing_state;
+ u32 dnode;
/*
* Exit if socket isn't fully initialized (occurs when a failed accept()
@@ -440,17 +455,12 @@ static int tipc_release(struct socket *sock)
tsk->connected = 0;
tipc_node_remove_conn(net, dnode, tsk->portid);
}
- if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
- TIPC_ERR_NO_PORT))
- tipc_link_xmit_skb(net, skb, dnode, 0);
+ tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
}
}
tipc_sk_withdraw(tsk, 0, NULL);
- probing_state = tsk->probing_state;
- if (del_timer_sync(&sk->sk_timer) &&
- probing_state != TIPC_CONN_PROBING)
- sock_put(sk);
+ sk_stop_timer(sk, &sk->sk_timer);
tipc_sk_remove(tsk);
if (tsk->connected) {
skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
@@ -458,13 +468,10 @@ static int tipc_release(struct socket *sock)
tsk_own_node(tsk), tsk_peer_port(tsk),
tsk->portid, TIPC_ERR_NO_PORT);
if (skb)
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
tipc_node_remove_conn(net, dnode, tsk->portid);
}
- /* Discard any remaining (connection-based) messages in receive queue */
- __skb_queue_purge(&sk->sk_receive_queue);
-
/* Reject any messages that accumulated in backlog queue */
sock->state = SS_DISCONNECTING;
release_sock(sk);
@@ -681,28 +688,29 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
new_mtu:
- mtu = tipc_bclink_get_mtu();
+ mtu = tipc_bcast_get_mtu(net);
rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
if (unlikely(rc < 0))
return rc;
do {
- rc = tipc_bclink_xmit(net, pktchain);
- if (likely(rc >= 0)) {
- rc = dsz;
- break;
+ rc = tipc_bcast_xmit(net, pktchain);
+ if (likely(!rc))
+ return dsz;
+
+ if (rc == -ELINKCONG) {
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (!rc)
+ continue;
}
+ __skb_queue_purge(pktchain);
if (rc == -EMSGSIZE) {
msg->msg_iter = save;
goto new_mtu;
}
- if (rc != -ELINKCONG)
- break;
- tipc_sk(sk)->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
- } while (!rc);
+ break;
+ } while (1);
return rc;
}
@@ -765,35 +773,35 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
/**
* tipc_sk_proto_rcv - receive a connection mng protocol message
* @tsk: receiving socket
- * @skb: pointer to message buffer. Set to NULL if buffer is consumed.
+ * @skb: pointer to message buffer.
*/
-static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff **skb)
+static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
{
- struct tipc_msg *msg = buf_msg(*skb);
+ struct sock *sk = &tsk->sk;
+ struct tipc_msg *hdr = buf_msg(skb);
+ int mtyp = msg_type(hdr);
int conn_cong;
- u32 dnode;
- u32 own_node = tsk_own_node(tsk);
+
/* Ignore if connection cannot be validated: */
- if (!tsk_peer_msg(tsk, msg))
+ if (!tsk_peer_msg(tsk, hdr))
goto exit;
tsk->probing_state = TIPC_CONN_OK;
- if (msg_type(msg) == CONN_ACK) {
+ if (mtyp == CONN_PROBE) {
+ msg_set_type(hdr, CONN_PROBE_REPLY);
+ tipc_sk_respond(sk, skb, TIPC_OK);
+ return;
+ } else if (mtyp == CONN_ACK) {
conn_cong = tsk_conn_cong(tsk);
- tsk->sent_unacked -= msg_msgcnt(msg);
+ tsk->sent_unacked -= msg_msgcnt(hdr);
if (conn_cong)
- tsk->sk.sk_write_space(&tsk->sk);
- } else if (msg_type(msg) == CONN_PROBE) {
- if (tipc_msg_reverse(own_node, *skb, &dnode, TIPC_OK)) {
- msg_set_type(msg, CONN_PROBE_REPLY);
- return;
- }
+ sk->sk_write_space(sk);
+ } else if (mtyp != CONN_PROBE_REPLY) {
+ pr_warn("Received unknown CONN_PROTO msg\n");
}
- /* Do nothing if msg_type() == CONN_PROBE_REPLY */
exit:
- kfree_skb(*skb);
- *skb = NULL;
+ kfree_skb(skb);
}
static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
@@ -926,24 +934,25 @@ new_mtu:
do {
skb = skb_peek(pktchain);
TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
- rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid);
- if (likely(rc >= 0)) {
+ rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+ if (likely(!rc)) {
if (sock->state != SS_READY)
sock->state = SS_CONNECTING;
- rc = dsz;
- break;
+ return dsz;
}
+ if (rc == -ELINKCONG) {
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (!rc)
+ continue;
+ }
+ __skb_queue_purge(pktchain);
if (rc == -EMSGSIZE) {
m->msg_iter = save;
goto new_mtu;
}
- if (rc != -ELINKCONG)
- break;
- tsk->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
- } while (!rc);
+ break;
+ } while (1);
return rc;
}
@@ -1045,15 +1054,16 @@ next:
return rc;
do {
if (likely(!tsk_conn_cong(tsk))) {
- rc = tipc_link_xmit(net, pktchain, dnode, portid);
+ rc = tipc_node_xmit(net, pktchain, dnode, portid);
if (likely(!rc)) {
tsk->sent_unacked++;
sent += send;
if (sent == dsz)
- break;
+ return dsz;
goto next;
}
if (rc == -EMSGSIZE) {
+ __skb_queue_purge(pktchain);
tsk->max_pkt = tipc_node_get_mtu(net, dnode,
portid);
m->msg_iter = save;
@@ -1061,13 +1071,13 @@ next:
}
if (rc != -ELINKCONG)
break;
+
tsk->link_cong = 1;
}
rc = tipc_wait_for_sndpkt(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
} while (!rc);
+ __skb_queue_purge(pktchain);
return sent ? sent : rc;
}
@@ -1223,7 +1233,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack)
return;
msg = buf_msg(skb);
msg_set_msgcnt(msg, ack);
- tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg));
+ tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg));
}
static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
@@ -1504,87 +1514,91 @@ static void tipc_data_ready(struct sock *sk)
rcu_read_unlock();
}
+static void tipc_sock_destruct(struct sock *sk)
+{
+ __skb_queue_purge(&sk->sk_receive_queue);
+}
+
/**
* filter_connect - Handle all incoming messages for a connection-based socket
* @tsk: TIPC socket
* @skb: pointer to message buffer. Set to NULL if buffer is consumed
*
- * Returns 0 (TIPC_OK) if everything ok, -TIPC_ERR_NO_PORT otherwise
+ * Returns true if everything ok, false otherwise
*/
-static int filter_connect(struct tipc_sock *tsk, struct sk_buff **skb)
+static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
struct socket *sock = sk->sk_socket;
- struct tipc_msg *msg = buf_msg(*skb);
- int retval = -TIPC_ERR_NO_PORT;
+ struct tipc_msg *hdr = buf_msg(skb);
- if (msg_mcast(msg))
- return retval;
+ if (unlikely(msg_mcast(hdr)))
+ return false;
switch ((int)sock->state) {
case SS_CONNECTED:
+
/* Accept only connection-based messages sent by peer */
- if (tsk_peer_msg(tsk, msg)) {
- if (unlikely(msg_errcode(msg))) {
- sock->state = SS_DISCONNECTING;
- tsk->connected = 0;
- /* let timer expire on it's own */
- tipc_node_remove_conn(net, tsk_peer_node(tsk),
- tsk->portid);
- }
- retval = TIPC_OK;
+ if (unlikely(!tsk_peer_msg(tsk, hdr)))
+ return false;
+
+ if (unlikely(msg_errcode(hdr))) {
+ sock->state = SS_DISCONNECTING;
+ tsk->connected = 0;
+ /* Let timer expire on it's own */
+ tipc_node_remove_conn(net, tsk_peer_node(tsk),
+ tsk->portid);
}
- break;
+ return true;
+
case SS_CONNECTING:
- /* Accept only ACK or NACK message */
- if (unlikely(!msg_connected(msg)))
- break;
+ /* Accept only ACK or NACK message */
+ if (unlikely(!msg_connected(hdr)))
+ return false;
- if (unlikely(msg_errcode(msg))) {
+ if (unlikely(msg_errcode(hdr))) {
sock->state = SS_DISCONNECTING;
sk->sk_err = ECONNREFUSED;
- retval = TIPC_OK;
- break;
+ return true;
}
- if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) {
+ if (unlikely(!msg_isdata(hdr))) {
sock->state = SS_DISCONNECTING;
sk->sk_err = EINVAL;
- retval = TIPC_OK;
- break;
+ return true;
}
- tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg));
- msg_set_importance(&tsk->phdr, msg_importance(msg));
+ tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr));
+ msg_set_importance(&tsk->phdr, msg_importance(hdr));
sock->state = SS_CONNECTED;
- /* If an incoming message is an 'ACK-', it should be
- * discarded here because it doesn't contain useful
- * data. In addition, we should try to wake up
- * connect() routine if sleeping.
- */
- if (msg_data_sz(msg) == 0) {
- kfree_skb(*skb);
- *skb = NULL;
- if (waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
- }
- retval = TIPC_OK;
- break;
+ /* If 'ACK+' message, add to socket receive queue */
+ if (msg_data_sz(hdr))
+ return true;
+
+ /* If empty 'ACK-' message, wake up sleeping connect() */
+ if (waitqueue_active(sk_sleep(sk)))
+ wake_up_interruptible(sk_sleep(sk));
+
+ /* 'ACK-' message is neither accepted nor rejected: */
+ msg_set_dest_droppable(hdr, 1);
+ return false;
+
case SS_LISTENING:
case SS_UNCONNECTED:
+
/* Accept only SYN message */
- if (!msg_connected(msg) && !(msg_errcode(msg)))
- retval = TIPC_OK;
+ if (!msg_connected(hdr) && !(msg_errcode(hdr)))
+ return true;
break;
case SS_DISCONNECTING:
break;
default:
pr_err("Unknown socket state %u\n", sock->state);
}
- return retval;
+ return false;
}
/**
@@ -1619,61 +1633,70 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
/**
* filter_rcv - validate incoming message
* @sk: socket
- * @skb: pointer to message. Set to NULL if buffer is consumed.
+ * @skb: pointer to message.
*
* Enqueues message on receive queue if acceptable; optionally handles
* disconnect indication for a connected socket.
*
* Called with socket lock already taken
*
- * Returns 0 (TIPC_OK) if message was ok, -TIPC error code if rejected
+ * Returns true if message was added to socket receive queue, otherwise false
*/
-static int filter_rcv(struct sock *sk, struct sk_buff **skb)
+static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
{
struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
- struct tipc_msg *msg = buf_msg(*skb);
- unsigned int limit = rcvbuf_limit(sk, *skb);
- int rc = TIPC_OK;
+ struct tipc_msg *hdr = buf_msg(skb);
+ unsigned int limit = rcvbuf_limit(sk, skb);
+ int err = TIPC_OK;
+ int usr = msg_user(hdr);
- if (unlikely(msg_user(msg) == CONN_MANAGER)) {
+ if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
tipc_sk_proto_rcv(tsk, skb);
- return TIPC_OK;
+ return false;
}
- if (unlikely(msg_user(msg) == SOCK_WAKEUP)) {
- kfree_skb(*skb);
+ if (unlikely(usr == SOCK_WAKEUP)) {
+ kfree_skb(skb);
tsk->link_cong = 0;
sk->sk_write_space(sk);
- *skb = NULL;
- return TIPC_OK;
+ return false;
}
- /* Reject message if it is wrong sort of message for socket */
- if (msg_type(msg) > TIPC_DIRECT_MSG)
- return -TIPC_ERR_NO_PORT;
+ /* Drop if illegal message type */
+ if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) {
+ kfree_skb(skb);
+ return false;
+ }
- if (sock->state == SS_READY) {
- if (msg_connected(msg))
- return -TIPC_ERR_NO_PORT;
- } else {
- rc = filter_connect(tsk, skb);
- if (rc != TIPC_OK || !*skb)
- return rc;
+ /* Reject if wrong message type for current socket state */
+ if (unlikely(sock->state == SS_READY)) {
+ if (msg_connected(hdr)) {
+ err = TIPC_ERR_NO_PORT;
+ goto reject;
+ }
+ } else if (unlikely(!filter_connect(tsk, skb))) {
+ err = TIPC_ERR_NO_PORT;
+ goto reject;
}
/* Reject message if there isn't room to queue it */
- if (sk_rmem_alloc_get(sk) + (*skb)->truesize >= limit)
- return -TIPC_ERR_OVERLOAD;
+ if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) {
+ err = TIPC_ERR_OVERLOAD;
+ goto reject;
+ }
/* Enqueue message */
- TIPC_SKB_CB(*skb)->handle = NULL;
- __skb_queue_tail(&sk->sk_receive_queue, *skb);
- skb_set_owner_r(*skb, sk);
+ TIPC_SKB_CB(skb)->handle = NULL;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
sk->sk_data_ready(sk);
- *skb = NULL;
- return TIPC_OK;
+ return true;
+
+reject:
+ tipc_sk_respond(sk, skb, err);
+ return false;
}
/**
@@ -1687,22 +1710,10 @@ static int filter_rcv(struct sock *sk, struct sk_buff **skb)
*/
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
- int err;
- atomic_t *dcnt;
- u32 dnode;
- struct tipc_sock *tsk = tipc_sk(sk);
- struct net *net = sock_net(sk);
- uint truesize = skb->truesize;
+ unsigned int truesize = skb->truesize;
- err = filter_rcv(sk, &skb);
- if (likely(!skb)) {
- dcnt = &tsk->dupl_rcvcnt;
- if (atomic_read(dcnt) < TIPC_CONN_OVERLOAD_LIMIT)
- atomic_add(truesize, dcnt);
- return 0;
- }
- if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err))
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ if (likely(filter_rcv(sk, skb)))
+ atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt);
return 0;
}
@@ -1712,45 +1723,43 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
* @inputq: list of incoming buffers with potentially different destinations
* @sk: socket where the buffers should be enqueued
* @dport: port number for the socket
- * @_skb: returned buffer to be forwarded or rejected, if applicable
*
* Caller must hold socket lock
- *
- * Returns TIPC_OK if all buffers enqueued, otherwise -TIPC_ERR_OVERLOAD
- * or -TIPC_ERR_NO_PORT
*/
-static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
- u32 dport, struct sk_buff **_skb)
+static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
+ u32 dport)
{
unsigned int lim;
atomic_t *dcnt;
- int err;
struct sk_buff *skb;
unsigned long time_limit = jiffies + 2;
while (skb_queue_len(inputq)) {
if (unlikely(time_after_eq(jiffies, time_limit)))
- return TIPC_OK;
+ return;
+
skb = tipc_skb_dequeue(inputq, dport);
if (unlikely(!skb))
- return TIPC_OK;
+ return;
+
+ /* Add message directly to receive queue if possible */
if (!sock_owned_by_user(sk)) {
- err = filter_rcv(sk, &skb);
- if (likely(!skb))
- continue;
- *_skb = skb;
- return err;
+ filter_rcv(sk, skb);
+ continue;
}
+
+ /* Try backlog, compensating for double-counted bytes */
dcnt = &tipc_sk(sk)->dupl_rcvcnt;
if (sk->sk_backlog.len)
atomic_set(dcnt, 0);
lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
if (likely(!sk_add_backlog(sk, skb, lim)))
continue;
- *_skb = skb;
- return -TIPC_ERR_OVERLOAD;
+
+ /* Overload => reject message back to sender */
+ tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD);
+ break;
}
- return TIPC_OK;
}
/**
@@ -1758,49 +1767,46 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
* @inputq: buffer list containing the buffers
* Consumes all buffers in list until inputq is empty
* Note: may be called in multiple threads referring to the same queue
- * Returns 0 if last buffer was accepted, otherwise -EHOSTUNREACH
- * Only node local calls check the return value, sending single-buffer queues
*/
-int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
+void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
u32 dnode, dport = 0;
int err;
- struct sk_buff *skb;
struct tipc_sock *tsk;
- struct tipc_net *tn;
struct sock *sk;
+ struct sk_buff *skb;
while (skb_queue_len(inputq)) {
- err = -TIPC_ERR_NO_PORT;
- skb = NULL;
dport = tipc_skb_peek_port(inputq, dport);
tsk = tipc_sk_lookup(net, dport);
+
if (likely(tsk)) {
sk = &tsk->sk;
if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
- err = tipc_sk_enqueue(inputq, sk, dport, &skb);
+ tipc_sk_enqueue(inputq, sk, dport);
spin_unlock_bh(&sk->sk_lock.slock);
- dport = 0;
}
sock_put(sk);
- } else {
- skb = tipc_skb_dequeue(inputq, dport);
- }
- if (likely(!skb))
continue;
- if (tipc_msg_lookup_dest(net, skb, &dnode, &err))
- goto xmit;
- if (!err) {
- dnode = msg_destnode(buf_msg(skb));
- goto xmit;
}
- tn = net_generic(net, tipc_net_id);
- if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err))
+
+ /* No destination socket => dequeue skb if still there */
+ skb = tipc_skb_dequeue(inputq, dport);
+ if (!skb)
+ return;
+
+ /* Try secondary lookup if unresolved named message */
+ err = TIPC_ERR_NO_PORT;
+ if (tipc_msg_lookup_dest(net, skb, &err))
+ goto xmit;
+
+ /* Prepare for message rejection */
+ if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err))
continue;
xmit:
- tipc_link_xmit_skb(net, skb, dnode, dport);
+ dnode = msg_destnode(buf_msg(skb));
+ tipc_node_xmit_skb(net, skb, dnode, dport);
}
- return err ? -EHOSTUNREACH : 0;
}
static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
@@ -2069,7 +2075,10 @@ static int tipc_shutdown(struct socket *sock, int how)
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
struct sk_buff *skb;
- u32 dnode;
+ u32 dnode = tsk_peer_node(tsk);
+ u32 dport = tsk_peer_port(tsk);
+ u32 onode = tipc_own_addr(net);
+ u32 oport = tsk->portid;
int res;
if (how != SHUT_RDWR)
@@ -2082,6 +2091,8 @@ static int tipc_shutdown(struct socket *sock, int how)
case SS_CONNECTED:
restart:
+ dnode = tsk_peer_node(tsk);
+
/* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
skb = __skb_dequeue(&sk->sk_receive_queue);
if (skb) {
@@ -2089,19 +2100,13 @@ restart:
kfree_skb(skb);
goto restart;
}
- if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
- TIPC_CONN_SHUTDOWN))
- tipc_link_xmit_skb(net, skb, dnode,
- tsk->portid);
+ tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN);
} else {
- dnode = tsk_peer_node(tsk);
-
skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
TIPC_CONN_MSG, SHORT_H_SIZE,
- 0, dnode, tsk_own_node(tsk),
- tsk_peer_port(tsk),
- tsk->portid, TIPC_CONN_SHUTDOWN);
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ 0, dnode, onode, dport, oport,
+ TIPC_CONN_SHUTDOWN);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
}
tsk->connected = 0;
sock->state = SS_DISCONNECTING;
@@ -2163,7 +2168,7 @@ static void tipc_sk_timeout(unsigned long data)
}
bh_unlock_sock(sk);
if (skb)
- tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
+ tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
exit:
sock_put(sk);
}
diff --git a/kernel/net/tipc/socket.h b/kernel/net/tipc/socket.h
index bf6551389..4241f2206 100644
--- a/kernel/net/tipc/socket.h
+++ b/kernel/net/tipc/socket.h
@@ -44,7 +44,7 @@
SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE))
int tipc_socket_init(void);
void tipc_socket_stop(void);
-int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
+void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
struct sk_buff_head *inputq);
void tipc_sk_reinit(struct net *net);
diff --git a/kernel/net/tipc/subscr.c b/kernel/net/tipc/subscr.c
index 1c147c869..69ee2eeef 100644
--- a/kernel/net/tipc/subscr.c
+++ b/kernel/net/tipc/subscr.c
@@ -40,16 +40,21 @@
/**
* struct tipc_subscriber - TIPC network topology subscriber
+ * @kref: reference counter to tipc_subscription object
* @conid: connection identifier to server connecting to subscriber
* @lock: control access to subscriber
- * @subscription_list: list of subscription objects for this subscriber
+ * @subscrp_list: list of subscription objects for this subscriber
*/
struct tipc_subscriber {
+ struct kref kref;
int conid;
spinlock_t lock;
- struct list_head subscription_list;
+ struct list_head subscrp_list;
};
+static void tipc_subscrp_delete(struct tipc_subscription *sub);
+static void tipc_subscrb_put(struct tipc_subscriber *subscriber);
+
/**
* htohl - convert value to endianness used by destination
* @in: value to convert
@@ -62,9 +67,9 @@ static u32 htohl(u32 in, int swap)
return swap ? swab32(in) : in;
}
-static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower,
- u32 found_upper, u32 event, u32 port_ref,
- u32 node)
+static void tipc_subscrp_send_event(struct tipc_subscription *sub,
+ u32 found_lower, u32 found_upper,
+ u32 event, u32 port_ref, u32 node)
{
struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
struct tipc_subscriber *subscriber = sub->subscriber;
@@ -82,12 +87,13 @@ static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower,
}
/**
- * tipc_subscr_overlap - test for subscription overlap with the given values
+ * tipc_subscrp_check_overlap - test for subscription overlap with the
+ * given values
*
* Returns 1 if there is overlap, otherwise 0.
*/
-int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower,
- u32 found_upper)
+int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper)
{
if (found_lower < sub->seq.lower)
found_lower = sub->seq.lower;
@@ -98,138 +104,121 @@ int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower,
return 1;
}
-/**
- * tipc_subscr_report_overlap - issue event if there is subscription overlap
- *
- * Protected by nameseq.lock in name_table.c
- */
-void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower,
- u32 found_upper, u32 event, u32 port_ref,
- u32 node, int must)
+void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper, u32 event, u32 port_ref,
+ u32 node, int must)
{
- if (!tipc_subscr_overlap(sub, found_lower, found_upper))
+ if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper))
return;
if (!must && !(sub->filter & TIPC_SUB_PORTS))
return;
- subscr_send_event(sub, found_lower, found_upper, event, port_ref, node);
+ tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
+ node);
}
-static void subscr_timeout(unsigned long data)
+static void tipc_subscrp_timeout(unsigned long data)
{
struct tipc_subscription *sub = (struct tipc_subscription *)data;
struct tipc_subscriber *subscriber = sub->subscriber;
- struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
- /* The spin lock per subscriber is used to protect its members */
- spin_lock_bh(&subscriber->lock);
+ /* Notify subscriber of timeout */
+ tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
+ TIPC_SUBSCR_TIMEOUT, 0, 0);
- /* Validate timeout (in case subscription is being cancelled) */
- if (sub->timeout == TIPC_WAIT_FOREVER) {
- spin_unlock_bh(&subscriber->lock);
- return;
- }
+ spin_lock_bh(&subscriber->lock);
+ tipc_subscrp_delete(sub);
+ spin_unlock_bh(&subscriber->lock);
- /* Unlink subscription from name table */
- tipc_nametbl_unsubscribe(sub);
+ tipc_subscrb_put(subscriber);
+}
- /* Unlink subscription from subscriber */
- list_del(&sub->subscription_list);
+static void tipc_subscrb_kref_release(struct kref *kref)
+{
+ struct tipc_subscriber *subcriber = container_of(kref,
+ struct tipc_subscriber, kref);
- spin_unlock_bh(&subscriber->lock);
+ kfree(subcriber);
+}
- /* Notify subscriber of timeout */
- subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
- TIPC_SUBSCR_TIMEOUT, 0, 0);
+static void tipc_subscrb_put(struct tipc_subscriber *subscriber)
+{
+ kref_put(&subscriber->kref, tipc_subscrb_kref_release);
+}
- /* Now destroy subscription */
- kfree(sub);
- atomic_dec(&tn->subscription_count);
+static void tipc_subscrb_get(struct tipc_subscriber *subscriber)
+{
+ kref_get(&subscriber->kref);
}
-/**
- * subscr_del - delete a subscription within a subscription list
- *
- * Called with subscriber lock held.
- */
-static void subscr_del(struct tipc_subscription *sub)
+static struct tipc_subscriber *tipc_subscrb_create(int conid)
{
- struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+ struct tipc_subscriber *subscriber;
- tipc_nametbl_unsubscribe(sub);
- list_del(&sub->subscription_list);
- kfree(sub);
- atomic_dec(&tn->subscription_count);
+ subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC);
+ if (!subscriber) {
+ pr_warn("Subscriber rejected, no memory\n");
+ return NULL;
+ }
+ kref_init(&subscriber->kref);
+ INIT_LIST_HEAD(&subscriber->subscrp_list);
+ subscriber->conid = conid;
+ spin_lock_init(&subscriber->lock);
+
+ return subscriber;
}
-static void subscr_release(struct tipc_subscriber *subscriber)
+static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
{
- struct tipc_subscription *sub;
- struct tipc_subscription *sub_temp;
+ struct tipc_subscription *sub, *temp;
spin_lock_bh(&subscriber->lock);
-
/* Destroy any existing subscriptions for subscriber */
- list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
- subscription_list) {
- if (sub->timeout != TIPC_WAIT_FOREVER) {
- spin_unlock_bh(&subscriber->lock);
- del_timer_sync(&sub->timer);
- spin_lock_bh(&subscriber->lock);
+ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
+ subscrp_list) {
+ if (del_timer(&sub->timer)) {
+ tipc_subscrp_delete(sub);
+ tipc_subscrb_put(subscriber);
}
- subscr_del(sub);
}
spin_unlock_bh(&subscriber->lock);
- /* Now destroy subscriber */
- kfree(subscriber);
+ tipc_subscrb_put(subscriber);
}
-/**
- * subscr_cancel - handle subscription cancellation request
- *
- * Called with subscriber lock held. Routine must temporarily release lock
- * to enable the subscription timeout routine to finish without deadlocking;
- * the lock is then reclaimed to allow caller to release it upon return.
- *
- * Note that fields of 's' use subscriber's endianness!
- */
-static void subscr_cancel(struct tipc_subscr *s,
- struct tipc_subscriber *subscriber)
+static void tipc_subscrp_delete(struct tipc_subscription *sub)
{
- struct tipc_subscription *sub;
- struct tipc_subscription *sub_temp;
- int found = 0;
+ struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+
+ tipc_nametbl_unsubscribe(sub);
+ list_del(&sub->subscrp_list);
+ kfree(sub);
+ atomic_dec(&tn->subscription_count);
+}
+
+static void tipc_subscrp_cancel(struct tipc_subscr *s,
+ struct tipc_subscriber *subscriber)
+{
+ struct tipc_subscription *sub, *temp;
+ spin_lock_bh(&subscriber->lock);
/* Find first matching subscription, exit if not found */
- list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
- subscription_list) {
+ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
+ subscrp_list) {
if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
- found = 1;
+ if (del_timer(&sub->timer)) {
+ tipc_subscrp_delete(sub);
+ tipc_subscrb_put(subscriber);
+ }
break;
}
}
- if (!found)
- return;
-
- /* Cancel subscription timer (if used), then delete subscription */
- if (sub->timeout != TIPC_WAIT_FOREVER) {
- sub->timeout = TIPC_WAIT_FOREVER;
- spin_unlock_bh(&subscriber->lock);
- del_timer_sync(&sub->timer);
- spin_lock_bh(&subscriber->lock);
- }
- subscr_del(sub);
+ spin_unlock_bh(&subscriber->lock);
}
-/**
- * subscr_subscribe - create subscription for subscriber
- *
- * Called with subscriber lock held.
- */
-static int subscr_subscribe(struct net *net, struct tipc_subscr *s,
- struct tipc_subscriber *subscriber,
- struct tipc_subscription **sub_p)
+static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
+ struct tipc_subscriber *subscriber,
+ struct tipc_subscription **sub_p)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_subscription *sub;
@@ -241,7 +230,7 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s,
/* Detect & process a subscription cancellation request */
if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
- subscr_cancel(s, subscriber);
+ tipc_subscrp_cancel(s, subscriber);
return 0;
}
@@ -273,62 +262,50 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s,
kfree(sub);
return -EINVAL;
}
- list_add(&sub->subscription_list, &subscriber->subscription_list);
+ spin_lock_bh(&subscriber->lock);
+ list_add(&sub->subscrp_list, &subscriber->subscrp_list);
+ spin_unlock_bh(&subscriber->lock);
sub->subscriber = subscriber;
sub->swap = swap;
- memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
+ memcpy(&sub->evt.s, s, sizeof(*s));
atomic_inc(&tn->subscription_count);
- if (sub->timeout != TIPC_WAIT_FOREVER) {
- setup_timer(&sub->timer, subscr_timeout, (unsigned long)sub);
- mod_timer(&sub->timer, jiffies + sub->timeout);
- }
+ setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
+ if (sub->timeout != TIPC_WAIT_FOREVER)
+ sub->timeout += jiffies;
+ if (!mod_timer(&sub->timer, sub->timeout))
+ tipc_subscrb_get(subscriber);
*sub_p = sub;
return 0;
}
/* Handle one termination request for the subscriber */
-static void subscr_conn_shutdown_event(int conid, void *usr_data)
+static void tipc_subscrb_shutdown_cb(int conid, void *usr_data)
{
- subscr_release((struct tipc_subscriber *)usr_data);
+ tipc_subscrb_delete((struct tipc_subscriber *)usr_data);
}
/* Handle one request to create a new subscription for the subscriber */
-static void subscr_conn_msg_event(struct net *net, int conid,
- struct sockaddr_tipc *addr, void *usr_data,
- void *buf, size_t len)
+static void tipc_subscrb_rcv_cb(struct net *net, int conid,
+ struct sockaddr_tipc *addr, void *usr_data,
+ void *buf, size_t len)
{
- struct tipc_subscriber *subscriber = usr_data;
+ struct tipc_subscriber *subscrb = usr_data;
struct tipc_subscription *sub = NULL;
struct tipc_net *tn = net_generic(net, tipc_net_id);
- spin_lock_bh(&subscriber->lock);
- subscr_subscribe(net, (struct tipc_subscr *)buf, subscriber, &sub);
- if (sub)
- tipc_nametbl_subscribe(sub);
- else
- tipc_conn_terminate(tn->topsrv, subscriber->conid);
- spin_unlock_bh(&subscriber->lock);
+ if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub))
+ return tipc_conn_terminate(tn->topsrv, subscrb->conid);
+
+ tipc_nametbl_subscribe(sub);
}
/* Handle one request to establish a new subscriber */
-static void *subscr_named_msg_event(int conid)
+static void *tipc_subscrb_connect_cb(int conid)
{
- struct tipc_subscriber *subscriber;
-
- /* Create subscriber object */
- subscriber = kzalloc(sizeof(struct tipc_subscriber), GFP_ATOMIC);
- if (subscriber == NULL) {
- pr_warn("Subscriber rejected, no memory\n");
- return NULL;
- }
- INIT_LIST_HEAD(&subscriber->subscription_list);
- subscriber->conid = conid;
- spin_lock_init(&subscriber->lock);
-
- return (void *)subscriber;
+ return (void *)tipc_subscrb_create(conid);
}
-int tipc_subscr_start(struct net *net)
+int tipc_topsrv_start(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
const char name[] = "topology_server";
@@ -355,9 +332,9 @@ int tipc_subscr_start(struct net *net)
topsrv->imp = TIPC_CRITICAL_IMPORTANCE;
topsrv->type = SOCK_SEQPACKET;
topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr);
- topsrv->tipc_conn_recvmsg = subscr_conn_msg_event;
- topsrv->tipc_conn_new = subscr_named_msg_event;
- topsrv->tipc_conn_shutdown = subscr_conn_shutdown_event;
+ topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb;
+ topsrv->tipc_conn_new = tipc_subscrb_connect_cb;
+ topsrv->tipc_conn_shutdown = tipc_subscrb_shutdown_cb;
strncpy(topsrv->name, name, strlen(name) + 1);
tn->topsrv = topsrv;
@@ -366,7 +343,7 @@ int tipc_subscr_start(struct net *net)
return tipc_server_start(topsrv);
}
-void tipc_subscr_stop(struct net *net)
+void tipc_topsrv_stop(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_server *topsrv = tn->topsrv;
diff --git a/kernel/net/tipc/subscr.h b/kernel/net/tipc/subscr.h
index 33488bd9f..92ee18cc5 100644
--- a/kernel/net/tipc/subscr.h
+++ b/kernel/net/tipc/subscr.h
@@ -54,7 +54,7 @@ struct tipc_subscriber;
* @filter: event filtering to be done for subscription
* @timer: timer governing subscription duration (optional)
* @nameseq_list: adjacent subscriptions in name sequence's subscription list
- * @subscription_list: adjacent subscriptions in subscriber's subscription list
+ * @subscrp_list: adjacent subscriptions in subscriber's subscription list
* @server_ref: object reference of server port associated with subscription
* @swap: indicates if subscriber uses opposite endianness in its messages
* @evt: template for events generated by subscription
@@ -67,17 +67,17 @@ struct tipc_subscription {
u32 filter;
struct timer_list timer;
struct list_head nameseq_list;
- struct list_head subscription_list;
+ struct list_head subscrp_list;
int swap;
struct tipc_event evt;
};
-int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower,
- u32 found_upper);
-void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower,
- u32 found_upper, u32 event, u32 port_ref,
- u32 node, int must);
-int tipc_subscr_start(struct net *net);
-void tipc_subscr_stop(struct net *net);
+int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper);
+void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
+ u32 found_lower, u32 found_upper, u32 event,
+ u32 port_ref, u32 node, int must);
+int tipc_topsrv_start(struct net *net);
+void tipc_topsrv_stop(struct net *net);
#endif
diff --git a/kernel/net/tipc/udp_media.c b/kernel/net/tipc/udp_media.c
index 66deebc66..70c03271b 100644
--- a/kernel/net/tipc/udp_media.c
+++ b/kernel/net/tipc/udp_media.c
@@ -48,10 +48,13 @@
#include <linux/tipc_netlink.h>
#include "core.h"
#include "bearer.h"
+#include "msg.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
+#define UDP_MIN_HEADROOM 28
+
static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
[TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC},
[TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY,
@@ -153,11 +156,15 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
struct udp_bearer *ub;
struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value;
struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
- struct sk_buff *clone;
struct rtable *rt;
- clone = skb_clone(skb, GFP_ATOMIC);
- skb_set_inner_protocol(clone, htons(ETH_P_TIPC));
+ if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
+ err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
+ if (err)
+ goto tx_error;
+ }
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
ub = rcu_dereference_rtnl(b->media_ptr);
if (!ub) {
err = -ENODEV;
@@ -167,7 +174,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
struct flowi4 fl = {
.daddr = dst->ipv4.s_addr,
.saddr = src->ipv4.s_addr,
- .flowi4_mark = clone->mark,
+ .flowi4_mark = skb->mark,
.flowi4_proto = IPPROTO_UDP
};
rt = ip_route_output_key(net, &fl);
@@ -176,7 +183,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
goto tx_error;
}
ttl = ip4_dst_hoplimit(&rt->dst);
- err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, clone,
+ err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb,
src->ipv4.s_addr,
dst->ipv4.s_addr, 0, ttl, 0,
src->udp_port, dst->udp_port,
@@ -194,11 +201,12 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
.saddr = src->ipv6,
.flowi6_proto = IPPROTO_UDP
};
- err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6);
+ err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst,
+ &fl6);
if (err)
goto tx_error;
ttl = ip6_dst_hoplimit(ndst);
- err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, clone,
+ err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
ndst->dev, &src->ipv6,
&dst->ipv6, 0, ttl, src->udp_port,
dst->udp_port, false);
@@ -207,7 +215,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
return err;
tx_error:
- kfree_skb(clone);
+ kfree_skb(skb);
return err;
}
@@ -216,6 +224,10 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct udp_bearer *ub;
struct tipc_bearer *b;
+ int usr = msg_user(buf_msg(skb));
+
+ if ((usr == LINK_PROTOCOL) || (usr == NAME_DISTRIBUTOR))
+ skb_linearize(skb);
ub = rcu_dereference_sk_user_data(sk);
if (!ub) {
@@ -424,7 +436,6 @@ static void tipc_udp_disable(struct tipc_bearer *b)
}
if (ub->ubsock)
sock_set_flag(ub->ubsock->sk, SOCK_DEAD);
- RCU_INIT_POINTER(b->media_ptr, NULL);
RCU_INIT_POINTER(ub->bearer, NULL);
/* sock_release need to be done outside of rtnl lock */
diff --git a/kernel/net/unix/af_unix.c b/kernel/net/unix/af_unix.c
index 06430598c..898a53a56 100644
--- a/kernel/net/unix/af_unix.c
+++ b/kernel/net/unix/af_unix.c
@@ -140,12 +140,17 @@ static struct hlist_head *unix_sockets_unbound(void *addr)
#ifdef CONFIG_SECURITY_NETWORK
static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{
- memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
+ UNIXCB(skb).secid = scm->secid;
}
static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{
- scm->secid = *UNIXSID(skb);
+ scm->secid = UNIXCB(skb).secid;
+}
+
+static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ return (scm->secid == UNIXCB(skb).secid);
}
#else
static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
@@ -153,6 +158,11 @@ static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{ }
+
+static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ return true;
+}
#endif /* CONFIG_SECURITY_NETWORK */
/*
@@ -316,9 +326,122 @@ found:
return s;
}
-static inline int unix_writable(struct sock *sk)
+/* Support code for asymmetrically connected dgram sockets
+ *
+ * If a datagram socket is connected to a socket not itself connected
+ * to the first socket (eg, /dev/log), clients may only enqueue more
+ * messages if the present receive queue of the server socket is not
+ * "too large". This means there's a second writeability condition
+ * poll and sendmsg need to test. The dgram recv code will do a wake
+ * up on the peer_wait wait queue of a socket upon reception of a
+ * datagram which needs to be propagated to sleeping would-be writers
+ * since these might not have sent anything so far. This can't be
+ * accomplished via poll_wait because the lifetime of the server
+ * socket might be less than that of its clients if these break their
+ * association with it or if the server socket is closed while clients
+ * are still connected to it and there's no way to inform "a polling
+ * implementation" that it should let go of a certain wait queue
+ *
+ * In order to propagate a wake up, a wait_queue_t of the client
+ * socket is enqueued on the peer_wait queue of the server socket
+ * whose wake function does a wake_up on the ordinary client socket
+ * wait queue. This connection is established whenever a write (or
+ * poll for write) hit the flow control condition and broken when the
+ * association to the server socket is dissolved or after a wake up
+ * was relayed.
+ */
+
+static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+ void *key)
+{
+ struct unix_sock *u;
+ wait_queue_head_t *u_sleep;
+
+ u = container_of(q, struct unix_sock, peer_wake);
+
+ __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
+ q);
+ u->peer_wake.private = NULL;
+
+ /* relaying can only happen while the wq still exists */
+ u_sleep = sk_sleep(&u->sk);
+ if (u_sleep)
+ wake_up_interruptible_poll(u_sleep, key);
+
+ return 0;
+}
+
+static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
{
- return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
+ struct unix_sock *u, *u_other;
+ int rc;
+
+ u = unix_sk(sk);
+ u_other = unix_sk(other);
+ rc = 0;
+ spin_lock(&u_other->peer_wait.lock);
+
+ if (!u->peer_wake.private) {
+ u->peer_wake.private = other;
+ __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
+
+ rc = 1;
+ }
+
+ spin_unlock(&u_other->peer_wait.lock);
+ return rc;
+}
+
+static void unix_dgram_peer_wake_disconnect(struct sock *sk,
+ struct sock *other)
+{
+ struct unix_sock *u, *u_other;
+
+ u = unix_sk(sk);
+ u_other = unix_sk(other);
+ spin_lock(&u_other->peer_wait.lock);
+
+ if (u->peer_wake.private == other) {
+ __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
+ u->peer_wake.private = NULL;
+ }
+
+ spin_unlock(&u_other->peer_wait.lock);
+}
+
+static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
+ struct sock *other)
+{
+ unix_dgram_peer_wake_disconnect(sk, other);
+ wake_up_interruptible_poll(sk_sleep(sk),
+ POLLOUT |
+ POLLWRNORM |
+ POLLWRBAND);
+}
+
+/* preconditions:
+ * - unix_peer(sk) == other
+ * - association is stable
+ */
+static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
+{
+ int connected;
+
+ connected = unix_dgram_peer_wake_connect(sk, other);
+
+ if (unix_recvq_full(other))
+ return 1;
+
+ if (connected)
+ unix_dgram_peer_wake_disconnect(sk, other);
+
+ return 0;
+}
+
+static int unix_writable(const struct sock *sk)
+{
+ return sk->sk_state != TCP_LISTEN &&
+ (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
}
static void unix_write_space(struct sock *sk)
@@ -420,6 +543,8 @@ static void unix_release_sock(struct sock *sk, int embrion)
skpair->sk_state_change(skpair);
sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
}
+
+ unix_dgram_peer_wake_disconnect(sk, skpair);
sock_put(skpair); /* It may now die */
unix_peer(sk) = NULL;
}
@@ -430,6 +555,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
if (state == TCP_LISTEN)
unix_release_sock(skb->sk, 1);
/* passed fds are erased in the kfree_skb hook */
+ UNIXCB(skb).consumed = skb->len;
kfree_skb(skb);
}
@@ -518,6 +644,11 @@ static int unix_ioctl(struct socket *, unsigned int, unsigned long);
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
+static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
+ size_t size, int flags);
+static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
+ struct pipe_inode_info *, size_t size,
+ unsigned int flags);
static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
static int unix_dgram_connect(struct socket *, struct sockaddr *,
@@ -558,7 +689,8 @@ static const struct proto_ops unix_stream_ops = {
.sendmsg = unix_stream_sendmsg,
.recvmsg = unix_stream_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
+ .sendpage = unix_stream_sendpage,
+ .splice_read = unix_stream_splice_read,
.set_peek_off = unix_set_peek_off,
};
@@ -620,7 +752,7 @@ static struct proto unix_proto = {
*/
static struct lock_class_key af_unix_sk_receive_queue_lock_key;
-static struct sock *unix_create1(struct net *net, struct socket *sock)
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
{
struct sock *sk = NULL;
struct unix_sock *u;
@@ -629,7 +761,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
goto out;
- sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
if (!sk)
goto out;
@@ -648,6 +780,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
INIT_LIST_HEAD(&u->link);
mutex_init(&u->readlock); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
+ init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
unix_insert_socket(unix_sockets_unbound(sk), sk);
out:
if (sk == NULL)
@@ -688,7 +821,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
return -ESOCKTNOSUPPORT;
}
- return unix_create1(net, sock) ? 0 : -ENOMEM;
+ return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
}
static int unix_release(struct socket *sock)
@@ -820,32 +953,20 @@ fail:
return NULL;
}
-static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode,
+ struct path *res)
{
- struct dentry *dentry;
- struct path path;
- int err = 0;
- /*
- * Get the parent directory, calculate the hash for last
- * component.
- */
- dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- return err;
+ int err;
- /*
- * All right, let's create it.
- */
- err = security_path_mknod(&path, dentry, mode, 0);
+ err = security_path_mknod(path, dentry, mode, 0);
if (!err) {
- err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+ err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
if (!err) {
- res->mnt = mntget(path.mnt);
+ res->mnt = mntget(path->mnt);
res->dentry = dget(dentry);
}
}
- done_path_create(&path, dentry);
+
return err;
}
@@ -856,10 +977,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
- int err;
+ int err, name_err;
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
+ struct path path;
+ struct dentry *dentry;
err = -EINVAL;
if (sunaddr->sun_family != AF_UNIX)
@@ -875,14 +998,34 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
addr_len = err;
+ name_err = 0;
+ dentry = NULL;
+ if (sun_path[0]) {
+ /* Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+
+ if (IS_ERR(dentry)) {
+ /* delay report until after 'already bound' check */
+ name_err = PTR_ERR(dentry);
+ dentry = NULL;
+ }
+ }
+
err = mutex_lock_interruptible(&u->readlock);
if (err)
- goto out;
+ goto out_path;
err = -EINVAL;
if (u->addr)
goto out_up;
+ if (name_err) {
+ err = name_err == -EEXIST ? -EADDRINUSE : name_err;
+ goto out_up;
+ }
+
err = -ENOMEM;
addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
if (!addr)
@@ -893,11 +1036,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
addr->hash = hash ^ sk->sk_type;
atomic_set(&addr->refcnt, 1);
- if (sun_path[0]) {
- struct path path;
+ if (dentry) {
+ struct path u_path;
umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
- err = unix_mknod(sun_path, mode, &path);
+ err = unix_mknod(dentry, &path, mode, &u_path);
if (err) {
if (err == -EEXIST)
err = -EADDRINUSE;
@@ -905,9 +1048,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
+ hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
- u->path = path;
+ u->path = u_path;
list = &unix_socket_table[hash];
} else {
spin_lock(&unix_table_lock);
@@ -930,6 +1073,10 @@ out_unlock:
spin_unlock(&unix_table_lock);
out_up:
mutex_unlock(&u->readlock);
+out_path:
+ if (dentry)
+ done_path_create(&path, dentry);
+
out:
return err;
}
@@ -1015,6 +1162,8 @@ restart:
if (unix_peer(sk)) {
struct sock *old_peer = unix_peer(sk);
unix_peer(sk) = other;
+ unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
+
unix_state_double_unlock(sk, other);
if (other != old_peer)
@@ -1088,7 +1237,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -ENOMEM;
/* create new sock for complete connection */
- newsk = unix_create1(sock_net(sk), NULL);
+ newsk = unix_create1(sock_net(sk), NULL, 0);
if (newsk == NULL)
goto out;
@@ -1347,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
UNIXCB(skb).fp = NULL;
for (i = scm->fp->count-1; i >= 0; i--)
- unix_notinflight(scm->fp->fp[i]);
+ unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}
static void unix_destruct_scm(struct sk_buff *skb)
@@ -1364,6 +1513,21 @@ static void unix_destruct_scm(struct sk_buff *skb)
sock_wfree(skb);
}
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+ struct user_struct *user = current_user();
+
+ if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+ return false;
+}
+
#define MAX_RECURSION_LEVEL 4
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
@@ -1372,6 +1536,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
unsigned char max_level = 0;
int unix_sock_count = 0;
+ if (too_many_unix_fds(current))
+ return -ETOOMANYREFS;
+
for (i = scm->fp->count - 1; i >= 0; i--) {
struct sock *sk = unix_get_socket(scm->fp->fp[i]);
@@ -1393,10 +1560,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (!UNIXCB(skb).fp)
return -ENOMEM;
- if (unix_sock_count) {
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->fp[i]);
- }
+ for (i = scm->fp->count - 1; i >= 0; i--)
+ unix_inflight(scm->fp->user, scm->fp->fp[i]);
return max_level;
}
@@ -1408,6 +1573,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
UNIXCB(skb).uid = scm->creds.uid;
UNIXCB(skb).gid = scm->creds.gid;
UNIXCB(skb).fp = NULL;
+ unix_get_secdata(scm, skb);
if (scm->fp && send_fds)
err = unix_attach_fds(scm, skb);
@@ -1415,6 +1581,14 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
return err;
}
+static bool unix_passcred_enabled(const struct socket *sock,
+ const struct sock *other)
+{
+ return test_bit(SOCK_PASSCRED, &sock->flags) ||
+ !other->sk_socket ||
+ test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
+}
+
/*
* Some apps rely on write() giving SCM_CREDENTIALS
* We include credentials if source or destination socket
@@ -1425,14 +1599,41 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
{
if (UNIXCB(skb).pid)
return;
- if (test_bit(SOCK_PASSCRED, &sock->flags) ||
- !other->sk_socket ||
- test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
+ if (unix_passcred_enabled(sock, other)) {
UNIXCB(skb).pid = get_pid(task_tgid(current));
current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
}
}
+static int maybe_init_creds(struct scm_cookie *scm,
+ struct socket *socket,
+ const struct sock *other)
+{
+ int err;
+ struct msghdr msg = { .msg_controllen = 0 };
+
+ err = scm_send(socket, &msg, scm, false);
+ if (err)
+ return err;
+
+ if (unix_passcred_enabled(socket, other)) {
+ scm->pid = get_pid(task_tgid(current));
+ current_uid_gid(&scm->creds.uid, &scm->creds.gid);
+ }
+ return err;
+}
+
+static bool unix_skb_scm_eq(struct sk_buff *skb,
+ struct scm_cookie *scm)
+{
+ const struct unix_skb_parms *u = &UNIXCB(skb);
+
+ return u->pid == scm->pid &&
+ uid_eq(u->uid, scm->creds.uid) &&
+ gid_eq(u->gid, scm->creds.gid) &&
+ unix_secdata_eq(scm, skb);
+}
+
/*
* Send AF_UNIX data.
*/
@@ -1453,6 +1654,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
struct scm_cookie scm;
int max_level;
int data_len = 0;
+ int sk_locked;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
@@ -1503,7 +1705,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
if (err < 0)
goto out_free;
max_level = err + 1;
- unix_get_secdata(&scm, skb);
skb_put(skb, len - data_len);
skb->data_len = data_len;
@@ -1532,12 +1733,14 @@ restart:
goto out_free;
}
+ sk_locked = 0;
unix_state_lock(other);
+restart_locked:
err = -EPERM;
if (!unix_may_send(sk, other))
goto out_unlock;
- if (sock_flag(other, SOCK_DEAD)) {
+ if (unlikely(sock_flag(other, SOCK_DEAD))) {
/*
* Check with 1003.1g - what should
* datagram error
@@ -1545,10 +1748,14 @@ restart:
unix_state_unlock(other);
sock_put(other);
+ if (!sk_locked)
+ unix_state_lock(sk);
+
err = 0;
- unix_state_lock(sk);
if (unix_peer(sk) == other) {
unix_peer(sk) = NULL;
+ unix_dgram_peer_wake_disconnect_wakeup(sk, other);
+
unix_state_unlock(sk);
unix_dgram_disconnected(sk, other);
@@ -1574,21 +1781,43 @@ restart:
goto out_unlock;
}
- if (unix_peer(other) != sk && unix_recvq_full(other)) {
- if (!timeo) {
- err = -EAGAIN;
- goto out_unlock;
+ /* other == sk && unix_peer(other) != sk if
+ * - unix_peer(sk) == NULL, destination address bound to sk
+ * - unix_peer(sk) == sk by time of get but disconnected before lock
+ */
+ if (other != sk &&
+ unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+ if (timeo) {
+ timeo = unix_wait_for_peer(other, timeo);
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out_free;
+
+ goto restart;
}
- timeo = unix_wait_for_peer(other, timeo);
+ if (!sk_locked) {
+ unix_state_unlock(other);
+ unix_state_double_lock(sk, other);
+ }
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- goto out_free;
+ if (unix_peer(sk) != other ||
+ unix_dgram_peer_wake_me(sk, other)) {
+ err = -EAGAIN;
+ sk_locked = 1;
+ goto out_unlock;
+ }
- goto restart;
+ if (!sk_locked) {
+ sk_locked = 1;
+ goto restart_locked;
+ }
}
+ if (unlikely(sk_locked))
+ unix_state_unlock(sk);
+
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
maybe_add_creds(skb, sock, other);
@@ -1602,6 +1831,8 @@ restart:
return len;
out_unlock:
+ if (sk_locked)
+ unix_state_unlock(sk);
unix_state_unlock(other);
out_free:
kfree_skb(skb);
@@ -1720,6 +1951,122 @@ out_err:
return sent ? : err;
}
+static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
+ int offset, size_t size, int flags)
+{
+ int err;
+ bool send_sigpipe = false;
+ bool init_scm = true;
+ struct scm_cookie scm;
+ struct sock *other, *sk = socket->sk;
+ struct sk_buff *skb, *newskb = NULL, *tail = NULL;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ other = unix_peer(sk);
+ if (!other || sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ if (false) {
+alloc_skb:
+ unix_state_unlock(other);
+ mutex_unlock(&unix_sk(other)->readlock);
+ newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
+ &err, 0);
+ if (!newskb)
+ goto err;
+ }
+
+ /* we must acquire readlock as we modify already present
+ * skbs in the sk_receive_queue and mess with skb->len
+ */
+ err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+ if (err) {
+ err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
+ goto err;
+ }
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ err = -EPIPE;
+ send_sigpipe = true;
+ goto err_unlock;
+ }
+
+ unix_state_lock(other);
+
+ if (sock_flag(other, SOCK_DEAD) ||
+ other->sk_shutdown & RCV_SHUTDOWN) {
+ err = -EPIPE;
+ send_sigpipe = true;
+ goto err_state_unlock;
+ }
+
+ if (init_scm) {
+ err = maybe_init_creds(&scm, socket, other);
+ if (err)
+ goto err_state_unlock;
+ init_scm = false;
+ }
+
+ skb = skb_peek_tail(&other->sk_receive_queue);
+ if (tail && tail == skb) {
+ skb = newskb;
+ } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
+ if (newskb) {
+ skb = newskb;
+ } else {
+ tail = skb;
+ goto alloc_skb;
+ }
+ } else if (newskb) {
+ /* this is fast path, we don't necessarily need to
+ * call to kfree_skb even though with newskb == NULL
+ * this - does no harm
+ */
+ consume_skb(newskb);
+ newskb = NULL;
+ }
+
+ if (skb_append_pagefrags(skb, page, offset, size)) {
+ tail = skb;
+ goto alloc_skb;
+ }
+
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += size;
+ atomic_add(size, &sk->sk_wmem_alloc);
+
+ if (newskb) {
+ err = unix_scm_to_skb(&scm, skb, false);
+ if (err)
+ goto err_state_unlock;
+ spin_lock(&other->sk_receive_queue.lock);
+ __skb_queue_tail(&other->sk_receive_queue, newskb);
+ spin_unlock(&other->sk_receive_queue.lock);
+ }
+
+ unix_state_unlock(other);
+ mutex_unlock(&unix_sk(other)->readlock);
+
+ other->sk_data_ready(other);
+ scm_destroy(&scm);
+ return size;
+
+err_state_unlock:
+ unix_state_unlock(other);
+err_unlock:
+ mutex_unlock(&unix_sk(other)->readlock);
+err:
+ kfree_skb(newskb);
+ if (send_sigpipe && !(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ if (!init_scm)
+ scm_destroy(&scm);
+ return err;
+}
+
static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
@@ -1860,8 +2207,9 @@ out:
* Sleep until more data has arrived. But check for races..
*/
static long unix_stream_data_wait(struct sock *sk, long timeo,
- struct sk_buff *last)
+ struct sk_buff *last, unsigned int last_len)
{
+ struct sk_buff *tail;
DEFINE_WAIT(wait);
unix_state_lock(sk);
@@ -1869,14 +2217,16 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (skb_peek_tail(&sk->sk_receive_queue) != last ||
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail != last ||
+ (tail && tail->len != last_len) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
!timeo)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
unix_state_unlock(sk);
timeo = freezable_schedule_timeout(timeo);
unix_state_lock(sk);
@@ -1884,7 +2234,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
if (sock_flag(sk, SOCK_DEAD))
break;
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
finish_wait(sk_sleep(sk), &wait);
@@ -1897,49 +2247,62 @@ static unsigned int unix_skb_len(const struct sk_buff *skb)
return skb->len - UNIXCB(skb).consumed;
}
-static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+struct unix_stream_read_state {
+ int (*recv_actor)(struct sk_buff *, int, int,
+ struct unix_stream_read_state *);
+ struct socket *socket;
+ struct msghdr *msg;
+ struct pipe_inode_info *pipe;
+ size_t size;
+ int flags;
+ unsigned int splice_flags;
+};
+
+static int unix_stream_read_generic(struct unix_stream_read_state *state)
{
struct scm_cookie scm;
+ struct socket *sock = state->socket;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
- DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
int copied = 0;
+ int flags = state->flags;
int noblock = flags & MSG_DONTWAIT;
- int check_creds = 0;
+ bool check_creds = false;
int target;
int err = 0;
long timeo;
int skip;
+ size_t size = state->size;
+ unsigned int last_len;
- err = -EINVAL;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+ err = -EINVAL;
goto out;
+ }
- err = -EOPNOTSUPP;
- if (flags&MSG_OOB)
+ if (unlikely(flags & MSG_OOB)) {
+ err = -EOPNOTSUPP;
goto out;
+ }
- target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
timeo = sock_rcvtimeo(sk, noblock);
+ memset(&scm, 0, sizeof(scm));
+
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
+ mutex_lock(&u->readlock);
- memset(&scm, 0, sizeof(scm));
-
- err = mutex_lock_interruptible(&u->readlock);
- if (unlikely(err)) {
- /* recvmsg() in non blocking mode is supposed to return -EAGAIN
- * sk_rcvtimeo is not honored by mutex_lock_interruptible()
- */
- err = noblock ? -EAGAIN : -ERESTARTSYS;
- goto out;
- }
+ if (flags & MSG_PEEK)
+ skip = sk_peek_offset(sk, flags);
+ else
+ skip = 0;
do {
int chunk;
+ bool drop_skb;
struct sk_buff *skb, *last;
unix_state_lock(sk);
@@ -1948,6 +2311,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
goto unlock;
}
last = skb = skb_peek(&sk->sk_receive_queue);
+ last_len = last ? last->len : 0;
again:
if (skb == NULL) {
unix_sk(sk)->recursion_level = 0;
@@ -1965,29 +2329,33 @@ again:
goto unlock;
unix_state_unlock(sk);
- err = -EAGAIN;
- if (!timeo)
+ if (!timeo) {
+ err = -EAGAIN;
break;
+ }
+
mutex_unlock(&u->readlock);
- timeo = unix_stream_data_wait(sk, timeo, last);
+ timeo = unix_stream_data_wait(sk, timeo, last,
+ last_len);
- if (signal_pending(current)
- || mutex_lock_interruptible(&u->readlock)) {
+ if (signal_pending(current)) {
err = sock_intr_errno(timeo);
+ scm_destroy(&scm);
goto out;
}
+ mutex_lock(&u->readlock);
continue;
- unlock:
+unlock:
unix_state_unlock(sk);
break;
}
- skip = sk_peek_offset(sk, flags);
while (skip >= unix_skb_len(skb)) {
skip -= unix_skb_len(skb);
last = skb;
+ last_len = skb->len;
skb = skb_peek_next(skb, &sk->sk_receive_queue);
if (!skb)
goto again;
@@ -1997,25 +2365,30 @@ again:
if (check_creds) {
/* Never glue messages from different writers */
- if ((UNIXCB(skb).pid != scm.pid) ||
- !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
- !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
+ if (!unix_skb_scm_eq(skb, &scm))
break;
} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
/* Copy credentials */
scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
- check_creds = 1;
+ unix_set_secdata(&scm, skb);
+ check_creds = true;
}
/* Copy address just once */
- if (sunaddr) {
- unix_copy_addr(msg, skb->sk);
+ if (state->msg && state->msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
+ state->msg->msg_name);
+ unix_copy_addr(state->msg, skb->sk);
sunaddr = NULL;
}
chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
- if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
- msg, chunk)) {
+ skb_get(skb);
+ chunk = state->recv_actor(skb, skip, chunk, state);
+ drop_skb = !unix_skb_len(skb);
+ /* skb is only safe to use if !drop_skb */
+ consume_skb(skb);
+ if (chunk < 0) {
if (copied == 0)
copied = -EFAULT;
break;
@@ -2023,6 +2396,18 @@ again:
copied += chunk;
size -= chunk;
+ if (drop_skb) {
+ /* the skb was touched by a concurrent reader;
+ * we should not expect anything from this skb
+ * anymore and assume it invalid - we can be
+ * sure it was dropped from the socket queue
+ *
+ * let's report a short read
+ */
+ err = 0;
+ break;
+ }
+
/* Mark read part of skb as used */
if (!(flags & MSG_PEEK)) {
UNIXCB(skb).consumed += chunk;
@@ -2048,16 +2433,101 @@ again:
sk_peek_offset_fwd(sk, chunk);
+ if (UNIXCB(skb).fp)
+ break;
+
+ skip = 0;
+ last = skb;
+ last_len = skb->len;
+ unix_state_lock(sk);
+ skb = skb_peek_next(skb, &sk->sk_receive_queue);
+ if (skb)
+ goto again;
+ unix_state_unlock(sk);
break;
}
} while (size);
mutex_unlock(&u->readlock);
- scm_recv(sock, msg, &scm, flags);
+ if (state->msg)
+ scm_recv(sock, state->msg, &scm, flags);
+ else
+ scm_destroy(&scm);
out:
return copied ? : err;
}
+static int unix_stream_read_actor(struct sk_buff *skb,
+ int skip, int chunk,
+ struct unix_stream_read_state *state)
+{
+ int ret;
+
+ ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
+ state->msg, chunk);
+ return ret ?: chunk;
+}
+
+static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct unix_stream_read_state state = {
+ .recv_actor = unix_stream_read_actor,
+ .socket = sock,
+ .msg = msg,
+ .size = size,
+ .flags = flags
+ };
+
+ return unix_stream_read_generic(&state);
+}
+
+static ssize_t skb_unix_socket_splice(struct sock *sk,
+ struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ int ret;
+ struct unix_sock *u = unix_sk(sk);
+
+ mutex_unlock(&u->readlock);
+ ret = splice_to_pipe(pipe, spd);
+ mutex_lock(&u->readlock);
+
+ return ret;
+}
+
+static int unix_stream_splice_actor(struct sk_buff *skb,
+ int skip, int chunk,
+ struct unix_stream_read_state *state)
+{
+ return skb_splice_bits(skb, state->socket->sk,
+ UNIXCB(skb).consumed + skip,
+ state->pipe, chunk, state->splice_flags,
+ skb_unix_socket_splice);
+}
+
+static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t size, unsigned int flags)
+{
+ struct unix_stream_read_state state = {
+ .recv_actor = unix_stream_splice_actor,
+ .socket = sock,
+ .pipe = pipe,
+ .size = size,
+ .splice_flags = flags,
+ };
+
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ if (sock->file->f_flags & O_NONBLOCK ||
+ flags & SPLICE_F_NONBLOCK)
+ state.flags = MSG_DONTWAIT;
+
+ return unix_stream_read_generic(&state);
+}
+
static int unix_shutdown(struct socket *sock, int mode)
{
struct sock *sk = sock->sk;
@@ -2231,20 +2701,22 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
return mask;
writable = unix_writable(sk);
- other = unix_peer_get(sk);
- if (other) {
- if (unix_peer(other) != sk) {
- sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
- if (unix_recvq_full(other))
- writable = 0;
- }
- sock_put(other);
+ if (writable) {
+ unix_state_lock(sk);
+
+ other = unix_peer(sk);
+ if (other && unix_peer(other) != sk &&
+ unix_recvq_full(other) &&
+ unix_dgram_peer_wake_me(sk, other))
+ writable = 0;
+
+ unix_state_unlock(sk);
}
if (writable)
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
diff --git a/kernel/net/unix/diag.c b/kernel/net/unix/diag.c
index c512f64d5..4d9679701 100644
--- a/kernel/net/unix/diag.c
+++ b/kernel/net/unix/diag.c
@@ -220,7 +220,7 @@ done:
return skb->len;
}
-static struct sock *unix_lookup_by_ino(int ino)
+static struct sock *unix_lookup_by_ino(unsigned int ino)
{
int i;
struct sock *sk;
diff --git a/kernel/net/unix/garbage.c b/kernel/net/unix/garbage.c
index a73a226f2..6a0d48525 100644
--- a/kernel/net/unix/garbage.c
+++ b/kernel/net/unix/garbage.c
@@ -116,15 +116,15 @@ struct sock *unix_get_socket(struct file *filp)
* descriptor if it is for an AF_UNIX socket.
*/
-void unix_inflight(struct file *fp)
+void unix_inflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp);
+ spin_lock(&unix_gc_lock);
+
if (s) {
struct unix_sock *u = unix_sk(s);
- spin_lock(&unix_gc_lock);
-
if (atomic_long_inc_return(&u->inflight) == 1) {
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &gc_inflight_list);
@@ -132,25 +132,28 @@ void unix_inflight(struct file *fp)
BUG_ON(list_empty(&u->link));
}
unix_tot_inflight++;
- spin_unlock(&unix_gc_lock);
}
+ user->unix_inflight++;
+ spin_unlock(&unix_gc_lock);
}
-void unix_notinflight(struct file *fp)
+void unix_notinflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp);
+ spin_lock(&unix_gc_lock);
+
if (s) {
struct unix_sock *u = unix_sk(s);
- spin_lock(&unix_gc_lock);
BUG_ON(list_empty(&u->link));
if (atomic_long_dec_and_test(&u->inflight))
list_del_init(&u->link);
unix_tot_inflight--;
- spin_unlock(&unix_gc_lock);
}
+ user->unix_inflight--;
+ spin_unlock(&unix_gc_lock);
}
static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
diff --git a/kernel/net/vmw_vsock/af_vsock.c b/kernel/net/vmw_vsock/af_vsock.c
index 2ec86e652..7fd1220fb 100644
--- a/kernel/net/vmw_vsock/af_vsock.c
+++ b/kernel/net/vmw_vsock/af_vsock.c
@@ -36,19 +36,20 @@
* not support simultaneous connects (two "client" sockets connecting).
*
* - "Server" sockets are referred to as listener sockets throughout this
- * implementation because they are in the SS_LISTEN state. When a connection
- * request is received (the second kind of socket mentioned above), we create a
- * new socket and refer to it as a pending socket. These pending sockets are
- * placed on the pending connection list of the listener socket. When future
- * packets are received for the address the listener socket is bound to, we
- * check if the source of the packet is from one that has an existing pending
- * connection. If it does, we process the packet for the pending socket. When
- * that socket reaches the connected state, it is removed from the listener
- * socket's pending list and enqueued in the listener socket's accept queue.
- * Callers of accept(2) will accept connected sockets from the listener socket's
- * accept queue. If the socket cannot be accepted for some reason then it is
- * marked rejected. Once the connection is accepted, it is owned by the user
- * process and the responsibility for cleanup falls with that user process.
+ * implementation because they are in the VSOCK_SS_LISTEN state. When a
+ * connection request is received (the second kind of socket mentioned above),
+ * we create a new socket and refer to it as a pending socket. These pending
+ * sockets are placed on the pending connection list of the listener socket.
+ * When future packets are received for the address the listener socket is
+ * bound to, we check if the source of the packet is from one that has an
+ * existing pending connection. If it does, we process the packet for the
+ * pending socket. When that socket reaches the connected state, it is removed
+ * from the listener socket's pending list and enqueued in the listener
+ * socket's accept queue. Callers of accept(2) will accept connected sockets
+ * from the listener socket's accept queue. If the socket cannot be accepted
+ * for some reason then it is marked rejected. Once the connection is
+ * accepted, it is owned by the user process and the responsibility for cleanup
+ * falls with that user process.
*
* - It is possible that these pending sockets will never reach the connected
* state; in fact, we may never receive another packet after the connection
@@ -114,8 +115,6 @@ static struct proto vsock_proto = {
*/
#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
-#define SS_LISTEN 255
-
static const struct vsock_transport *transport;
static DEFINE_MUTEX(vsock_register_mutex);
@@ -581,13 +580,14 @@ struct sock *__vsock_create(struct net *net,
struct socket *sock,
struct sock *parent,
gfp_t priority,
- unsigned short type)
+ unsigned short type,
+ int kern)
{
struct sock *sk;
struct vsock_sock *psk;
struct vsock_sock *vsk;
- sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
+ sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern);
if (!sk)
return NULL;
@@ -886,7 +886,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock,
/* Listening sockets that have connections in their accept
* queue can be read.
*/
- if (sk->sk_state == SS_LISTEN
+ if (sk->sk_state == VSOCK_SS_LISTEN
&& !vsock_is_accept_queue_empty(sk))
mask |= POLLIN | POLLRDNORM;
@@ -1143,7 +1143,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
err = -EALREADY;
break;
default:
- if ((sk->sk_state == SS_LISTEN) ||
+ if ((sk->sk_state == VSOCK_SS_LISTEN) ||
vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
err = -EINVAL;
goto out;
@@ -1255,7 +1255,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
goto out;
}
- if (listener->sk_state != SS_LISTEN) {
+ if (listener->sk_state != VSOCK_SS_LISTEN) {
err = -EINVAL;
goto out;
}
@@ -1347,7 +1347,7 @@ static int vsock_listen(struct socket *sock, int backlog)
}
sk->sk_max_ack_backlog = backlog;
- sk->sk_state = SS_LISTEN;
+ sk->sk_state = VSOCK_SS_LISTEN;
err = 0;
@@ -1866,7 +1866,7 @@ static int vsock_create(struct net *net, struct socket *sock,
sock->state = SS_UNCONNECTED;
- return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
+ return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
}
static const struct net_proto_family vsock_family_ops = {
@@ -1947,13 +1947,13 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
err = misc_register(&vsock_device);
if (err) {
pr_err("Failed to register misc device\n");
- return -ENOENT;
+ goto err_reset_transport;
}
err = proto_register(&vsock_proto, 1); /* we want our slab */
if (err) {
pr_err("Cannot register vsock protocol\n");
- goto err_misc_deregister;
+ goto err_deregister_misc;
}
err = sock_register(&vsock_family_ops);
@@ -1968,8 +1968,9 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
err_unregister_proto:
proto_unregister(&vsock_proto);
-err_misc_deregister:
+err_deregister_misc:
misc_deregister(&vsock_device);
+err_reset_transport:
transport = NULL;
err_busy:
mutex_unlock(&vsock_register_mutex);
diff --git a/kernel/net/vmw_vsock/vmci_transport.c b/kernel/net/vmw_vsock/vmci_transport.c
index c294da095..0a369bb44 100644
--- a/kernel/net/vmw_vsock/vmci_transport.c
+++ b/kernel/net/vmw_vsock/vmci_transport.c
@@ -40,13 +40,11 @@
static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg);
static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg);
-static void vmci_transport_peer_attach_cb(u32 sub_id,
- const struct vmci_event_data *ed,
- void *client_data);
static void vmci_transport_peer_detach_cb(u32 sub_id,
const struct vmci_event_data *ed,
void *client_data);
static void vmci_transport_recv_pkt_work(struct work_struct *work);
+static void vmci_transport_cleanup(struct work_struct *work);
static int vmci_transport_recv_listen(struct sock *sk,
struct vmci_transport_packet *pkt);
static int vmci_transport_recv_connecting_server(
@@ -75,6 +73,10 @@ struct vmci_transport_recv_pkt_info {
struct vmci_transport_packet pkt;
};
+static LIST_HEAD(vmci_transport_cleanup_list);
+static DEFINE_SPINLOCK(vmci_transport_cleanup_lock);
+static DECLARE_WORK(vmci_transport_cleanup_work, vmci_transport_cleanup);
+
static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID,
VMCI_INVALID_ID };
static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
@@ -90,8 +92,6 @@ static int PROTOCOL_OVERRIDE = -1;
*/
#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
-#define SS_LISTEN 255
-
/* Helper function to convert from a VMCI error code to a VSock error code. */
static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
@@ -791,44 +791,6 @@ out:
return err;
}
-static void vmci_transport_peer_attach_cb(u32 sub_id,
- const struct vmci_event_data *e_data,
- void *client_data)
-{
- struct sock *sk = client_data;
- const struct vmci_event_payload_qp *e_payload;
- struct vsock_sock *vsk;
-
- e_payload = vmci_event_data_const_payload(e_data);
-
- vsk = vsock_sk(sk);
-
- /* We don't ask for delayed CBs when we subscribe to this event (we
- * pass 0 as flags to vmci_event_subscribe()). VMCI makes no
- * guarantees in that case about what context we might be running in,
- * so it could be BH or process, blockable or non-blockable. So we
- * need to account for all possible contexts here.
- */
- local_bh_disable();
- bh_lock_sock(sk);
-
- /* XXX This is lame, we should provide a way to lookup sockets by
- * qp_handle.
- */
- if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle,
- e_payload->handle)) {
- /* XXX This doesn't do anything, but in the future we may want
- * to set a flag here to verify the attach really did occur and
- * we weren't just sent a datagram claiming it was.
- */
- goto out;
- }
-
-out:
- bh_unlock_sock(sk);
- local_bh_enable();
-}
-
static void vmci_transport_handle_detach(struct sock *sk)
{
struct vsock_sock *vsk;
@@ -871,28 +833,38 @@ static void vmci_transport_peer_detach_cb(u32 sub_id,
const struct vmci_event_data *e_data,
void *client_data)
{
- struct sock *sk = client_data;
+ struct vmci_transport *trans = client_data;
const struct vmci_event_payload_qp *e_payload;
- struct vsock_sock *vsk;
e_payload = vmci_event_data_const_payload(e_data);
- vsk = vsock_sk(sk);
- if (vmci_handle_is_invalid(e_payload->handle))
- return;
-
- /* Same rules for locking as for peer_attach_cb(). */
- local_bh_disable();
- bh_lock_sock(sk);
/* XXX This is lame, we should provide a way to lookup sockets by
* qp_handle.
*/
- if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle,
- e_payload->handle))
- vmci_transport_handle_detach(sk);
+ if (vmci_handle_is_invalid(e_payload->handle) ||
+ vmci_handle_is_equal(trans->qp_handle, e_payload->handle))
+ return;
- bh_unlock_sock(sk);
- local_bh_enable();
+ /* We don't ask for delayed CBs when we subscribe to this event (we
+ * pass 0 as flags to vmci_event_subscribe()). VMCI makes no
+ * guarantees in that case about what context we might be running in,
+ * so it could be BH or process, blockable or non-blockable. So we
+ * need to account for all possible contexts here.
+ */
+ spin_lock_bh(&trans->lock);
+ if (!trans->sk)
+ goto out;
+
+ /* Apart from here, trans->lock is only grabbed as part of sk destruct,
+ * where trans->sk isn't locked.
+ */
+ bh_lock_sock(trans->sk);
+
+ vmci_transport_handle_detach(trans->sk);
+
+ bh_unlock_sock(trans->sk);
+ out:
+ spin_unlock_bh(&trans->lock);
}
static void vmci_transport_qp_resumed_cb(u32 sub_id,
@@ -919,7 +891,7 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work)
vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context;
switch (sk->sk_state) {
- case SS_LISTEN:
+ case VSOCK_SS_LISTEN:
vmci_transport_recv_listen(sk, pkt);
break;
case SS_CONNECTING:
@@ -1022,7 +994,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
}
pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type);
+ sk->sk_type, 0);
if (!pending) {
vmci_transport_send_reset(sk, pkt);
return -ENOMEM;
@@ -1181,7 +1153,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,
*/
err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,
vmci_transport_peer_detach_cb,
- pending, &detach_sub_id);
+ vmci_trans(vpending), &detach_sub_id);
if (err < VMCI_SUCCESS) {
vmci_transport_send_reset(pending, pkt);
err = vmci_transport_error_to_vsock_error(err);
@@ -1262,7 +1234,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,
/* Callers of accept() will be be waiting on the listening socket, not
* the pending socket.
*/
- listener->sk_state_change(listener);
+ listener->sk_data_ready(listener);
return 0;
@@ -1321,7 +1293,6 @@ vmci_transport_recv_connecting_client(struct sock *sk,
|| vmci_trans(vsk)->qpair
|| vmci_trans(vsk)->produce_size != 0
|| vmci_trans(vsk)->consume_size != 0
- || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID
|| vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {
skerr = EPROTO;
err = -EINVAL;
@@ -1389,7 +1360,6 @@ static int vmci_transport_recv_connecting_client_negotiate(
struct vsock_sock *vsk;
struct vmci_handle handle;
struct vmci_qp *qpair;
- u32 attach_sub_id;
u32 detach_sub_id;
bool is_local;
u32 flags;
@@ -1399,7 +1369,6 @@ static int vmci_transport_recv_connecting_client_negotiate(
vsk = vsock_sk(sk);
handle = VMCI_INVALID_HANDLE;
- attach_sub_id = VMCI_INVALID_ID;
detach_sub_id = VMCI_INVALID_ID;
/* If we have gotten here then we should be past the point where old
@@ -1444,23 +1413,15 @@ static int vmci_transport_recv_connecting_client_negotiate(
goto destroy;
}
- /* Subscribe to attach and detach events first.
+ /* Subscribe to detach events first.
*
* XXX We attach once for each queue pair created for now so it is easy
* to find the socket (it's provided), but later we should only
* subscribe once and add a way to lookup sockets by queue pair handle.
*/
- err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH,
- vmci_transport_peer_attach_cb,
- sk, &attach_sub_id);
- if (err < VMCI_SUCCESS) {
- err = vmci_transport_error_to_vsock_error(err);
- goto destroy;
- }
-
err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,
vmci_transport_peer_detach_cb,
- sk, &detach_sub_id);
+ vmci_trans(vsk), &detach_sub_id);
if (err < VMCI_SUCCESS) {
err = vmci_transport_error_to_vsock_error(err);
goto destroy;
@@ -1496,7 +1457,6 @@ static int vmci_transport_recv_connecting_client_negotiate(
vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size =
pkt->u.size;
- vmci_trans(vsk)->attach_sub_id = attach_sub_id;
vmci_trans(vsk)->detach_sub_id = detach_sub_id;
vmci_trans(vsk)->notify_ops->process_negotiate(sk);
@@ -1504,9 +1464,6 @@ static int vmci_transport_recv_connecting_client_negotiate(
return 0;
destroy:
- if (attach_sub_id != VMCI_INVALID_ID)
- vmci_event_unsubscribe(attach_sub_id);
-
if (detach_sub_id != VMCI_INVALID_ID)
vmci_event_unsubscribe(detach_sub_id);
@@ -1607,9 +1564,11 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,
vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;
vmci_trans(vsk)->qpair = NULL;
vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0;
- vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id =
- VMCI_INVALID_ID;
+ vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID;
vmci_trans(vsk)->notify_ops = NULL;
+ INIT_LIST_HEAD(&vmci_trans(vsk)->elem);
+ vmci_trans(vsk)->sk = &vsk->sk;
+ spin_lock_init(&vmci_trans(vsk)->lock);
if (psk) {
vmci_trans(vsk)->queue_pair_size =
vmci_trans(psk)->queue_pair_size;
@@ -1629,29 +1588,57 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,
return 0;
}
-static void vmci_transport_destruct(struct vsock_sock *vsk)
+static void vmci_transport_free_resources(struct list_head *transport_list)
{
- if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) {
- vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id);
- vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID;
- }
+ while (!list_empty(transport_list)) {
+ struct vmci_transport *transport =
+ list_first_entry(transport_list, struct vmci_transport,
+ elem);
+ list_del(&transport->elem);
- if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {
- vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id);
- vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID;
- }
+ if (transport->detach_sub_id != VMCI_INVALID_ID) {
+ vmci_event_unsubscribe(transport->detach_sub_id);
+ transport->detach_sub_id = VMCI_INVALID_ID;
+ }
- if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) {
- vmci_qpair_detach(&vmci_trans(vsk)->qpair);
- vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;
- vmci_trans(vsk)->produce_size = 0;
- vmci_trans(vsk)->consume_size = 0;
+ if (!vmci_handle_is_invalid(transport->qp_handle)) {
+ vmci_qpair_detach(&transport->qpair);
+ transport->qp_handle = VMCI_INVALID_HANDLE;
+ transport->produce_size = 0;
+ transport->consume_size = 0;
+ }
+
+ kfree(transport);
}
+}
+
+static void vmci_transport_cleanup(struct work_struct *work)
+{
+ LIST_HEAD(pending);
+
+ spin_lock_bh(&vmci_transport_cleanup_lock);
+ list_replace_init(&vmci_transport_cleanup_list, &pending);
+ spin_unlock_bh(&vmci_transport_cleanup_lock);
+ vmci_transport_free_resources(&pending);
+}
+
+static void vmci_transport_destruct(struct vsock_sock *vsk)
+{
+ /* Ensure that the detach callback doesn't use the sk/vsk
+ * we are about to destruct.
+ */
+ spin_lock_bh(&vmci_trans(vsk)->lock);
+ vmci_trans(vsk)->sk = NULL;
+ spin_unlock_bh(&vmci_trans(vsk)->lock);
if (vmci_trans(vsk)->notify_ops)
vmci_trans(vsk)->notify_ops->socket_destruct(vsk);
- kfree(vsk->trans);
+ spin_lock_bh(&vmci_transport_cleanup_lock);
+ list_add(&vmci_trans(vsk)->elem, &vmci_transport_cleanup_list);
+ spin_unlock_bh(&vmci_transport_cleanup_lock);
+ schedule_work(&vmci_transport_cleanup_work);
+
vsk->trans = NULL;
}
@@ -2146,6 +2133,9 @@ module_init(vmci_transport_init);
static void __exit vmci_transport_exit(void)
{
+ cancel_work_sync(&vmci_transport_cleanup_work);
+ vmci_transport_free_resources(&vmci_transport_cleanup_list);
+
if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) {
if (vmci_datagram_destroy_handle(
vmci_transport_stream_handle) != VMCI_SUCCESS)
@@ -2164,6 +2154,7 @@ module_exit(vmci_transport_exit);
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
+MODULE_VERSION("1.0.2.0-k");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("vmware_vsock");
MODULE_ALIAS_NETPROTO(PF_VSOCK);
diff --git a/kernel/net/vmw_vsock/vmci_transport.h b/kernel/net/vmw_vsock/vmci_transport.h
index ce6c9623d..2ad46f396 100644
--- a/kernel/net/vmw_vsock/vmci_transport.h
+++ b/kernel/net/vmw_vsock/vmci_transport.h
@@ -119,10 +119,12 @@ struct vmci_transport {
u64 queue_pair_size;
u64 queue_pair_min_size;
u64 queue_pair_max_size;
- u32 attach_sub_id;
u32 detach_sub_id;
union vmci_transport_notify notify;
struct vmci_transport_notify_ops *notify_ops;
+ struct list_head elem;
+ struct sock *sk;
+ spinlock_t lock; /* protects sk. */
};
int vmci_transport_register(void);
diff --git a/kernel/net/wimax/op-rfkill.c b/kernel/net/wimax/op-rfkill.c
index 7d730543f..477364ad7 100644
--- a/kernel/net/wimax/op-rfkill.c
+++ b/kernel/net/wimax/op-rfkill.c
@@ -135,8 +135,7 @@ EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
* @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
* %WIMAX_RF_OFF radio off.
*
- * Reports changes in the software RF switch state to the the WiMAX
- * stack.
+ * Reports changes in the software RF switch state to the WiMAX stack.
*
* The main use is during initialization, so the driver can query the
* device for its current software radio kill switch state and feed it
diff --git a/kernel/net/wireless/Kconfig b/kernel/net/wireless/Kconfig
index 4f5543dd2..da72ed32f 100644
--- a/kernel/net/wireless/Kconfig
+++ b/kernel/net/wireless/Kconfig
@@ -174,6 +174,16 @@ config CFG80211_INTERNAL_REGDB
Most distributions have a CRDA package. So if unsure, say N.
+config CFG80211_CRDA_SUPPORT
+ bool "support CRDA" if CFG80211_INTERNAL_REGDB
+ default y
+ depends on CFG80211
+ help
+ You should enable this option unless you know for sure you have no
+ need for it, for example when using internal regdb (above.)
+
+ If unsure, say Y.
+
config CFG80211_WEXT
bool "cfg80211 wireless extensions compatibility" if !CFG80211_WEXT_EXPORT
depends on CFG80211
diff --git a/kernel/net/wireless/chan.c b/kernel/net/wireless/chan.c
index 7aaf7415d..59cabc9bc 100644
--- a/kernel/net/wireless/chan.c
+++ b/kernel/net/wireless/chan.c
@@ -698,19 +698,20 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
EXPORT_SYMBOL(cfg80211_chandef_usable);
/*
- * For GO only, check if the channel can be used under permissive conditions
- * mandated by the some regulatory bodies, i.e., the channel is marked with
- * IEEE80211_CHAN_GO_CONCURRENT and there is an additional station interface
+ * Check if the channel can be used under permissive conditions mandated by
+ * some regulatory bodies, i.e., the channel is marked with
+ * IEEE80211_CHAN_IR_CONCURRENT and there is an additional station interface
* associated to an AP on the same channel or on the same UNII band
* (assuming that the AP is an authorized master).
- * In addition allow the GO to operate on a channel on which indoor operation is
+ * In addition allow operation on a channel on which indoor operation is
* allowed, iff we are currently operating in an indoor environment.
*/
-static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev,
+static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
+ enum nl80211_iftype iftype,
struct ieee80211_channel *chan)
{
- struct wireless_dev *wdev_iter;
- struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx);
+ struct wireless_dev *wdev;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
ASSERT_RTNL();
@@ -718,32 +719,48 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev,
!(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
return false;
+ /* only valid for GO and TDLS off-channel (station/p2p-CL) */
+ if (iftype != NL80211_IFTYPE_P2P_GO &&
+ iftype != NL80211_IFTYPE_STATION &&
+ iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return false;
+
if (regulatory_indoor_allowed() &&
(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
return true;
- if (!(chan->flags & IEEE80211_CHAN_GO_CONCURRENT))
+ if (!(chan->flags & IEEE80211_CHAN_IR_CONCURRENT))
return false;
/*
* Generally, it is possible to rely on another device/driver to allow
- * the GO concurrent relaxation, however, since the device can further
+ * the IR concurrent relaxation, however, since the device can further
* enforce the relaxation (by doing a similar verifications as this),
* and thus fail the GO instantiation, consider only the interfaces of
* the current registered device.
*/
- list_for_each_entry(wdev_iter, &rdev->wdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
struct ieee80211_channel *other_chan = NULL;
int r1, r2;
- if (wdev_iter->iftype != NL80211_IFTYPE_STATION ||
- !netif_running(wdev_iter->netdev))
- continue;
-
- wdev_lock(wdev_iter);
- if (wdev_iter->current_bss)
- other_chan = wdev_iter->current_bss->pub.channel;
- wdev_unlock(wdev_iter);
+ wdev_lock(wdev);
+ if (wdev->iftype == NL80211_IFTYPE_STATION &&
+ wdev->current_bss)
+ other_chan = wdev->current_bss->pub.channel;
+
+ /*
+ * If a GO already operates on the same GO_CONCURRENT channel,
+ * this one (maybe the same one) can beacon as well. We allow
+ * the operation even if the station we relied on with
+ * GO_CONCURRENT is disconnected now. But then we must make sure
+ * we're not outdoor on an indoor-only channel.
+ */
+ if (iftype == NL80211_IFTYPE_P2P_GO &&
+ wdev->iftype == NL80211_IFTYPE_P2P_GO &&
+ wdev->beacon_interval &&
+ !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
+ other_chan = wdev->chandef.chan;
+ wdev_unlock(wdev);
if (!other_chan)
continue;
@@ -780,25 +797,18 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev,
return false;
}
-bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
- struct cfg80211_chan_def *chandef,
- enum nl80211_iftype iftype)
+static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype,
+ bool check_no_ir)
{
- struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
bool res;
u32 prohibited_flags = IEEE80211_CHAN_DISABLED |
IEEE80211_CHAN_RADAR;
- trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype);
+ trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
- /*
- * Under certain conditions suggested by the some regulatory bodies
- * a GO can operate on channels marked with IEEE80211_NO_IR
- * so set this flag only if such relaxations are not enabled and
- * the conditions are not met.
- */
- if (iftype != NL80211_IFTYPE_P2P_GO ||
- !cfg80211_go_permissive_chan(rdev, chandef->chan))
+ if (check_no_ir)
prohibited_flags |= IEEE80211_CHAN_NO_IR;
if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 &&
@@ -812,8 +822,36 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
trace_cfg80211_return_bool(res);
return res;
}
+
+bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype)
+{
+ return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, true);
+}
EXPORT_SYMBOL(cfg80211_reg_can_beacon);
+bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype)
+{
+ bool check_no_ir;
+
+ ASSERT_RTNL();
+
+ /*
+ * Under certain conditions suggested by some regulatory bodies a
+ * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag
+ * only if such relaxations are not enabled and the conditions are not
+ * met.
+ */
+ check_no_ir = !cfg80211_ir_permissive_chan(wiphy, iftype,
+ chandef->chan);
+
+ return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
+}
+EXPORT_SYMBOL(cfg80211_reg_can_beacon_relax);
+
int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
struct cfg80211_chan_def *chandef)
{
diff --git a/kernel/net/wireless/core.c b/kernel/net/wireless/core.c
index 2a0bbd228..8f0bac7e0 100644
--- a/kernel/net/wireless/core.c
+++ b/kernel/net/wireless/core.c
@@ -407,6 +407,9 @@ use_default_name:
INIT_LIST_HEAD(&rdev->bss_list);
INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done);
INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results);
+ INIT_LIST_HEAD(&rdev->mlme_unreg);
+ spin_lock_init(&rdev->mlme_unreg_lock);
+ INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk);
INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk,
cfg80211_dfs_channels_update_work);
#ifdef CONFIG_CFG80211_WEXT
@@ -416,6 +419,7 @@ use_default_name:
device_initialize(&rdev->wiphy.dev);
rdev->wiphy.dev.class = &ieee80211_class;
rdev->wiphy.dev.platform_data = rdev;
+ device_enable_async_suspend(&rdev->wiphy.dev);
INIT_LIST_HEAD(&rdev->destroy_list);
spin_lock_init(&rdev->destroy_list_lock);
@@ -457,6 +461,9 @@ use_default_name:
rdev->wiphy.max_num_csa_counters = 1;
+ rdev->wiphy.max_sched_scan_plans = 1;
+ rdev->wiphy.max_sched_scan_plan_interval = U32_MAX;
+
return &rdev->wiphy;
}
EXPORT_SYMBOL(wiphy_new_nm);
@@ -632,7 +639,7 @@ int wiphy_register(struct wiphy *wiphy)
if (WARN_ON(!sband->n_channels))
return -EINVAL;
/*
- * on 60gHz band, there are no legacy rates, so
+ * on 60GHz band, there are no legacy rates, so
* n_bitrates is 0
*/
if (WARN_ON(band != IEEE80211_BAND_60GHZ &&
@@ -802,6 +809,7 @@ void wiphy_unregister(struct wiphy *wiphy)
cancel_delayed_work_sync(&rdev->dfs_update_channels_wk);
flush_work(&rdev->destroy_work);
flush_work(&rdev->sched_scan_stop_wk);
+ flush_work(&rdev->mlme_unreg_wk);
#ifdef CONFIG_PM
if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup)
@@ -855,6 +863,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
switch (wdev->iftype) {
case NL80211_IFTYPE_P2P_DEVICE:
+ cfg80211_mlme_purge_registrations(wdev);
cfg80211_stop_p2p_device(rdev, wdev);
break;
default:
@@ -1138,6 +1147,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
return NOTIFY_DONE;
}
+ wireless_nlevent_flush();
+
return NOTIFY_OK;
}
diff --git a/kernel/net/wireless/core.h b/kernel/net/wireless/core.h
index 801cd49c5..a618b4b86 100644
--- a/kernel/net/wireless/core.h
+++ b/kernel/net/wireless/core.h
@@ -59,6 +59,10 @@ struct cfg80211_registered_device {
struct list_head beacon_registrations;
spinlock_t beacon_registrations_lock;
+ struct list_head mlme_unreg;
+ spinlock_t mlme_unreg_lock;
+ struct work_struct mlme_unreg_wk;
+
/* protected by RTNL only */
int num_running_ifaces;
int num_running_monitor_ifaces;
@@ -133,6 +137,7 @@ struct cfg80211_internal_bss {
struct list_head list;
struct list_head hidden_list;
struct rb_node rbn;
+ u64 ts_boottime;
unsigned long ts;
unsigned long refcount;
atomic_t hold;
@@ -222,6 +227,7 @@ struct cfg80211_event {
const u8 *ie;
size_t ie_len;
u16 reason;
+ bool locally_generated;
} dc;
struct {
u8 bssid[ETH_ALEN];
@@ -347,6 +353,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
u16 frame_type, const u8 *match_data,
int match_len);
+void cfg80211_mlme_unreg_wk(struct work_struct *wk);
void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
diff --git a/kernel/net/wireless/mlme.c b/kernel/net/wireless/mlme.c
index 7aae329e2..fb44fa3bf 100644
--- a/kernel/net/wireless/mlme.c
+++ b/kernel/net/wireless/mlme.c
@@ -2,6 +2,7 @@
* cfg80211 MLME SAP interface
*
* Copyright (c) 2009, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2015 Intel Deutschland GmbH
*/
#include <linux/kernel.h>
@@ -389,6 +390,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
struct cfg80211_mgmt_registration {
struct list_head list;
+ struct wireless_dev *wdev;
u32 nlportid;
@@ -399,6 +401,46 @@ struct cfg80211_mgmt_registration {
u8 match[];
};
+static void
+cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_mgmt_registration *reg;
+
+ ASSERT_RTNL();
+
+ spin_lock_bh(&rdev->mlme_unreg_lock);
+ while ((reg = list_first_entry_or_null(&rdev->mlme_unreg,
+ struct cfg80211_mgmt_registration,
+ list))) {
+ list_del(&reg->list);
+ spin_unlock_bh(&rdev->mlme_unreg_lock);
+
+ if (rdev->ops->mgmt_frame_register) {
+ u16 frame_type = le16_to_cpu(reg->frame_type);
+
+ rdev_mgmt_frame_register(rdev, reg->wdev,
+ frame_type, false);
+ }
+
+ kfree(reg);
+
+ spin_lock_bh(&rdev->mlme_unreg_lock);
+ }
+ spin_unlock_bh(&rdev->mlme_unreg_lock);
+}
+
+void cfg80211_mlme_unreg_wk(struct work_struct *wk)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(wk, struct cfg80211_registered_device,
+ mlme_unreg_wk);
+
+ rtnl_lock();
+ cfg80211_process_mlme_unregistrations(rdev);
+ rtnl_unlock();
+}
+
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
u16 frame_type, const u8 *match_data,
int match_len)
@@ -449,11 +491,18 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
nreg->match_len = match_len;
nreg->nlportid = snd_portid;
nreg->frame_type = cpu_to_le16(frame_type);
+ nreg->wdev = wdev;
list_add(&nreg->list, &wdev->mgmt_registrations);
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+ /* process all unregistrations to avoid driver confusion */
+ cfg80211_process_mlme_unregistrations(rdev);
if (rdev->ops->mgmt_frame_register)
rdev_mgmt_frame_register(rdev, wdev, frame_type, true);
+ return 0;
+
out:
spin_unlock_bh(&wdev->mgmt_registrations_lock);
@@ -472,15 +521,12 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
if (reg->nlportid != nlportid)
continue;
- if (rdev->ops->mgmt_frame_register) {
- u16 frame_type = le16_to_cpu(reg->frame_type);
-
- rdev_mgmt_frame_register(rdev, wdev,
- frame_type, false);
- }
-
list_del(&reg->list);
- kfree(reg);
+ spin_lock(&rdev->mlme_unreg_lock);
+ list_add_tail(&reg->list, &rdev->mlme_unreg);
+ spin_unlock(&rdev->mlme_unreg_lock);
+
+ schedule_work(&rdev->mlme_unreg_wk);
}
spin_unlock_bh(&wdev->mgmt_registrations_lock);
@@ -496,16 +542,15 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
{
- struct cfg80211_mgmt_registration *reg, *tmp;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
spin_lock_bh(&wdev->mgmt_registrations_lock);
-
- list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
- list_del(&reg->list);
- kfree(reg);
- }
-
+ spin_lock(&rdev->mlme_unreg_lock);
+ list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg);
+ spin_unlock(&rdev->mlme_unreg_lock);
spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+ cfg80211_process_mlme_unregistrations(rdev);
}
int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
diff --git a/kernel/net/wireless/nl80211.c b/kernel/net/wireless/nl80211.c
index dd78445c7..75b0d23ee 100644
--- a/kernel/net/wireless/nl80211.c
+++ b/kernel/net/wireless/nl80211.c
@@ -3,6 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2015 Intel Deutschland GmbH
*/
#include <linux/if.h>
@@ -478,6 +479,12 @@ nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = {
[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 },
};
+static const struct nla_policy
+nl80211_plan_policy[NL80211_SCHED_SCAN_PLAN_MAX + 1] = {
+ [NL80211_SCHED_SCAN_PLAN_INTERVAL] = { .type = NLA_U32 },
+ [NL80211_SCHED_SCAN_PLAN_ITERATIONS] = { .type = NLA_U32 },
+};
+
static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct cfg80211_registered_device **rdev,
@@ -639,8 +646,8 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY))
goto nla_put_failure;
- if ((chan->flags & IEEE80211_CHAN_GO_CONCURRENT) &&
- nla_put_flag(msg, NL80211_FREQUENCY_ATTR_GO_CONCURRENT))
+ if ((chan->flags & IEEE80211_CHAN_IR_CONCURRENT) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_IR_CONCURRENT))
goto nla_put_failure;
if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ))
@@ -1303,7 +1310,13 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN,
rdev->wiphy.max_sched_scan_ie_len) ||
nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS,
- rdev->wiphy.max_match_sets))
+ rdev->wiphy.max_match_sets) ||
+ nla_put_u32(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS,
+ rdev->wiphy.max_sched_scan_plans) ||
+ nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL,
+ rdev->wiphy.max_sched_scan_plan_interval) ||
+ nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
+ rdev->wiphy.max_sched_scan_plan_iterations))
goto nla_put_failure;
if ((rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) &&
@@ -2003,7 +2016,8 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
switch (iftype) {
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_P2P_GO:
- if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) {
+ if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
+ iftype)) {
result = -EINVAL;
break;
}
@@ -2320,6 +2334,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
rdev->wiphy.frag_threshold = old_frag_threshold;
rdev->wiphy.rts_threshold = old_rts_threshold;
rdev->wiphy.coverage_class = old_coverage_class;
+ return result;
}
}
return 0;
@@ -2401,6 +2416,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
}
}
+ if (rdev->ops->get_tx_power) {
+ int dbm, ret;
+
+ ret = rdev_get_tx_power(rdev, wdev, &dbm);
+ if (ret == 0 &&
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
+ DBM_TO_MBM(dbm)))
+ goto nla_put_failure;
+ }
+
if (wdev->ssid_len) {
if (nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid))
goto nla_put_failure;
@@ -3403,16 +3428,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
} else if (!nl80211_get_ap_channel(rdev, &params))
return -EINVAL;
- if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
- wdev->iftype))
+ if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
+ wdev->iftype))
return -EINVAL;
- if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
- params.acl = parse_acl_data(&rdev->wiphy, info);
- if (IS_ERR(params.acl))
- return PTR_ERR(params.acl);
- }
-
if (info->attrs[NL80211_ATTR_SMPS_MODE]) {
params.smps_mode =
nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]);
@@ -3436,6 +3455,12 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.smps_mode = NL80211_SMPS_OFF;
}
+ if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
+ params.acl = parse_acl_data(&rdev->wiphy, info);
+ if (IS_ERR(params.acl))
+ return PTR_ERR(params.acl);
+ }
+
wdev_lock(wdev);
err = rdev_start_ap(rdev, dev, &params);
if (!err) {
@@ -3943,10 +3968,13 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
struct station_parameters *params,
enum cfg80211_station_type statype)
{
- if (params->listen_interval != -1)
+ if (params->listen_interval != -1 &&
+ statype != CFG80211_STA_AP_CLIENT_UNASSOC)
return -EINVAL;
+
if (params->aid &&
- !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)))
+ !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) &&
+ statype != CFG80211_STA_AP_CLIENT_UNASSOC)
return -EINVAL;
/* When you run into this, adjust the code below for the new flag */
@@ -3996,7 +4024,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);
}
- if (statype != CFG80211_STA_TDLS_PEER_SETUP) {
+ if (statype != CFG80211_STA_TDLS_PEER_SETUP &&
+ statype != CFG80211_STA_AP_CLIENT_UNASSOC) {
/* reject other things that can't change */
if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD)
return -EINVAL;
@@ -4008,7 +4037,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
return -EINVAL;
}
- if (statype != CFG80211_STA_AP_CLIENT) {
+ if (statype != CFG80211_STA_AP_CLIENT &&
+ statype != CFG80211_STA_AP_CLIENT_UNASSOC) {
if (params->vlan)
return -EINVAL;
}
@@ -4020,6 +4050,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
return -EOPNOTSUPP;
break;
case CFG80211_STA_AP_CLIENT:
+ case CFG80211_STA_AP_CLIENT_UNASSOC:
/* accept only the listed bits */
if (params->sta_flags_mask &
~(BIT(NL80211_STA_FLAG_AUTHORIZED) |
@@ -4061,7 +4092,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
return -EINVAL;
break;
case CFG80211_STA_MESH_PEER_USER:
- if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION)
+ if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION &&
+ params->plink_action != NL80211_PLINK_ACTION_BLOCK)
return -EINVAL;
break;
}
@@ -4216,13 +4248,22 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
memset(&params, 0, sizeof(params));
- params.listen_interval = -1;
-
if (!rdev->ops->change_station)
return -EOPNOTSUPP;
- if (info->attrs[NL80211_ATTR_STA_AID])
- return -EINVAL;
+ /*
+ * AID and listen_interval properties can be set only for unassociated
+ * station. Include these parameters here and will check them in
+ * cfg80211_check_station_change().
+ */
+ if (info->attrs[NL80211_ATTR_PEER_AID])
+ params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
+
+ if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
+ params.listen_interval =
+ nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
+ else
+ params.listen_interval = -1;
if (!info->attrs[NL80211_ATTR_MAC])
return -EINVAL;
@@ -4249,9 +4290,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
}
- if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
- return -EINVAL;
-
if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
return -EINVAL;
@@ -4915,56 +4953,6 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
return err;
}
-static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
- [NL80211_ATTR_REG_RULE_FLAGS] = { .type = NLA_U32 },
- [NL80211_ATTR_FREQ_RANGE_START] = { .type = NLA_U32 },
- [NL80211_ATTR_FREQ_RANGE_END] = { .type = NLA_U32 },
- [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 },
- [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 },
- [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 },
- [NL80211_ATTR_DFS_CAC_TIME] = { .type = NLA_U32 },
-};
-
-static int parse_reg_rule(struct nlattr *tb[],
- struct ieee80211_reg_rule *reg_rule)
-{
- struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
- struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;
-
- if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
- return -EINVAL;
- if (!tb[NL80211_ATTR_FREQ_RANGE_START])
- return -EINVAL;
- if (!tb[NL80211_ATTR_FREQ_RANGE_END])
- return -EINVAL;
- if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
- return -EINVAL;
- if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
- return -EINVAL;
-
- reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);
-
- freq_range->start_freq_khz =
- nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
- freq_range->end_freq_khz =
- nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
- freq_range->max_bandwidth_khz =
- nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);
-
- power_rule->max_eirp =
- nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);
-
- if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
- power_rule->max_antenna_gain =
- nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);
-
- if (tb[NL80211_ATTR_DFS_CAC_TIME])
- reg_rule->dfs_cac_ms =
- nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]);
-
- return 0;
-}
-
static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
{
char *data = NULL;
@@ -5596,6 +5584,57 @@ out_err:
return err;
}
+#ifdef CONFIG_CFG80211_CRDA_SUPPORT
+static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
+ [NL80211_ATTR_REG_RULE_FLAGS] = { .type = NLA_U32 },
+ [NL80211_ATTR_FREQ_RANGE_START] = { .type = NLA_U32 },
+ [NL80211_ATTR_FREQ_RANGE_END] = { .type = NLA_U32 },
+ [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 },
+ [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 },
+ [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 },
+ [NL80211_ATTR_DFS_CAC_TIME] = { .type = NLA_U32 },
+};
+
+static int parse_reg_rule(struct nlattr *tb[],
+ struct ieee80211_reg_rule *reg_rule)
+{
+ struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
+ struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;
+
+ if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_FREQ_RANGE_START])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_FREQ_RANGE_END])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
+ return -EINVAL;
+
+ reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);
+
+ freq_range->start_freq_khz =
+ nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
+ freq_range->end_freq_khz =
+ nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
+ freq_range->max_bandwidth_khz =
+ nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);
+
+ power_rule->max_eirp =
+ nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);
+
+ if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
+ power_rule->max_antenna_gain =
+ nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);
+
+ if (tb[NL80211_ATTR_DFS_CAC_TIME])
+ reg_rule->dfs_cac_ms =
+ nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]);
+
+ return 0;
+}
+
static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
@@ -5672,6 +5711,7 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
kfree(rd);
return r;
}
+#endif /* CONFIG_CFG80211_CRDA_SUPPORT */
static int validate_scan_freqs(struct nlattr *freqs)
{
@@ -5957,14 +5997,100 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static int
+nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
+ struct cfg80211_sched_scan_request *request,
+ struct nlattr **attrs)
+{
+ int tmp, err, i = 0;
+ struct nlattr *attr;
+
+ if (!attrs[NL80211_ATTR_SCHED_SCAN_PLANS]) {
+ u32 interval;
+
+ /*
+ * If scan plans are not specified,
+ * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this
+ * case one scan plan will be set with the specified scan
+ * interval and infinite number of iterations.
+ */
+ if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
+ return -EINVAL;
+
+ interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
+ if (!interval)
+ return -EINVAL;
+
+ request->scan_plans[0].interval =
+ DIV_ROUND_UP(interval, MSEC_PER_SEC);
+ if (!request->scan_plans[0].interval)
+ return -EINVAL;
+
+ if (request->scan_plans[0].interval >
+ wiphy->max_sched_scan_plan_interval)
+ request->scan_plans[0].interval =
+ wiphy->max_sched_scan_plan_interval;
+
+ return 0;
+ }
+
+ nla_for_each_nested(attr, attrs[NL80211_ATTR_SCHED_SCAN_PLANS], tmp) {
+ struct nlattr *plan[NL80211_SCHED_SCAN_PLAN_MAX + 1];
+
+ if (WARN_ON(i >= n_plans))
+ return -EINVAL;
+
+ err = nla_parse(plan, NL80211_SCHED_SCAN_PLAN_MAX,
+ nla_data(attr), nla_len(attr),
+ nl80211_plan_policy);
+ if (err)
+ return err;
+
+ if (!plan[NL80211_SCHED_SCAN_PLAN_INTERVAL])
+ return -EINVAL;
+
+ request->scan_plans[i].interval =
+ nla_get_u32(plan[NL80211_SCHED_SCAN_PLAN_INTERVAL]);
+ if (!request->scan_plans[i].interval ||
+ request->scan_plans[i].interval >
+ wiphy->max_sched_scan_plan_interval)
+ return -EINVAL;
+
+ if (plan[NL80211_SCHED_SCAN_PLAN_ITERATIONS]) {
+ request->scan_plans[i].iterations =
+ nla_get_u32(plan[NL80211_SCHED_SCAN_PLAN_ITERATIONS]);
+ if (!request->scan_plans[i].iterations ||
+ (request->scan_plans[i].iterations >
+ wiphy->max_sched_scan_plan_iterations))
+ return -EINVAL;
+ } else if (i < n_plans - 1) {
+ /*
+ * All scan plans but the last one must specify
+ * a finite number of iterations
+ */
+ return -EINVAL;
+ }
+
+ i++;
+ }
+
+ /*
+ * The last scan plan must not specify the number of
+ * iterations, it is supposed to run infinitely
+ */
+ if (request->scan_plans[n_plans - 1].iterations)
+ return -EINVAL;
+
+ return 0;
+}
+
static struct cfg80211_sched_scan_request *
nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
struct nlattr **attrs)
{
struct cfg80211_sched_scan_request *request;
struct nlattr *attr;
- int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i;
- u32 interval;
+ int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i, n_plans = 0;
enum ieee80211_band band;
size_t ie_len;
struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
@@ -5973,13 +6099,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
if (!is_valid_ie_attr(attrs[NL80211_ATTR_IE]))
return ERR_PTR(-EINVAL);
- if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
- return ERR_PTR(-EINVAL);
-
- interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
- if (interval == 0)
- return ERR_PTR(-EINVAL);
-
if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
n_channels = validate_scan_freqs(
attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
@@ -6043,9 +6162,37 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
if (ie_len > wiphy->max_sched_scan_ie_len)
return ERR_PTR(-EINVAL);
+ if (attrs[NL80211_ATTR_SCHED_SCAN_PLANS]) {
+ /*
+ * NL80211_ATTR_SCHED_SCAN_INTERVAL must not be specified since
+ * each scan plan already specifies its own interval
+ */
+ if (attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
+ return ERR_PTR(-EINVAL);
+
+ nla_for_each_nested(attr,
+ attrs[NL80211_ATTR_SCHED_SCAN_PLANS], tmp)
+ n_plans++;
+ } else {
+ /*
+ * The scan interval attribute is kept for backward
+ * compatibility. If no scan plans are specified and sched scan
+ * interval is specified, one scan plan will be set with this
+ * scan interval and infinite number of iterations.
+ */
+ if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
+ return ERR_PTR(-EINVAL);
+
+ n_plans = 1;
+ }
+
+ if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
+ return ERR_PTR(-EINVAL);
+
request = kzalloc(sizeof(*request)
+ sizeof(*request->ssids) * n_ssids
+ sizeof(*request->match_sets) * n_match_sets
+ + sizeof(*request->scan_plans) * n_plans
+ sizeof(*request->channels) * n_channels
+ ie_len, GFP_KERNEL);
if (!request)
@@ -6073,6 +6220,18 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
}
request->n_match_sets = n_match_sets;
+ if (n_match_sets)
+ request->scan_plans = (void *)(request->match_sets +
+ n_match_sets);
+ else if (request->ie)
+ request->scan_plans = (void *)(request->ie + ie_len);
+ else if (n_ssids)
+ request->scan_plans = (void *)(request->ssids + n_ssids);
+ else
+ request->scan_plans = (void *)(request->channels + n_channels);
+
+ request->n_scan_plans = n_plans;
+
i = 0;
if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
/* user specified, bail out if channel not found */
@@ -6235,7 +6394,10 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
request->delay =
nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
- request->interval = interval;
+ err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
+ if (err)
+ goto out_free;
+
request->scan_start = jiffies;
return request;
@@ -6491,8 +6653,8 @@ skip_beacons:
if (err)
return err;
- if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
- wdev->iftype))
+ if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
+ wdev->iftype))
return -EINVAL;
err = cfg80211_chandef_dfs_required(wdev->wiphy,
@@ -6588,6 +6750,11 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
jiffies_to_msecs(jiffies - intbss->ts)))
goto nla_put_failure;
+ if (intbss->ts_boottime &&
+ nla_put_u64(msg, NL80211_BSS_LAST_SEEN_BOOTTIME,
+ intbss->ts_boottime))
+ goto nla_put_failure;
+
switch (rdev->wiphy.signal_type) {
case CFG80211_SIGNAL_TYPE_MBM:
if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal))
@@ -7388,7 +7555,8 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info)
int err;
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
return -EOPNOTSUPP;
if (!rdev->ops->set_mcast_rate)
@@ -7773,8 +7941,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
if (!(rdev->wiphy.features &
NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
- !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+ !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) {
+ kzfree(connkeys);
return -EINVAL;
+ }
connect.flags |= ASSOC_REQ_USE_RRM;
}
@@ -8827,7 +8997,7 @@ static int nl80211_send_wowlan_tcp(struct sk_buff *msg,
static int nl80211_send_wowlan_nd(struct sk_buff *msg,
struct cfg80211_sched_scan_request *req)
{
- struct nlattr *nd, *freqs, *matches, *match;
+ struct nlattr *nd, *freqs, *matches, *match, *scan_plans, *scan_plan;
int i;
if (!req)
@@ -8837,7 +9007,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
if (!nd)
return -ENOBUFS;
- if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL, req->interval))
+ if (req->n_scan_plans == 1 &&
+ nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL,
+ req->scan_plans[0].interval * 1000))
return -ENOBUFS;
if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
@@ -8864,6 +9036,23 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
nla_nest_end(msg, matches);
}
+ scan_plans = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_PLANS);
+ if (!scan_plans)
+ return -ENOBUFS;
+
+ for (i = 0; i < req->n_scan_plans; i++) {
+ scan_plan = nla_nest_start(msg, i + 1);
+ if (!scan_plan ||
+ nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
+ req->scan_plans[i].interval) ||
+ (req->scan_plans[i].iterations &&
+ nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS,
+ req->scan_plans[i].iterations)))
+ return -ENOBUFS;
+ nla_nest_end(msg, scan_plan);
+ }
+ nla_nest_end(msg, scan_plans);
+
nla_nest_end(msg, nd);
return 0;
@@ -9316,6 +9505,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
if (new_triggers.tcp && new_triggers.tcp->sock)
sock_release(new_triggers.tcp->sock);
kfree(new_triggers.tcp);
+ kfree(new_triggers.nd_config);
return err;
}
#endif
@@ -9934,6 +10124,9 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
if (!wdev->netdev && !wdev->p2p_started)
return -ENETDOWN;
}
+
+ if (!vcmd->doit)
+ return -EOPNOTSUPP;
} else {
wdev = NULL;
}
@@ -9953,6 +10146,193 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
return -EOPNOTSUPP;
}
+static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct cfg80211_registered_device **rdev,
+ struct wireless_dev **wdev)
+{
+ u32 vid, subcmd;
+ unsigned int i;
+ int vcmd_idx = -1;
+ int err;
+ void *data = NULL;
+ unsigned int data_len = 0;
+
+ rtnl_lock();
+
+ if (cb->args[0]) {
+ /* subtract the 1 again here */
+ struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
+ struct wireless_dev *tmp;
+
+ if (!wiphy) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ *rdev = wiphy_to_rdev(wiphy);
+ *wdev = NULL;
+
+ if (cb->args[1]) {
+ list_for_each_entry(tmp, &(*rdev)->wdev_list, list) {
+ if (tmp->identifier == cb->args[1] - 1) {
+ *wdev = tmp;
+ break;
+ }
+ }
+ }
+
+ /* keep rtnl locked in successful case */
+ return 0;
+ }
+
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
+ nl80211_fam.attrbuf, nl80211_fam.maxattr,
+ nl80211_policy);
+ if (err)
+ goto out_unlock;
+
+ if (!nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID] ||
+ !nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk),
+ nl80211_fam.attrbuf);
+ if (IS_ERR(*wdev))
+ *wdev = NULL;
+
+ *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk),
+ nl80211_fam.attrbuf);
+ if (IS_ERR(*rdev)) {
+ err = PTR_ERR(*rdev);
+ goto out_unlock;
+ }
+
+ vid = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID]);
+ subcmd = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
+
+ for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) {
+ const struct wiphy_vendor_command *vcmd;
+
+ vcmd = &(*rdev)->wiphy.vendor_commands[i];
+
+ if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
+ continue;
+
+ if (!vcmd->dumpit) {
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ vcmd_idx = i;
+ break;
+ }
+
+ if (vcmd_idx < 0) {
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ if (nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]) {
+ data = nla_data(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]);
+ data_len = nla_len(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]);
+ }
+
+ /* 0 is the first index - add 1 to parse only once */
+ cb->args[0] = (*rdev)->wiphy_idx + 1;
+ /* add 1 to know if it was NULL */
+ cb->args[1] = *wdev ? (*wdev)->identifier + 1 : 0;
+ cb->args[2] = vcmd_idx;
+ cb->args[3] = (unsigned long)data;
+ cb->args[4] = data_len;
+
+ /* keep rtnl locked in successful case */
+ return 0;
+ out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ unsigned int vcmd_idx;
+ const struct wiphy_vendor_command *vcmd;
+ void *data;
+ int data_len;
+ int err;
+ struct nlattr *vendor_data;
+
+ err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev);
+ if (err)
+ return err;
+
+ vcmd_idx = cb->args[2];
+ data = (void *)cb->args[3];
+ data_len = cb->args[4];
+ vcmd = &rdev->wiphy.vendor_commands[vcmd_idx];
+
+ if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
+ WIPHY_VENDOR_CMD_NEED_NETDEV)) {
+ if (!wdev)
+ return -EINVAL;
+ if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
+ !wdev->netdev)
+ return -EINVAL;
+
+ if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
+ if (wdev->netdev &&
+ !netif_running(wdev->netdev))
+ return -ENETDOWN;
+ if (!wdev->netdev && !wdev->p2p_started)
+ return -ENETDOWN;
+ }
+ }
+
+ while (1) {
+ void *hdr = nl80211hdr_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ NL80211_CMD_VENDOR);
+ if (!hdr)
+ break;
+
+ if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (wdev && nla_put_u64(skb, NL80211_ATTR_WDEV,
+ wdev_id(wdev)))) {
+ genlmsg_cancel(skb, hdr);
+ break;
+ }
+
+ vendor_data = nla_nest_start(skb, NL80211_ATTR_VENDOR_DATA);
+ if (!vendor_data) {
+ genlmsg_cancel(skb, hdr);
+ break;
+ }
+
+ err = vcmd->dumpit(&rdev->wiphy, wdev, skb, data, data_len,
+ (unsigned long *)&cb->args[5]);
+ nla_nest_end(skb, vendor_data);
+
+ if (err == -ENOBUFS || err == -ENOENT) {
+ genlmsg_cancel(skb, hdr);
+ break;
+ } else if (err) {
+ genlmsg_cancel(skb, hdr);
+ goto out;
+ }
+
+ genlmsg_end(skb, hdr);
+ }
+
+ err = skb->len;
+ out:
+ rtnl_unlock();
+ return err;
+}
+
struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy,
enum nl80211_commands cmd,
enum nl80211_attrs attr,
@@ -10169,7 +10549,8 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb,
return -EINVAL;
/* we will be active on the TDLS link */
- if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype))
+ if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
+ wdev->iftype))
return -EINVAL;
/* don't allow switching to DFS channels */
@@ -10528,6 +10909,7 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_RTNL,
/* can be retrieved by unprivileged users */
},
+#ifdef CONFIG_CFG80211_CRDA_SUPPORT
{
.cmd = NL80211_CMD_SET_REG,
.doit = nl80211_set_reg,
@@ -10535,6 +10917,7 @@ static const struct genl_ops nl80211_ops[] = {
.flags = GENL_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_RTNL,
},
+#endif
{
.cmd = NL80211_CMD_REQ_SET_REG,
.doit = nl80211_req_set_reg,
@@ -10989,6 +11372,7 @@ static const struct genl_ops nl80211_ops[] = {
{
.cmd = NL80211_CMD_VENDOR,
.doit = nl80211_vendor_cmd,
+ .dumpit = nl80211_vendor_cmd_dump,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL80211_FLAG_NEED_WIPHY |
diff --git a/kernel/net/wireless/rdev-ops.h b/kernel/net/wireless/rdev-ops.h
index c6e83a746..c23516d0f 100644
--- a/kernel/net/wireless/rdev-ops.h
+++ b/kernel/net/wireless/rdev-ops.h
@@ -733,6 +733,8 @@ static inline void
rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev, u16 frame_type, bool reg)
{
+ might_sleep();
+
trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
trace_rdev_return_void(&rdev->wiphy);
diff --git a/kernel/net/wireless/reg.c b/kernel/net/wireless/reg.c
index 0e347f888..06d050da0 100644
--- a/kernel/net/wireless/reg.c
+++ b/kernel/net/wireless/reg.c
@@ -135,10 +135,7 @@ static spinlock_t reg_indoor_lock;
/* Used to track the userspace process controlling the indoor setting */
static u32 reg_is_indoor_portid;
-/* Max number of consecutive attempts to communicate with CRDA */
-#define REG_MAX_CRDA_TIMEOUTS 10
-
-static u32 reg_crda_timeouts;
+static void restore_regulatory_settings(bool reset_user);
static const struct ieee80211_regdomain *get_cfg80211_regdom(void)
{
@@ -226,9 +223,6 @@ static DECLARE_DELAYED_WORK(reg_check_chans, reg_check_chans_work);
static void reg_todo(struct work_struct *work);
static DECLARE_WORK(reg_work, reg_todo);
-static void reg_timeout_work(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work);
-
/* We keep a static world regulatory domain in case of the absence of CRDA */
static const struct ieee80211_regdomain world_regdom = {
.n_reg_rules = 8,
@@ -262,7 +256,7 @@ static const struct ieee80211_regdomain world_regdom = {
REG_RULE(5745-10, 5825+10, 80, 6, 20,
NL80211_RRF_NO_IR),
- /* IEEE 802.11ad (60gHz), channels 1..3 */
+ /* IEEE 802.11ad (60GHz), channels 1..3 */
REG_RULE(56160+2160*1-1080, 56160+2160*3+1080, 2160, 0, 0, 0),
}
};
@@ -279,6 +273,9 @@ MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
static void reg_free_request(struct regulatory_request *request)
{
+ if (request == &core_request_world)
+ return;
+
if (request != get_last_request())
kfree(request);
}
@@ -453,68 +450,70 @@ reg_copy_regd(const struct ieee80211_regdomain *src_regd)
}
#ifdef CONFIG_CFG80211_INTERNAL_REGDB
-struct reg_regdb_search_request {
- char alpha2[2];
+struct reg_regdb_apply_request {
struct list_head list;
+ const struct ieee80211_regdomain *regdom;
};
-static LIST_HEAD(reg_regdb_search_list);
-static DEFINE_MUTEX(reg_regdb_search_mutex);
+static LIST_HEAD(reg_regdb_apply_list);
+static DEFINE_MUTEX(reg_regdb_apply_mutex);
-static void reg_regdb_search(struct work_struct *work)
+static void reg_regdb_apply(struct work_struct *work)
{
- struct reg_regdb_search_request *request;
- const struct ieee80211_regdomain *curdom, *regdom = NULL;
- int i;
+ struct reg_regdb_apply_request *request;
rtnl_lock();
- mutex_lock(&reg_regdb_search_mutex);
- while (!list_empty(&reg_regdb_search_list)) {
- request = list_first_entry(&reg_regdb_search_list,
- struct reg_regdb_search_request,
+ mutex_lock(&reg_regdb_apply_mutex);
+ while (!list_empty(&reg_regdb_apply_list)) {
+ request = list_first_entry(&reg_regdb_apply_list,
+ struct reg_regdb_apply_request,
list);
list_del(&request->list);
- for (i = 0; i < reg_regdb_size; i++) {
- curdom = reg_regdb[i];
-
- if (alpha2_equal(request->alpha2, curdom->alpha2)) {
- regdom = reg_copy_regd(curdom);
- break;
- }
- }
-
+ set_regdom(request->regdom, REGD_SOURCE_INTERNAL_DB);
kfree(request);
}
- mutex_unlock(&reg_regdb_search_mutex);
-
- if (!IS_ERR_OR_NULL(regdom))
- set_regdom(regdom, REGD_SOURCE_INTERNAL_DB);
+ mutex_unlock(&reg_regdb_apply_mutex);
rtnl_unlock();
}
-static DECLARE_WORK(reg_regdb_work, reg_regdb_search);
+static DECLARE_WORK(reg_regdb_work, reg_regdb_apply);
-static void reg_regdb_query(const char *alpha2)
+static int reg_query_builtin(const char *alpha2)
{
- struct reg_regdb_search_request *request;
+ const struct ieee80211_regdomain *regdom = NULL;
+ struct reg_regdb_apply_request *request;
+ unsigned int i;
- if (!alpha2)
- return;
+ for (i = 0; i < reg_regdb_size; i++) {
+ if (alpha2_equal(alpha2, reg_regdb[i]->alpha2)) {
+ regdom = reg_regdb[i];
+ break;
+ }
+ }
- request = kzalloc(sizeof(struct reg_regdb_search_request), GFP_KERNEL);
+ if (!regdom)
+ return -ENODATA;
+
+ request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL);
if (!request)
- return;
+ return -ENOMEM;
- memcpy(request->alpha2, alpha2, 2);
+ request->regdom = reg_copy_regd(regdom);
+ if (IS_ERR_OR_NULL(request->regdom)) {
+ kfree(request);
+ return -ENOMEM;
+ }
- mutex_lock(&reg_regdb_search_mutex);
- list_add_tail(&request->list, &reg_regdb_search_list);
- mutex_unlock(&reg_regdb_search_mutex);
+ mutex_lock(&reg_regdb_apply_mutex);
+ list_add_tail(&request->list, &reg_regdb_apply_list);
+ mutex_unlock(&reg_regdb_apply_mutex);
schedule_work(&reg_regdb_work);
+
+ return 0;
}
/* Feel free to add any other sanity checks here */
@@ -525,9 +524,45 @@ static void reg_regdb_size_check(void)
}
#else
static inline void reg_regdb_size_check(void) {}
-static inline void reg_regdb_query(const char *alpha2) {}
+static inline int reg_query_builtin(const char *alpha2)
+{
+ return -ENODATA;
+}
#endif /* CONFIG_CFG80211_INTERNAL_REGDB */
+#ifdef CONFIG_CFG80211_CRDA_SUPPORT
+/* Max number of consecutive attempts to communicate with CRDA */
+#define REG_MAX_CRDA_TIMEOUTS 10
+
+static u32 reg_crda_timeouts;
+
+static void crda_timeout_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work);
+
+static void crda_timeout_work(struct work_struct *work)
+{
+ REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
+ rtnl_lock();
+ reg_crda_timeouts++;
+ restore_regulatory_settings(true);
+ rtnl_unlock();
+}
+
+static void cancel_crda_timeout(void)
+{
+ cancel_delayed_work(&crda_timeout);
+}
+
+static void cancel_crda_timeout_sync(void)
+{
+ cancel_delayed_work_sync(&crda_timeout);
+}
+
+static void reset_crda_timeouts(void)
+{
+ reg_crda_timeouts = 0;
+}
+
/*
* This lets us keep regulatory code which is updated on a regulatory
* basis in userspace.
@@ -536,36 +571,50 @@ static int call_crda(const char *alpha2)
{
char country[12];
char *env[] = { country, NULL };
+ int ret;
snprintf(country, sizeof(country), "COUNTRY=%c%c",
alpha2[0], alpha2[1]);
- /* query internal regulatory database (if it exists) */
- reg_regdb_query(alpha2);
-
if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) {
- pr_info("Exceeded CRDA call max attempts. Not calling CRDA\n");
+ pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n");
return -EINVAL;
}
if (!is_world_regdom((char *) alpha2))
- pr_info("Calling CRDA for country: %c%c\n",
+ pr_debug("Calling CRDA for country: %c%c\n",
alpha2[0], alpha2[1]);
else
- pr_info("Calling CRDA to update world regulatory domain\n");
+ pr_debug("Calling CRDA to update world regulatory domain\n");
- return kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, env);
+ ret = kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, env);
+ if (ret)
+ return ret;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &crda_timeout, msecs_to_jiffies(3142));
+ return 0;
+}
+#else
+static inline void cancel_crda_timeout(void) {}
+static inline void cancel_crda_timeout_sync(void) {}
+static inline void reset_crda_timeouts(void) {}
+static inline int call_crda(const char *alpha2)
+{
+ return -ENODATA;
}
+#endif /* CONFIG_CFG80211_CRDA_SUPPORT */
-static enum reg_request_treatment
-reg_call_crda(struct regulatory_request *request)
+static bool reg_query_database(struct regulatory_request *request)
{
- if (call_crda(request->alpha2))
- return REG_REQ_IGNORE;
+ /* query internal regulatory database (if it exists) */
+ if (reg_query_builtin(request->alpha2) == 0)
+ return true;
- queue_delayed_work(system_power_efficient_wq,
- &reg_timeout, msecs_to_jiffies(3142));
- return REG_REQ_OK;
+ if (call_crda(request->alpha2) == 0)
+ return true;
+
+ return false;
}
bool reg_is_valid_request(const char *alpha2)
@@ -989,8 +1038,8 @@ static u32 map_regdom_flags(u32 rd_flags)
channel_flags |= IEEE80211_CHAN_NO_OFDM;
if (rd_flags & NL80211_RRF_NO_OUTDOOR)
channel_flags |= IEEE80211_CHAN_INDOOR_ONLY;
- if (rd_flags & NL80211_RRF_GO_CONCURRENT)
- channel_flags |= IEEE80211_CHAN_GO_CONCURRENT;
+ if (rd_flags & NL80211_RRF_IR_CONCURRENT)
+ channel_flags |= IEEE80211_CHAN_IR_CONCURRENT;
if (rd_flags & NL80211_RRF_NO_HT40MINUS)
channel_flags |= IEEE80211_CHAN_NO_HT40MINUS;
if (rd_flags & NL80211_RRF_NO_HT40PLUS)
@@ -1004,7 +1053,7 @@ static u32 map_regdom_flags(u32 rd_flags)
static const struct ieee80211_reg_rule *
freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq,
- const struct ieee80211_regdomain *regd)
+ const struct ieee80211_regdomain *regd, u32 bw)
{
int i;
bool band_rule_found = false;
@@ -1028,7 +1077,7 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq,
if (!band_rule_found)
band_rule_found = freq_in_rule_band(fr, center_freq);
- bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20));
+ bw_fits = reg_does_bw_fit(fr, center_freq, bw);
if (band_rule_found && bw_fits)
return rr;
@@ -1040,14 +1089,26 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq,
return ERR_PTR(-EINVAL);
}
-const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
- u32 center_freq)
+static const struct ieee80211_reg_rule *
+__freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
{
- const struct ieee80211_regdomain *regd;
+ const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy);
+ const struct ieee80211_reg_rule *reg_rule = NULL;
+ u32 bw;
- regd = reg_get_regdomain(wiphy);
+ for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
+ reg_rule = freq_reg_info_regd(wiphy, center_freq, regd, bw);
+ if (!IS_ERR(reg_rule))
+ return reg_rule;
+ }
+
+ return reg_rule;
+}
- return freq_reg_info_regd(wiphy, center_freq, regd);
+const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
+ u32 center_freq)
+{
+ return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(20));
}
EXPORT_SYMBOL(freq_reg_info);
@@ -1069,11 +1130,11 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
}
EXPORT_SYMBOL(reg_initiator_name);
-#ifdef CONFIG_CFG80211_REG_DEBUG
static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
struct ieee80211_channel *chan,
const struct ieee80211_reg_rule *reg_rule)
{
+#ifdef CONFIG_CFG80211_REG_DEBUG
const struct ieee80211_power_rule *power_rule;
const struct ieee80211_freq_range *freq_range;
char max_antenna_gain[32], bw[32];
@@ -1084,7 +1145,7 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
if (!power_rule->max_antenna_gain)
snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A");
else
- snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d",
+ snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d mBi",
power_rule->max_antenna_gain);
if (reg_rule->flags & NL80211_RRF_AUTO_BW)
@@ -1098,19 +1159,12 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n",
chan->center_freq);
- REG_DBG_PRINT("%d KHz - %d KHz @ %s), (%s mBi, %d mBm)\n",
+ REG_DBG_PRINT("(%d KHz - %d KHz @ %s), (%s, %d mBm)\n",
freq_range->start_freq_khz, freq_range->end_freq_khz,
bw, max_antenna_gain,
power_rule->max_eirp);
-}
-#else
-static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
- struct ieee80211_channel *chan,
- const struct ieee80211_reg_rule *reg_rule)
-{
- return;
-}
#endif
+}
/*
* Note that right now we assume the desired channel bandwidth
@@ -1176,8 +1230,20 @@ static void handle_channel(struct wiphy *wiphy,
if (reg_rule->flags & NL80211_RRF_AUTO_BW)
max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
+ /* If we get a reg_rule we can assume that at least 5Mhz fit */
+ if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(10)))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(20)))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
+
+ if (max_bandwidth_khz < MHZ_TO_KHZ(10))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(20))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags = IEEE80211_CHAN_NO_HT40;
+ bw_flags |= IEEE80211_CHAN_NO_HT40;
if (max_bandwidth_khz < MHZ_TO_KHZ(80))
bw_flags |= IEEE80211_CHAN_NO_80MHZ;
if (max_bandwidth_khz < MHZ_TO_KHZ(160))
@@ -1287,7 +1353,8 @@ static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy)
return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS);
}
#else
-static int reg_ignore_cell_hint(struct regulatory_request *pending_request)
+static enum reg_request_treatment
+reg_ignore_cell_hint(struct regulatory_request *pending_request)
{
return REG_REQ_IGNORE;
}
@@ -1589,7 +1656,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_ADHOC:
- return cfg80211_reg_can_beacon(wiphy, &chandef, iftype);
+ return cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype);
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_P2P_CLIENT:
return cfg80211_chandef_usable(wiphy, &chandef,
@@ -1695,9 +1762,15 @@ static void handle_channel_custom(struct wiphy *wiphy,
const struct ieee80211_power_rule *power_rule = NULL;
const struct ieee80211_freq_range *freq_range = NULL;
u32 max_bandwidth_khz;
+ u32 bw;
- reg_rule = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq),
- regd);
+ for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) {
+ reg_rule = freq_reg_info_regd(wiphy,
+ MHZ_TO_KHZ(chan->center_freq),
+ regd, bw);
+ if (!IS_ERR(reg_rule))
+ break;
+ }
if (IS_ERR(reg_rule)) {
REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n",
@@ -1721,8 +1794,20 @@ static void handle_channel_custom(struct wiphy *wiphy,
if (reg_rule->flags & NL80211_RRF_AUTO_BW)
max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
+ /* If we get a reg_rule we can assume that at least 5Mhz fit */
+ if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(10)))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(20)))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
+
+ if (max_bandwidth_khz < MHZ_TO_KHZ(10))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(20))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags = IEEE80211_CHAN_NO_HT40;
+ bw_flags |= IEEE80211_CHAN_NO_HT40;
if (max_bandwidth_khz < MHZ_TO_KHZ(80))
bw_flags |= IEEE80211_CHAN_NO_80MHZ;
if (max_bandwidth_khz < MHZ_TO_KHZ(160))
@@ -1804,7 +1889,7 @@ static void reg_set_request_processed(void)
need_more_processing = true;
spin_unlock(&reg_requests_lock);
- cancel_delayed_work(&reg_timeout);
+ cancel_crda_timeout();
if (need_more_processing)
schedule_work(&reg_work);
@@ -1816,19 +1901,18 @@ static void reg_set_request_processed(void)
*
* The wireless subsystem can use this function to process
* a regulatory request issued by the regulatory core.
- *
- * Returns one of the different reg request treatment values.
*/
static enum reg_request_treatment
reg_process_hint_core(struct regulatory_request *core_request)
{
+ if (reg_query_database(core_request)) {
+ core_request->intersect = false;
+ core_request->processed = false;
+ reg_update_last_request(core_request);
+ return REG_REQ_OK;
+ }
- core_request->intersect = false;
- core_request->processed = false;
-
- reg_update_last_request(core_request);
-
- return reg_call_crda(core_request);
+ return REG_REQ_IGNORE;
}
static enum reg_request_treatment
@@ -1873,8 +1957,6 @@ __reg_process_hint_user(struct regulatory_request *user_request)
*
* The wireless subsystem can use this function to process
* a regulatory request initiated by userspace.
- *
- * Returns one of the different reg request treatment values.
*/
static enum reg_request_treatment
reg_process_hint_user(struct regulatory_request *user_request)
@@ -1883,20 +1965,20 @@ reg_process_hint_user(struct regulatory_request *user_request)
treatment = __reg_process_hint_user(user_request);
if (treatment == REG_REQ_IGNORE ||
- treatment == REG_REQ_ALREADY_SET) {
- reg_free_request(user_request);
- return treatment;
- }
+ treatment == REG_REQ_ALREADY_SET)
+ return REG_REQ_IGNORE;
user_request->intersect = treatment == REG_REQ_INTERSECT;
user_request->processed = false;
- reg_update_last_request(user_request);
-
- user_alpha2[0] = user_request->alpha2[0];
- user_alpha2[1] = user_request->alpha2[1];
+ if (reg_query_database(user_request)) {
+ reg_update_last_request(user_request);
+ user_alpha2[0] = user_request->alpha2[0];
+ user_alpha2[1] = user_request->alpha2[1];
+ return REG_REQ_OK;
+ }
- return reg_call_crda(user_request);
+ return REG_REQ_IGNORE;
}
static enum reg_request_treatment
@@ -1944,16 +2026,12 @@ reg_process_hint_driver(struct wiphy *wiphy,
case REG_REQ_OK:
break;
case REG_REQ_IGNORE:
- reg_free_request(driver_request);
- return treatment;
+ return REG_REQ_IGNORE;
case REG_REQ_INTERSECT:
- /* fall through */
case REG_REQ_ALREADY_SET:
regd = reg_copy_regd(get_cfg80211_regdom());
- if (IS_ERR(regd)) {
- reg_free_request(driver_request);
+ if (IS_ERR(regd))
return REG_REQ_IGNORE;
- }
tmp = get_wiphy_regdom(wiphy);
rcu_assign_pointer(wiphy->regd, regd);
@@ -1964,8 +2042,6 @@ reg_process_hint_driver(struct wiphy *wiphy,
driver_request->intersect = treatment == REG_REQ_INTERSECT;
driver_request->processed = false;
- reg_update_last_request(driver_request);
-
/*
* Since CRDA will not be called in this case as we already
* have applied the requested regulatory domain before we just
@@ -1973,11 +2049,17 @@ reg_process_hint_driver(struct wiphy *wiphy,
*/
if (treatment == REG_REQ_ALREADY_SET) {
nl80211_send_reg_change_event(driver_request);
+ reg_update_last_request(driver_request);
reg_set_request_processed();
- return treatment;
+ return REG_REQ_ALREADY_SET;
+ }
+
+ if (reg_query_database(driver_request)) {
+ reg_update_last_request(driver_request);
+ return REG_REQ_OK;
}
- return reg_call_crda(driver_request);
+ return REG_REQ_IGNORE;
}
static enum reg_request_treatment
@@ -2043,12 +2125,11 @@ reg_process_hint_country_ie(struct wiphy *wiphy,
case REG_REQ_OK:
break;
case REG_REQ_IGNORE:
- /* fall through */
+ return REG_REQ_IGNORE;
case REG_REQ_ALREADY_SET:
reg_free_request(country_ie_request);
- return treatment;
+ return REG_REQ_ALREADY_SET;
case REG_REQ_INTERSECT:
- reg_free_request(country_ie_request);
/*
* This doesn't happen yet, not sure we
* ever want to support it for this case.
@@ -2060,9 +2141,12 @@ reg_process_hint_country_ie(struct wiphy *wiphy,
country_ie_request->intersect = false;
country_ie_request->processed = false;
- reg_update_last_request(country_ie_request);
+ if (reg_query_database(country_ie_request)) {
+ reg_update_last_request(country_ie_request);
+ return REG_REQ_OK;
+ }
- return reg_call_crda(country_ie_request);
+ return REG_REQ_IGNORE;
}
/* This processes *all* regulatory hints */
@@ -2076,14 +2160,11 @@ static void reg_process_hint(struct regulatory_request *reg_request)
switch (reg_request->initiator) {
case NL80211_REGDOM_SET_BY_CORE:
- reg_process_hint_core(reg_request);
- return;
+ treatment = reg_process_hint_core(reg_request);
+ break;
case NL80211_REGDOM_SET_BY_USER:
treatment = reg_process_hint_user(reg_request);
- if (treatment == REG_REQ_IGNORE ||
- treatment == REG_REQ_ALREADY_SET)
- return;
- return;
+ break;
case NL80211_REGDOM_SET_BY_DRIVER:
if (!wiphy)
goto out_free;
@@ -2099,7 +2180,15 @@ static void reg_process_hint(struct regulatory_request *reg_request)
goto out_free;
}
- /* This is required so that the orig_* parameters are saved */
+ if (treatment == REG_REQ_IGNORE)
+ goto out_free;
+
+ WARN(treatment != REG_REQ_OK && treatment != REG_REQ_ALREADY_SET,
+ "unexpected treatment value %d\n", treatment);
+
+ /* This is required so that the orig_* parameters are saved.
+ * NOTE: treatment must be set for any case that reaches here!
+ */
if (treatment == REG_REQ_ALREADY_SET && wiphy &&
wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
wiphy_update_regulatory(wiphy, reg_request->initiator);
@@ -2304,7 +2393,7 @@ int regulatory_hint_user(const char *alpha2,
request->user_reg_hint_type = user_reg_hint_type;
/* Allow calling CRDA again */
- reg_crda_timeouts = 0;
+ reset_crda_timeouts();
queue_regulatory_request(request);
@@ -2376,7 +2465,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
/* Allow calling CRDA again */
- reg_crda_timeouts = 0;
+ reset_crda_timeouts();
queue_regulatory_request(request);
@@ -2432,7 +2521,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band,
request->country_ie_env = env;
/* Allow calling CRDA again */
- reg_crda_timeouts = 0;
+ reset_crda_timeouts();
queue_regulatory_request(request);
request = NULL;
@@ -2584,7 +2673,7 @@ static void restore_regulatory_settings(bool reset_user)
* settings, user regulatory settings takes precedence.
*/
if (is_an_alpha2(alpha2))
- regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER);
+ regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER);
spin_lock(&reg_requests_lock);
list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
@@ -2833,11 +2922,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd,
}
request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx);
- if (!request_wiphy) {
- queue_delayed_work(system_power_efficient_wq,
- &reg_timeout, 0);
+ if (!request_wiphy)
return -ENODEV;
- }
if (!driver_request->intersect) {
if (request_wiphy->regd)
@@ -2894,11 +2980,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd,
}
request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx);
- if (!request_wiphy) {
- queue_delayed_work(system_power_efficient_wq,
- &reg_timeout, 0);
+ if (!request_wiphy)
return -ENODEV;
- }
if (country_ie_request->intersect)
return -EINVAL;
@@ -2925,7 +3008,7 @@ int set_regdom(const struct ieee80211_regdomain *rd,
}
if (regd_src == REGD_SOURCE_CRDA)
- reg_crda_timeouts = 0;
+ reset_crda_timeouts();
lr = get_last_request();
@@ -2946,6 +3029,7 @@ int set_regdom(const struct ieee80211_regdomain *rd,
break;
default:
WARN(1, "invalid initiator %d\n", lr->initiator);
+ kfree(rd);
return -EINVAL;
}
@@ -3082,15 +3166,6 @@ void wiphy_regulatory_deregister(struct wiphy *wiphy)
lr->country_ie_env = ENVIRON_ANY;
}
-static void reg_timeout_work(struct work_struct *work)
-{
- REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
- rtnl_lock();
- reg_crda_timeouts++;
- restore_regulatory_settings(true);
- rtnl_unlock();
-}
-
/*
* See http://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii, for
* UNII band definitions
@@ -3147,8 +3222,10 @@ int __init regulatory_init(void)
/* We always try to get an update for the static regdomain */
err = regulatory_hint_core(cfg80211_world_regdom->alpha2);
if (err) {
- if (err == -ENOMEM)
+ if (err == -ENOMEM) {
+ platform_device_unregister(reg_pdev);
return err;
+ }
/*
* N.B. kobject_uevent_env() can fail mainly for when we're out
* memory which is handled and propagated appropriately above
@@ -3176,7 +3253,7 @@ void regulatory_exit(void)
struct reg_beacon *reg_beacon, *btmp;
cancel_work_sync(&reg_work);
- cancel_delayed_work_sync(&reg_timeout);
+ cancel_crda_timeout_sync();
cancel_delayed_work_sync(&reg_check_chans);
/* Lock to suppress warnings */
diff --git a/kernel/net/wireless/scan.c b/kernel/net/wireless/scan.c
index 3a50aa255..14d5369eb 100644
--- a/kernel/net/wireless/scan.c
+++ b/kernel/net/wireless/scan.c
@@ -266,8 +266,7 @@ void __cfg80211_sched_scan_results(struct work_struct *wk)
spin_lock_bh(&rdev->bss_lock);
__cfg80211_bss_expire(rdev, request->scan_start);
spin_unlock_bh(&rdev->bss_lock);
- request->scan_start =
- jiffies + msecs_to_jiffies(request->interval);
+ request->scan_start = jiffies;
}
nl80211_send_sched_scan_results(rdev, request->dev);
}
@@ -839,6 +838,7 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
found->pub.signal = tmp->pub.signal;
found->pub.capability = tmp->pub.capability;
found->ts = tmp->ts;
+ found->ts_boottime = tmp->ts_boottime;
} else {
struct cfg80211_internal_bss *new;
struct cfg80211_internal_bss *hidden;
@@ -938,14 +938,13 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
}
/* Returned bss is reference counted and must be cleaned up appropriately. */
-struct cfg80211_bss*
-cfg80211_inform_bss_width(struct wiphy *wiphy,
- struct ieee80211_channel *rx_channel,
- enum nl80211_bss_scan_width scan_width,
- enum cfg80211_bss_frame_type ftype,
- const u8 *bssid, u64 tsf, u16 capability,
- u16 beacon_interval, const u8 *ie, size_t ielen,
- s32 signal, gfp_t gfp)
+struct cfg80211_bss *
+cfg80211_inform_bss_data(struct wiphy *wiphy,
+ struct cfg80211_inform_bss *data,
+ enum cfg80211_bss_frame_type ftype,
+ const u8 *bssid, u64 tsf, u16 capability,
+ u16 beacon_interval, const u8 *ie, size_t ielen,
+ gfp_t gfp)
{
struct cfg80211_bss_ies *ies;
struct ieee80211_channel *channel;
@@ -957,19 +956,21 @@ cfg80211_inform_bss_width(struct wiphy *wiphy,
return NULL;
if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
- (signal < 0 || signal > 100)))
+ (data->signal < 0 || data->signal > 100)))
return NULL;
- channel = cfg80211_get_bss_channel(wiphy, ie, ielen, rx_channel);
+ channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan);
if (!channel)
return NULL;
memcpy(tmp.pub.bssid, bssid, ETH_ALEN);
tmp.pub.channel = channel;
- tmp.pub.scan_width = scan_width;
- tmp.pub.signal = signal;
+ tmp.pub.scan_width = data->scan_width;
+ tmp.pub.signal = data->signal;
tmp.pub.beacon_interval = beacon_interval;
tmp.pub.capability = capability;
+ tmp.ts_boottime = data->boottime_ns;
+
/*
* If we do not know here whether the IEs are from a Beacon or Probe
* Response frame, we need to pick one of the options and only use it
@@ -999,7 +1000,7 @@ cfg80211_inform_bss_width(struct wiphy *wiphy,
}
rcu_assign_pointer(tmp.pub.ies, ies);
- signal_valid = abs(rx_channel->center_freq - channel->center_freq) <=
+ signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
if (!res)
@@ -1019,15 +1020,15 @@ cfg80211_inform_bss_width(struct wiphy *wiphy,
/* cfg80211_bss_update gives us a referenced result */
return &res->pub;
}
-EXPORT_SYMBOL(cfg80211_inform_bss_width);
+EXPORT_SYMBOL(cfg80211_inform_bss_data);
-/* Returned bss is reference counted and must be cleaned up appropriately. */
+/* cfg80211_inform_bss_width_frame helper */
struct cfg80211_bss *
-cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
- struct ieee80211_channel *rx_channel,
- enum nl80211_bss_scan_width scan_width,
- struct ieee80211_mgmt *mgmt, size_t len,
- s32 signal, gfp_t gfp)
+cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
+ struct cfg80211_inform_bss *data,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ gfp_t gfp)
+
{
struct cfg80211_internal_bss tmp = {}, *res;
struct cfg80211_bss_ies *ies;
@@ -1040,8 +1041,7 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
offsetof(struct ieee80211_mgmt, u.beacon.variable));
- trace_cfg80211_inform_bss_width_frame(wiphy, rx_channel, scan_width, mgmt,
- len, signal);
+ trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len);
if (WARN_ON(!mgmt))
return NULL;
@@ -1050,14 +1050,14 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
return NULL;
if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
- (signal < 0 || signal > 100)))
+ (data->signal < 0 || data->signal > 100)))
return NULL;
if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable)))
return NULL;
channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable,
- ielen, rx_channel);
+ ielen, data->chan);
if (!channel)
return NULL;
@@ -1077,12 +1077,13 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
tmp.pub.channel = channel;
- tmp.pub.scan_width = scan_width;
- tmp.pub.signal = signal;
+ tmp.pub.scan_width = data->scan_width;
+ tmp.pub.signal = data->signal;
tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
+ tmp.ts_boottime = data->boottime_ns;
- signal_valid = abs(rx_channel->center_freq - channel->center_freq) <=
+ signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
if (!res)
@@ -1102,7 +1103,7 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
/* cfg80211_bss_update gives us a referenced result */
return &res->pub;
}
-EXPORT_SYMBOL(cfg80211_inform_bss_width_frame);
+EXPORT_SYMBOL(cfg80211_inform_bss_frame_data);
void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
{
diff --git a/kernel/net/wireless/sme.c b/kernel/net/wireless/sme.c
index d11454f87..8020b5b09 100644
--- a/kernel/net/wireless/sme.c
+++ b/kernel/net/wireless/sme.c
@@ -938,7 +938,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
}
void cfg80211_disconnected(struct net_device *dev, u16 reason,
- const u8 *ie, size_t ie_len, gfp_t gfp)
+ const u8 *ie, size_t ie_len,
+ bool locally_generated, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -954,6 +955,7 @@ void cfg80211_disconnected(struct net_device *dev, u16 reason,
ev->dc.ie_len = ie_len;
memcpy((void *)ev->dc.ie, ie, ie_len);
ev->dc.reason = reason;
+ ev->dc.locally_generated = locally_generated;
spin_lock_irqsave(&wdev->event_lock, flags);
list_add_tail(&ev->list, &wdev->event_list);
diff --git a/kernel/net/wireless/sysfs.c b/kernel/net/wireless/sysfs.c
index 9ee6bc1a7..9cee02206 100644
--- a/kernel/net/wireless/sysfs.c
+++ b/kernel/net/wireless/sysfs.c
@@ -86,7 +86,7 @@ static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env)
return 0;
}
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
static void cfg80211_leave_all(struct cfg80211_registered_device *rdev)
{
struct wireless_dev *wdev;
@@ -95,7 +95,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev)
cfg80211_leave(rdev, wdev);
}
-static int wiphy_suspend(struct device *dev, pm_message_t state)
+static int wiphy_suspend(struct device *dev)
{
struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
int ret = 0;
@@ -136,6 +136,11 @@ static int wiphy_resume(struct device *dev)
return ret;
}
+
+static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume);
+#define WIPHY_PM_OPS (&wiphy_pm_ops)
+#else
+#define WIPHY_PM_OPS NULL
#endif
static const void *wiphy_namespace(struct device *d)
@@ -151,10 +156,7 @@ struct class ieee80211_class = {
.dev_release = wiphy_dev_release,
.dev_groups = ieee80211_groups,
.dev_uevent = wiphy_uevent,
-#ifdef CONFIG_PM
- .suspend = wiphy_suspend,
- .resume = wiphy_resume,
-#endif
+ .pm = WIPHY_PM_OPS,
.ns_type = &net_ns_type_operations,
.namespace = wiphy_namespace,
};
diff --git a/kernel/net/wireless/trace.h b/kernel/net/wireless/trace.h
index af3617c98..0c392d367 100644
--- a/kernel/net/wireless/trace.h
+++ b/kernel/net/wireless/trace.h
@@ -2358,20 +2358,23 @@ TRACE_EVENT(cfg80211_cqm_rssi_notify,
TRACE_EVENT(cfg80211_reg_can_beacon,
TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
- enum nl80211_iftype iftype),
- TP_ARGS(wiphy, chandef, iftype),
+ enum nl80211_iftype iftype, bool check_no_ir),
+ TP_ARGS(wiphy, chandef, iftype, check_no_ir),
TP_STRUCT__entry(
WIPHY_ENTRY
CHAN_DEF_ENTRY
__field(enum nl80211_iftype, iftype)
+ __field(bool, check_no_ir)
),
TP_fast_assign(
WIPHY_ASSIGN;
CHAN_DEF_ASSIGN(chandef);
__entry->iftype = iftype;
+ __entry->check_no_ir = check_no_ir;
),
- TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d",
- WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype)
+ TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s",
+ WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype,
+ BOOL_TO_STR(__entry->check_no_ir))
);
TRACE_EVENT(cfg80211_chandef_dfs_required,
@@ -2667,30 +2670,30 @@ TRACE_EVENT(cfg80211_get_bss,
__entry->privacy)
);
-TRACE_EVENT(cfg80211_inform_bss_width_frame,
- TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
- enum nl80211_bss_scan_width scan_width,
- struct ieee80211_mgmt *mgmt, size_t len,
- s32 signal),
- TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal),
+TRACE_EVENT(cfg80211_inform_bss_frame,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_inform_bss *data,
+ struct ieee80211_mgmt *mgmt, size_t len),
+ TP_ARGS(wiphy, data, mgmt, len),
TP_STRUCT__entry(
WIPHY_ENTRY
CHAN_ENTRY
__field(enum nl80211_bss_scan_width, scan_width)
__dynamic_array(u8, mgmt, len)
__field(s32, signal)
+ __field(u64, ts_boottime)
),
TP_fast_assign(
WIPHY_ASSIGN;
- CHAN_ASSIGN(channel);
- __entry->scan_width = scan_width;
+ CHAN_ASSIGN(data->chan);
+ __entry->scan_width = data->scan_width;
if (mgmt)
memcpy(__get_dynamic_array(mgmt), mgmt, len);
- __entry->signal = signal;
+ __entry->signal = data->signal;
+ __entry->ts_boottime = data->boottime_ns;
),
- TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d",
+ TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu",
WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
- __entry->signal)
+ __entry->signal, (unsigned long long)__entry->ts_boottime)
);
DECLARE_EVENT_CLASS(cfg80211_bss_evt,
diff --git a/kernel/net/wireless/util.c b/kernel/net/wireless/util.c
index 7e4e3fffe..baf7218ce 100644
--- a/kernel/net/wireless/util.c
+++ b/kernel/net/wireless/util.c
@@ -887,7 +887,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
case EVENT_DISCONNECTED:
__cfg80211_disconnected(wdev->netdev,
ev->dc.ie, ev->dc.ie_len,
- ev->dc.reason, true);
+ ev->dc.reason,
+ !ev->dc.locally_generated);
break;
case EVENT_IBSS_JOINED:
__cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid,
diff --git a/kernel/net/wireless/wext-core.c b/kernel/net/wireless/wext-core.c
index c8717c1d0..b50ee5d62 100644
--- a/kernel/net/wireless/wext-core.c
+++ b/kernel/net/wireless/wext-core.c
@@ -342,6 +342,40 @@ static const int compat_event_type_size[] = {
/* IW event code */
+void wireless_nlevent_flush(void)
+{
+ struct sk_buff *skb;
+ struct net *net;
+
+ ASSERT_RTNL();
+
+ for_each_net(net) {
+ while ((skb = skb_dequeue(&net->wext_nlevents)))
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+ GFP_KERNEL);
+ }
+}
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ /*
+ * When a netdev changes state in any way, flush all pending messages
+ * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+ * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+ * or similar - all of which could otherwise happen due to delays from
+ * schedule_work().
+ */
+ wireless_nlevent_flush();
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+ .notifier_call = wext_netdev_notifier_call,
+};
+
static int __net_init wext_pernet_init(struct net *net)
{
skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = {
static int __init wireless_nlevent_init(void)
{
- return register_pernet_subsys(&wext_pernet_ops);
+ int err = register_pernet_subsys(&wext_pernet_ops);
+
+ if (err)
+ return err;
+
+ return register_netdevice_notifier(&wext_netdev_notifier);
}
subsys_initcall(wireless_nlevent_init);
@@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init);
/* Process events generated by the wireless layer or the driver. */
static void wireless_nlevent_process(struct work_struct *work)
{
- struct sk_buff *skb;
- struct net *net;
-
rtnl_lock();
-
- for_each_net(net) {
- while ((skb = skb_dequeue(&net->wext_nlevents)))
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
- GFP_KERNEL);
- }
-
+ wireless_nlevent_flush();
rtnl_unlock();
}
diff --git a/kernel/net/x25/af_x25.c b/kernel/net/x25/af_x25.c
index c3ab230e4..a750f330b 100644
--- a/kernel/net/x25/af_x25.c
+++ b/kernel/net/x25/af_x25.c
@@ -515,10 +515,10 @@ static struct proto x25_proto = {
.obj_size = sizeof(struct x25_sock),
};
-static struct sock *x25_alloc_socket(struct net *net)
+static struct sock *x25_alloc_socket(struct net *net, int kern)
{
struct x25_sock *x25;
- struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto);
+ struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern);
if (!sk)
goto out;
@@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol,
goto out;
rc = -ENOBUFS;
- if ((sk = x25_alloc_socket(net)) == NULL)
+ if ((sk = x25_alloc_socket(net, kern)) == NULL)
goto out;
x25 = x25_sk(sk);
@@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk)
if (osk->sk_type != SOCK_SEQPACKET)
goto out;
- if ((sk = x25_alloc_socket(sock_net(osk))) == NULL)
+ if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL)
goto out;
x25 = x25_sk(sk);
diff --git a/kernel/net/xfrm/xfrm_algo.c b/kernel/net/xfrm/xfrm_algo.c
index 12e82a5e4..f07224d8b 100644
--- a/kernel/net/xfrm/xfrm_algo.c
+++ b/kernel/net/xfrm/xfrm_algo.c
@@ -31,6 +31,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 64,
}
},
@@ -49,6 +50,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 96,
}
},
@@ -67,6 +69,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 128,
}
},
@@ -85,6 +88,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 64,
}
},
@@ -103,6 +107,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 96,
}
},
@@ -121,6 +126,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 128,
}
},
@@ -139,6 +145,7 @@ static struct xfrm_algo_desc aead_list[] = {
.uinfo = {
.aead = {
+ .geniv = "seqiv",
.icv_truncbits = 128,
}
},
@@ -152,6 +159,18 @@ static struct xfrm_algo_desc aead_list[] = {
.sadb_alg_maxbits = 256
}
},
+{
+ .name = "rfc7539esp(chacha20,poly1305)",
+
+ .uinfo = {
+ .aead = {
+ .geniv = "seqiv",
+ .icv_truncbits = 128,
+ }
+ },
+
+ .pfkey_supported = 0,
+},
};
static struct xfrm_algo_desc aalg_list[] = {
@@ -353,6 +372,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 64,
.defkeybits = 64,
}
@@ -373,6 +393,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 64,
.defkeybits = 192,
}
@@ -393,6 +414,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 64,
.defkeybits = 128,
}
@@ -413,6 +435,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 64,
.defkeybits = 128,
}
@@ -433,6 +456,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 128,
.defkeybits = 128,
}
@@ -453,6 +477,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 128,
.defkeybits = 128,
}
@@ -473,6 +498,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 128,
.defkeybits = 128,
}
@@ -493,6 +519,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "echainiv",
.blockbits = 128,
.defkeybits = 128,
}
@@ -512,6 +539,7 @@ static struct xfrm_algo_desc ealg_list[] = {
.uinfo = {
.encr = {
+ .geniv = "seqiv",
.blockbits = 128,
.defkeybits = 160, /* 128-bit key + 32-bit nonce */
}
diff --git a/kernel/net/xfrm/xfrm_input.c b/kernel/net/xfrm/xfrm_input.c
index b58286ecd..ad7f5b3f9 100644
--- a/kernel/net/xfrm/xfrm_input.c
+++ b/kernel/net/xfrm/xfrm_input.c
@@ -31,7 +31,7 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
return -EAFNOSUPPORT;
spin_lock_bh(&xfrm_input_afinfo_lock);
if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
- err = -ENOBUFS;
+ err = -EEXIST;
else
rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo);
spin_unlock_bh(&xfrm_input_afinfo_lock);
@@ -254,13 +254,13 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
skb->sp->xvec[skb->sp->len++] = x;
spin_lock(&x->lock);
- if (unlikely(x->km.state == XFRM_STATE_ACQ)) {
- XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
- goto drop_unlock;
- }
if (unlikely(x->km.state != XFRM_STATE_VALID)) {
- XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID);
+ if (x->km.state == XFRM_STATE_ACQ)
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
+ else
+ XFRM_INC_STATS(net,
+ LINUX_MIB_XFRMINSTATEINVALID);
goto drop_unlock;
}
@@ -330,8 +330,10 @@ resume:
if (x->sel.family == AF_UNSPEC) {
inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
- if (inner_mode == NULL)
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
goto drop;
+ }
}
if (inner_mode->input(x, skb)) {
diff --git a/kernel/net/xfrm/xfrm_output.c b/kernel/net/xfrm/xfrm_output.c
index fbcedbe33..ff4a91fca 100644
--- a/kernel/net/xfrm/xfrm_output.c
+++ b/kernel/net/xfrm/xfrm_output.c
@@ -19,7 +19,7 @@
#include <net/dst.h>
#include <net/xfrm.h>
-static int xfrm_output2(struct sock *sk, struct sk_buff *skb);
+static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb);
static int xfrm_skb_check_space(struct sk_buff *skb)
{
@@ -38,6 +38,18 @@ static int xfrm_skb_check_space(struct sk_buff *skb)
return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC);
}
+/* Children define the path of the packet through the
+ * Linux networking. Thus, destinations are stackable.
+ */
+
+static struct dst_entry *skb_dst_pop(struct sk_buff *skb)
+{
+ struct dst_entry *child = dst_clone(skb_dst(skb)->child);
+
+ skb_dst_drop(skb);
+ return child;
+}
+
static int xfrm_output_one(struct sk_buff *skb, int err)
{
struct dst_entry *dst = skb_dst(skb);
@@ -119,18 +131,20 @@ out:
int xfrm_output_resume(struct sk_buff *skb, int err)
{
+ struct net *net = xs_net(skb_dst(skb)->xfrm);
+
while (likely((err = xfrm_output_one(skb, err)) == 0)) {
nf_reset(skb);
- err = skb_dst(skb)->ops->local_out(skb);
+ err = skb_dst(skb)->ops->local_out(net, skb->sk, skb);
if (unlikely(err != 1))
goto out;
if (!skb_dst(skb)->xfrm)
- return dst_output(skb);
+ return dst_output(net, skb->sk, skb);
err = nf_hook(skb_dst(skb)->ops->family,
- NF_INET_POST_ROUTING, skb->sk, skb,
+ NF_INET_POST_ROUTING, net, skb->sk, skb,
NULL, skb_dst(skb)->dev, xfrm_output2);
if (unlikely(err != 1))
goto out;
@@ -144,15 +158,17 @@ out:
}
EXPORT_SYMBOL_GPL(xfrm_output_resume);
-static int xfrm_output2(struct sock *sk, struct sk_buff *skb)
+static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return xfrm_output_resume(skb, 1);
}
-static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb)
+static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct sk_buff *segs;
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, 0);
kfree_skb(skb);
if (IS_ERR(segs))
@@ -165,7 +181,7 @@ static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb)
int err;
segs->next = NULL;
- err = xfrm_output2(sk, segs);
+ err = xfrm_output2(net, sk, segs);
if (unlikely(err)) {
kfree_skb_list(nskb);
@@ -184,7 +200,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
int err;
if (skb_is_gso(skb))
- return xfrm_output_gso(sk, skb);
+ return xfrm_output_gso(net, sk, skb);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
err = skb_checksum_help(skb);
@@ -195,7 +211,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
}
}
- return xfrm_output2(sk, skb);
+ return xfrm_output2(net, sk, skb);
}
EXPORT_SYMBOL_GPL(xfrm_output);
diff --git a/kernel/net/xfrm/xfrm_policy.c b/kernel/net/xfrm/xfrm_policy.c
index 638af0655..b5e665b3c 100644
--- a/kernel/net/xfrm/xfrm_policy.c
+++ b/kernel/net/xfrm/xfrm_policy.c
@@ -115,7 +115,8 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
rcu_read_unlock();
}
-static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
+static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
+ int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr,
int family)
@@ -127,14 +128,15 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
if (unlikely(afinfo == NULL))
return ERR_PTR(-EAFNOSUPPORT);
- dst = afinfo->dst_lookup(net, tos, saddr, daddr);
+ dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
xfrm_policy_put_afinfo(afinfo);
return dst;
}
-static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
+static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
+ int tos, int oif,
xfrm_address_t *prev_saddr,
xfrm_address_t *prev_daddr,
int family)
@@ -153,7 +155,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
daddr = x->coaddr;
}
- dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family);
+ dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
if (!IS_ERR(dst)) {
if (prev_saddr != saddr)
@@ -301,6 +303,14 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
}
EXPORT_SYMBOL(xfrm_policy_alloc);
+static void xfrm_policy_destroy_rcu(struct rcu_head *head)
+{
+ struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);
+
+ security_xfrm_policy_free(policy->security);
+ kfree(policy);
+}
+
/* Destroy xfrm_policy: descendant resources must be released to this moment. */
void xfrm_policy_destroy(struct xfrm_policy *policy)
@@ -310,19 +320,10 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
BUG();
- security_xfrm_policy_free(policy->security);
- kfree(policy);
+ call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);
-static void xfrm_queue_purge(struct sk_buff_head *list)
-{
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb(skb);
-}
-
/* Rule must be locked. Release descentant resources, announce
* entry dead. The rule must be unlinked from lists to the moment.
*/
@@ -335,7 +336,7 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
if (del_timer(&policy->polq.hold_timer))
xfrm_pol_put(policy);
- xfrm_queue_purge(&policy->polq.hold_queue);
+ skb_queue_purge(&policy->polq.hold_queue);
if (del_timer(&policy->timer))
xfrm_pol_put(policy);
@@ -708,6 +709,9 @@ static void xfrm_policy_requeue(struct xfrm_policy *old,
struct xfrm_policy_queue *pq = &old->polq;
struct sk_buff_head list;
+ if (skb_queue_empty(&pq->hold_queue))
+ return;
+
__skb_queue_head_init(&list);
spin_lock_bh(&pq->hold_queue.lock);
@@ -716,9 +720,6 @@ static void xfrm_policy_requeue(struct xfrm_policy *old,
xfrm_pol_put(old);
spin_unlock_bh(&pq->hold_queue.lock);
- if (skb_queue_empty(&list))
- return;
-
pq = &new->polq;
spin_lock_bh(&pq->hold_queue.lock);
@@ -1012,7 +1013,9 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
if (list_empty(&walk->walk.all))
x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
else
- x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all);
+ x = list_first_entry(&walk->walk.all,
+ struct xfrm_policy_walk_entry, all);
+
list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
if (x->dead)
continue;
@@ -1120,6 +1123,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
chain = &net->xfrm.policy_inexact[dir];
hlist_for_each_entry(pol, chain, bydst) {
+ if ((pol->priority >= priority) && ret)
+ break;
+
err = xfrm_policy_match(pol, fl, type, family, dir);
if (err) {
if (err == -ESRCH)
@@ -1128,13 +1134,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
ret = ERR_PTR(err);
goto fail;
}
- } else if (pol->priority < priority) {
+ } else {
ret = pol;
break;
}
}
- if (ret)
- xfrm_pol_hold(ret);
+
+ xfrm_pol_hold(ret);
fail:
read_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -1209,14 +1215,16 @@ static inline int policy_to_flow_dir(int dir)
}
}
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
+static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
const struct flowi *fl)
{
struct xfrm_policy *pol;
struct net *net = sock_net(sk);
+ rcu_read_lock();
read_lock_bh(&net->xfrm.xfrm_policy_lock);
- if ((pol = sk->sk_policy[dir]) != NULL) {
+ pol = rcu_dereference(sk->sk_policy[dir]);
+ if (pol != NULL) {
bool match = xfrm_selector_match(&pol->selector, fl,
sk->sk_family);
int err = 0;
@@ -1240,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
}
out:
read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ rcu_read_unlock();
return pol;
}
@@ -1308,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
#endif
write_lock_bh(&net->xfrm.xfrm_policy_lock);
- old_pol = sk->sk_policy[dir];
- sk->sk_policy[dir] = pol;
+ old_pol = rcu_dereference_protected(sk->sk_policy[dir],
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
pol->curlft.add_time = get_seconds();
pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
xfrm_sk_policy_link(pol, dir);
}
+ rcu_assign_pointer(sk->sk_policy[dir], pol);
if (old_pol) {
if (pol)
xfrm_policy_requeue(old_pol, pol);
@@ -1362,29 +1372,38 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
return newp;
}
-int __xfrm_sk_clone_policy(struct sock *sk)
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
- struct xfrm_policy *p0 = sk->sk_policy[0],
- *p1 = sk->sk_policy[1];
+ const struct xfrm_policy *p;
+ struct xfrm_policy *np;
+ int i, ret = 0;
- sk->sk_policy[0] = sk->sk_policy[1] = NULL;
- if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
- return -ENOMEM;
- if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
- return -ENOMEM;
- return 0;
+ rcu_read_lock();
+ for (i = 0; i < 2; i++) {
+ p = rcu_dereference(osk->sk_policy[i]);
+ if (p) {
+ np = clone_policy(p, i);
+ if (unlikely(!np)) {
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(sk->sk_policy[i], np);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
}
static int
-xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
- unsigned short family)
+xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
+ xfrm_address_t *remote, unsigned short family)
{
int err;
struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
if (unlikely(afinfo == NULL))
return -EINVAL;
- err = afinfo->get_saddr(net, local, remote);
+ err = afinfo->get_saddr(net, oif, local, remote);
xfrm_policy_put_afinfo(afinfo);
return err;
}
@@ -1413,7 +1432,9 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
remote = &tmpl->id.daddr;
local = &tmpl->saddr;
if (xfrm_addr_any(local, tmpl->encap_family)) {
- error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
+ error = xfrm_get_saddr(net, fl->flowi_oif,
+ &tmp, remote,
+ tmpl->encap_family);
if (error)
goto fail;
local = &tmp;
@@ -1582,8 +1603,6 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
xdst->flo.ops = &xfrm_bundle_fc_ops;
- if (afinfo->init_dst)
- afinfo->init_dst(net, xdst);
} else
xdst = ERR_PTR(-ENOBUFS);
@@ -1693,8 +1712,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
family = xfrm[i]->props.family;
- dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
- family);
+ dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
+ &saddr, &daddr, family);
err = PTR_ERR(dst);
if (IS_ERR(dst))
goto put_states;
@@ -1888,6 +1907,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
struct sock *sk;
struct dst_entry *dst;
struct xfrm_policy *pol = (struct xfrm_policy *)arg;
+ struct net *net = xp_net(pol);
struct xfrm_policy_queue *pq = &pol->polq;
struct flowi fl;
struct sk_buff_head list;
@@ -1904,8 +1924,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
spin_unlock(&pq->hold_queue.lock);
dst_hold(dst->path);
- dst = xfrm_lookup(xp_net(pol), dst->path, &fl,
- sk, 0);
+ dst = xfrm_lookup(net, dst->path, &fl, sk, 0);
if (IS_ERR(dst))
goto purge_queue;
@@ -1935,8 +1954,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
dst_hold(skb_dst(skb)->path);
- dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path,
- &fl, skb->sk, 0);
+ dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0);
if (IS_ERR(dst)) {
kfree_skb(skb);
continue;
@@ -1946,7 +1964,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
skb_dst_drop(skb);
skb_dst_set(skb, dst);
- dst_output(skb);
+ dst_output(net, skb->sk, skb);
}
out:
@@ -1955,11 +1973,11 @@ out:
purge_queue:
pq->timeout = 0;
- xfrm_queue_purge(&pq->hold_queue);
+ skb_queue_purge(&pq->hold_queue);
xfrm_pol_put(pol);
}
-static int xdst_queue_output(struct sock *sk, struct sk_buff *skb)
+static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned long sched_next;
struct dst_entry *dst = skb_dst(skb);
@@ -2186,7 +2204,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
*/
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
const struct flowi *fl,
- struct sock *sk, int flags)
+ const struct sock *sk, int flags)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct flow_cache_object *flo;
@@ -2200,6 +2218,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
xdst = NULL;
route = NULL;
+ sk = sk_const_to_full_sk(sk);
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
@@ -2334,7 +2353,7 @@ EXPORT_SYMBOL(xfrm_lookup);
*/
struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
const struct flowi *fl,
- struct sock *sk, int flags)
+ const struct sock *sk, int flags)
{
struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
flags | XFRM_LOOKUP_QUEUE |
@@ -2479,6 +2498,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
}
pol = NULL;
+ sk = sk_to_full_sk(sk);
if (sk && sk->sk_policy[dir]) {
pol = xfrm_sk_policy_lookup(sk, dir, &fl);
if (IS_ERR(pol)) {
@@ -2806,7 +2826,6 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
{
- struct net *net;
int err = 0;
if (unlikely(afinfo == NULL))
return -EINVAL;
@@ -2814,7 +2833,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
return -EAFNOSUPPORT;
spin_lock(&xfrm_policy_afinfo_lock);
if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
- err = -ENOBUFS;
+ err = -EEXIST;
else {
struct dst_ops *dst_ops = afinfo->dst_ops;
if (likely(dst_ops->kmem_cachep == NULL))
@@ -2837,26 +2856,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
}
spin_unlock(&xfrm_policy_afinfo_lock);
- rtnl_lock();
- for_each_net(net) {
- struct dst_ops *xfrm_dst_ops;
-
- switch (afinfo->family) {
- case AF_INET:
- xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops;
- break;
-#if IS_ENABLED(CONFIG_IPV6)
- case AF_INET6:
- xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops;
- break;
-#endif
- default:
- BUG();
- }
- *xfrm_dst_ops = *afinfo->dst_ops;
- }
- rtnl_unlock();
-
return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);
@@ -2892,22 +2891,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
-static void __net_init xfrm_dst_ops_init(struct net *net)
-{
- struct xfrm_policy_afinfo *afinfo;
-
- rcu_read_lock();
- afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]);
- if (afinfo)
- net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops;
-#if IS_ENABLED(CONFIG_IPV6)
- afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]);
- if (afinfo)
- net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops;
-#endif
- rcu_read_unlock();
-}
-
static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -3056,7 +3039,6 @@ static int __net_init xfrm_net_init(struct net *net)
rv = xfrm_policy_init(net);
if (rv < 0)
goto out_policy;
- xfrm_dst_ops_init(net);
rv = xfrm_sysctl_init(net);
if (rv < 0)
goto out_sysctl;
@@ -3209,16 +3191,17 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
}
chain = &net->xfrm.policy_inexact[dir];
hlist_for_each_entry(pol, chain, bydst) {
+ if ((pol->priority >= priority) && ret)
+ break;
+
if (xfrm_migrate_selector_match(sel, &pol->selector) &&
- pol->type == type &&
- pol->priority < priority) {
+ pol->type == type) {
ret = pol;
break;
}
}
- if (ret)
- xfrm_pol_hold(ret);
+ xfrm_pol_hold(ret);
read_unlock_bh(&net->xfrm.xfrm_policy_lock);
diff --git a/kernel/net/xfrm/xfrm_state.c b/kernel/net/xfrm/xfrm_state.c
index 96688cd0f..9895a8c56 100644
--- a/kernel/net/xfrm/xfrm_state.c
+++ b/kernel/net/xfrm/xfrm_state.c
@@ -1626,7 +1626,7 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
if (list_empty(&walk->all))
x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all);
else
- x = list_entry(&walk->all, struct xfrm_state_walk, all);
+ x = list_first_entry(&walk->all, struct xfrm_state_walk, all);
list_for_each_entry_from(x, &net->xfrm.state_all, all) {
if (x->state == XFRM_STATE_DEAD)
continue;
@@ -1908,7 +1908,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
return -EAFNOSUPPORT;
spin_lock_bh(&xfrm_state_afinfo_lock);
if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
- err = -ENOBUFS;
+ err = -EEXIST;
else
rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo);
spin_unlock_bh(&xfrm_state_afinfo_lock);
diff --git a/kernel/net/xfrm/xfrm_user.c b/kernel/net/xfrm/xfrm_user.c
index 209166429..805681a7d 100644
--- a/kernel/net/xfrm/xfrm_user.c
+++ b/kernel/net/xfrm/xfrm_user.c
@@ -31,6 +31,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#endif
+#include <asm/unaligned.h>
static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
{
@@ -289,6 +290,31 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
return 0;
}
+static int attach_crypt(struct xfrm_state *x, struct nlattr *rta)
+{
+ struct xfrm_algo *p, *ualg;
+ struct xfrm_algo_desc *algo;
+
+ if (!rta)
+ return 0;
+
+ ualg = nla_data(rta);
+
+ algo = xfrm_ealg_get_byname(ualg->alg_name, 1);
+ if (!algo)
+ return -ENOSYS;
+ x->props.ealgo = algo->desc.sadb_alg_id;
+
+ p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ strcpy(p->alg_name, algo->name);
+ x->ealg = p;
+ x->geniv = algo->uinfo.encr.geniv;
+ return 0;
+}
+
static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
struct nlattr *rta)
{
@@ -349,8 +375,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
return 0;
}
-static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props,
- struct nlattr *rta)
+static int attach_aead(struct xfrm_state *x, struct nlattr *rta)
{
struct xfrm_algo_aead *p, *ualg;
struct xfrm_algo_desc *algo;
@@ -363,14 +388,15 @@ static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props,
algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1);
if (!algo)
return -ENOSYS;
- *props = algo->desc.sadb_alg_id;
+ x->props.ealgo = algo->desc.sadb_alg_id;
p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL);
if (!p)
return -ENOMEM;
strcpy(p->alg_name, algo->name);
- *algpp = p;
+ x->aead = p;
+ x->geniv = algo->uinfo.aead.geniv;
return 0;
}
@@ -515,8 +541,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
if (attrs[XFRMA_SA_EXTRA_FLAGS])
x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);
- if ((err = attach_aead(&x->aead, &x->props.ealgo,
- attrs[XFRMA_ALG_AEAD])))
+ if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD])))
goto error;
if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo,
attrs[XFRMA_ALG_AUTH_TRUNC])))
@@ -526,9 +551,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
attrs[XFRMA_ALG_AUTH])))
goto error;
}
- if ((err = attach_one_algo(&x->ealg, &x->props.ealgo,
- xfrm_ealg_get_byname,
- attrs[XFRMA_ALG_CRYPT])))
+ if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT])))
goto error;
if ((err = attach_one_algo(&x->calg, &x->props.calgo,
xfrm_calg_get_byname,
@@ -706,7 +729,9 @@ static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
memcpy(&p->sel, &x->sel, sizeof(p->sel));
memcpy(&p->lft, &x->lft, sizeof(p->lft));
memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
- memcpy(&p->stats, &x->stats, sizeof(p->stats));
+ put_unaligned(x->stats.replay_window, &p->stats.replay_window);
+ put_unaligned(x->stats.replay, &p->stats.replay);
+ put_unaligned(x->stats.integrity_failed, &p->stats.integrity_failed);
memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr));
p->mode = x->props.mode;
p->replay_window = x->props.replay_window;
@@ -903,12 +928,10 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
return err;
if (attrs[XFRMA_ADDRESS_FILTER]) {
- filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]),
+ sizeof(*filter), GFP_KERNEL);
if (filter == NULL)
return -ENOMEM;
-
- memcpy(filter, nla_data(attrs[XFRMA_ADDRESS_FILTER]),
- sizeof(*filter));
}
if (attrs[XFRMA_PROTO])
@@ -1908,8 +1931,10 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
+ struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
+ struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];
- if (!lt && !rp && !re)
+ if (!lt && !rp && !re && !et && !rt)
return err;
/* pedantic mode - thou shalt sayeth replaceth */
@@ -2026,7 +2051,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
xfrm_audit_policy_delete(xp, 1, true);
} else {
// reset the timers here?
- WARN(1, "Dont know what to do with soft policy expire\n");
+ WARN(1, "Don't know what to do with soft policy expire\n");
}
km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);