summaryrefslogtreecommitdiffstats
path: root/kernel/net/ipv4/ip_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/net/ipv4/ip_output.c')
-rw-r--r--kernel/net/ipv4/ip_output.c204
1 files changed, 115 insertions, 89 deletions
diff --git a/kernel/net/ipv4/ip_output.c b/kernel/net/ipv4/ip_output.c
index c65b93a7b..49f028563 100644
--- a/kernel/net/ipv4/ip_output.c
+++ b/kernel/net/ipv4/ip_output.c
@@ -83,6 +83,11 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
+
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
@@ -91,32 +96,28 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
- skb_dst(skb)->dev, dst_output_sk);
-}
-
-int __ip_local_out(struct sk_buff *skb)
-{
- return __ip_local_out_sk(skb->sk, skb);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
}
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip_local_out(skb);
+ err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output_sk(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -131,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
* Add an ip header to a skbuff and send it out.
*
*/
-int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
@@ -145,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->dst))
- iph->frag_off = htons(IP_DF);
- else
- iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(sock_net(sk), skb, sk);
+ if (ip_dont_fragment(sk, &rt->dst)) {
+ iph->frag_off = htons(IP_DF);
+ iph->id = 0;
+ } else {
+ iph->frag_off = 0;
+ __ip_select_ident(net, iph, 1);
+ }
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -164,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
skb->mark = sk->sk_mark;
/* Send it out. */
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
@@ -178,9 +182,9 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
@@ -216,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -224,8 +229,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
- skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
- return ip_finish_output2(sk, skb);
+ skb_gso_network_seglen(skb) <= mtu)
+ return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -235,6 +240,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
* from host network stack.
*/
features = netif_skb_features(skb);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -248,7 +254,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, ip_finish_output2);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -258,25 +264,28 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
return ret;
}
-static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
+ mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb);
+ return ip_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > ip_skb_dst_mtu(skb))
- return ip_fragment(sk, skb, ip_finish_output2);
+ if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
}
-int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
@@ -284,7 +293,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -312,7 +321,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- sk, newskb, NULL, newskb->dev,
+ net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
}
@@ -327,26 +336,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
- NULL, newskb->dev, dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
}
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
- skb->dev, ip_finish_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
+ ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sock *sk, struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -369,6 +380,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -399,7 +411,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
@@ -436,20 +448,20 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(sock_net(sk), skb, sk,
+ ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- res = ip_local_out(skb);
+ res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
@@ -478,6 +490,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->frag_off & htons(IP_DF)) == 0)
+ return ip_do_fragment(net, sk, skb, output);
+
+ if (unlikely(!skb->ignore_df ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(net, sk, skb, output);
+}
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -485,8 +519,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
* single device frame, and queue such a frame for sending.
*/
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -500,6 +534,11 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
dev = rt->dst.dev;
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
+
/*
* Point into the IP datagram header.
*/
@@ -507,15 +546,8 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
- (IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > mtu))) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- kfree_skb(skb);
- return -EMSGSIZE;
- }
+ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+ mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
@@ -523,10 +555,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge)
- mtu -= nf_bridge_mtu_reduction(skb);
-#endif
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
@@ -599,10 +627,10 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
ip_send_check(iph);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
@@ -612,7 +640,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
}
if (err == 0) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return 0;
}
@@ -621,7 +649,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
kfree_skb(frag);
frag = skb;
}
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
slow_path_clean:
@@ -635,9 +663,6 @@ slow_path_clean:
}
slow_path:
- /* for offloaded checksums cleanup checksum before fragmentation */
- if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
- goto fail;
iph = ip_hdr(skb);
left = skb->len - hlen; /* Space per frame */
@@ -711,6 +736,9 @@ slow_path:
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ iph->frag_off |= htons(IP_DF);
+
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -736,22 +764,22 @@ slow_path:
ip_send_check(iph);
- err = output(sk, skb2);
+ err = output(net, sk, skb2);
if (err)
goto fail;
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
}
consume_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
}
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
@@ -886,6 +914,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ !(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -893,7 +922,7 @@ static int __ip_append_data(struct sock *sk,
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
@@ -1217,11 +1246,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
}
while (size > 0) {
- int i;
-
- if (skb_is_gso(skb))
+ if (skb_is_gso(skb)) {
len = size;
- else {
+ } else {
/* Check if the remaining data fits into current packet. */
len = mtu - skb->len;
@@ -1273,15 +1300,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
continue;
}
- i = skb_shinfo(skb)->nr_frags;
if (len > size)
len = size;
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
- } else if (i < MAX_SKB_FRAGS) {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, len);
- } else {
+
+ if (skb_append_pagefrags(skb, page, offset, len)) {
err = -EMSGSIZE;
goto error;
}
@@ -1416,7 +1438,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
- err = ip_local_out(skb);
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
@@ -1524,6 +1546,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
struct net *net = sock_net(sk);
struct sk_buff *nskb;
int err;
+ int oif;
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
return;
@@ -1541,7 +1564,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
daddr = replyopts.opt.opt.faddr;
}
- flowi4_init_output(&fl4, arg->bound_dev_if,
+ oif = arg->bound_dev_if;
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+ oif = skb->skb_iif;
+
+ flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark),
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
@@ -1573,7 +1600,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out: