summaryrefslogtreecommitdiffstats
path: root/kernel/net/ipv6/route.c
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/net/ipv6/route.c
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/net/ipv6/route.c')
-rw-r--r--kernel/net/ipv6/route.c932
1 files changed, 624 insertions, 308 deletions
diff --git a/kernel/net/ipv6/route.c b/kernel/net/ipv6/route.c
index f371fefa7..3f164d3aa 100644
--- a/kernel/net/ipv6/route.c
+++ b/kernel/net/ipv6/route.c
@@ -54,10 +54,14 @@
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
#include <asm/uaccess.h>
@@ -72,8 +76,7 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
- const struct in6_addr *dest);
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
@@ -84,14 +87,15 @@ static void ip6_dst_ifdown(struct dst_entry *,
static int ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb);
-static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int ip6_pkt_prohibit(struct sk_buff *skb);
-static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
+static void rt6_dst_from_metrics_check(struct rt6_info *rt);
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -104,65 +108,83 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *gwaddr, int ifindex);
#endif
-static void rt6_bind_peer(struct rt6_info *rt, int create)
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
+
+static void rt6_uncached_list_add(struct rt6_info *rt)
{
- struct inet_peer_base *base;
- struct inet_peer *peer;
+ struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- base = inetpeer_base_ptr(rt->_rt6i_peer);
- if (!base)
- return;
+ rt->dst.flags |= DST_NOCACHE;
+ rt->rt6i_uncached_list = ul;
- peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
- if (peer) {
- if (!rt6_set_peer(rt, peer))
- inet_putpeer(peer);
- }
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->rt6i_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
}
-static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
+static void rt6_uncached_list_del(struct rt6_info *rt)
{
- if (rt6_has_peer(rt))
- return rt6_peer_ptr(rt);
+ if (!list_empty(&rt->rt6i_uncached)) {
+ struct uncached_list *ul = rt->rt6i_uncached_list;
- rt6_bind_peer(rt, create);
- return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
+ spin_lock_bh(&ul->lock);
+ list_del(&rt->rt6i_uncached);
+ spin_unlock_bh(&ul->lock);
+ }
}
-static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
+static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
- return __rt6_get_peer(rt, 1);
-}
+ struct net_device *loopback_dev = net->loopback_dev;
+ int cpu;
-static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
-{
- struct rt6_info *rt = (struct rt6_info *) dst;
- struct inet_peer *peer;
- u32 *p = NULL;
-
- if (!(rt->dst.flags & DST_HOST))
- return dst_cow_metrics_generic(dst, old);
+ if (dev == loopback_dev)
+ return;
- peer = rt6_get_peer_create(rt);
- if (peer) {
- u32 *old_p = __DST_METRICS_PTR(old);
- unsigned long prev, new;
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+ struct rt6_info *rt;
- p = peer->metrics;
- if (inet_metrics_new(peer) ||
- (old & DST_METRICS_FORCE_OVERWRITE))
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+ spin_lock_bh(&ul->lock);
+ list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+ struct inet6_dev *rt_idev = rt->rt6i_idev;
+ struct net_device *rt_dev = rt->dst.dev;
- new = (unsigned long) p;
- prev = cmpxchg(&dst->_metrics, old, new);
+ if (rt_idev->dev == dev) {
+ rt->rt6i_idev = in6_dev_get(loopback_dev);
+ in6_dev_put(rt_idev);
+ }
- if (prev != old) {
- p = __DST_METRICS_PTR(prev);
- if (prev & DST_METRICS_READ_ONLY)
- p = NULL;
+ if (rt_dev == dev) {
+ rt->dst.dev = loopback_dev;
+ dev_hold(rt->dst.dev);
+ dev_put(rt_dev);
+ }
}
+ spin_unlock_bh(&ul->lock);
}
- return p;
+}
+
+static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
+{
+ return dst_metrics_write_ptr(rt->dst.from);
+}
+
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rt6_info *rt = (struct rt6_info *)dst;
+
+ if (rt->rt6i_flags & RTF_PCPU)
+ return rt6_pcpu_cow_metrics(rt);
+ else if (rt->rt6i_flags & RTF_CACHE)
+ return NULL;
+ else
+ return dst_cow_metrics_generic(dst, old);
}
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
@@ -227,12 +249,6 @@ static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
{
}
-static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
- unsigned long old)
-{
- return NULL;
-}
-
static struct dst_ops ip6_dst_blackhole_ops = {
.family = AF_INET6,
.destroy = ip6_dst_destroy,
@@ -241,7 +257,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
.redirect = ip6_rt_blackhole_redirect,
- .cow_metrics = ip6_rt_blackhole_cow_metrics,
+ .cow_metrics = dst_cow_metrics_generic,
.neigh_lookup = ip6_neigh_lookup,
};
@@ -288,7 +304,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EINVAL,
.input = dst_discard,
- .output = dst_discard_sk,
+ .output = dst_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
.rt6i_protocol = RTPROT_KERNEL,
@@ -298,34 +314,67 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
#endif
+static void rt6_info_init(struct rt6_info *rt)
+{
+ struct dst_entry *dst = &rt->dst;
+
+ memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
+ INIT_LIST_HEAD(&rt->rt6i_siblings);
+ INIT_LIST_HEAD(&rt->rt6i_uncached);
+}
+
/* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct net *net,
- struct net_device *dev,
- int flags,
- struct fib6_table *table)
+static struct rt6_info *__ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
0, DST_OBSOLETE_FORCE_CHK, flags);
+ if (rt)
+ rt6_info_init(rt);
+
+ return rt;
+}
+
+static struct rt6_info *ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags)
+{
+ struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
+
if (rt) {
- struct dst_entry *dst = &rt->dst;
+ rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+ if (rt->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **p;
- memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
- rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
- INIT_LIST_HEAD(&rt->rt6i_siblings);
+ p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ /* no one shares rt */
+ *p = NULL;
+ }
+ } else {
+ dst_destroy((struct dst_entry *)rt);
+ return NULL;
+ }
}
+
return rt;
}
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
- struct inet6_dev *idev = rt->rt6i_idev;
struct dst_entry *from = dst->from;
+ struct inet6_dev *idev;
- if (!(rt->dst.flags & DST_HOST))
- dst_destroy_metrics_generic(dst);
+ dst_destroy_metrics_generic(dst);
+ free_percpu(rt->rt6i_pcpu);
+ rt6_uncached_list_del(rt);
+ idev = rt->rt6i_idev;
if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
@@ -333,11 +382,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
dst->from = NULL;
dst_release(from);
-
- if (rt6_has_peer(rt)) {
- struct inet_peer *peer = rt6_peer_ptr(rt);
- inet_putpeer(peer);
- }
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -360,6 +404,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
}
}
+static bool __rt6_check_expired(const struct rt6_info *rt)
+{
+ if (rt->rt6i_flags & RTF_EXPIRES)
+ return time_after(jiffies, rt->dst.expires);
+ else
+ return false;
+}
+
static bool rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES) {
@@ -378,31 +430,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
const struct flowi6 *fl6)
{
- unsigned int val = fl6->flowi6_proto;
-
- val ^= ipv6_addr_hash(&fl6->daddr);
- val ^= ipv6_addr_hash(&fl6->saddr);
-
- /* Work only if this not encapsulated */
- switch (fl6->flowi6_proto) {
- case IPPROTO_UDP:
- case IPPROTO_TCP:
- case IPPROTO_SCTP:
- val ^= (__force u16)fl6->fl6_sport;
- val ^= (__force u16)fl6->fl6_dport;
- break;
-
- case IPPROTO_ICMPV6:
- val ^= (__force u16)fl6->fl6_icmp_type;
- val ^= (__force u16)fl6->fl6_icmp_code;
- break;
- }
- /* RFC6438 recommands to use flowlabel */
- val ^= (__force u32)fl6->flowlabel;
-
- /* Perhaps, we need to tune, this function? */
- val = val ^ (val >> 7) ^ (val >> 12);
- return val % candidate_count;
+ return get_hash_from_flowi6(fl6) % candidate_count;
}
static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
@@ -455,10 +483,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
if (dev->flags & IFF_LOOPBACK) {
if (!sprt->rt6i_idev ||
sprt->rt6i_idev->dev->ifindex != oif) {
- if (flags & RT6_LOOKUP_F_IFACE && oif)
+ if (flags & RT6_LOOKUP_F_IFACE)
continue;
- if (local && (!oif ||
- local->rt6i_idev->dev->ifindex == oif))
+ if (local &&
+ local->rt6i_idev->dev->ifindex == oif)
continue;
}
local = sprt;
@@ -495,13 +523,14 @@ static void rt6_probe_deferred(struct work_struct *w)
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
- ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
dev_put(work->dev);
kfree(work);
}
static void rt6_probe(struct rt6_info *rt)
{
+ struct __rt6_probe_work *work;
struct neighbour *neigh;
/*
* Okay, this does not seem to be appropriate
@@ -516,34 +545,33 @@ static void rt6_probe(struct rt6_info *rt)
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
if (neigh) {
- write_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
goto out;
- }
-
- if (!neigh ||
- time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
- struct __rt6_probe_work *work;
+ work = NULL;
+ write_lock(&neigh->lock);
+ if (!(neigh->nud_state & NUD_VALID) &&
+ time_after(jiffies,
+ neigh->updated +
+ rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work)
+ __neigh_set_probe_once(neigh);
+ }
+ write_unlock(&neigh->lock);
+ } else {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ }
- if (neigh && work)
- __neigh_set_probe_once(neigh);
-
- if (neigh)
- write_unlock(&neigh->lock);
+ if (work) {
+ INIT_WORK(&work->work, rt6_probe_deferred);
+ work->target = rt->rt6i_gateway;
+ dev_hold(rt->dst.dev);
+ work->dev = rt->dst.dev;
+ schedule_work(&work->work);
+ }
- if (work) {
- INIT_WORK(&work->work, rt6_probe_deferred);
- work->target = rt->rt6i_gateway;
- dev_hold(rt->dst.dev);
- work->dev = rt->dst.dev;
- schedule_work(&work->work);
- }
- } else {
out:
- write_unlock(&neigh->lock);
- }
rcu_read_unlock_bh();
}
#else
@@ -622,6 +650,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
{
int m;
bool match_do_rr = false;
+ struct inet6_dev *idev = rt->rt6i_idev;
+ struct net_device *dev = rt->dst.dev;
+
+ if (dev && !netif_carrier_ok(dev) &&
+ idev->cnf.ignore_routes_with_linkdown)
+ goto out;
if (rt6_check_expired(rt))
goto out;
@@ -652,15 +686,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
u32 metric, int oif, int strict,
bool *do_rr)
{
- struct rt6_info *rt, *match;
+ struct rt6_info *rt, *match, *cont;
int mpri = -1;
match = NULL;
- for (rt = rr_head; rt && rt->rt6i_metric == metric;
- rt = rt->dst.rt6_next)
+ cont = NULL;
+ for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+ if (rt->rt6i_metric != metric) {
+ cont = rt;
+ break;
+ }
+
match = find_match(rt, oif, strict, &mpri, match, do_rr);
- for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
- rt = rt->dst.rt6_next)
+ }
+
+ for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+ if (rt->rt6i_metric != metric) {
+ cont = rt;
+ break;
+ }
+
+ match = find_match(rt, oif, strict, &mpri, match, do_rr);
+ }
+
+ if (match || !cont)
+ return match;
+
+ for (rt = cont; rt; rt = rt->dst.rt6_next)
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
@@ -694,6 +746,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
return match ? match : net->ipv6.ip6_null_entry;
}
+static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+{
+ return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+}
+
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr)
@@ -872,9 +929,9 @@ int ip6_ins_rt(struct rt6_info *rt)
return __ip6_ins_rt(rt, &info, &mxc);
}
-static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct rt6_info *rt;
@@ -882,15 +939,25 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
* Clone the route.
*/
- rt = ip6_rt_copy(ort, daddr);
+ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+ ort = (struct rt6_info *)ort->dst.from;
- if (rt) {
+ rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
+
+ if (!rt)
+ return NULL;
+
+ ip6_rt_copy_init(rt, ort);
+ rt->rt6i_flags |= RTF_CACHE;
+ rt->rt6i_metric = 0;
+ rt->dst.flags |= DST_HOST;
+ rt->rt6i_dst.addr = *daddr;
+ rt->rt6i_dst.plen = 128;
+
+ if (!rt6_is_gw_or_nonexthop(ort)) {
if (ort->rt6i_dst.plen != 128 &&
ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
-
- rt->rt6i_flags |= RTF_CACHE;
-
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
rt->rt6i_src.addr = *saddr;
@@ -902,35 +969,93 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
return rt;
}
-static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
- const struct in6_addr *daddr)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
{
- struct rt6_info *rt = ip6_rt_copy(ort, daddr);
+ struct rt6_info *pcpu_rt;
- if (rt)
- rt->rt6i_flags |= RTF_CACHE;
- return rt;
+ pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
+ rt->dst.dev, rt->dst.flags);
+
+ if (!pcpu_rt)
+ return NULL;
+ ip6_rt_copy_init(pcpu_rt, rt);
+ pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
+ pcpu_rt->rt6i_flags |= RTF_PCPU;
+ return pcpu_rt;
+}
+
+/* It should be called with read_lock_bh(&tb6_lock) acquired */
+static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+{
+ struct rt6_info *pcpu_rt, **p;
+
+ p = this_cpu_ptr(rt->rt6i_pcpu);
+ pcpu_rt = *p;
+
+ if (pcpu_rt) {
+ dst_hold(&pcpu_rt->dst);
+ rt6_dst_from_metrics_check(pcpu_rt);
+ }
+ return pcpu_rt;
+}
+
+static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
+{
+ struct fib6_table *table = rt->rt6i_table;
+ struct rt6_info *pcpu_rt, *prev, **p;
+
+ pcpu_rt = ip6_rt_pcpu_alloc(rt);
+ if (!pcpu_rt) {
+ struct net *net = dev_net(rt->dst.dev);
+
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return net->ipv6.ip6_null_entry;
+ }
+
+ read_lock_bh(&table->tb6_lock);
+ if (rt->rt6i_pcpu) {
+ p = this_cpu_ptr(rt->rt6i_pcpu);
+ prev = cmpxchg(p, NULL, pcpu_rt);
+ if (prev) {
+ /* If someone did it before us, return prev instead */
+ dst_destroy(&pcpu_rt->dst);
+ pcpu_rt = prev;
+ }
+ } else {
+ /* rt has been removed from the fib6 tree
+ * before we have a chance to acquire the read_lock.
+ * In this case, don't brother to create a pcpu rt
+ * since rt is going away anyway. The next
+ * dst_check() will trigger a re-lookup.
+ */
+ dst_destroy(&pcpu_rt->dst);
+ pcpu_rt = rt;
+ }
+ dst_hold(&pcpu_rt->dst);
+ rt6_dst_from_metrics_check(pcpu_rt);
+ read_unlock_bh(&table->tb6_lock);
+ return pcpu_rt;
}
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
- struct rt6_info *rt, *nrt;
+ struct rt6_info *rt;
int strict = 0;
- int attempts = 3;
- int err;
strict |= flags & RT6_LOOKUP_F_IFACE;
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
-redo_fib6_lookup_lock:
read_lock_bh(&table->tb6_lock);
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
+ if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+ oif = 0;
+
redo_rt6_select:
rt = rt6_select(fn, oif, strict);
if (rt->rt6i_nsiblings)
@@ -944,51 +1069,65 @@ redo_rt6_select:
strict &= ~RT6_LOOKUP_F_REACHABLE;
fn = saved_fn;
goto redo_rt6_select;
- } else {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- goto out2;
}
}
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- if (rt->rt6i_flags & RTF_CACHE)
- goto out2;
+ if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
+ dst_use(&rt->dst, jiffies);
+ read_unlock_bh(&table->tb6_lock);
- if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
- nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
- else if (!(rt->dst.flags & DST_HOST))
- nrt = rt6_alloc_clone(rt, &fl6->daddr);
- else
- goto out2;
+ rt6_dst_from_metrics_check(rt);
+ return rt;
+ } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
+ !(rt->rt6i_flags & RTF_GATEWAY))) {
+ /* Create a RTF_CACHE clone which will not be
+ * owned by the fib6 tree. It is for the special case where
+ * the daddr in the skb during the neighbor look-up is different
+ * from the fl6->daddr used to look-up route here.
+ */
- ip6_rt_put(rt);
- rt = nrt ? : net->ipv6.ip6_null_entry;
+ struct rt6_info *uncached_rt;
- dst_hold(&rt->dst);
- if (nrt) {
- err = ip6_ins_rt(nrt);
- if (!err)
- goto out2;
- }
+ dst_use(&rt->dst, jiffies);
+ read_unlock_bh(&table->tb6_lock);
- if (--attempts <= 0)
- goto out2;
+ uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+ dst_release(&rt->dst);
- /*
- * Race condition! In the gap, when table->tb6_lock was
- * released someone could insert this route. Relookup.
- */
- ip6_rt_put(rt);
- goto redo_fib6_lookup_lock;
+ if (uncached_rt)
+ rt6_uncached_list_add(uncached_rt);
+ else
+ uncached_rt = net->ipv6.ip6_null_entry;
-out2:
- rt->dst.lastuse = jiffies;
- rt->dst.__use++;
+ dst_hold(&uncached_rt->dst);
+ return uncached_rt;
- return rt;
+ } else {
+ /* Get a percpu copy */
+
+ struct rt6_info *pcpu_rt;
+
+ rt->dst.lastuse = jiffies;
+ rt->dst.__use++;
+ pcpu_rt = rt6_get_pcpu_route(rt);
+
+ if (pcpu_rt) {
+ read_unlock_bh(&table->tb6_lock);
+ } else {
+ /* We have to do the read_unlock first
+ * because rt6_make_pcpu_route() may trigger
+ * ip6_dst_gc() which will take the write_lock.
+ */
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+ pcpu_rt = rt6_make_pcpu_route(rt);
+ dst_release(&rt->dst);
+ }
+
+ return pcpu_rt;
+
+ }
}
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
@@ -1012,8 +1151,9 @@ void ip6_route_input(struct sk_buff *skb)
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
- .flowi6_iif = skb->dev->ifindex,
+ .flowi6_iif = l3mdev_fib_oif(skb->dev),
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
@@ -1021,6 +1161,10 @@ void ip6_route_input(struct sk_buff *skb)
.flowi6_proto = iph->nexthdr,
};
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+ skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
}
@@ -1030,24 +1174,31 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
}
-struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
- struct flowi6 *fl6)
+struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6, int flags)
{
- int flags = 0;
+ struct dst_entry *dst;
+ bool any_src;
+
+ dst = l3mdev_rt6_dst_by_oif(net, fl6);
+ if (dst)
+ return dst;
fl6->flowi6_iif = LOOPBACK_IFINDEX;
- if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
+ any_src = ipv6_addr_any(&fl6->saddr);
+ if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
+ (fl6->flowi6_oif && any_src))
flags |= RT6_LOOKUP_F_IFACE;
- if (!ipv6_addr_any(&fl6->saddr))
+ if (!any_src)
flags |= RT6_LOOKUP_F_HAS_SADDR;
else if (sk)
flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
}
-EXPORT_SYMBOL(ip6_route_output);
+EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
@@ -1056,25 +1207,20 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
if (rt) {
- new = &rt->dst;
-
- memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
- rt6_init_peer(rt, net->ipv6.peers);
+ rt6_info_init(rt);
+ new = &rt->dst;
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
- if (dst_metrics_read_only(&ort->dst))
- new->_metrics = ort->dst._metrics;
- else
- dst_copy_metrics(new, &ort->dst);
+ dst_copy_metrics(new, &ort->dst);
rt->rt6i_idev = ort->rt6i_idev;
if (rt->rt6i_idev)
in6_dev_hold(rt->rt6i_idev);
rt->rt6i_gateway = ort->rt6i_gateway;
- rt->rt6i_flags = ort->rt6i_flags;
+ rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
rt->rt6i_metric = 0;
memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
@@ -1093,6 +1239,34 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
* Destination cache support functions
*/
+static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+{
+ if (rt->dst.from &&
+ dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
+ dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+}
+
+static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+{
+ if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+ return NULL;
+
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return &rt->dst;
+}
+
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+{
+ if (!__rt6_check_expired(rt) &&
+ rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+ return &rt->dst;
+ else
+ return NULL;
+}
+
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
struct rt6_info *rt;
@@ -1103,13 +1277,14 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
- if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
- return NULL;
- if (rt6_check_expired(rt))
- return NULL;
+ rt6_dst_from_metrics_check(rt);
- return dst;
+ if (rt->rt6i_flags & RTF_PCPU ||
+ (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
+ return rt6_dst_from_check(rt, cookie);
+ else
+ return rt6_check(rt, cookie);
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -1140,32 +1315,76 @@ static void ip6_link_failure(struct sk_buff *skb)
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
dst_hold(&rt->dst);
- if (ip6_del_rt(rt))
- dst_free(&rt->dst);
+ ip6_del_rt(rt);
} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
rt->rt6i_node->fn_sernum = -1;
}
}
}
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
+{
+ struct net *net = dev_net(rt->dst.dev);
+
+ rt->rt6i_flags |= RTF_MODIFIED;
+ rt->rt6i_pmtu = mtu;
+ rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
+{
+ return !(rt->rt6i_flags & RTF_CACHE) &&
+ (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+}
+
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+ const struct ipv6hdr *iph, u32 mtu)
{
struct rt6_info *rt6 = (struct rt6_info *)dst;
- dst_confirm(dst);
- if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
- struct net *net = dev_net(dst->dev);
+ if (rt6->rt6i_flags & RTF_LOCAL)
+ return;
- rt6->rt6i_flags |= RTF_MODIFIED;
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
+ dst_confirm(dst);
+ mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+ if (mtu >= dst_mtu(dst))
+ return;
- dst_metric_set(dst, RTAX_MTU, mtu);
- rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+ if (!rt6_cache_allowed_for_pmtu(rt6)) {
+ rt6_do_update_pmtu(rt6, mtu);
+ } else {
+ const struct in6_addr *daddr, *saddr;
+ struct rt6_info *nrt6;
+
+ if (iph) {
+ daddr = &iph->daddr;
+ saddr = &iph->saddr;
+ } else if (sk) {
+ daddr = &sk->sk_v6_daddr;
+ saddr = &inet6_sk(sk)->saddr;
+ } else {
+ return;
+ }
+ nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+ if (nrt6) {
+ rt6_do_update_pmtu(nrt6, mtu);
+
+ /* ip6_ins_rt(nrt6) will bump the
+ * rt6->rt6i_node->fn_sernum
+ * which will fail the next rt6_check() and
+ * invalidate the sk->sk_dst_cache.
+ */
+ ip6_ins_rt(nrt6);
+ }
}
}
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+}
+
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
int oif, u32 mark)
{
@@ -1182,7 +1401,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
- ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+ __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -1341,12 +1560,17 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
static unsigned int ip6_mtu(const struct dst_entry *dst)
{
+ const struct rt6_info *rt = (const struct rt6_info *)dst;
+ unsigned int mtu = rt->rt6i_pmtu;
struct inet6_dev *idev;
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu)
goto out;
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+ if (mtu)
+ goto out;
+
mtu = IPV6_MIN_MTU;
rcu_read_lock();
@@ -1373,7 +1597,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
if (unlikely(!idev))
return ERR_PTR(-ENODEV);
- rt = ip6_dst_alloc(net, dev, 0, NULL);
+ rt = ip6_dst_alloc(net, dev, 0);
if (unlikely(!rt)) {
in6_dev_put(idev);
dst = ERR_PTR(-ENOMEM);
@@ -1472,6 +1696,7 @@ out:
static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg)
{
+ bool ecn_ca = false;
struct nlattr *nla;
int remaining;
u32 *mp;
@@ -1485,51 +1710,57 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (unlikely(type > RTAX_MAX))
+ goto err;
- if (type) {
- u32 val;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
- if (unlikely(type > RTAX_MAX))
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
goto err;
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ goto err;
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp);
- if (val == TCP_CA_UNSPEC)
- goto err;
- } else {
- val = nla_get_u32(nla);
- }
+ mp[type - 1] = val;
+ __set_bit(type - 1, mxc->mx_valid);
+ }
- mp[type - 1] = val;
- __set_bit(type - 1, mxc->mx_valid);
- }
+ if (ecn_ca) {
+ __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
+ mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
}
mxc->mx = mp;
-
return 0;
err:
kfree(mp);
return -EINVAL;
}
-int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
+static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
{
- int err;
struct net *net = cfg->fc_nlinfo.nl_net;
struct rt6_info *rt = NULL;
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
struct fib6_table *table;
int addr_type;
+ int err = -EINVAL;
if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
- return -EINVAL;
+ goto out;
#ifndef CONFIG_IPV6_SUBTREES
if (cfg->fc_src_len)
- return -EINVAL;
+ goto out;
#endif
if (cfg->fc_ifindex) {
err = -ENODEV;
@@ -1559,7 +1790,8 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
if (!table)
goto out;
- rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
+ rt = ip6_dst_alloc(net, NULL,
+ (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
if (!rt) {
err = -ENOMEM;
@@ -1587,12 +1819,29 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
rt->dst.output = ip6_output;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, AF_INET6, cfg,
+ &lwtstate);
+ if (err)
+ goto out;
+ rt->dst.lwtstate = lwtstate_get(lwtstate);
+ if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
+ rt->dst.lwtstate->orig_output = rt->dst.output;
+ rt->dst.output = lwtunnel_output;
+ }
+ if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
+ rt->dst.lwtstate->orig_input = rt->dst.input;
+ rt->dst.input = lwtunnel_input;
+ }
+ }
+
ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->rt6i_dst.plen = cfg->fc_dst_len;
- if (rt->rt6i_dst.plen == 128) {
+ if (rt->rt6i_dst.plen == 128)
rt->dst.flags |= DST_HOST;
- dst_metrics_set_force_overwrite(&rt->dst);
- }
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -1626,7 +1875,7 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
switch (cfg->fc_type) {
case RTN_BLACKHOLE:
rt->dst.error = -EINVAL;
- rt->dst.output = dst_discard_sk;
+ rt->dst.output = dst_discard_out;
rt->dst.input = dst_discard;
break;
case RTN_PROHIBIT:
@@ -1635,9 +1884,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
rt->dst.input = ip6_pkt_prohibit;
break;
case RTN_THROW:
+ case RTN_UNREACHABLE:
default:
rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
- : -ENETUNREACH;
+ : (cfg->fc_type == RTN_UNREACHABLE)
+ ? -EHOSTUNREACH : -ENETUNREACH;
rt->dst.output = ip6_pkt_discard_out;
rt->dst.input = ip6_pkt_discard;
break;
@@ -1650,9 +1901,21 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
int gwa_type;
gw_addr = &cfg->fc_gateway;
- rt->rt6i_gateway = *gw_addr;
gwa_type = ipv6_addr_type(gw_addr);
+ /* if gw_addr is local we will fail to detect this in case
+ * address is still TENTATIVE (DAD in progress). rt6_lookup()
+ * will return already-added prefix route via interface that
+ * prefix route was assigned to, which might be non-loopback.
+ */
+ err = -EINVAL;
+ if (ipv6_chk_addr_and_flags(net, gw_addr,
+ gwa_type & IPV6_ADDR_LINKLOCAL ?
+ dev : NULL, 0, 0))
+ goto out;
+
+ rt->rt6i_gateway = *gw_addr;
+
if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
struct rt6_info *grt;
@@ -1663,7 +1926,6 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
(SIT, PtP, NBMA NOARP links) it is handy to allow
some exceptions. --ANK
*/
- err = -EINVAL;
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
@@ -1718,9 +1980,7 @@ install_route:
cfg->fc_nlinfo.nl_net = dev_net(dev);
- *rt_ret = rt;
-
- return 0;
+ return rt;
out:
if (dev)
dev_put(dev);
@@ -1729,20 +1989,21 @@ out:
if (rt)
dst_free(&rt->dst);
- *rt_ret = NULL;
-
- return err;
+ return ERR_PTR(err);
}
int ip6_route_add(struct fib6_config *cfg)
{
struct mx6_config mxc = { .mx = NULL, };
- struct rt6_info *rt = NULL;
+ struct rt6_info *rt;
int err;
- err = ip6_route_info_create(cfg, &rt);
- if (err)
+ rt = ip6_route_info_create(cfg);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto out;
+ }
err = ip6_convert_metrics(&mxc, cfg);
if (err)
@@ -1766,7 +2027,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
struct fib6_table *table;
struct net *net = dev_net(rt->dst.dev);
- if (rt == net->ipv6.ip6_null_entry) {
+ if (rt == net->ipv6.ip6_null_entry ||
+ rt->dst.flags & DST_NOCACHE) {
err = -ENOENT;
goto out;
}
@@ -1808,6 +2070,9 @@ static int ip6_route_del(struct fib6_config *cfg)
if (fn) {
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ if ((rt->rt6i_flags & RTF_CACHE) &&
+ !(cfg->fc_flags & RTF_CACHE))
+ continue;
if (cfg->fc_ifindex &&
(!rt->dst.dev ||
rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -1830,7 +2095,6 @@ static int ip6_route_del(struct fib6_config *cfg)
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
struct netevent_redirect netevent;
struct rt6_info *rt, *nrt = NULL;
struct ndisc_options ndopts;
@@ -1891,7 +2155,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
}
rt = (struct rt6_info *) dst;
- if (rt == net->ipv6.ip6_null_entry) {
+ if (rt->rt6i_flags & RTF_REJECT) {
net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
return;
}
@@ -1917,7 +2181,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NEIGH_UPDATE_F_ISROUTER))
);
- nrt = ip6_rt_copy(rt, &msg->dest);
+ nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
if (!nrt)
goto out;
@@ -1949,42 +2213,36 @@ out:
* Misc support functions
*/
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
- const struct in6_addr *dest)
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
{
- struct net *net = dev_net(ort->dst.dev);
- struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
- ort->rt6i_table);
+ BUG_ON(from->dst.from);
- if (rt) {
- rt->dst.input = ort->dst.input;
- rt->dst.output = ort->dst.output;
- rt->dst.flags |= DST_HOST;
-
- rt->rt6i_dst.addr = *dest;
- rt->rt6i_dst.plen = 128;
- dst_copy_metrics(&rt->dst, &ort->dst);
- rt->dst.error = ort->dst.error;
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
- rt->dst.lastuse = jiffies;
-
- if (ort->rt6i_flags & RTF_GATEWAY)
- rt->rt6i_gateway = ort->rt6i_gateway;
- else
- rt->rt6i_gateway = *dest;
- rt->rt6i_flags = ort->rt6i_flags;
- rt6_set_from(rt, ort);
- rt->rt6i_metric = 0;
+ rt->rt6i_flags &= ~RTF_EXPIRES;
+ dst_hold(&from->dst);
+ rt->dst.from = &from->dst;
+ dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+}
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+{
+ rt->dst.input = ort->dst.input;
+ rt->dst.output = ort->dst.output;
+ rt->rt6i_dst = ort->rt6i_dst;
+ rt->dst.error = ort->dst.error;
+ rt->rt6i_idev = ort->rt6i_idev;
+ if (rt->rt6i_idev)
+ in6_dev_hold(rt->rt6i_idev);
+ rt->dst.lastuse = jiffies;
+ rt->rt6i_gateway = ort->rt6i_gateway;
+ rt->rt6i_flags = ort->rt6i_flags;
+ rt6_set_from(rt, ort);
+ rt->rt6i_metric = ort->rt6i_metric;
#ifdef CONFIG_IPV6_SUBTREES
- memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+ rt->rt6i_src = ort->rt6i_src;
#endif
- memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
- rt->rt6i_table = ort->rt6i_table;
- }
- return rt;
+ rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ rt->rt6i_table = ort->rt6i_table;
+ rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -2026,7 +2284,6 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_INFO,
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = ifindex,
.fc_dst_len = prefixlen,
@@ -2037,6 +2294,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
+ cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -2077,7 +2335,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_DFLT,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2124,7 +2382,8 @@ static void rtmsg_to_fib6_config(struct net *net,
{
memset(cfg, 0, sizeof(*cfg));
- cfg->fc_table = RT6_TABLE_MAIN;
+ cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+ : RT6_TABLE_MAIN;
cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
cfg->fc_metric = rtmsg->rtmsg_metric;
cfg->fc_expires = rtmsg->rtmsg_info;
@@ -2208,7 +2467,7 @@ static int ip6_pkt_discard(struct sk_buff *skb)
return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
}
-static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb->dev = skb_dst(skb)->dev;
return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
@@ -2219,7 +2478,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb)
return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
}
-static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb->dev = skb_dst(skb)->dev;
return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
@@ -2233,9 +2492,10 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
const struct in6_addr *addr,
bool anycast)
{
+ u32 tb_id;
struct net *net = dev_net(idev->dev);
struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
- DST_NOCOUNT, NULL);
+ DST_NOCOUNT);
if (!rt)
return ERR_PTR(-ENOMEM);
@@ -2255,7 +2515,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->rt6i_gateway = *addr;
rt->rt6i_dst.addr = *addr;
rt->rt6i_dst.plen = 128;
- rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
+ tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+ rt->rt6i_table = fib6_get_table(net, tb_id);
+ rt->dst.flags |= DST_NOCACHE;
atomic_set(&rt->dst.__refcnt, 1);
@@ -2359,6 +2621,8 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
fib6_clean_all(net, fib6_ifdown, &adn);
icmp6_clean_all(fib6_ifdown, &adn);
+ if (dev)
+ rt6_uncached_list_flush_dev(net, dev);
}
struct rt6_mtu_change_arg {
@@ -2396,11 +2660,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
PMTU discouvery.
*/
if (rt->dst.dev == arg->dev &&
- !dst_metric_locked(&rt->dst, RTAX_MTU) &&
- (dst_mtu(&rt->dst) >= arg->mtu ||
- (dst_mtu(&rt->dst) < arg->mtu &&
- dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
- dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+ !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+ if (rt->rt6i_flags & RTF_CACHE) {
+ /* For RTF_CACHE with rt6i_pmtu == 0
+ * (i.e. a redirected route),
+ * the metrics of its rt->dst.from has already
+ * been updated.
+ */
+ if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
+ rt->rt6i_pmtu = arg->mtu;
+ } else if (dst_mtu(&rt->dst) >= arg->mtu ||
+ (dst_mtu(&rt->dst) < arg->mtu &&
+ dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+ dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+ }
}
return 0;
}
@@ -2423,6 +2696,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_PREF] = { .type = NLA_U8 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2457,6 +2732,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rtm->rtm_type == RTN_LOCAL)
cfg->fc_flags |= RTF_LOCAL;
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ cfg->fc_flags |= RTF_CACHE;
+
cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->fc_nlinfo.nlh = nlh;
cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2514,6 +2792,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= RTF_PREF(pref);
}
+ if (tb[RTA_ENCAP])
+ cfg->fc_encap = tb[RTA_ENCAP];
+
+ if (tb[RTA_ENCAP_TYPE])
+ cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
err = 0;
errout:
return err;
@@ -2605,11 +2889,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ r_cfg.fc_encap_type = nla_get_u16(nla);
}
- err = ip6_route_info_create(&r_cfg, &rt);
- if (err)
+ rt = ip6_route_info_create(&r_cfg);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto cleanup;
+ }
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
@@ -2658,8 +2949,7 @@ cleanup:
list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
if (nh->rt6_info)
dst_free(&nh->rt6_info->dst);
- if (nh->mxc.mx)
- kfree(nh->mxc.mx);
+ kfree(nh->mxc.mx);
list_del(&nh->next);
kfree(nh);
}
@@ -2734,7 +3024,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
return ip6_route_add(&cfg);
}
-static inline size_t rt6_nlmsg_size(void)
+static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
{
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
@@ -2748,7 +3038,8 @@ static inline size_t rt6_nlmsg_size(void)
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
- + nla_total_size(1); /* RTA_PREF */
+ + nla_total_size(1) /* RTA_PREF */
+ + lwtunnel_get_encap_size(rt->dst.lwtstate);
}
static int rt6_fill_node(struct net *net,
@@ -2757,6 +3048,7 @@ static int rt6_fill_node(struct net *net,
int iif, int type, u32 portid, u32 seq,
int prefix, int nowait, unsigned int flags)
{
+ u32 metrics[RTAX_MAX];
struct rtmsg *rtm;
struct nlmsghdr *nlh;
long expires;
@@ -2808,6 +3100,11 @@ static int rt6_fill_node(struct net *net,
else
rtm->rtm_type = RTN_UNICAST;
rtm->rtm_flags = 0;
+ if (!netif_carrier_ok(rt->dst.dev)) {
+ rtm->rtm_flags |= RTNH_F_LINKDOWN;
+ if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+ rtm->rtm_flags |= RTNH_F_DEAD;
+ }
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->rt6i_protocol;
if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -2870,7 +3167,10 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
}
- if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt6i_pmtu)
+ metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -2892,6 +3192,8 @@ static int rt6_fill_node(struct net *net,
if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
goto nla_put_failure;
+ lwtunnel_fill_encap(skb, rt->dst.lwtstate);
+
nlmsg_end(skb, nlh);
return 0;
@@ -2977,6 +3279,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
} else {
fl6.flowi6_oif = oif;
+ if (netif_index_is_l3_master(net, oif)) {
+ fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
+ FLOWI_FLAG_SKIP_NH_OIF;
+ }
+
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}
@@ -3008,7 +3315,8 @@ errout:
return err;
}
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+ unsigned int nlm_flags)
{
struct sk_buff *skb;
struct net *net = info->nl_net;
@@ -3018,12 +3326,12 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (!skb)
goto errout;
err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
- event, info->portid, seq, 0, 0, 0);
+ event, info->portid, seq, 0, 0, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3365,6 +3673,7 @@ static struct notifier_block ip6_route_dev_notifier = {
int __init ip6_route_init(void)
{
int ret;
+ int cpu;
ret = -ENOMEM;
ip6_dst_ops_template.kmem_cachep =
@@ -3424,6 +3733,13 @@ int __init ip6_route_init(void)
if (ret)
goto out_register_late_subsys;
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+
+ INIT_LIST_HEAD(&ul->head);
+ spin_lock_init(&ul->lock);
+ }
+
out:
return ret;