diff options
Diffstat (limited to 'kernel/net/ipv4/fib_semantics.c')
-rw-r--r-- | kernel/net/ipv4/fib_semantics.c | 520 |
1 files changed, 396 insertions, 124 deletions
diff --git a/kernel/net/ipv4/fib_semantics.c b/kernel/net/ipv4/fib_semantics.c index 8d695b665..d97268e8f 100644 --- a/kernel/net/ipv4/fib_semantics.c +++ b/kernel/net/ipv4/fib_semantics.c @@ -42,6 +42,7 @@ #include <net/ip_fib.h> #include <net/netlink.h> #include <net/nexthop.h> +#include <net/lwtunnel.h> #include "fib_lookup.h" @@ -56,8 +57,7 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH - -static DEFINE_SPINLOCK(fib_multipath_lock); +u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -208,6 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + lwtstate_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -266,7 +267,8 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -318,7 +320,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -366,6 +368,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->fib_nhs) { + size_t nh_encapsize = 0; /* Also handles the special case fib_nhs == 1 */ /* each nexthop is packed in an attribute */ @@ -374,8 +377,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); + /* grab encap info */ + for_nexthops(fi) { + if (nh->nh_lwtstate) { + /* RTA_ENCAP_TYPE */ + nh_encapsize += lwtunnel_get_encap_size( + nh->nh_lwtstate); + /* RTA_ENCAP */ + nh_encapsize += nla_total_size(2); + } + } endfor_nexthops(fi); + /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); + payload += nla_total_size((fi->fib_nhs * nhsize) + + nh_encapsize); + } return payload; @@ -421,13 +437,15 @@ static int fib_detect_death(struct fib_info *fi, int order, if (n) { state = n->nud_state; neigh_release(n); + } else { + return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || - (*last_idx < 0 && order > dflt)) { + (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } @@ -452,6 +470,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; + int ret; + change_nexthops(fi) { int attrlen; @@ -475,18 +496,130 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (nexthop_nh->nh_tclassid) fi->fib_net->ipv4.fib_num_tclassid_users++; #endif + nla = nla_find(attrs, attrlen, RTA_ENCAP); + if (nla) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + struct nlattr *nla_entype; + + nla_entype = nla_find(attrs, attrlen, + RTA_ENCAP_TYPE); + if (!nla_entype) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + ret = lwtunnel_build_state(dev, nla_get_u16( + nla_entype), + nla, AF_INET, cfg, + &lwtstate); + if (ret) + goto errout; + nexthop_nh->nh_lwtstate = + lwtstate_get(lwtstate); + } } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); return 0; + +err_inval: + ret = -EINVAL; + +errout: + return ret; } -#endif +static void fib_rebalance(struct fib_info *fi) +{ + int total; + int w; + struct in_device *in_dev; + + if (fi->fib_nhs < 2) + return; + + total = 0; + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + + in_dev = __in_dev_get_rtnl(nh->nh_dev); + + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN) + continue; + + total += nh->nh_weight; + } endfor_nexthops(fi); + + w = 0; + change_nexthops(fi) { + int upper_bound; + + in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); + + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + upper_bound = -1; + } else if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + upper_bound = -1; + } else { + w += nexthop_nh->nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; + } + + atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + } endfor_nexthops(fi); + + net_get_random_once(&fib_multipath_secret, + sizeof(fib_multipath_secret)); +} + +static inline void fib_add_weight(struct fib_info *fi, + const struct fib_nh *nh) +{ + fi->fib_weight += nh->nh_weight; +} + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define fib_rebalance(fi) do { } while (0) +#define fib_add_weight(fi, nh) do { } while (0) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +static int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh, + const struct fib_config *cfg) +{ + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + int ret, result = 0; + + if (encap_type == LWTUNNEL_ENCAP_NONE) + return 0; + + if (oif) + dev = __dev_get_by_index(net, oif); + ret = lwtunnel_build_state(dev, encap_type, encap, + AF_INET, cfg, &lwtstate); + if (!ret) { + result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + lwtstate_free(lwtstate); + } + + return result; +} int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { + struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -496,6 +629,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_encap) { + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, cfg->fc_oif, + fi->fib_nh, cfg)) + return 1; + } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; @@ -585,7 +724,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { - int err; + int err = 0; struct net *net; struct net_device *dev; @@ -594,16 +733,20 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_result res; if (nh->nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) - return -EINVAL; dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); + if (addr_type != RTN_UNICAST) + return -EINVAL; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -611,6 +754,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { + struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, @@ -621,7 +765,24 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res); + + if (cfg->fc_table) + tbl = fib_get_table(net, cfg->fc_table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); + } + if (err) { rcu_read_unlock(); return err; @@ -636,6 +797,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -654,6 +817,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -713,8 +878,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *dest; unsigned int new_hash; - hlist_del(&fi->fib_hash); - new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); @@ -731,8 +894,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *ldest; unsigned int new_hash; - hlist_del(&fi->fib_lhash); - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); ldest = &new_laddrhash[new_hash]; hlist_add_head(&fi->fib_lhash, ldest); @@ -757,6 +918,74 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh->nh_saddr; } +static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) +{ + if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || + fib_prefsrc != cfg->fc_dst) { + u32 tb_id = cfg->fc_table; + int rc; + + if (tb_id == RT_TABLE_MAIN) + tb_id = RT_TABLE_LOCAL; + + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id); + + if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, RT_TABLE_LOCAL); + } + + if (rc != RTN_LOCAL) + return false; + } + return true; +} + +static int +fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return 0; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + fi->fib_metrics[type - 1] = val; + } + + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -829,36 +1058,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } endfor_nexthops(fi) - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - u32 val; - - if (type > RTAX_MAX) - goto err_inval; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err_inval; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - fi->fib_metrics[type - 1] = val; - } - } - } + err = fib_convert_metrics(fi, cfg); + if (err) + goto failure; if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -879,6 +1081,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else { struct fib_nh *nh = fi->fib_nh; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, AF_INET, cfg, + &lwtstate); + if (err) + goto failure; + + nh->nh_lwtstate = lwtstate_get(lwtstate); + } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; @@ -924,24 +1142,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; + change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) + fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc) { - if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || - fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) - goto err_inval; - } + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + goto err_inval; change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) + fib_rebalance(fi); + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -1027,17 +1250,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { + struct in_device *in_dev; + if (fi->fib_nh->nh_gw && nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtm->rtm_flags |= RTNH_F_DEAD; + } #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif + if (fi->fib_nh->nh_lwtstate) + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1049,11 +1282,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(fi) { + struct in_device *in_dev; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rtnl(nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtnh->rtnh_flags |= RTNH_F_DEAD; + } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; @@ -1065,6 +1306,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif + if (nh->nh_lwtstate) + lwtunnel_fill_encap(skb, nh->nh_lwtstate); /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); @@ -1107,7 +1350,13 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +/* Event force Flags Description + * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host + * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host + * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed + * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + */ +int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1133,49 +1382,79 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nexthop_nh->nh_power; - nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); -#endif + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && + nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } + + fib_rebalance(fi); } return ret; } /* Must be invoked inside of an RCU protected region. */ -void fib_select_default(struct fib_result *res) +void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; + u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; - struct fib_alias *fa; + struct fib_alias *fa, *fa1 = NULL; + u32 last_prio = res->fi->fib_priority; + u8 last_tos = 0; hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + if (fa->fa_slen != slen) + continue; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fa->tb_id != tb->tb_id) + continue; + if (next_fi->fib_priority > last_prio && + fa->fa_tos == last_tos) { + if (last_tos) + continue; + break; + } + if (next_fi->fib_flags & RTNH_F_DEAD) + continue; + last_tos = fa->fa_tos; + last_prio = next_fi->fib_priority; + if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; @@ -1185,10 +1464,11 @@ void fib_select_default(struct fib_result *res) if (!fi) { if (next_fi != res->fi) break; + fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { + &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); - tb->tb_default = order; + fa1->fa_default = order; goto out; } fi = next_fi; @@ -1196,31 +1476,30 @@ void fib_select_default(struct fib_result *res) } if (order <= 0 || !fi) { - tb->tb_default = -1; + if (fa1) + fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { + fa1->fa_default)) { fib_result_assign(res, fi); - tb->tb_default = order; + fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); - tb->tb_default = last_idx; + fa1->fa_default = last_idx; out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1231,6 +1510,13 @@ int fib_sync_up(struct net_device *dev) if (!(dev->flags & IFF_UP)) return 0; + if (nh_flags & RTNH_F_DEAD) { + unsigned int flags = dev_get_flags(dev); + + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + nh_flags |= RTNH_F_LINKDOWN; + } + prev_fi = NULL; hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; @@ -1247,7 +1533,7 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } @@ -1258,71 +1544,57 @@ int fib_sync_up(struct net_device *dev) !__in_dev_get_rtnl(dev)) continue; alive++; - spin_lock_bh(&fib_multipath_lock); - nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; - spin_unlock_bh(&fib_multipath_lock); + nexthop_nh->nh_flags &= ~nh_flags; } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } + + fib_rebalance(fi); } return ret; } -/* - * The algorithm is suboptimal, but it provides really - * fair weighted route distribution. - */ -void fib_select_multipath(struct fib_result *res) +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; - int w; - spin_lock_bh(&fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } - } endfor_nexthops(fi); - fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } - } - - - /* w should be random number [0..fi->fib_power-1], - * it is pretty bad approximation. - */ - - w = jiffies % fi->fib_power; + for_nexthops(fi) { + if (hash > atomic_read(&nh->nh_upper_bound)) + continue; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && - nexthop_nh->nh_power) { - w -= nexthop_nh->nh_power; - if (w <= 0) { - nexthop_nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); - return; - } - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); } #endif + +void fib_select_path(struct net *net, struct fib_result *res, + struct flowi4 *fl4, int mp_hash) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (mp_hash < 0) + mp_hash = get_hash_from_flowi4(fl4) >> 1; + + fib_select_multipath(res, mp_hash); + } + else +#endif + if (!res->prefixlen && + res->table->tb_num_default > 1 && + res->type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(fl4, res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, *res); +} +EXPORT_SYMBOL_GPL(fib_select_path); |