diff options
Diffstat (limited to 'kernel/net/sched')
48 files changed, 1416 insertions, 602 deletions
diff --git a/kernel/net/sched/Kconfig b/kernel/net/sched/Kconfig index 2274e723a..daa33432b 100644 --- a/kernel/net/sched/Kconfig +++ b/kernel/net/sched/Kconfig @@ -312,6 +312,7 @@ config NET_SCH_PIE config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT + select NET_INGRESS ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -477,6 +478,16 @@ config NET_CLS_BPF To compile this code as a module, choose M here: the module will be called cls_bpf. +config NET_CLS_FLOWER + tristate "Flower classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys and masks. + + To compile this code as a module, choose M here: the module will + be called cls_flower. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/kernel/net/sched/Makefile b/kernel/net/sched/Makefile index 7ca7f4c1b..690c1689e 100644 --- a/kernel/net/sched/Makefile +++ b/kernel/net/sched/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o +obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/kernel/net/sched/act_api.c b/kernel/net/sched/act_api.c index f8d9c2a2c..06e7c4a37 100644 --- a/kernel/net/sched/act_api.c +++ b/kernel/net/sched/act_api.c @@ -27,7 +27,16 @@ #include <net/act_api.h> #include <net/netlink.h> -void tcf_hash_destroy(struct tc_action *a) +static void free_tcf(struct rcu_head *head) +{ + struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + + free_percpu(p->cpu_bstats); + free_percpu(p->cpu_qstats); + kfree(p); +} + +static void tcf_hash_destroy(struct tc_action *a) { struct tcf_common *p = a->priv; struct tcf_hashinfo *hinfo = a->ops->hinfo; @@ -41,9 +50,8 @@ void tcf_hash_destroy(struct tc_action *a) * gen_estimator est_timer() might access p->tcfc_lock * or bstats, wait a RCU grace period before freeing p */ - kfree_rcu(p, tcfc_rcu); + call_rcu(&p->tcfc_rcu, free_tcf); } -EXPORT_SYMBOL(tcf_hash_destroy); int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) { @@ -231,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) if (est) gen_kill_estimator(&pc->tcfc_bstats, &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); + call_rcu(&pc->tcfc_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind) + int size, int bind, bool cpustats) { struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; @@ -247,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, if (bind) p->tcfc_bindcnt = 1; + if (cpustats) { + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!p->cpu_bstats) { +err1: + kfree(p); + return err; + } + p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!p->cpu_qstats) { +err2: + free_percpu(p->cpu_bstats); + goto err1; + } + } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, NULL, - &p->tcfc_rate_est, - &p->tcfc_lock, est); + err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, + &p->tcfc_rate_est, + &p->tcfc_lock, est); if (err) { - kfree(p); - return err; + free_percpu(p->cpu_qstats); + goto err2; } } @@ -393,11 +416,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, list_for_each_entry(a, actions, list) { repeat: ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) @@ -621,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, NULL, + gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfc_qstats, p->tcfc_qstats.qlen) < 0) goto errout; diff --git a/kernel/net/sched/act_bpf.c b/kernel/net/sched/act_bpf.c index 521ffca91..0bc6f912f 100644 --- a/kernel/net/sched/act_bpf.c +++ b/kernel/net/sched/act_bpf.c @@ -37,19 +37,25 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { struct tcf_bpf *prog = act->priv; + struct bpf_prog *filter; int action, filter_res; + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; if (unlikely(!skb_mac_header_was_set(skb))) return TC_ACT_UNSPEC; - spin_lock(&prog->tcf_lock); - - prog->tcf_tm.lastuse = jiffies; - bstats_update(&prog->tcf_bstats, skb); + tcf_lastuse_update(&prog->tcf_tm); + bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - /* Needed here for accessing maps. */ rcu_read_lock(); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter = rcu_dereference(prog->filter); + if (at_ingress) { + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(filter, skb); + } rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. @@ -66,11 +72,12 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, case TC_ACT_PIPE: case TC_ACT_RECLASSIFY: case TC_ACT_OK: + case TC_ACT_REDIRECT: action = filter_res; break; case TC_ACT_SHOT: action = filter_res; - prog->tcf_qstats.drops++; + qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats)); break; case TC_ACT_UNSPEC: action = prog->tcf_action; @@ -80,7 +87,6 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, break; } - spin_unlock(&prog->tcf_lock); return action; } @@ -256,7 +262,10 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, struct tcf_bpf_cfg *cfg) { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); - cfg->filter = prog->filter; + /* updates to prog->filter are prevented, since it's called either + * with rtnl lock or during final cleanup in rcu callback + */ + cfg->filter = rcu_dereference_protected(prog->filter, 1); cfg->bpf_ops = prog->bpf_ops; cfg->bpf_name = prog->bpf_name; @@ -271,7 +280,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct tc_act_bpf *parm; struct tcf_bpf *prog; bool is_bpf, is_ebpf; - int ret; + int ret, res = 0; if (!nla) return -EINVAL; @@ -280,45 +289,47 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (ret < 0) return ret; - is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; - is_ebpf = tb[TCA_ACT_BPF_FD]; - - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || - !tb[TCA_ACT_BPF_PARMS]) + if (!tb[TCA_ACT_BPF_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - memset(&cfg, 0, sizeof(cfg)); - - ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : - tcf_bpf_init_from_efd(tb, &cfg); - if (ret < 0) - return ret; - if (!tcf_hash_check(parm->index, act, bind)) { ret = tcf_hash_create(parm->index, est, act, - sizeof(*prog), bind); + sizeof(*prog), bind, true); if (ret < 0) - goto destroy_fp; + return ret; - ret = ACT_P_CREATED; + res = ACT_P_CREATED; } else { /* Don't override defaults. */ if (bind) - goto destroy_fp; + return 0; tcf_hash_release(act, bind); - if (!replace) { - ret = -EEXIST; - goto destroy_fp; - } + if (!replace) + return -EEXIST; } + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; + is_ebpf = tb[TCA_ACT_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { + ret = -EINVAL; + goto out; + } + + memset(&cfg, 0, sizeof(cfg)); + + ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : + tcf_bpf_init_from_efd(tb, &cfg); + if (ret < 0) + goto out; + prog = to_bpf(act); - spin_lock_bh(&prog->tcf_lock); + ASSERT_RTNL(); - if (ret != ACT_P_CREATED) + if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); prog->bpf_ops = cfg.bpf_ops; @@ -330,19 +341,21 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->bpf_fd = cfg.bpf_fd; prog->tcf_action = parm->action; - prog->filter = cfg.filter; - - spin_unlock_bh(&prog->tcf_lock); + rcu_assign_pointer(prog->filter, cfg.filter); - if (ret == ACT_P_CREATED) + if (res == ACT_P_CREATED) { tcf_hash_insert(act); - else + } else { + /* make sure the program being replaced is no longer executing */ + synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); + } - return ret; + return res; +out: + if (res == ACT_P_CREATED) + tcf_hash_cleanup(act, est); -destroy_fp: - tcf_bpf_cfg_cleanup(&cfg); return ret; } diff --git a/kernel/net/sched/act_connmark.c b/kernel/net/sched/act_connmark.c index 295d14bd6..bb41699c6 100644 --- a/kernel/net/sched/act_connmark.c +++ b/kernel/net/sched/act_connmark.c @@ -37,6 +37,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = a->priv; + struct nf_conntrack_zone zone; struct nf_conn *c; int proto; @@ -67,10 +68,13 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - proto, &tuple)) + proto, ca->net, &tuple)) goto out; - thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple); + zone.id = ca->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; + + thash = nf_conntrack_find_get(ca->net, &zone, &tuple); if (!thash) goto out; @@ -108,12 +112,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), + bind, false); if (ret) return ret; ci = to_connmark(a); ci->tcf_action = parm->action; + ci->net = net; ci->zone = parm->zone; tcf_hash_insert(a); diff --git a/kernel/net/sched/act_csum.c b/kernel/net/sched/act_csum.c index 4cd5cf1ae..b07c535ba 100644 --- a/kernel/net/sched/act_csum.c +++ b/kernel/net/sched/act_csum.c @@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_CSUM_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/kernel/net/sched/act_gact.c b/kernel/net/sched/act_gact.c index 7fffc2272..5c1b05170 100644 --- a/kernel/net/sched/act_gact.c +++ b/kernel/net/sched/act_gact.c @@ -28,14 +28,18 @@ #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { - if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (prandom_u32() % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } static int gact_determ(struct tcf_gact *gact) { - if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval) + u32 pack = atomic_inc_return(&gact->packets); + + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (pack % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } @@ -85,7 +89,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #endif if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -99,16 +104,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(a); - spin_lock_bh(&gact->tcf_lock); + ASSERT_RTNL(); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { gact->tcfg_paction = p_parm->paction; - gact->tcfg_pval = p_parm->pval; + gact->tcfg_pval = max_t(u16, 1, p_parm->pval); + /* Make sure tcfg_pval is written before tcfg_ptype + * coupled with smp_rmb() in gact_net_rand() & gact_determ() + */ + smp_wmb(); gact->tcfg_ptype = p_parm->ptype; } #endif - spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) tcf_hash_insert(a); return ret; @@ -118,23 +126,21 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = a->priv; - int action = TC_ACT_SHOT; + int action = READ_ONCE(gact->tcf_action); - spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype) - action = gact_rand[gact->tcfg_ptype](gact); - else - action = gact->tcf_action; -#else - action = gact->tcf_action; + { + u32 ptype = READ_ONCE(gact->tcfg_ptype); + + if (ptype) + action = gact_rand[ptype](gact); + } #endif - gact->tcf_bstats.bytes += qdisc_pkt_len(skb); - gact->tcf_bstats.packets++; + bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); if (action == TC_ACT_SHOT) - gact->tcf_qstats.drops++; - gact->tcf_tm.lastuse = jiffies; - spin_unlock(&gact->tcf_lock); + qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + + tcf_lastuse_update(&gact->tcf_tm); return action; } diff --git a/kernel/net/sched/act_ipt.c b/kernel/net/sched/act_ipt.c index cbc8dd7dd..d05869646 100644 --- a/kernel/net/sched/act_ipt.c +++ b/kernel/net/sched/act_ipt.c @@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, index = nla_get_u32(tb[TCA_IPT_INDEX]); if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -189,6 +189,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, * worry later - danger - this API seems to have changed * from earlier kernels */ + par.net = dev_net(skb->dev); par.in = skb->dev; par.out = NULL; par.hooknum = ipt->tcfi_hook; diff --git a/kernel/net/sched/act_mirred.c b/kernel/net/sched/act_mirred.c index 3f63ceac8..32fcdecdb 100644 --- a/kernel/net/sched/act_mirred.c +++ b/kernel/net/sched/act_mirred.c @@ -31,13 +31,19 @@ #define MIRRED_TAB_MASK 7 static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock); static void tcf_mirred_release(struct tc_action *a, int bind) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); + + /* We could be called either in a RCU callback or with RTNL lock held. */ + spin_lock_bh(&mirred_list_lock); list_del(&m->tcfm_list); - if (m->tcfm_dev) - dev_put(m->tcfm_dev); + spin_unlock_bh(&mirred_list_lock); + if (dev) + dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -93,32 +99,37 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), + bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_hash_release(a, bind); + if (bind) + return 0; + + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } m = to_mirred(a); - spin_lock_bh(&m->tcf_lock); + ASSERT_RTNL(); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; if (ret != ACT_P_CREATED) - dev_put(m->tcfm_dev); + dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); - m->tcfm_dev = dev; + rcu_assign_pointer(m->tcfm_dev, dev); m->tcfm_ok_push = ok_push; } - spin_unlock_bh(&m->tcf_lock); + if (ret == ACT_P_CREATED) { + spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); + spin_unlock_bh(&mirred_list_lock); tcf_hash_insert(a); } @@ -131,28 +142,30 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_mirred *m = a->priv; struct net_device *dev; struct sk_buff *skb2; + int retval, err; u32 at; - int retval, err = 1; - spin_lock(&m->tcf_lock); - m->tcf_tm.lastuse = jiffies; - bstats_update(&m->tcf_bstats, skb); + tcf_lastuse_update(&m->tcf_tm); + + bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - dev = m->tcfm_dev; - if (!dev) { - printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + rcu_read_lock(); + retval = READ_ONCE(m->tcf_action); + dev = rcu_dereference(m->tcfm_dev); + if (unlikely(!dev)) { + pr_notice_once("tc mirred: target device is gone\n"); goto out; } - if (!(dev->flags & IFF_UP)) { + if (unlikely(!(dev->flags & IFF_UP))) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto out; } at = G_TC_AT(skb->tc_verd); - skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); - if (skb2 == NULL) + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) goto out; if (!(at & AT_EGRESS)) { @@ -166,18 +179,16 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; + skb_sender_cpu_clear(skb2); err = dev_queue_xmit(skb2); -out: if (err) { - m->tcf_qstats.overlimits++; +out: + qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); if (m->tcfm_eaction != TCA_EGRESS_MIRROR) retval = TC_ACT_SHOT; - else - retval = m->tcf_action; - } else - retval = m->tcf_action; - spin_unlock(&m->tcf_lock); + } + rcu_read_unlock(); return retval; } @@ -216,15 +227,20 @@ static int mirred_device_event(struct notifier_block *unused, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; - if (event == NETDEV_UNREGISTER) + ASSERT_RTNL(); + if (event == NETDEV_UNREGISTER) { + spin_lock_bh(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { - spin_lock_bh(&m->tcf_lock); - if (m->tcfm_dev == dev) { + if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); - m->tcfm_dev = NULL; + /* Note : no rcu grace period necessary, as + * net_device are already rcu protected. + */ + RCU_INIT_POINTER(m->tcfm_dev, NULL); } - spin_unlock_bh(&m->tcf_lock); } + spin_unlock_bh(&mirred_list_lock); + } return NOTIFY_DONE; } diff --git a/kernel/net/sched/act_nat.c b/kernel/net/sched/act_nat.c index 270a030d5..b7c4ead8b 100644 --- a/kernel/net/sched/act_nat.c +++ b/kernel/net/sched/act_nat.c @@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_NAT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -161,7 +162,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, goto drop; tcph = (void *)(skb_network_header(skb) + ihl); - inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, + true); break; } case IPPROTO_UDP: @@ -177,7 +179,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, udph = (void *)(skb_network_header(skb) + ihl); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, - new_addr, 1); + new_addr, true); if (!udph->check) udph->check = CSUM_MANGLED_0; } @@ -230,7 +232,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, iph->saddr = new_addr; inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, - 0); + false); break; } default: diff --git a/kernel/net/sched/act_pedit.c b/kernel/net/sched/act_pedit.c index 59649d588..e38a7701f 100644 --- a/kernel/net/sched/act_pedit.c +++ b/kernel/net/sched/act_pedit.c @@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; p = to_pedit(a); @@ -68,13 +69,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; } else { - p = to_pedit(a); - tcf_hash_release(a, bind); if (bind) return 0; + tcf_hash_release(a, bind); if (!ovr) return -EEXIST; - + p = to_pedit(a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -108,7 +108,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = a->priv; - int i, munged = 0; + int i; unsigned int off; if (skb_unclone(skb, GFP_ATOMIC)) @@ -156,11 +156,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, *ptr = ((*ptr & tkey->mask) ^ tkey->val); if (ptr == &_data) skb_store_bits(skb, off + offset, ptr, 4); - munged++; } - if (munged) - skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); goto done; } else WARN(1, "pedit BUG: index %d\n", p->tcf_index); diff --git a/kernel/net/sched/act_simple.c b/kernel/net/sched/act_simple.c index 6a8d94886..d6b708d6a 100644 --- a/kernel/net/sched/act_simple.c +++ b/kernel/net/sched/act_simple.c @@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, defdata = nla_data(tb[TCA_DEF_DATA]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/kernel/net/sched/act_skbedit.c b/kernel/net/sched/act_skbedit.c index fcfeeaf83..6751b5f8c 100644 --- a/kernel/net/sched/act_skbedit.c +++ b/kernel/net/sched/act_skbedit.c @@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/kernel/net/sched/act_vlan.c b/kernel/net/sched/act_vlan.c index d735ecf0b..796785e0b 100644 --- a/kernel/net/sched/act_vlan.c +++ b/kernel/net/sched/act_vlan.c @@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*v), + bind, false); if (ret) return ret; diff --git a/kernel/net/sched/cls_bpf.c b/kernel/net/sched/cls_bpf.c index c0b86f2bf..5faaa5425 100644 --- a/kernel/net/sched/cls_bpf.c +++ b/kernel/net/sched/cls_bpf.c @@ -38,6 +38,7 @@ struct cls_bpf_prog { struct bpf_prog *filter; struct list_head link; struct tcf_result res; + bool exts_integrated; struct tcf_exts exts; u32 handle; union { @@ -52,6 +53,7 @@ struct cls_bpf_prog { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FLAGS] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, @@ -59,11 +61,30 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; +static int cls_bpf_exec_opcode(int code) +{ + switch (code) { + case TC_ACT_OK: + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + case TC_ACT_REDIRECT: + case TC_ACT_UNSPEC: + return code; + default: + return TC_ACT_UNSPEC; + } +} + static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; +#ifdef CONFIG_NET_CLS_ACT + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; +#else + bool at_ingress = false; +#endif int ret = -1; if (unlikely(!skb_mac_header_was_set(skb))) @@ -72,7 +93,28 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { - int filter_res = BPF_PROG_RUN(prog->filter, skb); + int filter_res; + + qdisc_skb_cb(skb)->tc_classid = prog->res.classid; + + if (at_ingress) { + /* It is safe to push/pull even if skb_shared() */ + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } + + if (prog->exts_integrated) { + res->class = prog->res.class; + res->classid = qdisc_skb_cb(skb)->tc_classid; + + ret = cls_bpf_exec_opcode(filter_res); + if (ret == TC_ACT_UNSPEC) + continue; + break; + } if (filter_res == 0) continue; @@ -181,8 +223,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_prog_from_ops(struct nlattr **tb, - struct cls_bpf_prog *prog, u32 classid) +static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; @@ -216,15 +257,13 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, prog->bpf_ops = bpf_ops; prog->bpf_num_ops = bpf_num_ops; prog->bpf_name = NULL; - prog->filter = fp; - prog->res.classid = classid; return 0; } -static int cls_bpf_prog_from_efd(struct nlattr **tb, - struct cls_bpf_prog *prog, u32 classid) +static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, + const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; @@ -254,9 +293,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, prog->bpf_ops = NULL; prog->bpf_fd = bpf_fd; prog->bpf_name = name; - prog->filter = fp; - prog->res.classid = classid; + + if (fp->dst_needed) + netif_keep_dst(qdisc_dev(tp->q)); return 0; } @@ -266,16 +306,13 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { + bool is_bpf, is_ebpf, have_exts = false; struct tcf_exts exts; - bool is_bpf, is_ebpf; - u32 classid; int ret; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; is_ebpf = tb[TCA_BPF_FD]; - - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || - !tb[TCA_BPF_CLASSID]) + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); @@ -283,18 +320,32 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if (ret < 0) return ret; - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + if (tb[TCA_BPF_FLAGS]) { + u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); + + if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { + tcf_exts_destroy(&exts); + return -EINVAL; + } + + have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; + } + + prog->exts_integrated = have_exts; - ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : - cls_bpf_prog_from_efd(tb, prog, classid); + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : + cls_bpf_prog_from_efd(tb, prog, tp); if (ret < 0) { tcf_exts_destroy(&exts); return ret; } - tcf_bind_filter(tp, &prog->res, base); - tcf_exts_change(tp, &prog->exts, &exts); + if (tb[TCA_BPF_CLASSID]) { + prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + tcf_bind_filter(tp, &prog->res, base); + } + tcf_exts_change(tp, &prog->exts, &exts); return 0; } @@ -415,6 +466,7 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; struct nlattr *nest; + u32 bpf_flags = 0; int ret; if (prog == NULL) @@ -426,7 +478,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) + if (prog->res.classid && + nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; if (cls_bpf_is_ebpf(prog)) @@ -439,6 +492,11 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; + if (prog->exts_integrated) + bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; + if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) + goto nla_put_failure; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &prog->exts) < 0) diff --git a/kernel/net/sched/cls_cgroup.c b/kernel/net/sched/cls_cgroup.c index ea611b216..4c85bd3a7 100644 --- a/kernel/net/sched/cls_cgroup.c +++ b/kernel/net/sched/cls_cgroup.c @@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); - u32 classid; - - classid = task_cls_state(current)->classid; - - /* - * Due to the nature of the classifier it is required to ignore all - * packets originating from softirq context as accessing `current' - * would lead to false results. - * - * This test assumes that all callers of dev_queue_xmit() explicitely - * disable bh. Knowing this, it is possible to detect softirq based - * calls by looking at the number of nested bh disable calls because - * softirqs always disables bh. - */ - if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ - if (!skb->sk) - return -1; - classid = skb->sk->sk_classid; - } + u32 classid = task_get_classid(skb); if (!classid) return -1; - if (!tcf_em_tree_match(skb, &head->ematches, NULL)) return -1; res->classid = classid; res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); } diff --git a/kernel/net/sched/cls_flow.c b/kernel/net/sched/cls_flow.c index 75df923f5..fbfec6a18 100644 --- a/kernel/net/sched/cls_flow.c +++ b/kernel/net/sched/cls_flow.c @@ -22,11 +22,12 @@ #include <linux/if_vlan.h> #include <linux/slab.h> #include <linux/module.h> +#include <net/inet_sock.h> #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> -#include <net/flow_keys.h> +#include <net/flow_dissector.h> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> @@ -68,35 +69,41 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->src) - return ntohl(flow->src); + __be32 src = flow_get_u32_src(flow); + + if (src) + return ntohl(src); + return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->dst) - return ntohl(flow->dst); + __be32 dst = flow_get_u32_dst(flow); + + if (dst) + return ntohl(dst); + return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - return flow->ip_proto; + return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[0]); + if (flow->ports.ports) + return ntohs(flow->ports.src); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[1]); + if (flow->ports.ports) + return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } @@ -191,8 +198,11 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb) static u32 flow_get_skuid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { - kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; + struct sock *sk = skb_to_full_sk(skb); + + if (sk && sk->sk_socket && sk->sk_socket->file) { + kuid_t skuid = sk->sk_socket->file->f_cred->fsuid; + return from_kuid(&init_user_ns, skuid); } return 0; @@ -200,8 +210,11 @@ static u32 flow_get_skuid(const struct sk_buff *skb) static u32 flow_get_skgid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { - kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; + struct sock *sk = skb_to_full_sk(skb); + + if (sk && sk->sk_socket && sk->sk_socket->file) { + kgid_t skgid = sk->sk_socket->file->f_cred->fsgid; + return from_kgid(&init_user_ns, skgid); } return 0; @@ -295,7 +308,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/kernel/net/sched/cls_flower.c b/kernel/net/sched/cls_flower.c new file mode 100644 index 000000000..95b021243 --- /dev/null +++ b/kernel/net/sched/cls_flower.c @@ -0,0 +1,697 @@ +/* + * net/sched/cls_flower.c Flower classifier + * + * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/rhashtable.h> + +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/ip.h> + +#include <net/sch_generic.h> +#include <net/pkt_cls.h> +#include <net/ip.h> +#include <net/flow_dissector.h> + +struct fl_flow_key { + int indev_ifindex; + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_addrs ipaddrs; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct fl_flow_mask_range { + unsigned short int start; + unsigned short int end; +}; + +struct fl_flow_mask { + struct fl_flow_key key; + struct fl_flow_mask_range range; + struct rcu_head rcu; +}; + +struct cls_fl_head { + struct rhashtable ht; + struct fl_flow_mask mask; + struct flow_dissector dissector; + u32 hgen; + bool mask_assigned; + struct list_head filters; + struct rhashtable_params ht_params; + struct rcu_head rcu; +}; + +struct cls_fl_filter { + struct rhash_head ht_node; + struct fl_flow_key mkey; + struct tcf_exts exts; + struct tcf_result res; + struct fl_flow_key key; + struct list_head list; + u32 handle; + struct rcu_head rcu; +}; + +static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) +{ + return mask->range.end - mask->range.start; +} + +static void fl_mask_update_range(struct fl_flow_mask *mask) +{ + const u8 *bytes = (const u8 *) &mask->key; + size_t size = sizeof(mask->key); + size_t i, first = 0, last = size - 1; + + for (i = 0; i < sizeof(mask->key); i++) { + if (bytes[i]) { + if (!first && i) + first = i; + last = i; + } + } + mask->range.start = rounddown(first, sizeof(long)); + mask->range.end = roundup(last + 1, sizeof(long)); +} + +static void *fl_key_get_start(struct fl_flow_key *key, + const struct fl_flow_mask *mask) +{ + return (u8 *) key + mask->range.start; +} + +static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + const long *lkey = fl_key_get_start(key, mask); + const long *lmask = fl_key_get_start(&mask->key, mask); + long *lmkey = fl_key_get_start(mkey, mask); + int i; + + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) + *lmkey++ = *lkey++ & *lmask++; +} + +static void fl_clear_masked_range(struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); +} + +static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_fl_head *head = rcu_dereference_bh(tp->root); + struct cls_fl_filter *f; + struct fl_flow_key skb_key; + struct fl_flow_key skb_mkey; + + fl_clear_masked_range(&skb_key, &head->mask); + skb_key.indev_ifindex = skb->skb_iif; + /* skb_flow_dissect() does not set n_proto in case an unknown protocol, + * so do it rather here. + */ + skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect(skb, &head->dissector, &skb_key, 0); + + fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); + + f = rhashtable_lookup_fast(&head->ht, + fl_key_get_start(&skb_mkey, &head->mask), + head->ht_params); + if (f) { + *res = f->res; + return tcf_exts_exec(skb, &f->exts, res); + } + return -1; +} + +static int fl_init(struct tcf_proto *tp) +{ + struct cls_fl_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + INIT_LIST_HEAD_RCU(&head->filters); + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void fl_destroy_filter(struct rcu_head *head) +{ + struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + + tcf_exts_destroy(&f->exts); + kfree(f); +} + +static bool fl_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f, *next; + + if (!force && !list_empty(&head->filters)) + return false; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del_rcu(&f->list); + call_rcu(&f->rcu, fl_destroy_filter); + } + RCU_INIT_POINTER(tp->root, NULL); + if (head->mask_assigned) + rhashtable_destroy(&head->ht); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long fl_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { + [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, + [TCA_FLOWER_INDEV] = { .type = NLA_STRING, + .len = IFNAMSIZ }, + [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, +}; + +static void fl_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + if (!tb[val_type]) + return; + memcpy(val, nla_data(tb[val_type]), len); + if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + memcpy(mask, nla_data(tb[mask_type]), len); +} + +static int fl_set_key(struct net *net, struct nlattr **tb, + struct fl_flow_key *key, struct fl_flow_key *mask) +{ +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + if (err < 0) + return err; + key->indev_ifindex = err; + mask->indev_ifindex = 0xffffffff; + } +#endif + + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)); + fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)); + + fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + + if (key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto)); + } + + if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)); + fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)); + } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) { + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)); + fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)); + } + + if (key->basic.ip_proto == IPPROTO_TCP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } else if (key->basic.ip_proto == IPPROTO_UDP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } + + return 0; +} + +static bool fl_mask_eq(struct fl_flow_mask *mask1, + struct fl_flow_mask *mask2) +{ + const long *lmask1 = fl_key_get_start(&mask1->key, mask1); + const long *lmask2 = fl_key_get_start(&mask2->key, mask2); + + return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && + !memcmp(lmask1, lmask2, fl_mask_range(mask1)); +} + +static const struct rhashtable_params fl_ht_params = { + .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */ + .head_offset = offsetof(struct cls_fl_filter, ht_node), + .automatic_shrinking = true, +}; + +static int fl_init_hashtable(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + head->ht_params = fl_ht_params; + head->ht_params.key_len = fl_mask_range(mask); + head->ht_params.key_offset += mask->range.start; + + return rhashtable_init(&head->ht, &head->ht_params); +} + +#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) +#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_END_OFFSET(member) \ + (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) + +#define FL_KEY_IN_RANGE(mask, member) \ + (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ + FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) + +#define FL_KEY_SET(keys, cnt, id, member) \ + do { \ + keys[cnt].key_id = id; \ + keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \ + cnt++; \ + } while(0); + +#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ + do { \ + if (FL_KEY_IN_RANGE(mask, member)) \ + FL_KEY_SET(keys, cnt, id, member); \ + } while(0); + +static void fl_init_dissector(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; + size_t cnt = 0; + + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + + skb_flow_dissector_init(&head->dissector, keys, cnt); +} + +static int fl_check_assign_mask(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + int err; + + if (head->mask_assigned) { + if (!fl_mask_eq(&head->mask, mask)) + return -EINVAL; + else + return 0; + } + + /* Mask is not assigned yet. So assign it and init hashtable + * according to that. + */ + err = fl_init_hashtable(head, mask); + if (err) + return err; + memcpy(&head->mask, mask, sizeof(head->mask)); + head->mask_assigned = true; + + fl_init_dissector(head, mask); + + return 0; +} + +static int fl_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_fl_filter *f, struct fl_flow_mask *mask, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_FLOWER_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_FLOWER_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + err = fl_set_key(net, tb, &f->key, &mask->key); + if (err) + goto errout; + + fl_mask_update_range(mask); + fl_set_masked_key(&f->mkey, &f->key, mask); + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(&e); + return err; +} + +static u32 fl_grab_new_handle(struct tcf_proto *tp, + struct cls_fl_head *head) +{ + unsigned int i = 0x80000000; + u32 handle; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && fl_get(tp, head->hgen)); + + if (unlikely(i == 0)) { + pr_err("Insufficient number of handles\n"); + handle = 0; + } else { + handle = head->hgen; + } + + return handle; +} + +static int fl_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; + struct cls_fl_filter *fnew; + struct nlattr *tb[TCA_FLOWER_MAX + 1]; + struct fl_flow_mask mask = {}; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); + if (err < 0) + return err; + + if (fold && handle && fold->handle != handle) + return -EINVAL; + + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + + if (!handle) { + handle = fl_grab_new_handle(tp, head); + if (!handle) { + err = -EINVAL; + goto errout; + } + } + fnew->handle = handle; + + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + err = fl_check_assign_mask(head, &mask); + if (err) + goto errout; + + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + if (fold) + rhashtable_remove_fast(&head->ht, &fold->ht_node, + head->ht_params); + + *arg = (unsigned long) fnew; + + if (fold) { + list_replace_rcu(&fold->list, &fnew->list); + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, fl_destroy_filter); + } else { + list_add_tail_rcu(&fnew->list, &head->filters); + } + + return 0; + +errout: + kfree(fnew); + return err; +} + +static int fl_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) arg; + + rhashtable_remove_fast(&head->ht, &f->ht_node, + head->ht_params); + list_del_rcu(&f->list); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fl_destroy_filter); + return 0; +} + +static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry_rcu(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + int err; + + if (!memchr_inv(mask, 0, len)) + return 0; + err = nla_put(skb, val_type, len, val); + if (err) + return err; + if (mask_type != TCA_FLOWER_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + return 0; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &head->mask.key; + + if (mask->indev_ifindex) { + struct net_device *dev; + + dev = __dev_get_by_index(net, key->indev_ifindex); + if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) + goto nla_put_failure; + } + + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)) || + fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)) || + fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto))) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) && + fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto))) + goto nla_put_failure; + + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && + (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)) || + fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)))) + goto nla_put_failure; + else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && + (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)) || + fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)))) + goto nla_put_failure; + + if (key->basic.ip_proto == IPPROTO_TCP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_UDP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .kind = "flower", + .classify = fl_classify, + .init = fl_init, + .destroy = fl_destroy, + .get = fl_get, + .change = fl_change, + .delete = fl_delete, + .walk = fl_walk, + .dump = fl_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_fl_init(void) +{ + return register_tcf_proto_ops(&cls_fl_ops); +} + +static void __exit cls_fl_exit(void) +{ + unregister_tcf_proto_ops(&cls_fl_ops); +} + +module_init(cls_fl_init); +module_exit(cls_fl_exit); + +MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); +MODULE_DESCRIPTION("Flower classifier"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/net/sched/cls_rsvp.h b/kernel/net/sched/cls_rsvp.h index 02fa82792..f9c9fc075 100644 --- a/kernel/net/sched/cls_rsvp.h +++ b/kernel/net/sched/cls_rsvp.h @@ -283,12 +283,22 @@ static int rsvp_init(struct tcf_proto *tp) return -ENOBUFS; } -static void -rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +static void rsvp_delete_filter_rcu(struct rcu_head *head) { - tcf_unbind_filter(tp, &f->res); + struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); + tcf_exts_destroy(&f->exts); - kfree_rcu(f, rcu); + kfree(f); +} + +static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + /* all classifiers are required to call tcf_exts_destroy() after rcu + * grace period, since converted-to-rcu actions are relying on that + * in cleanup() callback + */ + call_rcu(&f->rcu, rsvp_delete_filter_rcu); } static bool rsvp_destroy(struct tcf_proto *tp, bool force) diff --git a/kernel/net/sched/cls_tcindex.c b/kernel/net/sched/cls_tcindex.c index a557dbaf5..944c8ff45 100644 --- a/kernel/net/sched/cls_tcindex.c +++ b/kernel/net/sched/cls_tcindex.c @@ -27,6 +27,7 @@ struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct rcu_head rcu; }; struct tcindex_filter { @@ -133,8 +134,23 @@ static int tcindex_init(struct tcf_proto *tp) return 0; } -static int -tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static void tcindex_destroy_rexts(struct rcu_head *head) +{ + struct tcindex_filter_result *r; + + r = container_of(head, struct tcindex_filter_result, rcu); + tcf_exts_destroy(&r->exts); +} + +static void tcindex_destroy_fexts(struct rcu_head *head) +{ + struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); + + tcf_exts_destroy(&f->result.exts); + kfree(f); +} + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; @@ -162,9 +178,14 @@ found: rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); - tcf_exts_destroy(&r->exts); + /* all classifiers are required to call tcf_exts_destroy() after rcu + * grace period, since converted-to-rcu actions are relying on that + * in cleanup() callback + */ if (f) - kfree_rcu(f, rcu); + call_rcu(&f->rcu, tcindex_destroy_fexts); + else + call_rcu(&r->rcu, tcindex_destroy_rexts); return 0; } diff --git a/kernel/net/sched/em_ipset.c b/kernel/net/sched/em_ipset.c index a3d79c8bf..c66ca9400 100644 --- a/kernel/net/sched/em_ipset.c +++ b/kernel/net/sched/em_ipset.c @@ -92,9 +92,10 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, rcu_read_lock(); - if (dev && skb->skb_iif) - indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); + acpar.net = em->net; acpar.in = indev ? indev : dev; acpar.out = dev; diff --git a/kernel/net/sched/em_meta.c b/kernel/net/sched/em_meta.c index b5294ce20..f2aabc008 100644 --- a/kernel/net/sched/em_meta.c +++ b/kernel/net/sched/em_meta.c @@ -343,119 +343,145 @@ META_COLLECTOR(int_sk_refcnt) META_COLLECTOR(int_sk_rcvbuf) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvbuf; + dst->value = sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_shutdown; + dst->value = sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_protocol; + dst->value = sk->sk_protocol; } META_COLLECTOR(int_sk_type) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_type; + dst->value = sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = sk_rmem_alloc_get(skb->sk); + dst->value = sk_rmem_alloc_get(sk); } META_COLLECTOR(int_sk_wmem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = sk_wmem_alloc_get(skb->sk); + dst->value = sk_wmem_alloc_get(sk); } META_COLLECTOR(int_sk_omem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = atomic_read(&skb->sk->sk_omem_alloc); + dst->value = atomic_read(&sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_receive_queue.qlen; + dst->value = sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_write_queue.qlen; + dst->value = sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_wmem_queued; + dst->value = sk->sk_wmem_queued; } META_COLLECTOR(int_sk_fwd_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_forward_alloc; + dst->value = sk->sk_forward_alloc; } META_COLLECTOR(int_sk_sndbuf) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_sndbuf; + dst->value = sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = (__force int) skb->sk->sk_allocation; + dst->value = (__force int) sk->sk_allocation; } META_COLLECTOR(int_sk_hash) @@ -469,92 +495,112 @@ META_COLLECTOR(int_sk_hash) META_COLLECTOR(int_sk_lingertime) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_lingertime / HZ; + dst->value = sk->sk_lingertime / HZ; } META_COLLECTOR(int_sk_err_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_error_queue.qlen; + dst->value = sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_ack_backlog; + dst->value = sk->sk_ack_backlog; } META_COLLECTOR(int_sk_max_ack_bl) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_max_ack_backlog; + dst->value = sk->sk_max_ack_backlog; } META_COLLECTOR(int_sk_prio) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_priority; + dst->value = sk->sk_priority; } META_COLLECTOR(int_sk_rcvlowat) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvlowat; + dst->value = sk->sk_rcvlowat; } META_COLLECTOR(int_sk_rcvtimeo) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvtimeo / HZ; + dst->value = sk->sk_rcvtimeo / HZ; } META_COLLECTOR(int_sk_sndtimeo) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_sndtimeo / HZ; + dst->value = sk->sk_sndtimeo / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_frag.offset; + dst->value = sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_write_pending; + dst->value = sk->sk_write_pending; } /************************************************************************** diff --git a/kernel/net/sched/sch_api.c b/kernel/net/sched/sch_api.c index 1e1c89e51..af1acf009 100644 --- a/kernel/net/sched/sch_api.c +++ b/kernel/net/sched/sch_api.c @@ -253,7 +253,8 @@ int qdisc_set_default(const char *name) } /* We know handle. Find qdisc among all qdisc's attached to device - (root qdisc, all its children, children of children etc.) + * (root qdisc, all its children, children of children etc.) + * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) @@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - list_for_each_entry(q, &root->list, list) { + list_for_each_entry_rcu(q, &root->list, list) { if (q->handle == handle) return q; } @@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q) struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); - list_add_tail(&q->list, &root->list); + ASSERT_RTNL(); + list_add_tail_rcu(&q->list, &root->list); } } EXPORT_SYMBOL(qdisc_list_add); void qdisc_list_del(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_del(&q->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + ASSERT_RTNL(); + list_del_rcu(&q->list); + } } EXPORT_SYMBOL(qdisc_list_del); @@ -750,14 +754,18 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) if (n == 0) return; drops = max_t(int, n, 0); + rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) - return; + break; + if (sch->flags & TCQ_F_NOPARENT) + break; + /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { - WARN_ON(parentid != TC_H_ROOT); - return; + WARN_ON_ONCE(parentid != TC_H_ROOT); + break; } cops = sch->ops->cl_ops; if (cops->qlen_notify) { @@ -768,6 +776,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) sch->q.qlen -= n; __qdisc_qstats_drop(sch, drops); } + rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -1806,57 +1815,46 @@ done: * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) { __be16 protocol = tc_skb_protocol(skb); - int err; +#ifdef CONFIG_NET_CLS_ACT + const struct tcf_proto *old_tp = tp; + int limit = 0; +reclassify: +#endif for (; tp; tp = rcu_dereference_bh(tp->next)) { + int err; + if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; - err = tp->classify(skb, tp, res); - if (err >= 0) { + err = tp->classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT - if (err != TC_ACT_RECLASSIFY && skb->tc_verd) - skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) + goto reset; #endif + if (err >= 0) return err; - } } - return -1; -} -EXPORT_SYMBOL(tc_classify_compat); - -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - int err = 0; -#ifdef CONFIG_NET_CLS_ACT - const struct tcf_proto *otp = tp; -reclassify: -#endif - err = tc_classify_compat(skb, tp, res); + return -1; #ifdef CONFIG_NET_CLS_ACT - if (err == TC_ACT_RECLASSIFY) { - u32 verd = G_TC_VERD(skb->tc_verd); - tp = otp; - - if (verd++ >= MAX_REC_LOOP) { - net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, - ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); - goto reclassify; +reset: + if (unlikely(limit++ >= MAX_REC_LOOP)) { + net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", + tp->q->ops->id, tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; } + + tp = old_tp; + protocol = tc_skb_protocol(skb); + goto reclassify; #endif - return err; } EXPORT_SYMBOL(tc_classify); @@ -1885,13 +1883,10 @@ EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { - struct timespec ts; - - hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, - (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); + (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } @@ -1956,6 +1951,7 @@ static int __init pktsched_init(void) register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); + register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); diff --git a/kernel/net/sched/sch_atm.c b/kernel/net/sched/sch_atm.c index e3e2cc5fd..1911af3ca 100644 --- a/kernel/net/sched/sch_atm.c +++ b/kernel/net/sched/sch_atm.c @@ -375,7 +375,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/kernel/net/sched/sch_blackhole.c b/kernel/net/sched/sch_blackhole.c index 094a874b4..3fee70d98 100644 --- a/kernel/net/sched/sch_blackhole.c +++ b/kernel/net/sched/sch_blackhole.c @@ -11,7 +11,7 @@ * Note: Quantum tunneling is not supported. */ -#include <linux/module.h> +#include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> @@ -37,17 +37,8 @@ static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; -static int __init blackhole_module_init(void) +static int __init blackhole_init(void) { return register_qdisc(&blackhole_qdisc_ops); } - -static void __exit blackhole_module_exit(void) -{ - unregister_qdisc(&blackhole_qdisc_ops); -} - -module_init(blackhole_module_init) -module_exit(blackhole_module_exit) - -MODULE_LICENSE("GPL"); +device_initcall(blackhole_init) diff --git a/kernel/net/sched/sch_cbq.c b/kernel/net/sched/sch_cbq.c index beeb75f80..c538d9e4a 100644 --- a/kernel/net/sched/sch_cbq.c +++ b/kernel/net/sched/sch_cbq.c @@ -240,7 +240,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (!fl || result < 0) goto fallback; diff --git a/kernel/net/sched/sch_choke.c b/kernel/net/sched/sch_choke.c index c009eb904..5ffb8b833 100644 --- a/kernel/net/sched/sch_choke.c +++ b/kernel/net/sched/sch_choke.c @@ -18,7 +18,7 @@ #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/red.h> -#include <net/flow_keys.h> +#include <net/flow_dissector.h> /* CHOKe stateless AQM for fair bandwidth allocation @@ -133,16 +133,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* private part of skb->cb[] that a qdisc is allowed to use - * is limited to QDISC_CB_PRIV_LEN bytes. - * As a flow key might be too large, we store a part of it only. - */ -#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) - struct choke_skb_cb { u16 classid; u8 keys_valid; - u8 keys[QDISC_CB_PRIV_LEN - 3]; + struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -176,19 +170,19 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &temp); - memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); + skb_flow_dissect_flow_keys(skb1, &temp, 0); + make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &temp); - memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); + skb_flow_dissect_flow_keys(skb2, &temp, 0); + make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - CHOKE_K_LEN); + sizeof(choke_skb_cb(skb1)->keys)); } /* @@ -207,7 +201,7 @@ static bool choke_classify(struct sk_buff *skb, int result; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -391,6 +385,19 @@ static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); + while (q->head != q->tail) { + struct sk_buff *skb = q->tab[q->head]; + + q->head = (q->head + 1) & q->tab_mask; + if (!skb) + continue; + qdisc_qstats_backlog_dec(sch, skb); + --sch->q.qlen; + qdisc_drop(skb, sch); + } + + memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); + q->head = q->tail = 0; red_restart(&q->vars); } @@ -546,65 +553,6 @@ static void choke_destroy(struct Qdisc *sch) choke_free(q->tab); } -static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg) -{ - return NULL; -} - -static unsigned long choke_get(struct Qdisc *sch, u32 classid) -{ - return 0; -} - -static void choke_put(struct Qdisc *q, unsigned long cl) -{ -} - -static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, - u32 classid) -{ - return 0; -} - -static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch, - unsigned long cl) -{ - struct choke_sched_data *q = qdisc_priv(sch); - - if (cl) - return NULL; - return &q->filter_list; -} - -static int choke_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - tcm->tcm_handle |= TC_H_MIN(cl); - return 0; -} - -static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg) -{ - if (!arg->stop) { - if (arg->fn(sch, 1, arg) < 0) { - arg->stop = 1; - return; - } - arg->count++; - } -} - -static const struct Qdisc_class_ops choke_class_ops = { - .leaf = choke_leaf, - .get = choke_get, - .put = choke_put, - .tcf_chain = choke_find_tcf, - .bind_tcf = choke_bind, - .unbind_tcf = choke_put, - .dump = choke_dump_class, - .walk = choke_walk, -}; - static struct sk_buff *choke_peek_head(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); diff --git a/kernel/net/sched/sch_codel.c b/kernel/net/sched/sch_codel.c index 7a0bdb16a..535007d5f 100644 --- a/kernel/net/sched/sch_codel.c +++ b/kernel/net/sched/sch_codel.c @@ -6,7 +6,7 @@ * * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> - * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, [TCA_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 }, }; static int codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); + + q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); @@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_CODEL_ECN, q->params.ecn)) goto nla_put_failure; - + if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, + codel_time_to_us(q->params.ce_threshold))) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: @@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ldelay = codel_time_to_us(q->vars.ldelay), .dropping = q->vars.dropping, .ecn_mark = q->stats.ecn_mark, + .ce_mark = q->stats.ce_mark, }; if (q->vars.dropping) { diff --git a/kernel/net/sched/sch_drr.c b/kernel/net/sched/sch_drr.c index 338706092..f26bdea87 100644 --- a/kernel/net/sched/sch_drr.c +++ b/kernel/net/sched/sch_drr.c @@ -331,7 +331,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/kernel/net/sched/sch_dsmark.c b/kernel/net/sched/sch_dsmark.c index 66700a611..f357f34d0 100644 --- a/kernel/net/sched/sch_dsmark.c +++ b/kernel/net/sched/sch_dsmark.c @@ -35,14 +35,20 @@ #define NO_DEFAULT_INDEX (1 << 16) +struct mask_value { + u8 mask; + u8 value; +}; + struct dsmark_qdisc_data { struct Qdisc *q; struct tcf_proto __rcu *filter_list; - u8 *mask; /* "owns" the array */ - u8 *value; + struct mask_value *mv; u16 indices; + u8 set_tc_index; u32 default_index; /* index range is 0...0xffff */ - int set_tc_index; +#define DSMARK_EMBEDDED_SZ 16 + struct mask_value embedded[DSMARK_EMBEDDED_SZ]; }; static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) @@ -116,7 +122,6 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; - u8 mask = 0; pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", __func__, sch, p, classid, parent, *arg); @@ -133,14 +138,11 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, if (err < 0) goto errout; - if (tb[TCA_DSMARK_MASK]) - mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - if (tb[TCA_DSMARK_VALUE]) - p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); + p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]); if (tb[TCA_DSMARK_MASK]) - p->mask[*arg - 1] = mask; + p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]); err = 0; @@ -155,8 +157,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg) if (!dsmark_valid_index(p, arg)) return -EINVAL; - p->mask[arg - 1] = 0xff; - p->value[arg - 1] = 0; + p->mv[arg - 1].mask = 0xff; + p->mv[arg - 1].value = 0; return 0; } @@ -173,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { - if (p->mask[i] == 0xff && !p->value[i]) + if (p->mv[i].mask == 0xff && !p->mv[i].value) goto ignore; if (walker->count >= walker->skip) { if (walker->fn(sch, i + 1, walker) < 0) { @@ -230,7 +232,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tc_classify(skb, fl, &res); + int result = tc_classify(skb, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); @@ -291,12 +293,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mask[index], - p->value[index]); + ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, + p->mv[index].value); break; case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], - p->value[index]); + ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask, + p->mv[index].value); break; default: /* @@ -304,7 +306,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) * This way, we can send non-IP traffic through dsmark * and don't need yet another qdisc as a bypass. */ - if (p->mask[index] != 0xff || p->value[index]) + if (p->mv[index].mask != 0xff || p->mv[index].value) pr_warn("%s: unsupported protocol %d\n", __func__, ntohs(tc_skb_protocol(skb))); break; @@ -346,7 +348,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) int err = -EINVAL; u32 default_index = NO_DEFAULT_INDEX; u16 indices; - u8 *mask; + int i; pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); @@ -366,18 +368,18 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_DSMARK_DEFAULT_INDEX]) default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); - mask = kmalloc(indices * 2, GFP_KERNEL); - if (mask == NULL) { + if (indices <= DSMARK_EMBEDDED_SZ) + p->mv = p->embedded; + else + p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL); + if (!p->mv) { err = -ENOMEM; goto errout; } - - p->mask = mask; - memset(p->mask, 0xff, indices); - - p->value = p->mask + indices; - memset(p->value, 0, indices); - + for (i = 0; i < indices; i++) { + p->mv[i].mask = 0xff; + p->mv[i].value = 0; + } p->indices = indices; p->default_index = default_index; p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); @@ -410,7 +412,8 @@ static void dsmark_destroy(struct Qdisc *sch) tcf_destroy_chain(&p->filter_list); qdisc_destroy(p->q); - kfree(p->mask); + if (p->mv != p->embedded) + kfree(p->mv); } static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, @@ -430,8 +433,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || - nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) + if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || + nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/kernel/net/sched/sch_fifo.c b/kernel/net/sched/sch_fifo.c index 2e2398cfc..2177eac0a 100644 --- a/kernel/net/sched/sch_fifo.c +++ b/kernel/net/sched/sch_fifo.c @@ -54,7 +54,7 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt) bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { - u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; + u32 limit = qdisc_dev(sch)->tx_queue_len; if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); diff --git a/kernel/net/sched/sch_fq.c b/kernel/net/sched/sch_fq.c index f377702d4..109b23227 100644 --- a/kernel/net/sched/sch_fq.c +++ b/kernel/net/sched/sch_fq.c @@ -224,13 +224,16 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; - /* SYNACK messages are attached to a listener socket. - * 1) They are not part of a 'flow' yet - * 2) We do not want to rate limit them (eg SYNFLOOD attack), + /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * or a listener (SYNCOOKIE mode) + * 1) request sockets are not full blown, + * they do not contain sk_pacing_rate + * 2) They are not part of a 'flow' yet + * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE - * 3) We pretend they are orphaned + * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_LISTEN) { + if (!sk || sk_listener(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not diff --git a/kernel/net/sched/sch_fq_codel.c b/kernel/net/sched/sch_fq_codel.c index 9291598b5..4c834e93d 100644 --- a/kernel/net/sched/sch_fq_codel.c +++ b/kernel/net/sched/sch_fq_codel.c @@ -6,7 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> */ #include <linux/module.h> @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> -#include <net/flow_keys.h> #include <net/codel.h> /* Fair Queue CoDel. @@ -68,15 +67,9 @@ struct fq_codel_sched_data { }; static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, - const struct sk_buff *skb) + struct sk_buff *skb) { - struct flow_keys keys; - unsigned int hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); + u32 hash = skb_get_hash_perturb(skb, q->perturbation); return reciprocal_scale(hash, q->flows_cnt); } @@ -99,7 +92,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, filter, &res); + result = tc_classify(skb, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -170,6 +163,15 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) return idx; } +static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) +{ + unsigned int prev_backlog; + + prev_backlog = sch->qstats.backlog; + fq_codel_drop(sch); + return prev_backlog - sch->qstats.backlog; +} + static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -286,10 +288,26 @@ begin: static void fq_codel_reset(struct Qdisc *sch) { - struct sk_buff *skb; + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; - while ((skb = fq_codel_dequeue(sch)) != NULL) - kfree_skb(skb); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + while (flow->head) { + struct sk_buff *skb = dequeue_head(flow); + + qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); + } + + INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); + } + memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); + sch->q.qlen = 0; } static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { @@ -299,6 +317,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -329,6 +348,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); + + q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -448,6 +473,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -466,6 +496,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; + st.qdisc_stats.ce_mark = q->cstats.ce_mark; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; @@ -598,7 +629,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, - .drop = fq_codel_drop, + .drop = fq_codel_qdisc_drop, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/kernel/net/sched/sch_generic.c b/kernel/net/sched/sch_generic.c index 1e346523f..47ef1b11b 100644 --- a/kernel/net/sched/sch_generic.c +++ b/kernel/net/sched/sch_generic.c @@ -416,33 +416,25 @@ struct Qdisc noop_qdisc = { }; EXPORT_SYMBOL(noop_qdisc); -static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { +static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + /* register_qdisc() assigns a default of noop_enqueue if unset, + * but __dev_queue_xmit() treats noqueue only as such + * if this is NULL - so clear it here. */ + qdisc->enqueue = NULL; + return 0; +} + +struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, + .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; -static struct Qdisc noqueue_qdisc; -static struct netdev_queue noqueue_netdev_queue = { - .qdisc = &noqueue_qdisc, - .qdisc_sleeping = &noqueue_qdisc, -}; - -static struct Qdisc noqueue_qdisc = { - .enqueue = NULL, - .dequeue = noop_dequeue, - .flags = TCQ_F_BUILTIN, - .ops = &noqueue_qdisc_ops, - .list = LIST_HEAD_INIT(noqueue_qdisc.list), - .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), - .dev_queue = &noqueue_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), -}; - - static const u8 prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; @@ -666,8 +658,10 @@ static void qdisc_rcu_free(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) + if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); + free_percpu(qdisc->cpu_qstats); + } kfree((char *) qdisc - qdisc->padded); } @@ -733,18 +727,19 @@ static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { - struct Qdisc *qdisc = &noqueue_qdisc; + struct Qdisc *qdisc; + const struct Qdisc_ops *ops = default_qdisc_ops; - if (dev->tx_queue_len) { - qdisc = qdisc_create_dflt(dev_queue, - default_qdisc_ops, TC_H_ROOT); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); - return; - } - if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + if (dev->priv_flags & IFF_NO_QUEUE) + ops = &noqueue_qdisc_ops; + + qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT); + if (!qdisc) { + netdev_info(dev, "activation failed\n"); + return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; } @@ -755,7 +750,8 @@ static void attach_default_qdiscs(struct net_device *dev) txq = netdev_get_tx_queue(dev, 0); - if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) { + if (!netif_is_multiqueue(dev) || + dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; atomic_inc(&dev->qdisc->refcnt); @@ -779,7 +775,7 @@ static void transition_one_qdisc(struct net_device *dev, clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); - if (need_watchdog_p && new_qdisc != &noqueue_qdisc) { + if (need_watchdog_p) { dev_queue->trans_start = 0; *need_watchdog_p = 1; } diff --git a/kernel/net/sched/sch_gred.c b/kernel/net/sched/sch_gred.c index 634529e0c..80105109f 100644 --- a/kernel/net/sched/sch_gred.c +++ b/kernel/net/sched/sch_gred.c @@ -165,7 +165,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) * if no default DP has been configured. This * allows for DP flows to be left untouched. */ - if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; @@ -397,7 +398,10 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, q->DP = dp; q->prio = prio; - q->limit = ctl->limit; + if (ctl->limit > sch->limit) + q->limit = sch->limit; + else + q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); @@ -414,6 +418,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, + [TCA_GRED_LIMIT] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -433,11 +438,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { + if (tb[TCA_GRED_LIMIT] != NULL) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, opt); + } if (tb[TCA_GRED_PARMS] == NULL || - tb[TCA_GRED_STAB] == NULL) + tb[TCA_GRED_STAB] == NULL || + tb[TCA_GRED_LIMIT] != NULL) return -EINVAL; max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; @@ -501,6 +510,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; + if (tb[TCA_GRED_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); + else + sch->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } @@ -531,6 +546,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) + goto nla_put_failure; + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; diff --git a/kernel/net/sched/sch_hfsc.c b/kernel/net/sched/sch_hfsc.c index e6c7416d0..b7ebe2c87 100644 --- a/kernel/net/sched/sch_hfsc.c +++ b/kernel/net/sched/sch_hfsc.c @@ -1165,7 +1165,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/kernel/net/sched/sch_hhf.c b/kernel/net/sched/sch_hhf.c index 15d3aabfe..86b04e31e 100644 --- a/kernel/net/sched/sch_hhf.c +++ b/kernel/net/sched/sch_hhf.c @@ -9,7 +9,6 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <net/flow_keys.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -176,22 +175,6 @@ static u32 hhf_time_stamp(void) return jiffies; } -static unsigned int skb_hash(const struct hhf_sched_data *q, - const struct sk_buff *skb) -{ - struct flow_keys keys; - unsigned int hash; - - if (skb->sk && skb->sk->sk_hash) - return skb->sk->sk_hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); - return hash; -} - /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, @@ -280,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_hash(q, skb); + hash = skb_get_hash_perturb(skb, q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; @@ -385,6 +368,15 @@ static unsigned int hhf_drop(struct Qdisc *sch) return bucket - q->buckets; } +static unsigned int hhf_qdisc_drop(struct Qdisc *sch) +{ + unsigned int prev_backlog; + + prev_backlog = sch->qstats.backlog; + hhf_drop(sch); + return prev_backlog - sch->qstats.backlog; +} + static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); @@ -713,7 +705,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, - .drop = hhf_drop, + .drop = hhf_qdisc_drop, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, diff --git a/kernel/net/sched/sch_htb.c b/kernel/net/sched/sch_htb.c index f1acb0f60..15ccd7f8f 100644 --- a/kernel/net/sched/sch_htb.c +++ b/kernel/net/sched/sch_htb.c @@ -229,7 +229,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -1048,11 +1048,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); - else { + else q->direct_qlen = qdisc_dev(sch)->tx_queue_len; - if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ - q->direct_qlen = 2; - } + if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; diff --git a/kernel/net/sched/sch_ingress.c b/kernel/net/sched/sch_ingress.c index 4cdbfb856..e7c648fa9 100644 --- a/kernel/net/sched/sch_ingress.c +++ b/kernel/net/sched/sch_ingress.c @@ -12,16 +12,10 @@ #include <linux/list.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> + #include <net/netlink.h> #include <net/pkt_sched.h> - -struct ingress_qdisc_data { - struct tcf_proto __rcu *filter_list; -}; - -/* ------------------------- Class/flow operations ------------------------- */ - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -49,57 +43,24 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = qdisc_priv(sch); - - return &p->filter_list; -} - -/* --------------------------- Qdisc operations ---------------------------- */ + struct net_device *dev = qdisc_dev(sch); -static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct ingress_qdisc_data *p = qdisc_priv(sch); - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result; - - result = tc_classify(skb, fl, &res); - - qdisc_bstats_update(sch, skb); - switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - qdisc_qstats_drop(sch); - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - default: - result = TC_ACT_OK; - break; - } - - return result; + return &dev->ingress_cl_list; } -/* ------------------------------------------------------------- */ - static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { net_inc_ingress_queue(); + sch->flags |= TCQ_F_CPUSTATS; return 0; } static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_destroy_chain(&p->filter_list); + tcf_destroy_chain(&dev->ingress_cl_list); net_dec_ingress_queue(); } @@ -110,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; + return nla_nest_end(skb, nest); nla_put_failure: @@ -130,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", - .priv_size = sizeof(struct ingress_qdisc_data), - .enqueue = ingress_enqueue, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -148,6 +108,7 @@ static void __exit ingress_module_exit(void) unregister_qdisc(&ingress_qdisc_ops); } -module_init(ingress_module_init) -module_exit(ingress_module_exit) +module_init(ingress_module_init); +module_exit(ingress_module_exit); + MODULE_LICENSE("GPL"); diff --git a/kernel/net/sched/sch_mq.c b/kernel/net/sched/sch_mq.c index f3cbaecd2..3e82f047c 100644 --- a/kernel/net/sched/sch_mq.c +++ b/kernel/net/sched/sch_mq.c @@ -63,7 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; @@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/kernel/net/sched/sch_mqprio.c b/kernel/net/sched/sch_mqprio.c index 3811a7454..ad70ecf57 100644 --- a/kernel/net/sched/sch_mqprio.c +++ b/kernel/net/sched/sch_mqprio.c @@ -132,7 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own @@ -209,7 +209,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); diff --git a/kernel/net/sched/sch_multiq.c b/kernel/net/sched/sch_multiq.c index 42dd21887..4e904ca0a 100644 --- a/kernel/net/sched/sch_multiq.c +++ b/kernel/net/sched/sch_multiq.c @@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/kernel/net/sched/sch_netem.c b/kernel/net/sched/sch_netem.c index 956ead2ca..5abd1d9de 100644 --- a/kernel/net/sched/sch_netem.c +++ b/kernel/net/sched/sch_netem.c @@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { struct Qdisc *rootq = qdisc_root(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; - qdisc_enqueue_root(skb2, rootq); + q->duplicate = 0; + rootq->enqueue(skb2, rootq); q->duplicate = dupsave; } diff --git a/kernel/net/sched/sch_plug.c b/kernel/net/sched/sch_plug.c index 89f8fcf73..5abfe4467 100644 --- a/kernel/net/sched/sch_plug.c +++ b/kernel/net/sched/sch_plug.c @@ -130,12 +130,8 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt) q->unplug_indefinite = false; if (opt == NULL) { - /* We will set a default limit of 100 pkts (~150kB) - * in case tx_queue_len is not available. The - * default value is completely arbitrary. - */ - u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; - q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + q->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); @@ -216,6 +212,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .peek = qdisc_peek_head, .init = plug_init, .change = plug_change, + .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; diff --git a/kernel/net/sched/sch_prio.c b/kernel/net/sched/sch_prio.c index 8e5cd34aa..ba6487f27 100644 --- a/kernel/net/sched/sch_prio.c +++ b/kernel/net/sched/sch_prio.c @@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/kernel/net/sched/sch_qfq.c b/kernel/net/sched/sch_qfq.c index 3ec7e88a4..3dc3a6e56 100644 --- a/kernel/net/sched/sch_qfq.c +++ b/kernel/net/sched/sch_qfq.c @@ -186,7 +186,6 @@ struct qfq_sched { u64 oldV, V; /* Precise virtual times. */ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ - u32 num_active_agg; /* Num. of active aggregates */ u32 wsum; /* weight sum */ u32 iwsum; /* inverse weight sum */ @@ -339,8 +338,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - if (!hlist_unhashed(&agg->nonfull_next)) - hlist_del_init(&agg->nonfull_next); + hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; @@ -719,7 +717,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/kernel/net/sched/sch_sfb.c b/kernel/net/sched/sch_sfb.c index 5819dd826..5bbb6332e 100644 --- a/kernel/net/sched/sch_sfb.c +++ b/kernel/net/sched/sch_sfb.c @@ -26,7 +26,6 @@ #include <net/ip.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> -#include <net/flow_keys.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -259,7 +258,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -285,9 +284,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) int i; u32 p_min = ~0; u32 minqlen = ~0; - u32 r, slot, salt, sfbhash; + u32 r, sfbhash; + u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); @@ -309,22 +308,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) fl = rcu_dereference_bh(q->filter_list); if (fl) { + u32 salt; + /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - keys.src = salt; - keys.dst = 0; - keys.ports = 0; + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); } else { - skb_flow_dissect(skb, &keys); + sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); } - slot = q->slot; - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -356,10 +350,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -510,7 +502,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) limit = ctl->limit; if (limit == 0) - limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + limit = qdisc_dev(sch)->tx_queue_len; child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); if (IS_ERR(child)) diff --git a/kernel/net/sched/sch_sfq.c b/kernel/net/sched/sch_sfq.c index b877140be..3abab534e 100644 --- a/kernel/net/sched/sch_sfq.c +++ b/kernel/net/sched/sch_sfq.c @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> -#include <net/flow_keys.h> #include <net/red.h> @@ -156,30 +155,10 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_MAX_FLOWS]; } -/* - * In order to be able to quickly rehash our queue when timer changes - * q->perturbation, we store flow_keys in skb->cb[] - */ -struct sfq_skb_cb { - struct flow_keys keys; -}; - -static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; -} - static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; - unsigned int hash; - - hash = jhash_3words((__force u32)keys->dst, - (__force u32)keys->src ^ keys->ip_proto, - (__force u32)keys->ports, q->perturbation); - return hash & (q->divisor - 1); + return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -196,13 +175,11 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); - if (!fl) { - skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); + if (!fl) return sfq_hash(q, skb) + 1; - } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -329,10 +306,10 @@ drop: len = qdisc_pkt_len(skb); slot->backlog -= len; sfq_dec(q, x); - kfree_skb(skb); sch->q.qlen--; qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); return len; } |